MLeTomatic/src/main.bak.rs

97 lines
2.2 KiB
Rust

use rxml::{EventRead, parser::ResolvedEvent, PullParser};
fn next_event<T: std::io::BufRead>(parser: &mut PullParser<T>, nb_errors: &mut usize) -> Option<ResolvedEvent> {
loop {
match parser.read() {
Ok(event) =>
return event,
Err(e) => println!("error {nb_errors}: {e:?}"),
}
*nb_errors += 1;
}
}
fn main() {
let file = std::fs::File::open("frwiktionary-latest-pages-articles.xml").unwrap();
//let file = std::fs::File::open("/tmp/test.xml").unwrap();
let mut file_reader = std::io::BufReader::new(file);
let mut parser = PullParser::new(&mut file_reader);
/*parser.read_all(|d| {
dbg!(d);
}).unwrap();
return;*/
let mut i = 0usize;
let mut nb_errors = 0usize;
while let Some(event) = next_event(&mut parser, &mut nb_errors) {
match event {
ResolvedEvent::StartElement(_, (_, name), _) => {
if name == "page" {
parse_page(&mut parser, &mut nb_errors);
}
},
_ => {}
}
i += 1;
if i > 1000 {
println!("limit");
break
}
}
println!("nb: {i}");
println!("Errors: {nb_errors}");
}
enum ParsePageCtx {
Nothing,
Title,
Ns,
Text,
}
fn parse_page<T: std::io::BufRead>(parser: &mut PullParser<T>, nb_errors: &mut usize) {
let mut title = None;
let mut text = None;
let mut ctx = ParsePageCtx::Nothing;
let mut level = 1usize;
while let Some(event) = next_event(parser, nb_errors) {
match event {
ResolvedEvent::StartElement(_, (_, name), _) => {
level += 1;
match name.as_str() {
"title" => ctx = ParsePageCtx::Title,
"ns" => ctx = ParsePageCtx::Ns,
"text" => ctx = ParsePageCtx::Text,
_ => {}
}
},
ResolvedEvent::Text(_, data) => {
match ctx {
ParsePageCtx::Title => title = Some(data.as_str().to_string()),
ParsePageCtx::Ns => if data.as_str() != "4" {
//dbg!(data.as_str());
return
},
ParsePageCtx::Text => text = Some(data.as_str().to_string()),
ParsePageCtx::Nothing => {}
}
ctx = ParsePageCtx::Nothing;
},
ResolvedEvent::EndElement(_) => {
level -= 1;
ctx = ParsePageCtx::Nothing;
}
_ => {}
}
if level == 0 {
break
}
}
let (Some(title), Some(text)) = (title, text) else {
return
};
println!("{title}:\n{text}\n\n");
}