97 lines
2.2 KiB
Rust
97 lines
2.2 KiB
Rust
use rxml::{EventRead, parser::ResolvedEvent, PullParser};
|
|
|
|
fn next_event<T: std::io::BufRead>(parser: &mut PullParser<T>, nb_errors: &mut usize) -> Option<ResolvedEvent> {
|
|
loop {
|
|
match parser.read() {
|
|
Ok(event) =>
|
|
return event,
|
|
Err(e) => println!("error {nb_errors}: {e:?}"),
|
|
}
|
|
*nb_errors += 1;
|
|
}
|
|
}
|
|
|
|
fn main() {
|
|
let file = std::fs::File::open("frwiktionary-latest-pages-articles.xml").unwrap();
|
|
//let file = std::fs::File::open("/tmp/test.xml").unwrap();
|
|
let mut file_reader = std::io::BufReader::new(file);
|
|
|
|
let mut parser = PullParser::new(&mut file_reader);
|
|
|
|
/*parser.read_all(|d| {
|
|
dbg!(d);
|
|
}).unwrap();
|
|
return;*/
|
|
|
|
let mut i = 0usize;
|
|
let mut nb_errors = 0usize;
|
|
while let Some(event) = next_event(&mut parser, &mut nb_errors) {
|
|
match event {
|
|
ResolvedEvent::StartElement(_, (_, name), _) => {
|
|
if name == "page" {
|
|
parse_page(&mut parser, &mut nb_errors);
|
|
}
|
|
},
|
|
_ => {}
|
|
}
|
|
i += 1;
|
|
if i > 1000 {
|
|
println!("limit");
|
|
break
|
|
}
|
|
}
|
|
println!("nb: {i}");
|
|
println!("Errors: {nb_errors}");
|
|
}
|
|
|
|
enum ParsePageCtx {
|
|
Nothing,
|
|
Title,
|
|
Ns,
|
|
Text,
|
|
}
|
|
|
|
fn parse_page<T: std::io::BufRead>(parser: &mut PullParser<T>, nb_errors: &mut usize) {
|
|
let mut title = None;
|
|
let mut text = None;
|
|
let mut ctx = ParsePageCtx::Nothing;
|
|
let mut level = 1usize;
|
|
while let Some(event) = next_event(parser, nb_errors) {
|
|
match event {
|
|
ResolvedEvent::StartElement(_, (_, name), _) => {
|
|
level += 1;
|
|
match name.as_str() {
|
|
"title" => ctx = ParsePageCtx::Title,
|
|
"ns" => ctx = ParsePageCtx::Ns,
|
|
"text" => ctx = ParsePageCtx::Text,
|
|
_ => {}
|
|
}
|
|
},
|
|
ResolvedEvent::Text(_, data) => {
|
|
match ctx {
|
|
ParsePageCtx::Title => title = Some(data.as_str().to_string()),
|
|
ParsePageCtx::Ns => if data.as_str() != "4" {
|
|
//dbg!(data.as_str());
|
|
return
|
|
},
|
|
ParsePageCtx::Text => text = Some(data.as_str().to_string()),
|
|
ParsePageCtx::Nothing => {}
|
|
}
|
|
ctx = ParsePageCtx::Nothing;
|
|
},
|
|
ResolvedEvent::EndElement(_) => {
|
|
level -= 1;
|
|
ctx = ParsePageCtx::Nothing;
|
|
}
|
|
_ => {}
|
|
}
|
|
if level == 0 {
|
|
break
|
|
}
|
|
}
|
|
let (Some(title), Some(text)) = (title, text) else {
|
|
return
|
|
};
|
|
println!("{title}:\n{text}\n\n");
|
|
}
|