use rxml::{EventRead, parser::ResolvedEvent, PullParser}; fn next_event(parser: &mut PullParser, nb_errors: &mut usize) -> Option { loop { match parser.read() { Ok(event) => return event, Err(e) => println!("error {nb_errors}: {e:?}"), } *nb_errors += 1; } } fn main() { let file = std::fs::File::open("frwiktionary-latest-pages-articles.xml").unwrap(); //let file = std::fs::File::open("/tmp/test.xml").unwrap(); let mut file_reader = std::io::BufReader::new(file); let mut parser = PullParser::new(&mut file_reader); /*parser.read_all(|d| { dbg!(d); }).unwrap(); return;*/ let mut i = 0usize; let mut nb_errors = 0usize; while let Some(event) = next_event(&mut parser, &mut nb_errors) { match event { ResolvedEvent::StartElement(_, (_, name), _) => { if name == "page" { parse_page(&mut parser, &mut nb_errors); } }, _ => {} } i += 1; if i > 1000 { println!("limit"); break } } println!("nb: {i}"); println!("Errors: {nb_errors}"); } enum ParsePageCtx { Nothing, Title, Ns, Text, } fn parse_page(parser: &mut PullParser, nb_errors: &mut usize) { let mut title = None; let mut text = None; let mut ctx = ParsePageCtx::Nothing; let mut level = 1usize; while let Some(event) = next_event(parser, nb_errors) { match event { ResolvedEvent::StartElement(_, (_, name), _) => { level += 1; match name.as_str() { "title" => ctx = ParsePageCtx::Title, "ns" => ctx = ParsePageCtx::Ns, "text" => ctx = ParsePageCtx::Text, _ => {} } }, ResolvedEvent::Text(_, data) => { match ctx { ParsePageCtx::Title => title = Some(data.as_str().to_string()), ParsePageCtx::Ns => if data.as_str() != "4" { //dbg!(data.as_str()); return }, ParsePageCtx::Text => text = Some(data.as_str().to_string()), ParsePageCtx::Nothing => {} } ctx = ParsePageCtx::Nothing; }, ResolvedEvent::EndElement(_) => { level -= 1; ctx = ParsePageCtx::Nothing; } _ => {} } if level == 0 { break } } let (Some(title), Some(text)) = (title, text) else { return }; println!("{title}:\n{text}\n\n"); }