Initial commit
This commit is contained in:
commit
8cb7237439
8 changed files with 551762 additions and 0 deletions
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
/target
|
2234
Cargo.lock
generated
Normal file
2234
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
17
Cargo.toml
Normal file
17
Cargo.toml
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
[package]
|
||||||
|
name = "mlet"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
once_cell = "1.17.1"
|
||||||
|
percent-encoding = "2.2.0"
|
||||||
|
rand = "0.8.5"
|
||||||
|
reqwest = { version = "0.11.16", features = ["blocking", "brotli", "deflate", "gzip"] }
|
||||||
|
tide = { version = "0.16.0", default-features = false, features = ["h1-server", "logger"] }
|
||||||
|
tokio = { version = "1.27.0", features = ["full"] }
|
||||||
|
#roxmltree = { version = "0.18.0", default-features = false, features = ["std"] }
|
||||||
|
#rxml = "0.9.1"
|
||||||
|
#rxml = { path = "/home/tuxmain/Téléchargements/rxml/rxml" }
|
4
README.md
Normal file
4
README.md
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
https://dumps.wikimedia.org/frwiktionary/latest/
|
||||||
|
|
||||||
|
problème:
|
||||||
|
* rxml ne supporte pas les erreurs UTF-8
|
549274
lefff-3.4.mlex
Normal file
549274
lefff-3.4.mlex
Normal file
File diff suppressed because it is too large
Load diff
8
rustfmt.toml
Normal file
8
rustfmt.toml
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
hard_tabs = true
|
||||||
|
newline_style = "unix"
|
||||||
|
|
||||||
|
unstable_features = true
|
||||||
|
format_code_in_doc_comments = true
|
||||||
|
format_macro_bodies = true
|
||||||
|
format_macro_matchers = true
|
||||||
|
format_strings = true
|
96
src/main.bak.rs
Normal file
96
src/main.bak.rs
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
use rxml::{EventRead, parser::ResolvedEvent, PullParser};
|
||||||
|
|
||||||
|
fn next_event<T: std::io::BufRead>(parser: &mut PullParser<T>, nb_errors: &mut usize) -> Option<ResolvedEvent> {
|
||||||
|
loop {
|
||||||
|
match parser.read() {
|
||||||
|
Ok(event) =>
|
||||||
|
return event,
|
||||||
|
Err(e) => println!("error {nb_errors}: {e:?}"),
|
||||||
|
}
|
||||||
|
*nb_errors += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let file = std::fs::File::open("frwiktionary-latest-pages-articles.xml").unwrap();
|
||||||
|
//let file = std::fs::File::open("/tmp/test.xml").unwrap();
|
||||||
|
let mut file_reader = std::io::BufReader::new(file);
|
||||||
|
|
||||||
|
let mut parser = PullParser::new(&mut file_reader);
|
||||||
|
|
||||||
|
/*parser.read_all(|d| {
|
||||||
|
dbg!(d);
|
||||||
|
}).unwrap();
|
||||||
|
return;*/
|
||||||
|
|
||||||
|
let mut i = 0usize;
|
||||||
|
let mut nb_errors = 0usize;
|
||||||
|
while let Some(event) = next_event(&mut parser, &mut nb_errors) {
|
||||||
|
match event {
|
||||||
|
ResolvedEvent::StartElement(_, (_, name), _) => {
|
||||||
|
if name == "page" {
|
||||||
|
parse_page(&mut parser, &mut nb_errors);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
i += 1;
|
||||||
|
if i > 1000 {
|
||||||
|
println!("limit");
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
println!("nb: {i}");
|
||||||
|
println!("Errors: {nb_errors}");
|
||||||
|
}
|
||||||
|
|
||||||
|
enum ParsePageCtx {
|
||||||
|
Nothing,
|
||||||
|
Title,
|
||||||
|
Ns,
|
||||||
|
Text,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_page<T: std::io::BufRead>(parser: &mut PullParser<T>, nb_errors: &mut usize) {
|
||||||
|
let mut title = None;
|
||||||
|
let mut text = None;
|
||||||
|
let mut ctx = ParsePageCtx::Nothing;
|
||||||
|
let mut level = 1usize;
|
||||||
|
while let Some(event) = next_event(parser, nb_errors) {
|
||||||
|
match event {
|
||||||
|
ResolvedEvent::StartElement(_, (_, name), _) => {
|
||||||
|
level += 1;
|
||||||
|
match name.as_str() {
|
||||||
|
"title" => ctx = ParsePageCtx::Title,
|
||||||
|
"ns" => ctx = ParsePageCtx::Ns,
|
||||||
|
"text" => ctx = ParsePageCtx::Text,
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
ResolvedEvent::Text(_, data) => {
|
||||||
|
match ctx {
|
||||||
|
ParsePageCtx::Title => title = Some(data.as_str().to_string()),
|
||||||
|
ParsePageCtx::Ns => if data.as_str() != "4" {
|
||||||
|
//dbg!(data.as_str());
|
||||||
|
return
|
||||||
|
},
|
||||||
|
ParsePageCtx::Text => text = Some(data.as_str().to_string()),
|
||||||
|
ParsePageCtx::Nothing => {}
|
||||||
|
}
|
||||||
|
ctx = ParsePageCtx::Nothing;
|
||||||
|
},
|
||||||
|
ResolvedEvent::EndElement(_) => {
|
||||||
|
level -= 1;
|
||||||
|
ctx = ParsePageCtx::Nothing;
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
if level == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let (Some(title), Some(text)) = (title, text) else {
|
||||||
|
return
|
||||||
|
};
|
||||||
|
println!("{title}:\n{text}\n\n");
|
||||||
|
}
|
128
src/main.rs
Normal file
128
src/main.rs
Normal file
|
@ -0,0 +1,128 @@
|
||||||
|
use std::io::BufRead;
|
||||||
|
use rand::seq::SliceRandom;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
|
static FONTS: &'static [&'static str] = &[
|
||||||
|
"Albura-Regular.ttf",
|
||||||
|
"AmaticSC-Bold.ttf",
|
||||||
|
"Exo-Regular.otf",
|
||||||
|
"EBGaramond08-Regular.otf",
|
||||||
|
"Gidole-Regular.ttf",
|
||||||
|
"LondrinaBook-Regular.otf",
|
||||||
|
"PlayenSans-Regular.otf",
|
||||||
|
"Playfulist.otf",
|
||||||
|
"ProzaLibre-Regular.ttf",
|
||||||
|
];
|
||||||
|
|
||||||
|
static WORDS: Lazy<Vec<String>> = Lazy::new(|| {
|
||||||
|
let file = std::fs::File::open("lefff-3.4.mlex").unwrap();
|
||||||
|
let file_buf = std::io::BufReader::new(file);
|
||||||
|
|
||||||
|
let mut words = Vec::new();
|
||||||
|
for line in file_buf.lines() {
|
||||||
|
let line = line.unwrap();
|
||||||
|
if line.as_bytes().get(0) != Some(&b't') {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
let mut cols = line.split('\t');
|
||||||
|
let Some(word) = cols.next() else {
|
||||||
|
continue
|
||||||
|
};
|
||||||
|
let class = cols.next();
|
||||||
|
match class {
|
||||||
|
Some("nc") => {}
|
||||||
|
Some("adj") => {
|
||||||
|
let Some(flex) = cols.nth(1) else {
|
||||||
|
continue
|
||||||
|
};
|
||||||
|
if flex.contains('p') || flex.contains('m') {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => continue
|
||||||
|
}
|
||||||
|
let mut capitalized = String::from("T");
|
||||||
|
capitalized.push_str(&word[1..]);
|
||||||
|
words.push(capitalized);
|
||||||
|
}
|
||||||
|
//eprintln!("Words: {}", words.len());
|
||||||
|
words
|
||||||
|
});
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
Lazy::force(&WORDS);
|
||||||
|
|
||||||
|
tide::log::start();
|
||||||
|
let mut app = tide::new();
|
||||||
|
|
||||||
|
app.at("/").get(
|
||||||
|
move |req: tide::Request<()>| {
|
||||||
|
handle_request(
|
||||||
|
req
|
||||||
|
)
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
app.listen(std::net::SocketAddr::new(std::net::IpAddr::V4(std::net::Ipv4Addr::new(127, 0, 0, 1)), 8137)).await.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn handle_request<'a>(_req: tide::Request<()>) -> tide::Result<tide::Response> {
|
||||||
|
let mut rng = rand::thread_rng();
|
||||||
|
let word = WORDS.choose(&mut rng).unwrap();
|
||||||
|
|
||||||
|
let client = reqwest::blocking::Client::builder()
|
||||||
|
.user_agent("Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0")
|
||||||
|
.build().unwrap();
|
||||||
|
|
||||||
|
let resp = client.get(format!("https://www.bing.com/images/search?q={word}&FORM=HDRSC3")).send().unwrap();
|
||||||
|
assert!(resp.status().is_success());
|
||||||
|
let page = resp.text().unwrap();
|
||||||
|
|
||||||
|
let pattern = r#"mediaurl="#;
|
||||||
|
let url_start = page.find(pattern).unwrap() + pattern.len();
|
||||||
|
let url_len = page[url_start..].find('&').unwrap();
|
||||||
|
if url_len > 1024 {
|
||||||
|
panic!("url too long ({url_len})");
|
||||||
|
}
|
||||||
|
let img_url_encoded = &page[url_start..url_start+url_len];
|
||||||
|
let img_url = percent_encoding::percent_decode_str(img_url_encoded).decode_utf8().unwrap();
|
||||||
|
if img_url.contains('"') {
|
||||||
|
panic!("url contains quotes");
|
||||||
|
}
|
||||||
|
let font = *FONTS.choose(&mut rng).unwrap();
|
||||||
|
|
||||||
|
Ok(tide::Response::builder(200)
|
||||||
|
.header("Access-Control-Allow-Origin", "*")
|
||||||
|
.header("Access-Control-Allow-Headers", "*")
|
||||||
|
.content_type(tide::http::mime::HTML)
|
||||||
|
.body(format!(r#"<!doctype html>
|
||||||
|
<html lang="fr">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8"/>
|
||||||
|
<title>MLeTomatic</title>
|
||||||
|
<style type="text/css">
|
||||||
|
@font-face {{
|
||||||
|
font-family: CustomFont;
|
||||||
|
src: url("//txmn.tk/fonts/{font}");
|
||||||
|
}}
|
||||||
|
body {{
|
||||||
|
text-align: center;
|
||||||
|
}}
|
||||||
|
h1 {{
|
||||||
|
font-family: CustomFont;
|
||||||
|
font-weight: normal;
|
||||||
|
}}
|
||||||
|
#img {{
|
||||||
|
max-width: 100%;
|
||||||
|
max-height: 100vh;
|
||||||
|
}}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Monnaie Libre et {word}</h1>
|
||||||
|
<img id="img" alt="{word}" src="{img_url}"/>
|
||||||
|
</body>
|
||||||
|
</html>"#))
|
||||||
|
.build())
|
||||||
|
}
|
Loading…
Reference in a new issue