Initial commit

This commit is contained in:
Pascal Engélibert 2023-03-29 20:17:29 +02:00
commit 8cb7237439
8 changed files with 551762 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/target

2234
Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

17
Cargo.toml Normal file
View file

@ -0,0 +1,17 @@
[package]
name = "mlet"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
once_cell = "1.17.1"
percent-encoding = "2.2.0"
rand = "0.8.5"
reqwest = { version = "0.11.16", features = ["blocking", "brotli", "deflate", "gzip"] }
tide = { version = "0.16.0", default-features = false, features = ["h1-server", "logger"] }
tokio = { version = "1.27.0", features = ["full"] }
#roxmltree = { version = "0.18.0", default-features = false, features = ["std"] }
#rxml = "0.9.1"
#rxml = { path = "/home/tuxmain/Téléchargements/rxml/rxml" }

4
README.md Normal file
View file

@ -0,0 +1,4 @@
https://dumps.wikimedia.org/frwiktionary/latest/
problème:
* rxml ne supporte pas les erreurs UTF-8

549274
lefff-3.4.mlex Normal file

File diff suppressed because it is too large Load diff

8
rustfmt.toml Normal file
View file

@ -0,0 +1,8 @@
hard_tabs = true
newline_style = "unix"
unstable_features = true
format_code_in_doc_comments = true
format_macro_bodies = true
format_macro_matchers = true
format_strings = true

96
src/main.bak.rs Normal file
View file

@ -0,0 +1,96 @@
use rxml::{EventRead, parser::ResolvedEvent, PullParser};
fn next_event<T: std::io::BufRead>(parser: &mut PullParser<T>, nb_errors: &mut usize) -> Option<ResolvedEvent> {
loop {
match parser.read() {
Ok(event) =>
return event,
Err(e) => println!("error {nb_errors}: {e:?}"),
}
*nb_errors += 1;
}
}
fn main() {
let file = std::fs::File::open("frwiktionary-latest-pages-articles.xml").unwrap();
//let file = std::fs::File::open("/tmp/test.xml").unwrap();
let mut file_reader = std::io::BufReader::new(file);
let mut parser = PullParser::new(&mut file_reader);
/*parser.read_all(|d| {
dbg!(d);
}).unwrap();
return;*/
let mut i = 0usize;
let mut nb_errors = 0usize;
while let Some(event) = next_event(&mut parser, &mut nb_errors) {
match event {
ResolvedEvent::StartElement(_, (_, name), _) => {
if name == "page" {
parse_page(&mut parser, &mut nb_errors);
}
},
_ => {}
}
i += 1;
if i > 1000 {
println!("limit");
break
}
}
println!("nb: {i}");
println!("Errors: {nb_errors}");
}
enum ParsePageCtx {
Nothing,
Title,
Ns,
Text,
}
fn parse_page<T: std::io::BufRead>(parser: &mut PullParser<T>, nb_errors: &mut usize) {
let mut title = None;
let mut text = None;
let mut ctx = ParsePageCtx::Nothing;
let mut level = 1usize;
while let Some(event) = next_event(parser, nb_errors) {
match event {
ResolvedEvent::StartElement(_, (_, name), _) => {
level += 1;
match name.as_str() {
"title" => ctx = ParsePageCtx::Title,
"ns" => ctx = ParsePageCtx::Ns,
"text" => ctx = ParsePageCtx::Text,
_ => {}
}
},
ResolvedEvent::Text(_, data) => {
match ctx {
ParsePageCtx::Title => title = Some(data.as_str().to_string()),
ParsePageCtx::Ns => if data.as_str() != "4" {
//dbg!(data.as_str());
return
},
ParsePageCtx::Text => text = Some(data.as_str().to_string()),
ParsePageCtx::Nothing => {}
}
ctx = ParsePageCtx::Nothing;
},
ResolvedEvent::EndElement(_) => {
level -= 1;
ctx = ParsePageCtx::Nothing;
}
_ => {}
}
if level == 0 {
break
}
}
let (Some(title), Some(text)) = (title, text) else {
return
};
println!("{title}:\n{text}\n\n");
}

128
src/main.rs Normal file
View file

@ -0,0 +1,128 @@
use std::io::BufRead;
use rand::seq::SliceRandom;
use once_cell::sync::Lazy;
static FONTS: &'static [&'static str] = &[
"Albura-Regular.ttf",
"AmaticSC-Bold.ttf",
"Exo-Regular.otf",
"EBGaramond08-Regular.otf",
"Gidole-Regular.ttf",
"LondrinaBook-Regular.otf",
"PlayenSans-Regular.otf",
"Playfulist.otf",
"ProzaLibre-Regular.ttf",
];
static WORDS: Lazy<Vec<String>> = Lazy::new(|| {
let file = std::fs::File::open("lefff-3.4.mlex").unwrap();
let file_buf = std::io::BufReader::new(file);
let mut words = Vec::new();
for line in file_buf.lines() {
let line = line.unwrap();
if line.as_bytes().get(0) != Some(&b't') {
continue
}
let mut cols = line.split('\t');
let Some(word) = cols.next() else {
continue
};
let class = cols.next();
match class {
Some("nc") => {}
Some("adj") => {
let Some(flex) = cols.nth(1) else {
continue
};
if flex.contains('p') || flex.contains('m') {
continue
}
}
_ => continue
}
let mut capitalized = String::from("T");
capitalized.push_str(&word[1..]);
words.push(capitalized);
}
//eprintln!("Words: {}", words.len());
words
});
#[tokio::main]
async fn main() {
Lazy::force(&WORDS);
tide::log::start();
let mut app = tide::new();
app.at("/").get(
move |req: tide::Request<()>| {
handle_request(
req
)
}
);
app.listen(std::net::SocketAddr::new(std::net::IpAddr::V4(std::net::Ipv4Addr::new(127, 0, 0, 1)), 8137)).await.unwrap();
}
async fn handle_request<'a>(_req: tide::Request<()>) -> tide::Result<tide::Response> {
let mut rng = rand::thread_rng();
let word = WORDS.choose(&mut rng).unwrap();
let client = reqwest::blocking::Client::builder()
.user_agent("Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0")
.build().unwrap();
let resp = client.get(format!("https://www.bing.com/images/search?q={word}&FORM=HDRSC3")).send().unwrap();
assert!(resp.status().is_success());
let page = resp.text().unwrap();
let pattern = r#"mediaurl="#;
let url_start = page.find(pattern).unwrap() + pattern.len();
let url_len = page[url_start..].find('&').unwrap();
if url_len > 1024 {
panic!("url too long ({url_len})");
}
let img_url_encoded = &page[url_start..url_start+url_len];
let img_url = percent_encoding::percent_decode_str(img_url_encoded).decode_utf8().unwrap();
if img_url.contains('"') {
panic!("url contains quotes");
}
let font = *FONTS.choose(&mut rng).unwrap();
Ok(tide::Response::builder(200)
.header("Access-Control-Allow-Origin", "*")
.header("Access-Control-Allow-Headers", "*")
.content_type(tide::http::mime::HTML)
.body(format!(r#"<!doctype html>
<html lang="fr">
<head>
<meta charset="utf-8"/>
<title>MLeTomatic</title>
<style type="text/css">
@font-face {{
font-family: CustomFont;
src: url("//txmn.tk/fonts/{font}");
}}
body {{
text-align: center;
}}
h1 {{
font-family: CustomFont;
font-weight: normal;
}}
#img {{
max-width: 100%;
max-height: 100vh;
}}
</style>
</head>
<body>
<h1>Monnaie Libre et {word}</h1>
<img id="img" alt="{word}" src="{img_url}"/>
</body>
</html>"#))
.build())
}