Initial commit
This commit is contained in:
commit
8cb7237439
8 changed files with 551762 additions and 0 deletions
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
/target
|
2234
Cargo.lock
generated
Normal file
2234
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
17
Cargo.toml
Normal file
17
Cargo.toml
Normal file
|
@ -0,0 +1,17 @@
|
|||
[package]
|
||||
name = "mlet"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
once_cell = "1.17.1"
|
||||
percent-encoding = "2.2.0"
|
||||
rand = "0.8.5"
|
||||
reqwest = { version = "0.11.16", features = ["blocking", "brotli", "deflate", "gzip"] }
|
||||
tide = { version = "0.16.0", default-features = false, features = ["h1-server", "logger"] }
|
||||
tokio = { version = "1.27.0", features = ["full"] }
|
||||
#roxmltree = { version = "0.18.0", default-features = false, features = ["std"] }
|
||||
#rxml = "0.9.1"
|
||||
#rxml = { path = "/home/tuxmain/Téléchargements/rxml/rxml" }
|
4
README.md
Normal file
4
README.md
Normal file
|
@ -0,0 +1,4 @@
|
|||
https://dumps.wikimedia.org/frwiktionary/latest/
|
||||
|
||||
problème:
|
||||
* rxml ne supporte pas les erreurs UTF-8
|
549274
lefff-3.4.mlex
Normal file
549274
lefff-3.4.mlex
Normal file
File diff suppressed because it is too large
Load diff
8
rustfmt.toml
Normal file
8
rustfmt.toml
Normal file
|
@ -0,0 +1,8 @@
|
|||
hard_tabs = true
|
||||
newline_style = "unix"
|
||||
|
||||
unstable_features = true
|
||||
format_code_in_doc_comments = true
|
||||
format_macro_bodies = true
|
||||
format_macro_matchers = true
|
||||
format_strings = true
|
96
src/main.bak.rs
Normal file
96
src/main.bak.rs
Normal file
|
@ -0,0 +1,96 @@
|
|||
use rxml::{EventRead, parser::ResolvedEvent, PullParser};
|
||||
|
||||
fn next_event<T: std::io::BufRead>(parser: &mut PullParser<T>, nb_errors: &mut usize) -> Option<ResolvedEvent> {
|
||||
loop {
|
||||
match parser.read() {
|
||||
Ok(event) =>
|
||||
return event,
|
||||
Err(e) => println!("error {nb_errors}: {e:?}"),
|
||||
}
|
||||
*nb_errors += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let file = std::fs::File::open("frwiktionary-latest-pages-articles.xml").unwrap();
|
||||
//let file = std::fs::File::open("/tmp/test.xml").unwrap();
|
||||
let mut file_reader = std::io::BufReader::new(file);
|
||||
|
||||
let mut parser = PullParser::new(&mut file_reader);
|
||||
|
||||
/*parser.read_all(|d| {
|
||||
dbg!(d);
|
||||
}).unwrap();
|
||||
return;*/
|
||||
|
||||
let mut i = 0usize;
|
||||
let mut nb_errors = 0usize;
|
||||
while let Some(event) = next_event(&mut parser, &mut nb_errors) {
|
||||
match event {
|
||||
ResolvedEvent::StartElement(_, (_, name), _) => {
|
||||
if name == "page" {
|
||||
parse_page(&mut parser, &mut nb_errors);
|
||||
}
|
||||
},
|
||||
_ => {}
|
||||
}
|
||||
i += 1;
|
||||
if i > 1000 {
|
||||
println!("limit");
|
||||
break
|
||||
}
|
||||
}
|
||||
println!("nb: {i}");
|
||||
println!("Errors: {nb_errors}");
|
||||
}
|
||||
|
||||
enum ParsePageCtx {
|
||||
Nothing,
|
||||
Title,
|
||||
Ns,
|
||||
Text,
|
||||
}
|
||||
|
||||
fn parse_page<T: std::io::BufRead>(parser: &mut PullParser<T>, nb_errors: &mut usize) {
|
||||
let mut title = None;
|
||||
let mut text = None;
|
||||
let mut ctx = ParsePageCtx::Nothing;
|
||||
let mut level = 1usize;
|
||||
while let Some(event) = next_event(parser, nb_errors) {
|
||||
match event {
|
||||
ResolvedEvent::StartElement(_, (_, name), _) => {
|
||||
level += 1;
|
||||
match name.as_str() {
|
||||
"title" => ctx = ParsePageCtx::Title,
|
||||
"ns" => ctx = ParsePageCtx::Ns,
|
||||
"text" => ctx = ParsePageCtx::Text,
|
||||
_ => {}
|
||||
}
|
||||
},
|
||||
ResolvedEvent::Text(_, data) => {
|
||||
match ctx {
|
||||
ParsePageCtx::Title => title = Some(data.as_str().to_string()),
|
||||
ParsePageCtx::Ns => if data.as_str() != "4" {
|
||||
//dbg!(data.as_str());
|
||||
return
|
||||
},
|
||||
ParsePageCtx::Text => text = Some(data.as_str().to_string()),
|
||||
ParsePageCtx::Nothing => {}
|
||||
}
|
||||
ctx = ParsePageCtx::Nothing;
|
||||
},
|
||||
ResolvedEvent::EndElement(_) => {
|
||||
level -= 1;
|
||||
ctx = ParsePageCtx::Nothing;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
if level == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
let (Some(title), Some(text)) = (title, text) else {
|
||||
return
|
||||
};
|
||||
println!("{title}:\n{text}\n\n");
|
||||
}
|
128
src/main.rs
Normal file
128
src/main.rs
Normal file
|
@ -0,0 +1,128 @@
|
|||
use std::io::BufRead;
|
||||
use rand::seq::SliceRandom;
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
static FONTS: &'static [&'static str] = &[
|
||||
"Albura-Regular.ttf",
|
||||
"AmaticSC-Bold.ttf",
|
||||
"Exo-Regular.otf",
|
||||
"EBGaramond08-Regular.otf",
|
||||
"Gidole-Regular.ttf",
|
||||
"LondrinaBook-Regular.otf",
|
||||
"PlayenSans-Regular.otf",
|
||||
"Playfulist.otf",
|
||||
"ProzaLibre-Regular.ttf",
|
||||
];
|
||||
|
||||
static WORDS: Lazy<Vec<String>> = Lazy::new(|| {
|
||||
let file = std::fs::File::open("lefff-3.4.mlex").unwrap();
|
||||
let file_buf = std::io::BufReader::new(file);
|
||||
|
||||
let mut words = Vec::new();
|
||||
for line in file_buf.lines() {
|
||||
let line = line.unwrap();
|
||||
if line.as_bytes().get(0) != Some(&b't') {
|
||||
continue
|
||||
}
|
||||
let mut cols = line.split('\t');
|
||||
let Some(word) = cols.next() else {
|
||||
continue
|
||||
};
|
||||
let class = cols.next();
|
||||
match class {
|
||||
Some("nc") => {}
|
||||
Some("adj") => {
|
||||
let Some(flex) = cols.nth(1) else {
|
||||
continue
|
||||
};
|
||||
if flex.contains('p') || flex.contains('m') {
|
||||
continue
|
||||
}
|
||||
}
|
||||
_ => continue
|
||||
}
|
||||
let mut capitalized = String::from("T");
|
||||
capitalized.push_str(&word[1..]);
|
||||
words.push(capitalized);
|
||||
}
|
||||
//eprintln!("Words: {}", words.len());
|
||||
words
|
||||
});
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
Lazy::force(&WORDS);
|
||||
|
||||
tide::log::start();
|
||||
let mut app = tide::new();
|
||||
|
||||
app.at("/").get(
|
||||
move |req: tide::Request<()>| {
|
||||
handle_request(
|
||||
req
|
||||
)
|
||||
}
|
||||
);
|
||||
|
||||
app.listen(std::net::SocketAddr::new(std::net::IpAddr::V4(std::net::Ipv4Addr::new(127, 0, 0, 1)), 8137)).await.unwrap();
|
||||
}
|
||||
|
||||
async fn handle_request<'a>(_req: tide::Request<()>) -> tide::Result<tide::Response> {
|
||||
let mut rng = rand::thread_rng();
|
||||
let word = WORDS.choose(&mut rng).unwrap();
|
||||
|
||||
let client = reqwest::blocking::Client::builder()
|
||||
.user_agent("Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0")
|
||||
.build().unwrap();
|
||||
|
||||
let resp = client.get(format!("https://www.bing.com/images/search?q={word}&FORM=HDRSC3")).send().unwrap();
|
||||
assert!(resp.status().is_success());
|
||||
let page = resp.text().unwrap();
|
||||
|
||||
let pattern = r#"mediaurl="#;
|
||||
let url_start = page.find(pattern).unwrap() + pattern.len();
|
||||
let url_len = page[url_start..].find('&').unwrap();
|
||||
if url_len > 1024 {
|
||||
panic!("url too long ({url_len})");
|
||||
}
|
||||
let img_url_encoded = &page[url_start..url_start+url_len];
|
||||
let img_url = percent_encoding::percent_decode_str(img_url_encoded).decode_utf8().unwrap();
|
||||
if img_url.contains('"') {
|
||||
panic!("url contains quotes");
|
||||
}
|
||||
let font = *FONTS.choose(&mut rng).unwrap();
|
||||
|
||||
Ok(tide::Response::builder(200)
|
||||
.header("Access-Control-Allow-Origin", "*")
|
||||
.header("Access-Control-Allow-Headers", "*")
|
||||
.content_type(tide::http::mime::HTML)
|
||||
.body(format!(r#"<!doctype html>
|
||||
<html lang="fr">
|
||||
<head>
|
||||
<meta charset="utf-8"/>
|
||||
<title>MLeTomatic</title>
|
||||
<style type="text/css">
|
||||
@font-face {{
|
||||
font-family: CustomFont;
|
||||
src: url("//txmn.tk/fonts/{font}");
|
||||
}}
|
||||
body {{
|
||||
text-align: center;
|
||||
}}
|
||||
h1 {{
|
||||
font-family: CustomFont;
|
||||
font-weight: normal;
|
||||
}}
|
||||
#img {{
|
||||
max-width: 100%;
|
||||
max-height: 100vh;
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Monnaie Libre et {word}</h1>
|
||||
<img id="img" alt="{word}" src="{img_url}"/>
|
||||
</body>
|
||||
</html>"#))
|
||||
.build())
|
||||
}
|
Loading…
Reference in a new issue