Moved to a locally patched docx-rs and made it so the excluded chars are read from a file

master
Wynd 2025-06-12 14:17:25 +03:00
parent c27cae4230
commit 7df4943a96
4 changed files with 18 additions and 39 deletions

2
.gitignore vendored
View File

@ -1,4 +1,4 @@
/target /target
excluded
*.docx *.docx
*.txt *.txt

8
Cargo.lock generated
View File

@ -22,9 +22,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
[[package]] [[package]]
name = "base64" name = "base64"
version = "0.13.1" version = "0.22.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
[[package]] [[package]]
name = "bitflags" name = "bitflags"
@ -82,9 +82,7 @@ dependencies = [
[[package]] [[package]]
name = "docx-rs" name = "docx-rs"
version = "0.4.17" version = "0.4.18-rc19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e593b51d4fe95d69d70fd40da4b314b029736302c986c3c760826e842fd27dc3"
dependencies = [ dependencies = [
"base64", "base64",
"image", "image",

View File

@ -5,5 +5,6 @@ edition = "2021"
[dependencies] [dependencies]
anyhow = "1.0.98" anyhow = "1.0.98"
docx-rs = "0.4.17" docx-rs = { path = "../../Tests/docx-rs/docx-core" }
# docx-rs = "0.4.17"
serde_json = "1.0.140" serde_json = "1.0.140"

View File

@ -1,5 +1,5 @@
use std::{ use std::{
env, env, fs,
io::{stdout, Read, Write}, io::{stdout, Read, Write},
}; };
@ -12,50 +12,30 @@ fn main() -> anyhow::Result<()> {
let mut lines: Vec<String> = vec![]; let mut lines: Vec<String> = vec![];
// for docx
parse_docx(file_name, &mut lines)?; parse_docx(file_name, &mut lines)?;
// for txt let ascii_upper_limit = u32::from_str_radix("7F", 16).unwrap();
// for line in fs::read_to_string(file_name).unwrap().lines() { let mut excluded = vec![];
// if !line.is_empty() { for line in fs::read_to_string("excluded")?.lines() {
// lines.push(line.to_string()) if let Some(c) = line.chars().next() {
// } let c = format!("{:x}", c as u32);
// } let u = u32::from_str_radix(&c, 16)?;
excluded.push(u);
}
}
let mut lock = stdout().lock(); let mut lock = stdout().lock();
let ascii_upper_limit = u32::from_str_radix("7F", 16).unwrap();
let excluded = [
"0000021B", // ț
"00000219", // ș
"00000103", // ă
"000000E2", // â
"000000EE", // î
"0000021A", // Ț
"00000218", // Ș
"00000102", // Ă
"000000C2", // Â
"000000CE", // Î
"0000201E", // „
"0000201D", // ”
"0000201A", //
"00002019", //
];
let excluded: Vec<u32> = excluded
.into_iter()
.map(|c| u32::from_str_radix(c, 16).unwrap())
.collect();
for line in lines { for line in lines {
for c in line.chars() { for c in line.chars() {
let uc = c as u32; let uc = c as u32;
if uc > ascii_upper_limit && !excluded.contains(&uc) { if uc > ascii_upper_limit && !excluded.contains(&uc) {
writeln!(lock, "{c} - {line}").unwrap(); writeln!(lock, "{c} - {line}")?;
} }
} }
} }
lock.flush().unwrap(); lock.flush()?;
Ok(()) Ok(())
} }