Moved to a locally patched docx-rs and made it so the excluded chars are read from a file

master
Wynd 2025-06-12 14:17:25 +03:00
parent c27cae4230
commit 7df4943a96
4 changed files with 18 additions and 39 deletions

2
.gitignore vendored
View File

@ -1,4 +1,4 @@
/target
excluded
*.docx
*.txt

8
Cargo.lock generated
View File

@ -22,9 +22,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
[[package]]
name = "base64"
version = "0.13.1"
version = "0.22.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
[[package]]
name = "bitflags"
@ -82,9 +82,7 @@ dependencies = [
[[package]]
name = "docx-rs"
version = "0.4.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e593b51d4fe95d69d70fd40da4b314b029736302c986c3c760826e842fd27dc3"
version = "0.4.18-rc19"
dependencies = [
"base64",
"image",

View File

@ -5,5 +5,6 @@ edition = "2021"
[dependencies]
anyhow = "1.0.98"
docx-rs = "0.4.17"
docx-rs = { path = "../../Tests/docx-rs/docx-core" }
# docx-rs = "0.4.17"
serde_json = "1.0.140"

View File

@ -1,5 +1,5 @@
use std::{
env,
env, fs,
io::{stdout, Read, Write},
};
@ -12,50 +12,30 @@ fn main() -> anyhow::Result<()> {
let mut lines: Vec<String> = vec![];
// for docx
parse_docx(file_name, &mut lines)?;
// for txt
// for line in fs::read_to_string(file_name).unwrap().lines() {
// if !line.is_empty() {
// lines.push(line.to_string())
// }
// }
let ascii_upper_limit = u32::from_str_radix("7F", 16).unwrap();
let mut excluded = vec![];
for line in fs::read_to_string("excluded")?.lines() {
if let Some(c) = line.chars().next() {
let c = format!("{:x}", c as u32);
let u = u32::from_str_radix(&c, 16)?;
excluded.push(u);
}
}
let mut lock = stdout().lock();
let ascii_upper_limit = u32::from_str_radix("7F", 16).unwrap();
let excluded = [
"0000021B", // ț
"00000219", // ș
"00000103", // ă
"000000E2", // â
"000000EE", // î
"0000021A", // Ț
"00000218", // Ș
"00000102", // Ă
"000000C2", // Â
"000000CE", // Î
"0000201E", // „
"0000201D", // ”
"0000201A", //
"00002019", //
];
let excluded: Vec<u32> = excluded
.into_iter()
.map(|c| u32::from_str_radix(c, 16).unwrap())
.collect();
for line in lines {
for c in line.chars() {
let uc = c as u32;
if uc > ascii_upper_limit && !excluded.contains(&uc) {
writeln!(lock, "{c} - {line}").unwrap();
writeln!(lock, "{c} - {line}")?;
}
}
}
lock.flush().unwrap();
lock.flush()?;
Ok(())
}