Moved to a locally patched docx-rs and made it so the excluded chars are read from a file
parent
c27cae4230
commit
7df4943a96
|
@ -1,4 +1,4 @@
|
||||||
/target
|
/target
|
||||||
|
excluded
|
||||||
*.docx
|
*.docx
|
||||||
*.txt
|
*.txt
|
||||||
|
|
|
@ -22,9 +22,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "base64"
|
name = "base64"
|
||||||
version = "0.13.1"
|
version = "0.22.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bitflags"
|
name = "bitflags"
|
||||||
|
@ -82,9 +82,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docx-rs"
|
name = "docx-rs"
|
||||||
version = "0.4.17"
|
version = "0.4.18-rc19"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "e593b51d4fe95d69d70fd40da4b314b029736302c986c3c760826e842fd27dc3"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"base64",
|
"base64",
|
||||||
"image",
|
"image",
|
||||||
|
|
|
@ -5,5 +5,6 @@ edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = "1.0.98"
|
anyhow = "1.0.98"
|
||||||
docx-rs = "0.4.17"
|
docx-rs = { path = "../../Tests/docx-rs/docx-core" }
|
||||||
|
# docx-rs = "0.4.17"
|
||||||
serde_json = "1.0.140"
|
serde_json = "1.0.140"
|
||||||
|
|
44
src/main.rs
44
src/main.rs
|
@ -1,5 +1,5 @@
|
||||||
use std::{
|
use std::{
|
||||||
env,
|
env, fs,
|
||||||
io::{stdout, Read, Write},
|
io::{stdout, Read, Write},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -12,50 +12,30 @@ fn main() -> anyhow::Result<()> {
|
||||||
|
|
||||||
let mut lines: Vec<String> = vec![];
|
let mut lines: Vec<String> = vec![];
|
||||||
|
|
||||||
// for docx
|
|
||||||
parse_docx(file_name, &mut lines)?;
|
parse_docx(file_name, &mut lines)?;
|
||||||
|
|
||||||
// for txt
|
let ascii_upper_limit = u32::from_str_radix("7F", 16).unwrap();
|
||||||
// for line in fs::read_to_string(file_name).unwrap().lines() {
|
let mut excluded = vec![];
|
||||||
// if !line.is_empty() {
|
for line in fs::read_to_string("excluded")?.lines() {
|
||||||
// lines.push(line.to_string())
|
if let Some(c) = line.chars().next() {
|
||||||
// }
|
let c = format!("{:x}", c as u32);
|
||||||
// }
|
let u = u32::from_str_radix(&c, 16)?;
|
||||||
|
excluded.push(u);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let mut lock = stdout().lock();
|
let mut lock = stdout().lock();
|
||||||
|
|
||||||
let ascii_upper_limit = u32::from_str_radix("7F", 16).unwrap();
|
|
||||||
let excluded = [
|
|
||||||
"0000021B", // ț
|
|
||||||
"00000219", // ș
|
|
||||||
"00000103", // ă
|
|
||||||
"000000E2", // â
|
|
||||||
"000000EE", // î
|
|
||||||
"0000021A", // Ț
|
|
||||||
"00000218", // Ș
|
|
||||||
"00000102", // Ă
|
|
||||||
"000000C2", // Â
|
|
||||||
"000000CE", // Î
|
|
||||||
"0000201E", // „
|
|
||||||
"0000201D", // ”
|
|
||||||
"0000201A", // ‚
|
|
||||||
"00002019", // ’
|
|
||||||
];
|
|
||||||
let excluded: Vec<u32> = excluded
|
|
||||||
.into_iter()
|
|
||||||
.map(|c| u32::from_str_radix(c, 16).unwrap())
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
for line in lines {
|
for line in lines {
|
||||||
for c in line.chars() {
|
for c in line.chars() {
|
||||||
let uc = c as u32;
|
let uc = c as u32;
|
||||||
if uc > ascii_upper_limit && !excluded.contains(&uc) {
|
if uc > ascii_upper_limit && !excluded.contains(&uc) {
|
||||||
writeln!(lock, "{c} - {line}").unwrap();
|
writeln!(lock, "{c} - {line}")?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
lock.flush().unwrap();
|
lock.flush()?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue