Moved to a locally patched docx-rs and made it so the excluded chars are read from a file
parent
c27cae4230
commit
7df4943a96
|
@ -1,4 +1,4 @@
|
|||
/target
|
||||
|
||||
excluded
|
||||
*.docx
|
||||
*.txt
|
||||
|
|
|
@ -22,9 +22,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
|
|||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.13.1"
|
||||
version = "0.22.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
||||
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
|
@ -82,9 +82,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "docx-rs"
|
||||
version = "0.4.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e593b51d4fe95d69d70fd40da4b314b029736302c986c3c760826e842fd27dc3"
|
||||
version = "0.4.18-rc19"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"image",
|
||||
|
|
|
@ -5,5 +5,6 @@ edition = "2021"
|
|||
|
||||
[dependencies]
|
||||
anyhow = "1.0.98"
|
||||
docx-rs = "0.4.17"
|
||||
docx-rs = { path = "../../Tests/docx-rs/docx-core" }
|
||||
# docx-rs = "0.4.17"
|
||||
serde_json = "1.0.140"
|
||||
|
|
44
src/main.rs
44
src/main.rs
|
@ -1,5 +1,5 @@
|
|||
use std::{
|
||||
env,
|
||||
env, fs,
|
||||
io::{stdout, Read, Write},
|
||||
};
|
||||
|
||||
|
@ -12,50 +12,30 @@ fn main() -> anyhow::Result<()> {
|
|||
|
||||
let mut lines: Vec<String> = vec![];
|
||||
|
||||
// for docx
|
||||
parse_docx(file_name, &mut lines)?;
|
||||
|
||||
// for txt
|
||||
// for line in fs::read_to_string(file_name).unwrap().lines() {
|
||||
// if !line.is_empty() {
|
||||
// lines.push(line.to_string())
|
||||
// }
|
||||
// }
|
||||
let ascii_upper_limit = u32::from_str_radix("7F", 16).unwrap();
|
||||
let mut excluded = vec![];
|
||||
for line in fs::read_to_string("excluded")?.lines() {
|
||||
if let Some(c) = line.chars().next() {
|
||||
let c = format!("{:x}", c as u32);
|
||||
let u = u32::from_str_radix(&c, 16)?;
|
||||
excluded.push(u);
|
||||
}
|
||||
}
|
||||
|
||||
let mut lock = stdout().lock();
|
||||
|
||||
let ascii_upper_limit = u32::from_str_radix("7F", 16).unwrap();
|
||||
let excluded = [
|
||||
"0000021B", // ț
|
||||
"00000219", // ș
|
||||
"00000103", // ă
|
||||
"000000E2", // â
|
||||
"000000EE", // î
|
||||
"0000021A", // Ț
|
||||
"00000218", // Ș
|
||||
"00000102", // Ă
|
||||
"000000C2", // Â
|
||||
"000000CE", // Î
|
||||
"0000201E", // „
|
||||
"0000201D", // ”
|
||||
"0000201A", // ‚
|
||||
"00002019", // ’
|
||||
];
|
||||
let excluded: Vec<u32> = excluded
|
||||
.into_iter()
|
||||
.map(|c| u32::from_str_radix(c, 16).unwrap())
|
||||
.collect();
|
||||
|
||||
for line in lines {
|
||||
for c in line.chars() {
|
||||
let uc = c as u32;
|
||||
if uc > ascii_upper_limit && !excluded.contains(&uc) {
|
||||
writeln!(lock, "{c} - {line}").unwrap();
|
||||
writeln!(lock, "{c} - {line}")?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lock.flush().unwrap();
|
||||
lock.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue