From c27cae4230db532ddd3bec397618993ac8eed3a4 Mon Sep 17 00:00:00 2001 From: Wynd Date: Thu, 12 Jun 2025 01:03:41 +0300 Subject: [PATCH] Hastingly put together app to find non-ascii and non-excluded characters in a docx file --- .gitignore | 4 + Cargo.lock | 325 ++++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 9 ++ src/main.rs | 87 ++++++++++++++ 4 files changed, 425 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..881b3fd --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/target + +*.docx +*.txt diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..35c95cf --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,325 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "anyhow" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bytemuck" +version = "1.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c76a5792e44e4abe34d3abf15636779261d45a7450612059293d1d2cfc63422" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "cfg-if" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" + +[[package]] +name = "color_quant" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "docx-non-ascii" +version = "0.1.0" +dependencies = [ + "anyhow", + "docx-rs", + "serde_json", +] + +[[package]] +name = "docx-rs" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e593b51d4fe95d69d70fd40da4b314b029736302c986c3c760826e842fd27dc3" +dependencies = [ + "base64", + "image", + "serde", + "serde_json", + "thiserror", + "xml-rs", + "zip", +] + +[[package]] +name = "fdeflate" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "flate2" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "gif" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb2d69b19215e18bb912fa30f7ce15846e301408695e44e0ef719f1da9e19f2" +dependencies = [ + "color_quant", + "weezl", +] + +[[package]] +name = "image" +version = "0.24.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5690139d2f55868e080017335e4b94cb7414274c74f1669c84fb5feba2c9f69d" +dependencies = [ + "bytemuck", + "byteorder", + "color_quant", + "gif", + "jpeg-decoder", + "num-traits", + "png", + "tiff", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "jpeg-decoder" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5d4a7da358eff58addd2877a45865158f0d78c911d43a5784ceb7bbf52833b0" + +[[package]] +name = "memchr" +version = "2.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "png" +version = "0.17.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82151a2fc869e011c153adc57cf2789ccb8d9906ce52c0b39a6b5697749d7526" +dependencies = [ + "bitflags", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.140" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + +[[package]] +name = "syn" +version = "2.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6397daf94fa90f058bd0fd88429dd9e5738999cca8d701813c80723add80462" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tiff" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba1310fcea54c6a9a4fd1aad794ecc02c31682f6bfbecdf460bf19533eed1e3e" +dependencies = [ + "flate2", + "jpeg-decoder", + "weezl", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "weezl" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a751b3277700db47d3e574514de2eced5e54dc8a5436a3bf7a0b248b2cee16f3" + +[[package]] +name = "xml-rs" +version = "0.8.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a62ce76d9b56901b19a74f19431b0d8b3bc7ca4ad685a746dfd78ca8f4fc6bda" + +[[package]] +name = "zip" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" +dependencies = [ + "byteorder", + "crc32fast", + "crossbeam-utils", + "flate2", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d929945 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "docx-non-ascii" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = "1.0.98" +docx-rs = "0.4.17" +serde_json = "1.0.140" diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..989febd --- /dev/null +++ b/src/main.rs @@ -0,0 +1,87 @@ +use std::{ + env, + io::{stdout, Read, Write}, +}; + +use docx_rs::read_docx; +use serde_json::Value; + +fn main() -> anyhow::Result<()> { + let args: Vec = env::args().collect(); + let file_name = &args[1]; + + let mut lines: Vec = vec![]; + + // for docx + parse_docx(file_name, &mut lines)?; + + // for txt + // for line in fs::read_to_string(file_name).unwrap().lines() { + // if !line.is_empty() { + // lines.push(line.to_string()) + // } + // } + + let mut lock = stdout().lock(); + + let ascii_upper_limit = u32::from_str_radix("7F", 16).unwrap(); + let excluded = [ + "0000021B", // ț + "00000219", // ș + "00000103", // ă + "000000E2", // â + "000000EE", // î + "0000021A", // Ț + "00000218", // Ș + "00000102", // Ă + "000000C2", // Â + "000000CE", // Î + "0000201E", // „ + "0000201D", // ” + "0000201A", // ‚ + "00002019", // ’ + ]; + let excluded: Vec = excluded + .into_iter() + .map(|c| u32::from_str_radix(c, 16).unwrap()) + .collect(); + + for line in lines { + for c in line.chars() { + let uc = c as u32; + if uc > ascii_upper_limit && !excluded.contains(&uc) { + writeln!(lock, "{c} - {line}").unwrap(); + } + } + } + + lock.flush().unwrap(); + + Ok(()) +} + +fn read_to_vec(file_name: &str) -> anyhow::Result> { + let mut buf = Vec::new(); + std::fs::File::open(file_name)?.read_to_end(&mut buf)?; + Ok(buf) +} + +fn parse_docx(file_name: &str, lines: &mut Vec) -> anyhow::Result<()> { + let data: Value = serde_json::from_str(&read_docx(&read_to_vec(file_name)?)?.json())?; + if let Some(children) = data["document"]["children"].as_array() { + children.iter().for_each(|c| read_children(c, lines)); + } + Ok(()) +} + +fn read_children(node: &Value, lines: &mut Vec) { + if let Some(children) = node["data"]["children"].as_array() { + children.iter().for_each(|child| { + if child["type"] != "text" { + read_children(child, lines); + } else { + lines.push(child["data"]["text"].to_string()); + } + }); + } +}