Skip to content

Commit ff4e184

Browse files
fix: resolve clippy pedantic warnings (#67)
- Replace lazy_static with std::sync::LazyLock - Use case-insensitive file extension comparison in pptx parser - Add missing #Errors doc section to parse function - Merge identical match arms in ApiError implementations - Move const declaration before statements in benchmark - Use integer div_ceil instead of float arithmetic - Remove unused lazy_static dependency
1 parent 815db5d commit ff4e184

File tree

13 files changed

+141
-132
lines changed

13 files changed

+141
-132
lines changed

Cargo.lock

Lines changed: 76 additions & 77 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,9 @@ categories = ["text-processing", "parsing"]
1515
calamine = "0.32.0"
1616
docx-rs = "0.4.18"
1717
infer = "0.19.0"
18-
lazy_static = "1.5.0"
1918
mime = "0.3.17"
2019
pdf-extract = "0.10.0"
21-
regex = "1.12.2"
20+
regex = "1.12.3"
2221
tempfile = "3.24.0"
2322
tesseract = "0.15.2"
2423
zip = "7.2.0"
@@ -38,6 +37,9 @@ env_logger = "0.11.8"
3837
criterion = "0.8"
3938
num_cpus = "1.17.0"
4039

40+
[lints.clippy]
41+
pedantic = "warn"
42+
4143
[[bench]]
4244
name = "function_parse"
4345
harness = false

benches/function_parse.rs

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ fn benchmark_sequential_vs_parallel(c: &mut Criterion) {
5656
.par_iter()
5757
.map(|d| parse(black_box(d)))
5858
.collect::<Result<Vec<String>, ParserError>>()
59-
})
59+
});
6060
});
6161

6262
// Benchmark sequential parsing
@@ -66,7 +66,7 @@ fn benchmark_sequential_vs_parallel(c: &mut Criterion) {
6666
.iter()
6767
.map(|d| parse(black_box(d)))
6868
.collect::<Result<Vec<String>, ParserError>>()
69-
})
69+
});
7070
});
7171

7272
group.finish();
@@ -98,7 +98,7 @@ fn benchmark_parallel_efficiency(c: &mut Criterion) {
9898
.par_iter()
9999
.map(|d| parse(black_box(d)))
100100
.collect::<Result<Vec<String>, ParserError>>()
101-
})
101+
});
102102
});
103103
}
104104

@@ -123,7 +123,7 @@ fn benchmark_per_filetype(c: &mut Criterion) {
123123
.par_iter()
124124
.map(|d| parse(black_box(d)))
125125
.collect::<Result<Vec<String>, ParserError>>()
126-
})
126+
});
127127
});
128128
}
129129

@@ -144,7 +144,7 @@ fn benchmark_per_filetype(c: &mut Criterion) {
144144
.par_iter()
145145
.map(|d| parse(black_box(d)))
146146
.collect::<Result<Vec<String>, ParserError>>()
147-
})
147+
});
148148
});
149149
}
150150

@@ -153,6 +153,7 @@ fn benchmark_per_filetype(c: &mut Criterion) {
153153

154154
// Finds the threshold number of files for each type that takes less than 16ms
155155
fn benchmark_parallel_threshold(c: &mut Criterion) {
156+
const SAMPLE_COUNT: usize = 5;
156157
let max_time_threshold = Duration::from_millis(16);
157158

158159
// Read each test file only once
@@ -181,7 +182,6 @@ fn benchmark_parallel_threshold(c: &mut Criterion) {
181182
}
182183

183184
// Take multiple measurements and use median for robustness
184-
const SAMPLE_COUNT: usize = 5;
185185
let mut durations = Vec::with_capacity(SAMPLE_COUNT);
186186

187187
for _ in 0..SAMPLE_COUNT {
@@ -227,13 +227,16 @@ fn benchmark_parallel_threshold(c: &mut Criterion) {
227227
// The threshold count is now in 'low'
228228
let threshold_count = low;
229229

230-
// Define percentages to test around the threshold
231-
let percentages = [99.0, 99.9, 100.0, 100.1, 101.0];
230+
// Permille values for percentages: 99.0%, 99.9%, 100.0%, 100.1%, 101.0%
231+
let permille_values: [usize; 5] = [990, 999, 1000, 1001, 1010];
232232

233-
// Generate test points based on percentages of the threshold
234-
let mut test_points: Vec<usize> = percentages
233+
// Generate test points based on percentages of the threshold using integer math
234+
let mut test_points: Vec<usize> = permille_values
235235
.iter()
236-
.map(|&p| ((threshold_count as f64 * p / 100.0).ceil() as usize).max(1))
236+
.map(|&p| {
237+
let product = threshold_count.saturating_mul(p);
238+
product.div_ceil(1000).max(1)
239+
})
237240
.collect();
238241

239242
test_points.dedup();
@@ -251,7 +254,7 @@ fn benchmark_parallel_threshold(c: &mut Criterion) {
251254
.par_iter()
252255
.map(|d| parse(black_box(d)))
253256
.collect::<Result<Vec<String>, ParserError>>()
254-
})
257+
});
255258
});
256259
}
257260

src/core/constants.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,6 @@ pub const APPLICATION_DOCX: &str =
1313
pub const APPLICATION_XLSX: &str =
1414
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
1515

16-
/// MIME type for PPTX (Microsoft PowerPoint) presentations
16+
/// MIME type for PPTX (Microsoft `PowerPoint`) presentations
1717
pub const APPLICATION_PPTX: &str =
1818
"application/vnd.openxmlformats-officedocument.presentationml.presentation";

src/core/errors.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,14 @@ pub enum ParserError {
3131
impl std::fmt::Display for ParserError {
3232
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
3333
match self {
34-
ParserError::IoError(msg) => write!(f, "IO error: {}", msg),
35-
ParserError::ParseError(msg) => write!(f, "Parse error: {}", msg),
36-
ParserError::InvalidFormat(msg) => write!(f, "Invalid format: {}", msg),
34+
ParserError::IoError(msg) => write!(f, "IO error: {msg}"),
35+
ParserError::ParseError(msg) => write!(f, "Parse error: {msg}"),
36+
ParserError::InvalidFormat(msg) => write!(f, "Invalid format: {msg}"),
3737
}
3838
}
3939
}
4040

41-
/// Implements the std::error::Error trait for ParserError to allow it to be used
41+
/// Implements the `std::error::Error` trait for `ParserError` to allow it to be used
4242
/// with the ? operator and to be boxed as a dyn Error.
4343
impl std::error::Error for ParserError {}
4444

src/core/parsers.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,12 @@ use super::{
2121
errors::ParserError,
2222
};
2323
use infer::Infer;
24-
use lazy_static::lazy_static;
2524
use mime::{IMAGE, Mime, TEXT, TEXT_PLAIN};
2625
use std::str;
26+
use std::sync::LazyLock;
2727

2828
// Create a static infer instance to avoid recreating it on every call
29-
lazy_static! {
30-
static ref INFER: Infer = Infer::new();
31-
}
29+
static INFER: LazyLock<Infer> = LazyLock::new(Infer::new);
3230

3331
/// Parses the given data into plain text.
3432
///
@@ -74,6 +72,11 @@ lazy_static! {
7472
/// // Verify the result
7573
/// assert_eq!(result, "Hello, world! This is a sample text file.");
7674
/// ```
75+
///
76+
/// # Errors
77+
///
78+
/// Returns [`ParserError::InvalidFormat`] if the file type is unsupported or unrecognized.
79+
/// May return other [`ParserError`] variants if an error occurs during parsing.
7780
pub fn parse(data: &[u8]) -> Result<String, ParserError> {
7881
match determine_mime_type(data) {
7982
Some(mime) if mime == APPLICATION_PDF => parse_pdf(data),
@@ -83,8 +86,7 @@ pub fn parse(data: &[u8]) -> Result<String, ParserError> {
8386
Some(mime) if mime.type_() == TEXT => parse_text(data),
8487
Some(mime) if mime.type_() == IMAGE => parse_image(data),
8588
Some(mime) => Err(ParserError::InvalidFormat(format!(
86-
"Unsupported file type: {}",
87-
mime
89+
"Unsupported file type: {mime}"
8890
))),
8991
None => Err(ParserError::InvalidFormat(
9092
"Could not determine file type.".to_string(),

src/core/parsers/docx.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
//! DOCX parser module.
22
//!
33
//! This module provides functionality for extracting text from Microsoft Word DOCX
4-
//! documents using the docx_rs library.
4+
//! documents using the `docx_rs` library.
55
66
use super::super::errors::ParserError;
77
use docx_rs::read_docx;
@@ -22,7 +22,7 @@ use docx_rs::read_docx;
2222
///
2323
/// # Implementation Notes
2424
///
25-
/// * Uses the docx_rs library for DOCX parsing
25+
/// * Uses the `docx_rs` library for DOCX parsing
2626
/// * Extracts text by traversing document structure: documents → paragraphs → runs → text
2727
/// * Joins paragraphs with newlines and trims whitespace from the result
2828
/// * TODO: Consider simplifying the document traversal logic

src/core/parsers/image.rs

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
//! various image formats including PNG, JPEG, and WebP.
66
77
use super::super::errors::ParserError;
8-
use lazy_static::lazy_static;
8+
use std::sync::LazyLock;
99
use std::{fs, io::Write};
1010
use tempfile::{NamedTempFile, TempDir};
1111
use tesseract::Tesseract;
@@ -20,20 +20,18 @@ const TESSDATA_FRA: &[u8] = include_bytes!(concat!(
2020
"/assets/ocr/fra.traineddata"
2121
));
2222

23-
lazy_static! {
24-
static ref TESSDATA_DIR: TempDir = {
25-
let dir = tempfile::tempdir().expect("Failed to create tessdata directory");
26-
let dir_path = dir.path();
23+
static TESSDATA_DIR: LazyLock<TempDir> = LazyLock::new(|| {
24+
let dir = tempfile::tempdir().expect("Failed to create tessdata directory");
25+
let dir_path = dir.path();
2726

28-
// Write language files to tessdata directory (only done once)
29-
fs::write(dir_path.join("eng.traineddata"), TESSDATA_ENG)
30-
.expect("Failed to write English training data");
31-
fs::write(dir_path.join("fra.traineddata"), TESSDATA_FRA)
32-
.expect("Failed to write French training data");
27+
// Write language files to tessdata directory (only done once)
28+
fs::write(dir_path.join("eng.traineddata"), TESSDATA_ENG)
29+
.expect("Failed to write English training data");
30+
fs::write(dir_path.join("fra.traineddata"), TESSDATA_FRA)
31+
.expect("Failed to write French training data");
3332

34-
dir
35-
};
36-
}
33+
dir
34+
});
3735

3836
/// Parses image data and extracts text using OCR.
3937
///

src/core/parsers/pdf.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
//! PDF parser module.
22
//!
33
//! This module provides functionality for extracting text from PDF documents using
4-
//! the pdf_extract library.
4+
//! the `pdf_extract` library.
55
66
use super::super::errors::ParserError;
77
use pdf_extract::extract_text_from_mem;
@@ -22,7 +22,7 @@ use pdf_extract::extract_text_from_mem;
2222
///
2323
/// # Implementation Notes
2424
///
25-
/// * Uses the pdf_extract library for PDF text extraction
25+
/// * Uses the `pdf_extract` library for PDF text extraction
2626
/// * Trims whitespace from the result before returning
2727
/// * TODO: Need to find a way to silence the output of that function since on
2828
/// unknown characters it outputs a lot of errors, cluttering the logs.

src/core/parsers/pptx.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//! PPTX parser module.
22
//!
3-
//! This module provides functionality for extracting text from Microsoft PowerPoint
3+
//! This module provides functionality for extracting text from Microsoft `PowerPoint`
44
//! PPTX presentation files. It uses the zip crate to extract slide XML files and
55
//! regex to extract text content.
66
@@ -46,7 +46,10 @@ pub(crate) fn parse_pptx(data: &[u8]) -> Result<String, ParserError> {
4646
let mut file = archive.by_index(i)?;
4747

4848
// Only process slide XML files
49-
if file.name().starts_with("ppt/slides/slide") && file.name().ends_with(".xml") {
49+
let is_xml = std::path::Path::new(file.name())
50+
.extension()
51+
.is_some_and(|ext| ext.eq_ignore_ascii_case("xml"));
52+
if file.name().starts_with("ppt/slides/slide") && is_xml {
5053
slide_count += 1;
5154
if slide_count > 1 {
5255
text.push_str("\n--- Slide ");

0 commit comments

Comments
 (0)