fix: resolve clippy pedantic warnings (#67)

excoffierleonard · web-flow · commit ff4e184350b8 · 2026-02-03T15:27:34.000-05:00
- Replace lazy_static with std::sync::LazyLock
  - Use case-insensitive file extension comparison in pptx parser
  - Add missing #Errors doc section to parse function
  - Merge identical match arms in ApiError implementations
  - Move const declaration before statements in benchmark
  - Use integer div_ceil instead of float arithmetic
  - Remove unused lazy_static dependency
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,10 +15,9 @@ categories = ["text-processing", "parsing"]
 calamine = "0.32.0"
 docx-rs = "0.4.18"
 infer = "0.19.0"
-lazy_static = "1.5.0"
 mime = "0.3.17"
 pdf-extract = "0.10.0"
-regex = "1.12.2"
+regex = "1.12.3"
 tempfile = "3.24.0"
 tesseract = "0.15.2"
 zip = "7.2.0"
@@ -38,6 +37,9 @@ env_logger = "0.11.8"
 criterion = "0.8"
 num_cpus = "1.17.0"
 
+[lints.clippy]
+pedantic = "warn"
+
 [[bench]]
 name = "function_parse"
 harness = false
diff --git a/benches/function_parse.rs b/benches/function_parse.rs
@@ -56,7 +56,7 @@ fn benchmark_sequential_vs_parallel(c: &mut Criterion) {
                 .par_iter()
                 .map(|d| parse(black_box(d)))
                 .collect::<Result<Vec<String>, ParserError>>()
-        })
+        });
     });
 
     // Benchmark sequential parsing
@@ -66,7 +66,7 @@ fn benchmark_sequential_vs_parallel(c: &mut Criterion) {
                 .iter()
                 .map(|d| parse(black_box(d)))
                 .collect::<Result<Vec<String>, ParserError>>()
-        })
+        });
     });
 
     group.finish();
@@ -98,7 +98,7 @@ fn benchmark_parallel_efficiency(c: &mut Criterion) {
                     .par_iter()
                     .map(|d| parse(black_box(d)))
                     .collect::<Result<Vec<String>, ParserError>>()
-            })
+            });
         });
     }
 
@@ -123,7 +123,7 @@ fn benchmark_per_filetype(c: &mut Criterion) {
                     .par_iter()
                     .map(|d| parse(black_box(d)))
                     .collect::<Result<Vec<String>, ParserError>>()
-            })
+            });
         });
     }
 
@@ -144,7 +144,7 @@ fn benchmark_per_filetype(c: &mut Criterion) {
                     .par_iter()
                     .map(|d| parse(black_box(d)))
                     .collect::<Result<Vec<String>, ParserError>>()
-            })
+            });
         });
     }
 
@@ -153,6 +153,7 @@ fn benchmark_per_filetype(c: &mut Criterion) {
 
 // Finds the threshold number of files for each type that takes less than 16ms
 fn benchmark_parallel_threshold(c: &mut Criterion) {
+    const SAMPLE_COUNT: usize = 5;
     let max_time_threshold = Duration::from_millis(16);
 
     // Read each test file only once
@@ -181,7 +182,6 @@ fn benchmark_parallel_threshold(c: &mut Criterion) {
             }
 
             // Take multiple measurements and use median for robustness
-            const SAMPLE_COUNT: usize = 5;
             let mut durations = Vec::with_capacity(SAMPLE_COUNT);
 
             for _ in 0..SAMPLE_COUNT {
@@ -227,13 +227,16 @@ fn benchmark_parallel_threshold(c: &mut Criterion) {
         // The threshold count is now in 'low'
         let threshold_count = low;
 
-        // Define percentages to test around the threshold
-        let percentages = [99.0, 99.9, 100.0, 100.1, 101.0];
+        // Permille values for percentages: 99.0%, 99.9%, 100.0%, 100.1%, 101.0%
+        let permille_values: [usize; 5] = [990, 999, 1000, 1001, 1010];
 
-        // Generate test points based on percentages of the threshold
-        let mut test_points: Vec<usize> = percentages
+        // Generate test points based on percentages of the threshold using integer math
+        let mut test_points: Vec<usize> = permille_values
             .iter()
-            .map(|&p| ((threshold_count as f64 * p / 100.0).ceil() as usize).max(1))
+            .map(|&p| {
+                let product = threshold_count.saturating_mul(p);
+                product.div_ceil(1000).max(1)
+            })
             .collect();
 
         test_points.dedup();
@@ -251,7 +254,7 @@ fn benchmark_parallel_threshold(c: &mut Criterion) {
                         .par_iter()
                         .map(|d| parse(black_box(d)))
                         .collect::<Result<Vec<String>, ParserError>>()
-                })
+                });
             });
         }
 
diff --git a/src/core/constants.rs b/src/core/constants.rs
@@ -13,6 +13,6 @@ pub const APPLICATION_DOCX: &str =
 pub const APPLICATION_XLSX: &str =
     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
 
-/// MIME type for PPTX (Microsoft PowerPoint) presentations
+/// MIME type for PPTX (Microsoft `PowerPoint`) presentations
 pub const APPLICATION_PPTX: &str =
     "application/vnd.openxmlformats-officedocument.presentationml.presentation";
diff --git a/src/core/errors.rs b/src/core/errors.rs
@@ -31,14 +31,14 @@ pub enum ParserError {
 impl std::fmt::Display for ParserError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
-            ParserError::IoError(msg) => write!(f, "IO error: {}", msg),
-            ParserError::ParseError(msg) => write!(f, "Parse error: {}", msg),
-            ParserError::InvalidFormat(msg) => write!(f, "Invalid format: {}", msg),
+            ParserError::IoError(msg) => write!(f, "IO error: {msg}"),
+            ParserError::ParseError(msg) => write!(f, "Parse error: {msg}"),
+            ParserError::InvalidFormat(msg) => write!(f, "Invalid format: {msg}"),
         }
     }
 }
 
-/// Implements the std::error::Error trait for ParserError to allow it to be used
+/// Implements the `std::error::Error` trait for `ParserError` to allow it to be used
 /// with the ? operator and to be boxed as a dyn Error.
 impl std::error::Error for ParserError {}
 
diff --git a/src/core/parsers.rs b/src/core/parsers.rs
@@ -21,14 +21,12 @@ use super::{
     errors::ParserError,
 };
 use infer::Infer;
-use lazy_static::lazy_static;
 use mime::{IMAGE, Mime, TEXT, TEXT_PLAIN};
 use std::str;
+use std::sync::LazyLock;
 
 // Create a static infer instance to avoid recreating it on every call
-lazy_static! {
-    static ref INFER: Infer = Infer::new();
-}
+static INFER: LazyLock<Infer> = LazyLock::new(Infer::new);
 
 /// Parses the given data into plain text.
 ///
@@ -74,6 +72,11 @@ lazy_static! {
 /// // Verify the result
 /// assert_eq!(result, "Hello, world! This is a sample text file.");
 /// ```
+///
+/// # Errors
+///
+/// Returns [`ParserError::InvalidFormat`] if the file type is unsupported or unrecognized.
+/// May return other [`ParserError`] variants if an error occurs during parsing.
 pub fn parse(data: &[u8]) -> Result<String, ParserError> {
     match determine_mime_type(data) {
         Some(mime) if mime == APPLICATION_PDF => parse_pdf(data),
@@ -83,8 +86,7 @@ pub fn parse(data: &[u8]) -> Result<String, ParserError> {
         Some(mime) if mime.type_() == TEXT => parse_text(data),
         Some(mime) if mime.type_() == IMAGE => parse_image(data),
         Some(mime) => Err(ParserError::InvalidFormat(format!(
-            "Unsupported file type: {}",
-            mime
+            "Unsupported file type: {mime}"
         ))),
         None => Err(ParserError::InvalidFormat(
             "Could not determine file type.".to_string(),
diff --git a/src/core/parsers/docx.rs b/src/core/parsers/docx.rs
@@ -1,7 +1,7 @@
 //! DOCX parser module.
 //!
 //! This module provides functionality for extracting text from Microsoft Word DOCX
-//! documents using the docx_rs library.
+//! documents using the `docx_rs` library.
 
 use super::super::errors::ParserError;
 use docx_rs::read_docx;
@@ -22,7 +22,7 @@ use docx_rs::read_docx;
 ///
 /// # Implementation Notes
 ///
-/// * Uses the docx_rs library for DOCX parsing
+/// * Uses the `docx_rs` library for DOCX parsing
 /// * Extracts text by traversing document structure: documents → paragraphs → runs → text
 /// * Joins paragraphs with newlines and trims whitespace from the result
 /// * TODO: Consider simplifying the document traversal logic
diff --git a/src/core/parsers/image.rs b/src/core/parsers/image.rs
@@ -5,7 +5,7 @@
 //! various image formats including PNG, JPEG, and WebP.
 
 use super::super::errors::ParserError;
-use lazy_static::lazy_static;
+use std::sync::LazyLock;
 use std::{fs, io::Write};
 use tempfile::{NamedTempFile, TempDir};
 use tesseract::Tesseract;
@@ -20,20 +20,18 @@ const TESSDATA_FRA: &[u8] = include_bytes!(concat!(
     "/assets/ocr/fra.traineddata"
 ));
 
-lazy_static! {
-    static ref TESSDATA_DIR: TempDir = {
-        let dir = tempfile::tempdir().expect("Failed to create tessdata directory");
-        let dir_path = dir.path();
+static TESSDATA_DIR: LazyLock<TempDir> = LazyLock::new(|| {
+    let dir = tempfile::tempdir().expect("Failed to create tessdata directory");
+    let dir_path = dir.path();
 
-        // Write language files to tessdata directory (only done once)
-        fs::write(dir_path.join("eng.traineddata"), TESSDATA_ENG)
-            .expect("Failed to write English training data");
-        fs::write(dir_path.join("fra.traineddata"), TESSDATA_FRA)
-            .expect("Failed to write French training data");
+    // Write language files to tessdata directory (only done once)
+    fs::write(dir_path.join("eng.traineddata"), TESSDATA_ENG)
+        .expect("Failed to write English training data");
+    fs::write(dir_path.join("fra.traineddata"), TESSDATA_FRA)
+        .expect("Failed to write French training data");
 
-        dir
-    };
-}
+    dir
+});
 
 /// Parses image data and extracts text using OCR.
 ///
diff --git a/src/core/parsers/pdf.rs b/src/core/parsers/pdf.rs
@@ -1,7 +1,7 @@
 //! PDF parser module.
 //!
 //! This module provides functionality for extracting text from PDF documents using
-//! the pdf_extract library.
+//! the `pdf_extract` library.
 
 use super::super::errors::ParserError;
 use pdf_extract::extract_text_from_mem;
@@ -22,7 +22,7 @@ use pdf_extract::extract_text_from_mem;
 ///
 /// # Implementation Notes
 ///
-/// * Uses the pdf_extract library for PDF text extraction
+/// * Uses the `pdf_extract` library for PDF text extraction
 /// * Trims whitespace from the result before returning
 /// * TODO: Need to find a way to silence the output of that function since on
 ///   unknown characters it outputs a lot of errors, cluttering the logs.
diff --git a/src/core/parsers/pptx.rs b/src/core/parsers/pptx.rs
@@ -1,6 +1,6 @@
 //! PPTX parser module.
 //!
-//! This module provides functionality for extracting text from Microsoft PowerPoint
+//! This module provides functionality for extracting text from Microsoft `PowerPoint`
 //! PPTX presentation files. It uses the zip crate to extract slide XML files and
 //! regex to extract text content.
 
@@ -46,7 +46,10 @@ pub(crate) fn parse_pptx(data: &[u8]) -> Result<String, ParserError> {
         let mut file = archive.by_index(i)?;
 
         // Only process slide XML files
-        if file.name().starts_with("ppt/slides/slide") && file.name().ends_with(".xml") {
+        let is_xml = std::path::Path::new(file.name())
+            .extension()
+            .is_some_and(|ext| ext.eq_ignore_ascii_case("xml"));
+        if file.name().starts_with("ppt/slides/slide") && is_xml {
             slide_count += 1;
             if slide_count > 1 {
                 text.push_str("\n--- Slide ");
diff --git a/src/core/parsers/xlsx.rs b/src/core/parsers/xlsx.rs
@@ -30,7 +30,7 @@ use std::io::Cursor;
 /// * Adds sheet headers for multi-sheet workbooks
 /// * Memory-efficient implementation using cursors instead of temporary files
 /// * TODO: Need proper logic to escape commas and quotes
-/// * TODO: Consider using the csv crate to convert each sheet and pass it through the parse_text function
+/// * TODO: Consider using the csv crate to convert each sheet and pass it through the `parse_text` function
 pub(crate) fn parse_xlsx(data: &[u8]) -> Result<String, ParserError> {
     // Create a cursor from the bytes for memory-based reading
     let cursor = Cursor::new(data);
@@ -42,7 +42,7 @@ pub(crate) fn parse_xlsx(data: &[u8]) -> Result<String, ParserError> {
     let mut csv_data = String::new();
 
     // Copy the sheet names to avoid borrowing issues
-    let sheet_names = excel.sheet_names().to_vec();
+    let sheet_names = excel.sheet_names().clone();
 
     for name in sheet_names {
         if let Ok(range) = excel.worksheet_range(&name) {
@@ -55,7 +55,7 @@ pub(crate) fn parse_xlsx(data: &[u8]) -> Result<String, ParserError> {
                 .rows()
                 .map(|row| {
                     row.iter()
-                        .map(|cell| cell.to_string())
+                        .map(std::string::ToString::to_string)
                         .collect::<Vec<String>>()
                         .join(",")
                 })
diff --git a/src/web/errors.rs b/src/web/errors.rs
@@ -26,9 +26,9 @@ pub enum ApiError {
 impl std::fmt::Display for ApiError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
-            ApiError::BadRequest(msg) => write!(f, "Bad Request: {}", msg),
-            ApiError::InternalError(msg) => write!(f, "Internal Error: {}", msg),
-            ApiError::ConfigError(msg) => write!(f, "Configuration Error: {}", msg),
+            ApiError::BadRequest(msg) => write!(f, "Bad Request: {msg}"),
+            ApiError::InternalError(msg) => write!(f, "Internal Error: {msg}"),
+            ApiError::ConfigError(msg) => write!(f, "Configuration Error: {msg}"),
         }
     }
 }
@@ -47,16 +47,18 @@ impl ResponseError for ApiError {
 
         match self {
             ApiError::BadRequest(_) => HttpResponse::BadRequest().json(error_response),
-            ApiError::InternalError(_) => HttpResponse::InternalServerError().json(error_response),
-            ApiError::ConfigError(_) => HttpResponse::InternalServerError().json(error_response),
+            ApiError::InternalError(_) | ApiError::ConfigError(_) => {
+                HttpResponse::InternalServerError().json(error_response)
+            }
         }
     }
 
     fn status_code(&self) -> StatusCode {
         match self {
             ApiError::BadRequest(_) => StatusCode::BAD_REQUEST,
-            ApiError::InternalError(_) => StatusCode::INTERNAL_SERVER_ERROR,
-            ApiError::ConfigError(_) => StatusCode::INTERNAL_SERVER_ERROR,
+            ApiError::InternalError(_) | ApiError::ConfigError(_) => {
+                StatusCode::INTERNAL_SERVER_ERROR
+            }
         }
     }
 }
diff --git a/tests/endpoints.rs b/tests/endpoints.rs
@@ -22,6 +22,6 @@ fn test_file_paths_exist() {
         let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
             .join("tests/assets")
             .join(name);
-        assert!(path.exists(), "Test file should exist: {:?}", path);
+        assert!(path.exists(), "Test file should exist: {path:?}");
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -31,14 +31,14 @@ pub enum ParserError {`
`31`	`31`	`impl std::fmt::Display for ParserError {`
`32`	`32`	`fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {`
`33`	`33`	`match self {`
`34`		`- ParserError::IoError(msg) => write!(f, "IO error: {}", msg),`
`35`		`- ParserError::ParseError(msg) => write!(f, "Parse error: {}", msg),`
`36`		`- ParserError::InvalidFormat(msg) => write!(f, "Invalid format: {}", msg),`
	`34`	`+ ParserError::IoError(msg) => write!(f, "IO error: {msg}"),`
	`35`	`+ ParserError::ParseError(msg) => write!(f, "Parse error: {msg}"),`
	`36`	`+ ParserError::InvalidFormat(msg) => write!(f, "Invalid format: {msg}"),`
`37`	`37`	`}`
`38`	`38`	`}`
`39`	`39`	`}`
`40`	`40`
`41`		`-/// Implements the std::error::Error trait for ParserError to allow it to be used`
	`41`	+/// Implements the `std::error::Error` trait for `ParserError` to allow it to be used
`42`	`42`	`/// with the ? operator and to be boxed as a dyn Error.`
`43`	`43`	`impl std::error::Error for ParserError {}`
`44`	`44`