Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 76 additions & 77 deletions Cargo.lock

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,9 @@ categories = ["text-processing", "parsing"]
calamine = "0.32.0"
docx-rs = "0.4.18"
infer = "0.19.0"
lazy_static = "1.5.0"
mime = "0.3.17"
pdf-extract = "0.10.0"
regex = "1.12.2"
regex = "1.12.3"
tempfile = "3.24.0"
tesseract = "0.15.2"
zip = "7.2.0"
Expand All @@ -38,6 +37,9 @@ env_logger = "0.11.8"
criterion = "0.8"
num_cpus = "1.17.0"

[lints.clippy]
pedantic = "warn"

[[bench]]
name = "function_parse"
harness = false
27 changes: 15 additions & 12 deletions benches/function_parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ fn benchmark_sequential_vs_parallel(c: &mut Criterion) {
.par_iter()
.map(|d| parse(black_box(d)))
.collect::<Result<Vec<String>, ParserError>>()
})
});
});

// Benchmark sequential parsing
Expand All @@ -66,7 +66,7 @@ fn benchmark_sequential_vs_parallel(c: &mut Criterion) {
.iter()
.map(|d| parse(black_box(d)))
.collect::<Result<Vec<String>, ParserError>>()
})
});
});

group.finish();
Expand Down Expand Up @@ -98,7 +98,7 @@ fn benchmark_parallel_efficiency(c: &mut Criterion) {
.par_iter()
.map(|d| parse(black_box(d)))
.collect::<Result<Vec<String>, ParserError>>()
})
});
});
}

Expand All @@ -123,7 +123,7 @@ fn benchmark_per_filetype(c: &mut Criterion) {
.par_iter()
.map(|d| parse(black_box(d)))
.collect::<Result<Vec<String>, ParserError>>()
})
});
});
}

Expand All @@ -144,7 +144,7 @@ fn benchmark_per_filetype(c: &mut Criterion) {
.par_iter()
.map(|d| parse(black_box(d)))
.collect::<Result<Vec<String>, ParserError>>()
})
});
});
}

Expand All @@ -153,6 +153,7 @@ fn benchmark_per_filetype(c: &mut Criterion) {

// Finds the threshold number of files for each type that takes less than 16ms
fn benchmark_parallel_threshold(c: &mut Criterion) {
const SAMPLE_COUNT: usize = 5;
let max_time_threshold = Duration::from_millis(16);

// Read each test file only once
Expand Down Expand Up @@ -181,7 +182,6 @@ fn benchmark_parallel_threshold(c: &mut Criterion) {
}

// Take multiple measurements and use median for robustness
const SAMPLE_COUNT: usize = 5;
let mut durations = Vec::with_capacity(SAMPLE_COUNT);

for _ in 0..SAMPLE_COUNT {
Expand Down Expand Up @@ -227,13 +227,16 @@ fn benchmark_parallel_threshold(c: &mut Criterion) {
// The threshold count is now in 'low'
let threshold_count = low;

// Define percentages to test around the threshold
let percentages = [99.0, 99.9, 100.0, 100.1, 101.0];
// Permille values for percentages: 99.0%, 99.9%, 100.0%, 100.1%, 101.0%
let permille_values: [usize; 5] = [990, 999, 1000, 1001, 1010];

// Generate test points based on percentages of the threshold
let mut test_points: Vec<usize> = percentages
// Generate test points based on percentages of the threshold using integer math
let mut test_points: Vec<usize> = permille_values
.iter()
.map(|&p| ((threshold_count as f64 * p / 100.0).ceil() as usize).max(1))
.map(|&p| {
let product = threshold_count.saturating_mul(p);
product.div_ceil(1000).max(1)
})
.collect();

test_points.dedup();
Expand All @@ -251,7 +254,7 @@ fn benchmark_parallel_threshold(c: &mut Criterion) {
.par_iter()
.map(|d| parse(black_box(d)))
.collect::<Result<Vec<String>, ParserError>>()
})
});
});
}

Expand Down
2 changes: 1 addition & 1 deletion src/core/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ pub const APPLICATION_DOCX: &str =
pub const APPLICATION_XLSX: &str =
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";

/// MIME type for PPTX (Microsoft PowerPoint) presentations
/// MIME type for PPTX (Microsoft `PowerPoint`) presentations
pub const APPLICATION_PPTX: &str =
"application/vnd.openxmlformats-officedocument.presentationml.presentation";
8 changes: 4 additions & 4 deletions src/core/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ pub enum ParserError {
impl std::fmt::Display for ParserError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ParserError::IoError(msg) => write!(f, "IO error: {}", msg),
ParserError::ParseError(msg) => write!(f, "Parse error: {}", msg),
ParserError::InvalidFormat(msg) => write!(f, "Invalid format: {}", msg),
ParserError::IoError(msg) => write!(f, "IO error: {msg}"),
ParserError::ParseError(msg) => write!(f, "Parse error: {msg}"),
ParserError::InvalidFormat(msg) => write!(f, "Invalid format: {msg}"),
}
}
}

/// Implements the std::error::Error trait for ParserError to allow it to be used
/// Implements the `std::error::Error` trait for `ParserError` to allow it to be used
/// with the ? operator and to be boxed as a dyn Error.
impl std::error::Error for ParserError {}

Expand Down
14 changes: 8 additions & 6 deletions src/core/parsers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,12 @@ use super::{
errors::ParserError,
};
use infer::Infer;
use lazy_static::lazy_static;
use mime::{IMAGE, Mime, TEXT, TEXT_PLAIN};
use std::str;
use std::sync::LazyLock;

// Create a static infer instance to avoid recreating it on every call
lazy_static! {
static ref INFER: Infer = Infer::new();
}
static INFER: LazyLock<Infer> = LazyLock::new(Infer::new);

/// Parses the given data into plain text.
///
Expand Down Expand Up @@ -74,6 +72,11 @@ lazy_static! {
/// // Verify the result
/// assert_eq!(result, "Hello, world! This is a sample text file.");
/// ```
///
/// # Errors
///
/// Returns [`ParserError::InvalidFormat`] if the file type is unsupported or unrecognized.
/// May return other [`ParserError`] variants if an error occurs during parsing.
pub fn parse(data: &[u8]) -> Result<String, ParserError> {
match determine_mime_type(data) {
Some(mime) if mime == APPLICATION_PDF => parse_pdf(data),
Expand All @@ -83,8 +86,7 @@ pub fn parse(data: &[u8]) -> Result<String, ParserError> {
Some(mime) if mime.type_() == TEXT => parse_text(data),
Some(mime) if mime.type_() == IMAGE => parse_image(data),
Some(mime) => Err(ParserError::InvalidFormat(format!(
"Unsupported file type: {}",
mime
"Unsupported file type: {mime}"
))),
None => Err(ParserError::InvalidFormat(
"Could not determine file type.".to_string(),
Expand Down
4 changes: 2 additions & 2 deletions src/core/parsers/docx.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! DOCX parser module.
//!
//! This module provides functionality for extracting text from Microsoft Word DOCX
//! documents using the docx_rs library.
//! documents using the `docx_rs` library.

use super::super::errors::ParserError;
use docx_rs::read_docx;
Expand All @@ -22,7 +22,7 @@ use docx_rs::read_docx;
///
/// # Implementation Notes
///
/// * Uses the docx_rs library for DOCX parsing
/// * Uses the `docx_rs` library for DOCX parsing
/// * Extracts text by traversing document structure: documents → paragraphs → runs → text
/// * Joins paragraphs with newlines and trims whitespace from the result
/// * TODO: Consider simplifying the document traversal logic
Expand Down
24 changes: 11 additions & 13 deletions src/core/parsers/image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
//! various image formats including PNG, JPEG, and WebP.

use super::super::errors::ParserError;
use lazy_static::lazy_static;
use std::sync::LazyLock;
use std::{fs, io::Write};
use tempfile::{NamedTempFile, TempDir};
use tesseract::Tesseract;
Expand All @@ -20,20 +20,18 @@ const TESSDATA_FRA: &[u8] = include_bytes!(concat!(
"/assets/ocr/fra.traineddata"
));

lazy_static! {
static ref TESSDATA_DIR: TempDir = {
let dir = tempfile::tempdir().expect("Failed to create tessdata directory");
let dir_path = dir.path();
static TESSDATA_DIR: LazyLock<TempDir> = LazyLock::new(|| {
let dir = tempfile::tempdir().expect("Failed to create tessdata directory");
let dir_path = dir.path();

// Write language files to tessdata directory (only done once)
fs::write(dir_path.join("eng.traineddata"), TESSDATA_ENG)
.expect("Failed to write English training data");
fs::write(dir_path.join("fra.traineddata"), TESSDATA_FRA)
.expect("Failed to write French training data");
// Write language files to tessdata directory (only done once)
fs::write(dir_path.join("eng.traineddata"), TESSDATA_ENG)
.expect("Failed to write English training data");
fs::write(dir_path.join("fra.traineddata"), TESSDATA_FRA)
.expect("Failed to write French training data");

dir
};
}
dir
});

/// Parses image data and extracts text using OCR.
///
Expand Down
4 changes: 2 additions & 2 deletions src/core/parsers/pdf.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! PDF parser module.
//!
//! This module provides functionality for extracting text from PDF documents using
//! the pdf_extract library.
//! the `pdf_extract` library.

use super::super::errors::ParserError;
use pdf_extract::extract_text_from_mem;
Expand All @@ -22,7 +22,7 @@ use pdf_extract::extract_text_from_mem;
///
/// # Implementation Notes
///
/// * Uses the pdf_extract library for PDF text extraction
/// * Uses the `pdf_extract` library for PDF text extraction
/// * Trims whitespace from the result before returning
/// * TODO: Need to find a way to silence the output of that function since on
/// unknown characters it outputs a lot of errors, cluttering the logs.
Expand Down
7 changes: 5 additions & 2 deletions src/core/parsers/pptx.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! PPTX parser module.
//!
//! This module provides functionality for extracting text from Microsoft PowerPoint
//! This module provides functionality for extracting text from Microsoft `PowerPoint`
//! PPTX presentation files. It uses the zip crate to extract slide XML files and
//! regex to extract text content.

Expand Down Expand Up @@ -46,7 +46,10 @@ pub(crate) fn parse_pptx(data: &[u8]) -> Result<String, ParserError> {
let mut file = archive.by_index(i)?;

// Only process slide XML files
if file.name().starts_with("ppt/slides/slide") && file.name().ends_with(".xml") {
let is_xml = std::path::Path::new(file.name())
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("xml"));
if file.name().starts_with("ppt/slides/slide") && is_xml {
slide_count += 1;
if slide_count > 1 {
text.push_str("\n--- Slide ");
Expand Down
6 changes: 3 additions & 3 deletions src/core/parsers/xlsx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use std::io::Cursor;
/// * Adds sheet headers for multi-sheet workbooks
/// * Memory-efficient implementation using cursors instead of temporary files
/// * TODO: Need proper logic to escape commas and quotes
/// * TODO: Consider using the csv crate to convert each sheet and pass it through the parse_text function
/// * TODO: Consider using the csv crate to convert each sheet and pass it through the `parse_text` function
pub(crate) fn parse_xlsx(data: &[u8]) -> Result<String, ParserError> {
// Create a cursor from the bytes for memory-based reading
let cursor = Cursor::new(data);
Expand All @@ -42,7 +42,7 @@ pub(crate) fn parse_xlsx(data: &[u8]) -> Result<String, ParserError> {
let mut csv_data = String::new();

// Copy the sheet names to avoid borrowing issues
let sheet_names = excel.sheet_names().to_vec();
let sheet_names = excel.sheet_names().clone();

for name in sheet_names {
if let Ok(range) = excel.worksheet_range(&name) {
Expand All @@ -55,7 +55,7 @@ pub(crate) fn parse_xlsx(data: &[u8]) -> Result<String, ParserError> {
.rows()
.map(|row| {
row.iter()
.map(|cell| cell.to_string())
.map(std::string::ToString::to_string)
.collect::<Vec<String>>()
.join(",")
})
Expand Down
16 changes: 9 additions & 7 deletions src/web/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ pub enum ApiError {
impl std::fmt::Display for ApiError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ApiError::BadRequest(msg) => write!(f, "Bad Request: {}", msg),
ApiError::InternalError(msg) => write!(f, "Internal Error: {}", msg),
ApiError::ConfigError(msg) => write!(f, "Configuration Error: {}", msg),
ApiError::BadRequest(msg) => write!(f, "Bad Request: {msg}"),
ApiError::InternalError(msg) => write!(f, "Internal Error: {msg}"),
ApiError::ConfigError(msg) => write!(f, "Configuration Error: {msg}"),
}
}
}
Expand All @@ -47,16 +47,18 @@ impl ResponseError for ApiError {

match self {
ApiError::BadRequest(_) => HttpResponse::BadRequest().json(error_response),
ApiError::InternalError(_) => HttpResponse::InternalServerError().json(error_response),
ApiError::ConfigError(_) => HttpResponse::InternalServerError().json(error_response),
ApiError::InternalError(_) | ApiError::ConfigError(_) => {
HttpResponse::InternalServerError().json(error_response)
}
}
}

fn status_code(&self) -> StatusCode {
match self {
ApiError::BadRequest(_) => StatusCode::BAD_REQUEST,
ApiError::InternalError(_) => StatusCode::INTERNAL_SERVER_ERROR,
ApiError::ConfigError(_) => StatusCode::INTERNAL_SERVER_ERROR,
ApiError::InternalError(_) | ApiError::ConfigError(_) => {
StatusCode::INTERNAL_SERVER_ERROR
}
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion tests/endpoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@ fn test_file_paths_exist() {
let path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("tests/assets")
.join(name);
assert!(path.exists(), "Test file should exist: {:?}", path);
assert!(path.exists(), "Test file should exist: {path:?}");
}
}