diff --git a/.gitignore b/.gitignore index 4cca0855..ac43cc6e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ build dist extlib_*/ scratch_* +perf/results/** # Created by https://www.toptal.com/developers/gitignore/api/python,macos,emacs,cmake,virtualenv # Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,emacs,cmake,virtualenv diff --git a/README.md b/README.md index 70843ae6..3dcd0257 100644 --- a/README.md +++ b/README.md @@ -191,7 +191,8 @@ uv sync The latter will only work after a clean `git clone`. If you are developing and updating C++ code, please use, ```sh -uv pip install --force-reinstall --no-deps -e . +# uv pip install --force-reinstall --no-deps -e . +rm -rf .venv; uv venv; uv pip install --force-reinstall --no-deps -e ".[perf-tools]" ``` To test the package, run: diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index 04b909b5..db0310df 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -446,8 +446,8 @@ def _to_segmented_page( "`words` will be created for segmented_page in an inefficient way!" ) self._create_word_cells(segmented_page, enforce_same_font=enforce_same_font) - else: - logging.warning("No `words` will be created for segmented_page") + # else: + # logging.warning("No `words` will be created for segmented_page") if create_textlines and ("line_cells" in page): segmented_page.textline_cells = self._to_cells(page["line_cells"]) @@ -459,8 +459,8 @@ def _to_segmented_page( self._create_textline_cells( segmented_page, enforce_same_font=enforce_same_font ) - else: - logging.warning("No `text_lines` will be created for segmented_page") + # else: + # logging.warning("No `text_lines` will be created for segmented_page") return segmented_page diff --git a/perf/README.md b/perf/README.md new file mode 100644 index 00000000..fc94d16a --- /dev/null +++ b/perf/README.md @@ -0,0 +1,22 @@ +Perf tools for page-level parsing benchmarking. + +Usage +- Install extras for optional parsers (not part of main package): + - pip: `pip install .[perf-tools]` + - uv (already configured): `uv sync --group perf-test` +- Run on a file or directory: + - `python perf/run_perf.py ./docs/sample.pdf` + - `python perf/run_perf.py ./dataset --recursive -p pdfplumber` + +CLI +- `input`: PDF file or directory of PDFs. +- `--parser|-p`: one of `docling` (default), `pdfplumber`, `pypdfium2` (alias: `pypdfium`), `pymupdf`. +- `--recursive|-r`: recurse when input is a directory. +- `--output|-o`: output CSV path (default under `perf/results`). + +CSV columns +- `filename,page_number,elapsed_sec,success,error` + +Statistics +- Prints totals, avg sec/page, min/max, and percentiles (p50/p90/p95/p99) after the run. + diff --git a/perf/run_perf.py b/perf/run_perf.py new file mode 100644 index 00000000..c7d3a390 --- /dev/null +++ b/perf/run_perf.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python3 +""" +Performance harness for page-by-page PDF parsing. + +Outputs a CSV with rows: +filename,page_number,elapsed_sec,success,error + +Parsers supported: +- docling (default) — uses docling-parse +- pdfplumber +- pypdfium2 (alias: pypdfium) +- pymupdf (fitz) + +Install extras for non-docling parsers only when needed, e.g.: + pip install .[perf-tools] +or with uv: + uv sync --group perf-test +""" + +from __future__ import annotations + +import argparse +import csv +import os +import sys +import time +from dataclasses import dataclass +from pathlib import Path +from statistics import mean, median +from typing import Callable, Iterable, List, Tuple +from tqdm import tqdm +from tabulate import tabulate + + +# -------- Utilities -------- + + +def find_pdfs(path: Path, recursive: bool = False) -> List[Path]: + if path.is_file(): + return [path] if path.suffix.lower() == ".pdf" else [] + pattern = "**/*.pdf" if recursive else "*.pdf" + return sorted([p for p in path.glob(pattern) if p.is_file()]) + + +def ensure_parent_dir(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + + +def percentile(values: List[float], p: float) -> float: + if not values: + return 0.0 + if p <= 0: + return min(values) + if p >= 100: + return max(values) + vs = sorted(values) + k = (len(vs) - 1) * (p / 100.0) + f = int(k) + c = min(f + 1, len(vs) - 1) + if f == c: + return vs[f] + d0 = vs[f] * (c - k) + d1 = vs[c] * (k - f) + return d0 + d1 + + +def fmt_seconds(s: float) -> str: + return f"{s:.6f}" + + +@dataclass +class Row: + filename: str + page_number: int + elapsed_sec: float + success: bool + error: str + + +# -------- Parser adapters -------- + + +def parse_with_docling(pdf_path: Path) -> Iterable[Row]: + from docling_parse.pdf_parser import DoclingPdfParser + from docling_core.types.doc.page import PdfPageBoundaryType + + rows: List[Row] = [] + try: + parser = DoclingPdfParser(loglevel="fatal") + doc = parser.load(str(pdf_path), lazy=True, boundary_type=PdfPageBoundaryType.CROP_BOX) + try: + n = doc.number_of_pages() + except Exception as e: # pragma: no cover + rows.append(Row(str(pdf_path), -1, 0.0, False, f"num_pages: {e}")) + return rows + + for page_idx in range(1, n + 1): + t0 = time.perf_counter() + err = "" + ok = True + try: + _ = doc.get_page(page_idx, + keep_chars=False, + keep_lines=False, + keep_bitmaps=False, + create_words=False, + create_textlines=True) + except Exception as e: # pragma: no cover + ok = False + err = str(e) + print(f"error: {err}") + t1 = time.perf_counter() + rows.append(Row(str(pdf_path), page_idx, t1 - t0, ok, err)) + + # best-effort cleanup + try: + doc.unload() + except Exception: + pass + + except Exception as e: # pragma: no cover + rows.append(Row(str(pdf_path), -1, 0.0, False, f"load: {e}")) + + return rows + + +def parse_with_pdfplumber(pdf_path: Path) -> Iterable[Row]: + try: + import pdfplumber # type: ignore + except Exception as e: # pragma: no cover + return [Row(str(pdf_path), -1, 0.0, False, f"import pdfplumber: {e}")] + + rows: List[Row] = [] + try: + with pdfplumber.open(str(pdf_path)) as pdf: + n = len(pdf.pages) + for idx in range(n): + t0 = time.perf_counter() + ok = True + err = "" + try: + _ = pdf.pages[idx].extract_text() # parse text via pdfminer + except Exception as e: # pragma: no cover + ok = False + err = str(e) + print(f"error: {err}") + + t1 = time.perf_counter() + rows.append(Row(str(pdf_path), idx + 1, t1 - t0, ok, err)) + except Exception as e: # pragma: no cover + rows.append(Row(str(pdf_path), -1, 0.0, False, f"open: {e}")) + return rows + + +def parse_with_pypdfium2(pdf_path: Path) -> Iterable[Row]: + try: + import pypdfium2 as pdfium # type: ignore + except Exception as e: # pragma: no cover + return [Row(str(pdf_path), -1, 0.0, False, f"import pypdfium2: {e}")] + + rows: List[Row] = [] + try: + doc = pdfium.PdfDocument(str(pdf_path)) + except Exception as e: # pragma: no cover + return [Row(str(pdf_path), -1, 0.0, False, f"open: {e}")] + + try: + n = len(doc) + for i in range(n): + t0 = time.perf_counter() + ok = True + err = "" + try: + page = doc[i] + text_page = page.get_textpage() + + # _ = textpage.get_text_range() # extract all page text + for l in range(text_page.count_rects()): + rect = text_page.get_rect(l) + text_piece = text_page.get_text_bounded(*rect) + # x0, y0, x1, y1 = rect + # print(f"{rect}: {text_piece}") + + text_page.close() + page.close() + except Exception as e: # pragma: no cover + ok = False + err = str(e) + print(f"error: {err}") + + t1 = time.perf_counter() + rows.append(Row(str(pdf_path), i + 1, t1 - t0, ok, err)) + finally: + try: + doc.close() + except Exception: + pass + + return rows + + +def parse_with_pymupdf(pdf_path: Path) -> Iterable[Row]: + try: + import fitz # PyMuPDF + except Exception as e: # pragma: no cover + return [Row(str(pdf_path), -1, 0.0, False, f"import pymupdf: {e}")] + + rows: List[Row] = [] + try: + doc = fitz.open(str(pdf_path)) + except Exception as e: # pragma: no cover + return [Row(str(pdf_path), -1, 0.0, False, f"open: {e}")] + + try: + for i, page in enumerate(doc): + t0 = time.perf_counter() + ok = True + err = "" + try: + _ = page.get_text("text") # plain text extraction + except Exception as e: # pragma: no cover + ok = False + err = str(e) + t1 = time.perf_counter() + rows.append(Row(str(pdf_path), i + 1, t1 - t0, ok, err)) + finally: + try: + doc.close() + except Exception: + pass + + return rows + + +PARSERS: dict[str, Callable[[Path], Iterable[Row]]] = { + "docling": parse_with_docling, + "pdfplumber": parse_with_pdfplumber, + "pypdfium2": parse_with_pypdfium2, + "pypdfium": parse_with_pypdfium2, # alias + "pymupdf": parse_with_pymupdf, +} + + +# -------- Main program -------- + + +def compute_stats(rows: List[Row]) -> dict: + times = [r.elapsed_sec for r in rows if r.page_number > 0 and r.success] + total_pages = sum(1 for r in rows if r.page_number > 0) + ok_pages = len(times) + failed_pages = total_pages - ok_pages + total_time = sum(times) + stats = { + "files": len(set(r.filename for r in rows)), + "pages_total": total_pages, + "pages_ok": ok_pages, + "pages_failed": failed_pages, + "time_total_sec": total_time, + "time_avg_sec": mean(times) if times else 0.0, + "p50_sec": percentile(times, 50), + "p90_sec": percentile(times, 90), + "p95_sec": percentile(times, 95), + "p99_sec": percentile(times, 99), + "min_sec": min(times) if times else 0.0, + "max_sec": max(times) if times else 0.0, + } + return stats + + +def print_stats(stats: dict, parser_name: str) -> None: + print("") + print(f"Summary for parser={parser_name}") + print(f" - files: {stats['files']}") + print(f" - pages total: {stats['pages_total']}") + print(f" - pages ok: {stats['pages_ok']}") + print(f" - pages failed: {stats['pages_failed']}") + print(f" - total sec: {fmt_seconds(stats['time_total_sec'])}") + print(f" - avg sec/page: {fmt_seconds(stats['time_avg_sec'])}") + print(f" - p50: {fmt_seconds(stats['p50_sec'])} p90: {fmt_seconds(stats['p90_sec'])} p95: {fmt_seconds(stats['p95_sec'])} p99: {fmt_seconds(stats['p99_sec'])}") + print(f" - min: {fmt_seconds(stats['min_sec'])} max: {fmt_seconds(stats['max_sec'])}") + + +def compute_per_document_stats(rows: List[Row]) -> List[dict]: + # Collect per-file successful page times and total page counts + times_by_file: dict[str, List[float]] = {} + total_pages_by_file: dict[str, int] = {} + + for r in rows: + if r.page_number > 0: + total_pages_by_file[r.filename] = total_pages_by_file.get(r.filename, 0) + 1 + if r.page_number > 0 and r.success: + times_by_file.setdefault(r.filename, []).append(r.elapsed_sec) + + per_doc: List[dict] = [] + for fname in sorted(set(times_by_file.keys()) | set(total_pages_by_file.keys())): + times = times_by_file.get(fname, []) + pages_total = total_pages_by_file.get(fname, 0) + per_doc.append( + { + "document": fname, + "pages": pages_total, + "total": sum(times) if times else 0.0, + "mean": mean(times) if times else 0.0, + "median": median(times) if times else 0.0, + "min": min(times) if times else 0.0, + "max": max(times) if times else 0.0, + "p90": percentile(times, 90), + "p95": percentile(times, 95), + "p99": percentile(times, 99), + } + ) + return per_doc + + +def print_per_document_table(rows: List[Row]) -> None: + per_doc = compute_per_document_stats(rows) + if not per_doc: + print("\nNo per-document stats to display (no successful pages).") + return + + headers = ["document", "pages", "total", "mean", "median", "min", "max", "p90", "p95", "p99"] + table_rows = [] + for s in per_doc: + table_rows.append( + [ + s["document"], + s["pages"], + fmt_seconds(s["total"]), + fmt_seconds(s["mean"]), + fmt_seconds(s["median"]), + fmt_seconds(s["min"]), + fmt_seconds(s["max"]), + fmt_seconds(s["p90"]), + fmt_seconds(s["p95"]), + fmt_seconds(s["p99"]), + ] + ) + + print("\nPer-document statistics (sec/page):") + print(tabulate(table_rows, headers=headers)) + + +def default_output_path(parser_name: str) -> Path: + ts = time.strftime("%Y%m%d-%H%M%S") + return Path("perf") / "results" / f"perf_{parser_name}_{ts}.csv" + + +def main(argv: List[str]) -> int: + ap = argparse.ArgumentParser(description="Page-level PDF parsing perf harness") + ap.add_argument("input", help="Path to a PDF file or directory of PDFs") + ap.add_argument( + "--parser", + "-p", + default="docling", + choices=sorted({"docling", "pdfplumber", "pypdfium2", "pypdfium", "pymupdf"}), + help="Parser backend to benchmark", + ) + ap.add_argument( + "--recursive", + "-r", + action="store_true", + help="Recurse into subdirectories when input is a directory", + ) + ap.add_argument( + "--output", + "-o", + type=str, + default=None, + help="Output CSV path. Defaults to perf/results/perf__.csv", + ) + + args = ap.parse_args(argv) + + parser_key = args.parser + parser_fn = PARSERS[parser_key] + input_path = Path(args.input) + pdfs = find_pdfs(input_path, recursive=args.recursive) + + if not pdfs: + print(f"No PDFs found at {input_path}", file=sys.stderr) + return 2 + + out_path = Path(args.output) if args.output else default_output_path(parser_key) + ensure_parent_dir(out_path) + + rows: List[Row] = [] + started = time.perf_counter() + for pdf in tqdm(pdfs, desc=f"Parsing PDFs with {parser_key}"): + rows.extend(list(parser_fn(pdf))) + ended = time.perf_counter() + + # Write CSV + with out_path.open("w", newline="") as f: + w = csv.writer(f) + w.writerow(["filename", "page_number", "elapsed_sec", "success", "error"]) + for r in rows: + w.writerow([r.filename, r.page_number, f"{r.elapsed_sec:.9f}", int(r.success), r.error]) + + # Print summary + stats = compute_stats(rows) + print_stats(stats, parser_key) + print_per_document_table(rows) + print(f"\nWrote: {out_path}") + print(f"Total wall time: {fmt_seconds(ended - started)} sec") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/pyproject.toml b/pyproject.toml index 116559da..e288f1e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,18 @@ dev = [ "boto3>=1.35.67,<2.0.0", "autoflake>=2.3.1,<3.0.0" ] +perf-test = [ + "pdfplumber>=0.11.7", + "pymupdf>=1.26.4", + "pypdfium2>=4.30.0", +] + +[project.optional-dependencies] +perf-tools = [ + "pdfplumber>=0.11.7", + "pymupdf>=1.26.4", + "pypdfium2>=4.30.0", +] [tool.uv] package = true default-groups = "all" diff --git a/src/v2/pdf_decoders/document.h b/src/v2/pdf_decoders/document.h index a32bf8d5..98e84a49 100644 --- a/src/v2/pdf_decoders/document.h +++ b/src/v2/pdf_decoders/document.h @@ -298,11 +298,16 @@ namespace pdflib utils::timer page_timer; pdf_decoder page_decoder(pages.at(page_number), page_number); - - auto timings_ = page_decoder.decode_page(page_boundary, do_sanitization); - - update_timings(timings_, set_timer); - set_timer=false; + + { + //utils::timer decode_timer; + auto timings_ = page_decoder.decode_page(page_boundary, do_sanitization); + + //std::cout << "decode_timer: " << decode_timer.get_time() << "\n"; + + update_timings(timings_, set_timer); + set_timer=false; + } nlohmann::json page = page_decoder.get(keep_char_cells, keep_lines, keep_bitmaps, do_sanitization); @@ -319,11 +324,17 @@ namespace pdflib horizontal_cell_tolerance, enforce_same_font, space_width_factor_for_merge); + + // quadratic: might be slower ... + sanitizer.remove_duplicate_cells(word_cells, 0.5, true); + page["original"]["word_cells"] = word_cells.get(); } if(create_line_cells) { + //utils::timer line_cells_timer; + LOG_S(INFO) << "creating line-cells in `original` (2)"; double horizontal_cell_tolerance=1.00; @@ -336,7 +347,11 @@ namespace pdflib enforce_same_font, space_width_factor_for_merge, space_width_factor_for_merge_with_space); + // quadratic: might be slower ... + sanitizer.remove_duplicate_cells(line_cells, 0.5, true); + page["original"]["line_cells"] = line_cells.get(); + //std::cout << "line_cells: " << line_cells_timer.get_time() << "\n"; } json_pages.push_back(page); diff --git a/src/v2/pdf_decoders/page.h b/src/v2/pdf_decoders/page.h index 659fa2ac..82d2a5e4 100644 --- a/src/v2/pdf_decoders/page.h +++ b/src/v2/pdf_decoders/page.h @@ -198,10 +198,19 @@ namespace pdflib { utils::timer timer; - json_page = to_json(qpdf_page); + { + //utils::timer _; + json_page = to_json(qpdf_page); + //std::cout << "json_page: " << _.get_time() << "\n"; + } - json_annots = extract_annots_in_json(qpdf_page); + { + //utils::timer _; + json_annots = extract_annots_in_json(qpdf_page); + //std::cout << "json_annots: " << _.get_time() << "\n"; + } + /* try { LOG_S(INFO) << "json_page: \n" << json_page.dump(2); @@ -211,32 +220,61 @@ namespace pdflib LOG_S(ERROR) << "could not dump the json-representation of the page with error: " << e.what(); } - + */ + decode_dimensions(); - decode_resources(); + { + //utils::timer _; + decode_resources(); + //std::cout << "decode_resources: " << _.get_time() << "\n"; + } - decode_contents(); + { + //utils::timer _; + decode_contents(); + //std::cout << "decode_contents: " << _.get_time() << "\n"; + } - decode_annots(); + { + //utils::timer _; + decode_annots(); + //std::cout << "decode_annots: " << _.get_time() << "\n"; + } rotate_contents(); // fix the orientiation { + //utils::timer _; + pdf_sanitator sanitator(page_dimension); sanitator.sanitize(page_boundary); // update the top-level bbox sanitator.sanitize(page_cells, page_boundary); sanitator.sanitize(page_lines, page_boundary); sanitator.sanitize(page_images, page_boundary); + + //std::cout << "pdf_sanitator: " << _.get_time() << "\n"; } { pdf_sanitator sanitator; - sanitator.remove_duplicate_chars(page_cells, 0.5); - sanitator.sanitize_text(page_cells); + { + //utils::timer _; + + //sanitator.remove_adjacent_cells(page_cells, 0.5); + sanitator.remove_duplicate_cells(page_cells, 0.5, true); + + //std::cout << "pdf_sanitator::remove_duplicate_chars " << _.get_time() << "\n"; + } + + { + //utils::timer _; + sanitator.sanitize_text(page_cells); + //std::cout << "pdf_sanitator::sanitize_text " << _.get_time() << "\n"; + } } if(do_sanitization) @@ -274,7 +312,7 @@ namespace pdflib auto parent = qpdf_page.getKey("/Parent"); if(parent.hasKey("/Resources")) { - LOG_S(INFO) << "parent of page has resources!: " << json_parent_resources.dump(2); + //LOG_S(INFO) << "parent of page has resources!: " << json_parent_resources.dump(2); qpdf_parent_resources = parent.getKey("/Resources"); json_parent_resources = to_json(qpdf_parent_resources); //json_page["/Resources"]; diff --git a/src/v2/pdf_decoders/stream.h b/src/v2/pdf_decoders/stream.h index bbc6f8d9..47e82b3e 100644 --- a/src/v2/pdf_decoders/stream.h +++ b/src/v2/pdf_decoders/stream.h @@ -505,7 +505,7 @@ namespace pdflib this->Q(); } - LOG_S(INFO) << "ending the execution of FORM XObject with name" << xobj_name; + LOG_S(INFO) << "ending the execution of FORM XObject with name `" << xobj_name << "`"; } break; diff --git a/src/v2/pdf_resources/page_cells.h b/src/v2/pdf_resources/page_cells.h index 7efbd876..1a9616b1 100644 --- a/src/v2/pdf_resources/page_cells.h +++ b/src/v2/pdf_resources/page_cells.h @@ -35,14 +35,19 @@ namespace pdflib itr_type erase(itr_type itr_0, itr_type itr_1) { return cells.erase(itr_0, itr_1); } pdf_resource& at(std::size_t i) { return cells.at(i); } + + void remove_inactive_cells(); private: std::vector > cells; }; - pdf_resource::pdf_resource() - {} + pdf_resource::pdf_resource(): + cells(0) // 0 elements + { + cells.reserve(1000000); // Reserve space for 1M elements + } pdf_resource::~pdf_resource() {} @@ -124,6 +129,42 @@ namespace pdflib cells.push_back(cell); } + void pdf_resource::remove_inactive_cells() + { + /* + auto itr=cells.begin(); + while(itr!=cells.end()) + { + if(itr->active) + { + itr++; + } + else + { + itr = cells.erase(itr); + } + } + */ + + std::size_t write_pos = 0; + for(std::size_t read_pos = 0; read_pos < cells.size(); ++read_pos) + { + if(cells[read_pos].active) + { + if(write_pos != read_pos) + { + cells[write_pos] = std::move(cells[read_pos]); + } + ++write_pos; + } + else + { + LOG_S(WARNING) << "removing inactive cell (text: " << cells[read_pos].text << ")"; + } + } + cells.resize(write_pos); + } + } #endif diff --git a/src/v2/pdf_resources/page_xobject.h b/src/v2/pdf_resources/page_xobject.h index 94266b00..03fb668a 100644 --- a/src/v2/pdf_resources/page_xobject.h +++ b/src/v2/pdf_resources/page_xobject.h @@ -152,7 +152,7 @@ namespace pdflib nlohmann::json& json_xobject_, QPDFObjectHandle qpdf_xobject_) { - LOG_S(INFO) << __FUNCTION__; + LOG_S(INFO) << __FUNCTION__ << ": " << xobject_key_; xobject_key = xobject_key_; @@ -161,7 +161,7 @@ namespace pdflib parse(); - LOG_S(INFO) << json_xobject.dump(2); + // LOG_S(INFO) << json_xobject.dump(2); } void pdf_resource::parse() @@ -172,7 +172,7 @@ namespace pdflib qpdf_xobject_dict = qpdf_xobject.getDict(); json_xobject_dict = to_json(qpdf_xobject_dict); - LOG_S(INFO) << "xobject-dict: " << json_xobject_dict.dump(2); + // LOG_S(INFO) << "xobject-dict: " << json_xobject_dict.dump(2); } { diff --git a/src/v2/pdf_resources/page_xobjects.h b/src/v2/pdf_resources/page_xobjects.h index d301fe49..d85a1727 100644 --- a/src/v2/pdf_resources/page_xobjects.h +++ b/src/v2/pdf_resources/page_xobjects.h @@ -77,17 +77,25 @@ namespace pdflib LOG_S(INFO) << __FUNCTION__; //LOG_S(INFO) << json_xobjects.dump(2); + int cnt = 0; + int len = json_xobjects.size(); + for(auto& pair : json_xobjects.items()) { - LOG_S(INFO) << "decoding xobject: " << pair.key(); + LOG_S(INFO) << "decoding xobject: " << pair.key() << "\t" << (++cnt) << "/" << len; - std::string key = pair.key(); - nlohmann::json& val = pair.value(); - - pdf_resource page_xobject; - page_xobject.set(key, val, qpdf_xobjects.getKey(key)); - - page_xobjects[key] = page_xobject; + std::string key = pair.key(); + nlohmann::json& val = pair.value(); + + if(page_xobjects.count(key)>0) + { + LOG_S(ERROR) << key << "is already in page_xobjects, overwriting ..."; + } + + pdf_resource page_xobject; + page_xobject.set(key, val, qpdf_xobjects.getKey(key)); + + page_xobjects[key] = page_xobject; } } diff --git a/src/v2/pdf_sanitators/cells.h b/src/v2/pdf_sanitators/cells.h index 46d611f5..2c7b94aa 100644 --- a/src/v2/pdf_sanitators/cells.h +++ b/src/v2/pdf_sanitators/cells.h @@ -28,7 +28,9 @@ namespace pdflib double space_width_factor_for_merge_with_space); //=0.33); - void remove_duplicate_chars(pdf_resource& cells, double eps=1.0e-1); + //void remove_duplicate_chars(pdf_resource& cells, double eps=1.0e-1); + void remove_adjacent_cells(pdf_resource& cells, double eps); //=1.0e-1); + void remove_duplicate_cells(pdf_resource& cells, double eps, bool same_line); void sanitize_bbox(pdf_resource& cells, double horizontal_cell_tolerance, //=1.0, @@ -203,7 +205,8 @@ namespace pdflib //return to_records(line_cells); return line_cells; } - + + /* void pdf_sanitator::remove_duplicate_chars(pdf_resource& cells, double eps) { while(true) @@ -216,14 +219,14 @@ namespace pdflib { continue; } - + for(int j=i+1; j::remove_duplicate_chars(pdf_resource& cells, double eps) + void pdf_sanitator::remove_adjacent_cells(pdf_resource& cells, double eps) + { + for(int i=0; i=cells.size() or (not cells[j].active)) + { + continue; + } + + if(cells[i].font_name==cells[j].font_name and + cells[i].text==cells[j].text and + utils::values::distance(cells[i].r_x0, cells[i].r_y0, cells[j].r_x0, cells[j].r_y0)::remove_duplicate_cells(pdf_resource& cells, double eps, bool same_line) + { + for(int i=0; ieps) + { + break; + } + + if(not cells[j].active) + { + continue; + } + + if(cells[i].font_name==cells[j].font_name and + cells[i].text==cells[j].text and + utils::values::distance(cells[i].r_x0, cells[i].r_y0, cells[j].r_x0, cells[j].r_y0)::sanitize_text(pdf_resource& cells) { for(int i=0; i >& cells) { - LOG_S(INFO) << __FUNCTION__ << " with text='" << text << "', width=" << width; + // LOG_S(INFO) << __FUNCTION__ << " with text='" << text << "', width=" << width; bool left_to_right = (not utils::string::is_right_to_left(text)); @@ -475,6 +475,7 @@ namespace pdflib double font_ascent = font.get_ascent(); double font_capheight = font.get_capheight(); + /* if(left_to_right) { LOG_S(INFO) << "font_descent: " << font_descent << ", " @@ -491,14 +492,17 @@ namespace pdflib << "capheight/ascent: " << font_capheight/font_ascent << ", " << "left_to_right: " << left_to_right << ", text: " << text; } + */ double space_width=0; { double w0 = font.get_space_width(); double w1 = (w0 / 1000.0 * font_size * h_scaling);// + (char_spacing+word_spacing)*h_scaling; + /* LOG_S(INFO) << __FUNCTION__ << " -> w0: " << w0 << ", w1: " << w1 << ", " << "font_size: " << font_size << ", h_scaling: " << h_scaling; + */ std::array rect = compute_rect(font_descent, font_ascent, w1); space_width = std::sqrt((rect[2]-rect[0])*(rect[2]-rect[0])+ @@ -576,21 +580,21 @@ namespace pdflib std::vector > pdf_state::analyse_string(qpdf_instruction instruction) { - LOG_S(INFO) << __FUNCTION__ << " fontname: " << font_name << ", key: " << instruction.key << " => val: " << instruction.val; + //LOG_S(INFO) << __FUNCTION__ << " fontname: " << font_name << ", key: " << instruction.key << " => val: " << instruction.val; auto& font = page_fonts[font_name]; font_encoding_name encoding = font.get_encoding(); std::string values = instruction.to_char_string(); - LOG_S(INFO) << "values: " << values.size() << "\t" << values; + //LOG_S(INFO) << "values: " << values.size() << "\t" << values; std::vector > result; if(encoding == IDENTITY_H or encoding == IDENTITY_V ) // 2-byte string { - LOG_S(INFO) << "detected encoding: " << to_string(encoding); + //LOG_S(INFO) << "detected encoding: " << to_string(encoding); //assert(values.size()%2==0); @@ -632,7 +636,7 @@ namespace pdflib } else if(encoding == CMAP_RESOURCES) { - LOG_S(INFO) << "detected encoding: " << to_string(encoding); + //LOG_S(INFO) << "detected encoding: " << to_string(encoding); int l=0; @@ -685,7 +689,7 @@ namespace pdflib } else { - LOG_S(INFO) << "detected encoding: " << to_string(encoding); + //LOG_S(INFO) << "detected encoding: " << to_string(encoding); for(int l=0; l item(c,v); - LOG_S(INFO) << item.first << ": " << item.second; + //LOG_S(INFO) << item.first << ": " << item.second; result.push_back(item); } diff --git a/tests/data/cases/case_10.pdf b/tests/data/cases/case_10.pdf new file mode 100644 index 00000000..27adf84e Binary files /dev/null and b/tests/data/cases/case_10.pdf differ diff --git a/uv.lock b/uv.lock index 979d4f25..f28a62f1 100644 --- a/uv.lock +++ b/uv.lock @@ -187,6 +187,8 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/fc/97/c783634659c2920c3fc70419e3af40972dbaf758daa229a7d6ea6135c90d/cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824", size = 516621, upload-time = "2024-09-04T20:45:21.852Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/90/07/f44ca684db4e4f08a3fdc6eeb9a0d15dc6883efc7b8c90357fdbf74e186c/cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14", size = 182191, upload-time = "2024-09-04T20:43:30.027Z" }, + { url = "https://files.pythonhosted.org/packages/08/fd/cc2fedbd887223f9f5d170c96e57cbf655df9831a6546c1727ae13fa977a/cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67", size = 178592, upload-time = "2024-09-04T20:43:32.108Z" }, { url = "https://files.pythonhosted.org/packages/de/cc/4635c320081c78d6ffc2cab0a76025b691a91204f4aa317d568ff9280a2d/cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382", size = 426024, upload-time = "2024-09-04T20:43:34.186Z" }, { url = "https://files.pythonhosted.org/packages/b6/7b/3b2b250f3aab91abe5f8a51ada1b717935fdaec53f790ad4100fe2ec64d1/cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702", size = 448188, upload-time = "2024-09-04T20:43:36.286Z" }, { url = "https://files.pythonhosted.org/packages/d3/48/1b9283ebbf0ec065148d8de05d647a986c5f22586b18120020452fff8f5d/cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3", size = 455571, upload-time = "2024-09-04T20:43:38.586Z" }, @@ -195,6 +197,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/a0/62f00bcb411332106c02b663b26f3545a9ef136f80d5df746c05878f8c4b/cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8", size = 461325, upload-time = "2024-09-04T20:43:43.117Z" }, { url = "https://files.pythonhosted.org/packages/36/83/76127035ed2e7e27b0787604d99da630ac3123bfb02d8e80c633f218a11d/cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e", size = 438784, upload-time = "2024-09-04T20:43:45.256Z" }, { url = "https://files.pythonhosted.org/packages/21/81/a6cd025db2f08ac88b901b745c163d884641909641f9b826e8cb87645942/cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be", size = 461564, upload-time = "2024-09-04T20:43:46.779Z" }, + { url = "https://files.pythonhosted.org/packages/f8/fe/4d41c2f200c4a457933dbd98d3cf4e911870877bd94d9656cc0fcb390681/cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c", size = 171804, upload-time = "2024-09-04T20:43:48.186Z" }, + { url = "https://files.pythonhosted.org/packages/d1/b6/0b0f5ab93b0df4acc49cae758c81fe4e5ef26c3ae2e10cc69249dfd8b3ab/cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15", size = 181299, upload-time = "2024-09-04T20:43:49.812Z" }, + { url = "https://files.pythonhosted.org/packages/6b/f4/927e3a8899e52a27fa57a48607ff7dc91a9ebe97399b357b85a0c7892e00/cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401", size = 182264, upload-time = "2024-09-04T20:43:51.124Z" }, + { url = "https://files.pythonhosted.org/packages/6c/f5/6c3a8efe5f503175aaddcbea6ad0d2c96dad6f5abb205750d1b3df44ef29/cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf", size = 178651, upload-time = "2024-09-04T20:43:52.872Z" }, { url = "https://files.pythonhosted.org/packages/94/dd/a3f0118e688d1b1a57553da23b16bdade96d2f9bcda4d32e7d2838047ff7/cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4", size = 445259, upload-time = "2024-09-04T20:43:56.123Z" }, { url = "https://files.pythonhosted.org/packages/2e/ea/70ce63780f096e16ce8588efe039d3c4f91deb1dc01e9c73a287939c79a6/cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41", size = 469200, upload-time = "2024-09-04T20:43:57.891Z" }, { url = "https://files.pythonhosted.org/packages/1c/a0/a4fa9f4f781bda074c3ddd57a572b060fa0df7655d2a4247bbe277200146/cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1", size = 477235, upload-time = "2024-09-04T20:44:00.18Z" }, @@ -203,6 +209,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/52/d9a0e523a572fbccf2955f5abe883cfa8bcc570d7faeee06336fbd50c9fc/cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6", size = 477999, upload-time = "2024-09-04T20:44:05.023Z" }, { url = "https://files.pythonhosted.org/packages/44/74/f2a2460684a1a2d00ca799ad880d54652841a780c4c97b87754f660c7603/cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f", size = 454242, upload-time = "2024-09-04T20:44:06.444Z" }, { url = "https://files.pythonhosted.org/packages/f8/4a/34599cac7dfcd888ff54e801afe06a19c17787dfd94495ab0c8d35fe99fb/cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b", size = 478604, upload-time = "2024-09-04T20:44:08.206Z" }, + { url = "https://files.pythonhosted.org/packages/34/33/e1b8a1ba29025adbdcda5fb3a36f94c03d771c1b7b12f726ff7fef2ebe36/cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655", size = 171727, upload-time = "2024-09-04T20:44:09.481Z" }, + { url = "https://files.pythonhosted.org/packages/3d/97/50228be003bb2802627d28ec0627837ac0bf35c90cf769812056f235b2d1/cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0", size = 181400, upload-time = "2024-09-04T20:44:10.873Z" }, + { url = "https://files.pythonhosted.org/packages/5a/84/e94227139ee5fb4d600a7a4927f322e1d4aea6fdc50bd3fca8493caba23f/cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4", size = 183178, upload-time = "2024-09-04T20:44:12.232Z" }, + { url = "https://files.pythonhosted.org/packages/da/ee/fb72c2b48656111c4ef27f0f91da355e130a923473bf5ee75c5643d00cca/cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c", size = 178840, upload-time = "2024-09-04T20:44:13.739Z" }, { url = "https://files.pythonhosted.org/packages/cc/b6/db007700f67d151abadf508cbfd6a1884f57eab90b1bb985c4c8c02b0f28/cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36", size = 454803, upload-time = "2024-09-04T20:44:15.231Z" }, { url = "https://files.pythonhosted.org/packages/1a/df/f8d151540d8c200eb1c6fba8cd0dfd40904f1b0682ea705c36e6c2e97ab3/cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5", size = 478850, upload-time = "2024-09-04T20:44:17.188Z" }, { url = "https://files.pythonhosted.org/packages/28/c0/b31116332a547fd2677ae5b78a2ef662dfc8023d67f41b2a83f7c2aa78b1/cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff", size = 485729, upload-time = "2024-09-04T20:44:18.688Z" }, @@ -210,6 +220,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b2/d5/da47df7004cb17e4955df6a43d14b3b4ae77737dff8bf7f8f333196717bf/cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93", size = 479424, upload-time = "2024-09-04T20:44:21.673Z" }, { url = "https://files.pythonhosted.org/packages/0b/ac/2a28bcf513e93a219c8a4e8e125534f4f6db03e3179ba1c45e949b76212c/cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3", size = 484568, upload-time = "2024-09-04T20:44:23.245Z" }, { url = "https://files.pythonhosted.org/packages/d4/38/ca8a4f639065f14ae0f1d9751e70447a261f1a30fa7547a828ae08142465/cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8", size = 488736, upload-time = "2024-09-04T20:44:24.757Z" }, + { url = "https://files.pythonhosted.org/packages/86/c5/28b2d6f799ec0bdecf44dced2ec5ed43e0eb63097b0f58c293583b406582/cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65", size = 172448, upload-time = "2024-09-04T20:44:26.208Z" }, + { url = "https://files.pythonhosted.org/packages/50/b9/db34c4755a7bd1cb2d1603ac3863f22bcecbd1ba29e5ee841a4bc510b294/cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903", size = 181976, upload-time = "2024-09-04T20:44:27.578Z" }, + { url = "https://files.pythonhosted.org/packages/8d/f8/dd6c246b148639254dad4d6803eb6a54e8c85c6e11ec9df2cffa87571dbe/cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e", size = 182989, upload-time = "2024-09-04T20:44:28.956Z" }, + { url = "https://files.pythonhosted.org/packages/8b/f1/672d303ddf17c24fc83afd712316fda78dc6fce1cd53011b839483e1ecc8/cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2", size = 178802, upload-time = "2024-09-04T20:44:30.289Z" }, { url = "https://files.pythonhosted.org/packages/0e/2d/eab2e858a91fdff70533cab61dcff4a1f55ec60425832ddfdc9cd36bc8af/cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3", size = 454792, upload-time = "2024-09-04T20:44:32.01Z" }, { url = "https://files.pythonhosted.org/packages/75/b2/fbaec7c4455c604e29388d55599b99ebcc250a60050610fadde58932b7ee/cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683", size = 478893, upload-time = "2024-09-04T20:44:33.606Z" }, { url = "https://files.pythonhosted.org/packages/4f/b7/6e4a2162178bf1935c336d4da8a9352cccab4d3a5d7914065490f08c0690/cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5", size = 485810, upload-time = "2024-09-04T20:44:35.191Z" }, @@ -217,6 +231,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/26/9f/1aab65a6c0db35f43c4d1b4f580e8df53914310afc10ae0397d29d697af4/cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd", size = 479447, upload-time = "2024-09-04T20:44:38.492Z" }, { url = "https://files.pythonhosted.org/packages/5f/e4/fb8b3dd8dc0e98edf1135ff067ae070bb32ef9d509d6cb0f538cd6f7483f/cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed", size = 484358, upload-time = "2024-09-04T20:44:40.046Z" }, { url = "https://files.pythonhosted.org/packages/f1/47/d7145bf2dc04684935d57d67dff9d6d795b2ba2796806bb109864be3a151/cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9", size = 488469, upload-time = "2024-09-04T20:44:41.616Z" }, + { url = "https://files.pythonhosted.org/packages/bf/ee/f94057fa6426481d663b88637a9a10e859e492c73d0384514a17d78ee205/cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d", size = 172475, upload-time = "2024-09-04T20:44:43.733Z" }, + { url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009, upload-time = "2024-09-04T20:44:45.309Z" }, + { url = "https://files.pythonhosted.org/packages/b9/ea/8bb50596b8ffbc49ddd7a1ad305035daa770202a6b782fc164647c2673ad/cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16", size = 182220, upload-time = "2024-09-04T20:45:01.577Z" }, + { url = "https://files.pythonhosted.org/packages/ae/11/e77c8cd24f58285a82c23af484cf5b124a376b32644e445960d1a4654c3a/cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36", size = 178605, upload-time = "2024-09-04T20:45:03.837Z" }, { url = "https://files.pythonhosted.org/packages/ed/65/25a8dc32c53bf5b7b6c2686b42ae2ad58743f7ff644844af7cdb29b49361/cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8", size = 424910, upload-time = "2024-09-04T20:45:05.315Z" }, { url = "https://files.pythonhosted.org/packages/42/7a/9d086fab7c66bd7c4d0f27c57a1b6b068ced810afc498cc8c49e0088661c/cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576", size = 447200, upload-time = "2024-09-04T20:45:06.903Z" }, { url = "https://files.pythonhosted.org/packages/da/63/1785ced118ce92a993b0ec9e0d0ac8dc3e5dbfbcaa81135be56c69cabbb6/cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87", size = 454565, upload-time = "2024-09-04T20:45:08.975Z" }, @@ -225,6 +243,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5b/95/b34462f3ccb09c2594aa782d90a90b045de4ff1f70148ee79c69d37a0a5a/cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595", size = 460486, upload-time = "2024-09-04T20:45:13.935Z" }, { url = "https://files.pythonhosted.org/packages/fc/fc/a1e4bebd8d680febd29cf6c8a40067182b64f00c7d105f8f26b5bc54317b/cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a", size = 437911, upload-time = "2024-09-04T20:45:15.696Z" }, { url = "https://files.pythonhosted.org/packages/e6/c3/21cab7a6154b6a5ea330ae80de386e7665254835b9e98ecc1340b3a7de9a/cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e", size = 460632, upload-time = "2024-09-04T20:45:17.284Z" }, + { url = "https://files.pythonhosted.org/packages/cb/b5/fd9f8b5a84010ca169ee49f4e4ad6f8c05f4e3545b72ee041dbbcb159882/cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7", size = 171820, upload-time = "2024-09-04T20:45:18.762Z" }, + { url = "https://files.pythonhosted.org/packages/8c/52/b08750ce0bce45c143e1b5d7357ee8c55341b52bdef4b0f081af1eb248c2/cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662", size = 181290, upload-time = "2024-09-04T20:45:20.226Z" }, ] [[package]] @@ -419,6 +439,7 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/fe/c8/a2a376a8711c1e11708b9c9972e0c3223f5fc682552c82d8db844393d6ce/cryptography-45.0.4.tar.gz", hash = "sha256:7405ade85c83c37682c8fe65554759800a4a8c54b2d96e0f8ad114d31b808d57", size = 744890, upload-time = "2025-06-10T00:03:51.297Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/cc/1c/92637793de053832523b410dbe016d3f5c11b41d0cf6eef8787aabb51d41/cryptography-45.0.4-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:425a9a6ac2823ee6e46a76a21a4e8342d8fa5c01e08b823c1f19a8b74f096069", size = 7055712, upload-time = "2025-06-10T00:02:38.826Z" }, { url = "https://files.pythonhosted.org/packages/ba/14/93b69f2af9ba832ad6618a03f8a034a5851dc9a3314336a3d71c252467e1/cryptography-45.0.4-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:680806cf63baa0039b920f4976f5f31b10e772de42f16310a6839d9f21a26b0d", size = 4205335, upload-time = "2025-06-10T00:02:41.64Z" }, { url = "https://files.pythonhosted.org/packages/67/30/fae1000228634bf0b647fca80403db5ca9e3933b91dd060570689f0bd0f7/cryptography-45.0.4-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4ca0f52170e821bc8da6fc0cc565b7bb8ff8d90d36b5e9fdd68e8a86bdf72036", size = 4431487, upload-time = "2025-06-10T00:02:43.696Z" }, { url = "https://files.pythonhosted.org/packages/6d/5a/7dffcf8cdf0cb3c2430de7404b327e3db64735747d641fc492539978caeb/cryptography-45.0.4-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:f3fe7a5ae34d5a414957cc7f457e2b92076e72938423ac64d215722f6cf49a9e", size = 4208922, upload-time = "2025-06-10T00:02:45.334Z" }, @@ -428,6 +449,9 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/db/b7/a84bdcd19d9c02ec5807f2ec2d1456fd8451592c5ee353816c09250e3561/cryptography-45.0.4-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:2882338b2a6e0bd337052e8b9007ced85c637da19ef9ecaf437744495c8c2999", size = 4463623, upload-time = "2025-06-10T00:02:52.542Z" }, { url = "https://files.pythonhosted.org/packages/d8/84/69707d502d4d905021cac3fb59a316344e9f078b1da7fb43ecde5e10840a/cryptography-45.0.4-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:23b9c3ea30c3ed4db59e7b9619272e94891f8a3a5591d0b656a7582631ccf750", size = 4332447, upload-time = "2025-06-10T00:02:54.63Z" }, { url = "https://files.pythonhosted.org/packages/f3/ee/d4f2ab688e057e90ded24384e34838086a9b09963389a5ba6854b5876598/cryptography-45.0.4-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b0a97c927497e3bc36b33987abb99bf17a9a175a19af38a892dc4bbb844d7ee2", size = 4572830, upload-time = "2025-06-10T00:02:56.689Z" }, + { url = "https://files.pythonhosted.org/packages/70/d4/994773a261d7ff98034f72c0e8251fe2755eac45e2265db4c866c1c6829c/cryptography-45.0.4-cp311-abi3-win32.whl", hash = "sha256:e00a6c10a5c53979d6242f123c0a97cff9f3abed7f064fc412c36dc521b5f257", size = 2932769, upload-time = "2025-06-10T00:02:58.467Z" }, + { url = "https://files.pythonhosted.org/packages/5a/42/c80bd0b67e9b769b364963b5252b17778a397cefdd36fa9aa4a5f34c599a/cryptography-45.0.4-cp311-abi3-win_amd64.whl", hash = "sha256:817ee05c6c9f7a69a16200f0c90ab26d23a87701e2a284bd15156783e46dbcc8", size = 3410441, upload-time = "2025-06-10T00:03:00.14Z" }, + { url = "https://files.pythonhosted.org/packages/ce/0b/2488c89f3a30bc821c9d96eeacfcab6ff3accc08a9601ba03339c0fd05e5/cryptography-45.0.4-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:964bcc28d867e0f5491a564b7debb3ffdd8717928d315d12e0d7defa9e43b723", size = 7031836, upload-time = "2025-06-10T00:03:01.726Z" }, { url = "https://files.pythonhosted.org/packages/fe/51/8c584ed426093aac257462ae62d26ad61ef1cbf5b58d8b67e6e13c39960e/cryptography-45.0.4-cp37-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6a5bf57554e80f75a7db3d4b1dacaa2764611ae166ab42ea9a72bcdb5d577637", size = 4195746, upload-time = "2025-06-10T00:03:03.94Z" }, { url = "https://files.pythonhosted.org/packages/5c/7d/4b0ca4d7af95a704eef2f8f80a8199ed236aaf185d55385ae1d1610c03c2/cryptography-45.0.4-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:46cf7088bf91bdc9b26f9c55636492c1cce3e7aaf8041bbf0243f5e5325cfb2d", size = 4424456, upload-time = "2025-06-10T00:03:05.589Z" }, { url = "https://files.pythonhosted.org/packages/1d/45/5fabacbc6e76ff056f84d9f60eeac18819badf0cefc1b6612ee03d4ab678/cryptography-45.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:7bedbe4cc930fa4b100fc845ea1ea5788fcd7ae9562e669989c11618ae8d76ee", size = 4198495, upload-time = "2025-06-10T00:03:09.172Z" }, @@ -437,14 +461,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/c0/85fa358ddb063ec588aed4a6ea1df57dc3e3bc1712d87c8fa162d02a65fc/cryptography-45.0.4-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:06509dc70dd71fa56eaa138336244e2fbaf2ac164fc9b5e66828fccfd2b680d6", size = 4451442, upload-time = "2025-06-10T00:03:16.248Z" }, { url = "https://files.pythonhosted.org/packages/33/67/362d6ec1492596e73da24e669a7fbbaeb1c428d6bf49a29f7a12acffd5dc/cryptography-45.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:5f31e6b0a5a253f6aa49be67279be4a7e5a4ef259a9f33c69f7d1b1191939872", size = 4325038, upload-time = "2025-06-10T00:03:18.4Z" }, { url = "https://files.pythonhosted.org/packages/53/75/82a14bf047a96a1b13ebb47fb9811c4f73096cfa2e2b17c86879687f9027/cryptography-45.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:944e9ccf67a9594137f942d5b52c8d238b1b4e46c7a0c2891b7ae6e01e7c80a4", size = 4560964, upload-time = "2025-06-10T00:03:20.06Z" }, + { url = "https://files.pythonhosted.org/packages/cd/37/1a3cba4c5a468ebf9b95523a5ef5651244693dc712001e276682c278fc00/cryptography-45.0.4-cp37-abi3-win32.whl", hash = "sha256:c22fe01e53dc65edd1945a2e6f0015e887f84ced233acecb64b4daadb32f5c97", size = 2924557, upload-time = "2025-06-10T00:03:22.563Z" }, + { url = "https://files.pythonhosted.org/packages/2a/4b/3256759723b7e66380397d958ca07c59cfc3fb5c794fb5516758afd05d41/cryptography-45.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:627ba1bc94f6adf0b0a2e35d87020285ead22d9f648c7e75bb64f367375f3b22", size = 3395508, upload-time = "2025-06-10T00:03:24.586Z" }, + { url = "https://files.pythonhosted.org/packages/16/33/b38e9d372afde56906a23839302c19abdac1c505bfb4776c1e4b07c3e145/cryptography-45.0.4-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a77c6fb8d76e9c9f99f2f3437c1a4ac287b34eaf40997cfab1e9bd2be175ac39", size = 3580103, upload-time = "2025-06-10T00:03:26.207Z" }, { url = "https://files.pythonhosted.org/packages/c4/b9/357f18064ec09d4807800d05a48f92f3b369056a12f995ff79549fbb31f1/cryptography-45.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7aad98a25ed8ac917fdd8a9c1e706e5a0956e06c498be1f713b61734333a4507", size = 4143732, upload-time = "2025-06-10T00:03:27.896Z" }, { url = "https://files.pythonhosted.org/packages/c4/9c/7f7263b03d5db329093617648b9bd55c953de0b245e64e866e560f9aac07/cryptography-45.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:3530382a43a0e524bc931f187fc69ef4c42828cf7d7f592f7f249f602b5a4ab0", size = 4385424, upload-time = "2025-06-10T00:03:29.992Z" }, { url = "https://files.pythonhosted.org/packages/a6/5a/6aa9d8d5073d5acc0e04e95b2860ef2684b2bd2899d8795fc443013e263b/cryptography-45.0.4-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:6b613164cb8425e2f8db5849ffb84892e523bf6d26deb8f9bb76ae86181fa12b", size = 4142438, upload-time = "2025-06-10T00:03:31.782Z" }, { url = "https://files.pythonhosted.org/packages/42/1c/71c638420f2cdd96d9c2b287fec515faf48679b33a2b583d0f1eda3a3375/cryptography-45.0.4-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:96d4819e25bf3b685199b304a0029ce4a3caf98947ce8a066c9137cc78ad2c58", size = 4384622, upload-time = "2025-06-10T00:03:33.491Z" }, + { url = "https://files.pythonhosted.org/packages/ef/ab/e3a055c34e97deadbf0d846e189237d3385dca99e1a7e27384c3b2292041/cryptography-45.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b97737a3ffbea79eebb062eb0d67d72307195035332501722a9ca86bab9e3ab2", size = 3328911, upload-time = "2025-06-10T00:03:35.035Z" }, + { url = "https://files.pythonhosted.org/packages/ea/ba/cf442ae99ef363855ed84b39e0fb3c106ac66b7a7703f3c9c9cfe05412cb/cryptography-45.0.4-pp311-pypy311_pp73-macosx_10_9_x86_64.whl", hash = "sha256:4828190fb6c4bcb6ebc6331f01fe66ae838bb3bd58e753b59d4b22eb444b996c", size = 3590512, upload-time = "2025-06-10T00:03:36.982Z" }, { url = "https://files.pythonhosted.org/packages/28/9a/a7d5bb87d149eb99a5abdc69a41e4e47b8001d767e5f403f78bfaafc7aa7/cryptography-45.0.4-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:03dbff8411206713185b8cebe31bc5c0eb544799a50c09035733716b386e61a4", size = 4146899, upload-time = "2025-06-10T00:03:38.659Z" }, { url = "https://files.pythonhosted.org/packages/17/11/9361c2c71c42cc5c465cf294c8030e72fb0c87752bacbd7a3675245e3db3/cryptography-45.0.4-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:51dfbd4d26172d31150d84c19bbe06c68ea4b7f11bbc7b3a5e146b367c311349", size = 4388900, upload-time = "2025-06-10T00:03:40.233Z" }, { url = "https://files.pythonhosted.org/packages/c0/76/f95b83359012ee0e670da3e41c164a0c256aeedd81886f878911581d852f/cryptography-45.0.4-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:0339a692de47084969500ee455e42c58e449461e0ec845a34a6a9b9bf7df7fb8", size = 4146422, upload-time = "2025-06-10T00:03:41.827Z" }, { url = "https://files.pythonhosted.org/packages/09/ad/5429fcc4def93e577a5407988f89cf15305e64920203d4ac14601a9dc876/cryptography-45.0.4-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:0cf13c77d710131d33e63626bd55ae7c0efb701ebdc2b3a7952b9b23a0412862", size = 4388475, upload-time = "2025-06-10T00:03:43.493Z" }, + { url = "https://files.pythonhosted.org/packages/99/49/0ab9774f64555a1b50102757811508f5ace451cf5dc0a2d074a4b9deca6a/cryptography-45.0.4-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:bbc505d1dc469ac12a0a064214879eac6294038d6b24ae9f71faae1448a9608d", size = 3337594, upload-time = "2025-06-10T00:03:45.523Z" }, ] [[package]] @@ -515,7 +545,7 @@ wheels = [ [[package]] name = "docling-parse" -version = "4.3.0" +version = "4.4.0" source = { editable = "." } dependencies = [ { name = "docling-core" }, @@ -525,6 +555,13 @@ dependencies = [ { name = "tabulate" }, ] +[package.optional-dependencies] +perf-tools = [ + { name = "pdfplumber" }, + { name = "pymupdf" }, + { name = "pypdfium2" }, +] + [package.dev-dependencies] build = [ { name = "cibuildwheel" }, @@ -546,15 +583,24 @@ dev = [ { name = "python-semantic-release" }, { name = "tqdm" }, ] +perf-test = [ + { name = "pdfplumber" }, + { name = "pymupdf" }, + { name = "pypdfium2" }, +] [package.metadata] requires-dist = [ { name = "docling-core", specifier = ">=2.44.1" }, + { name = "pdfplumber", marker = "extra == 'perf-tools'", specifier = ">=0.11.7" }, { name = "pillow", specifier = ">=10.0.0,<12.0.0" }, { name = "pydantic", specifier = ">=2.0.0" }, + { name = "pymupdf", marker = "extra == 'perf-tools'", specifier = ">=1.26.4" }, + { name = "pypdfium2", marker = "extra == 'perf-tools'", specifier = ">=4.30.0" }, { name = "pywin32", marker = "sys_platform == 'win32'", specifier = ">=305" }, { name = "tabulate", specifier = ">=0.9.0,<1.0.0" }, ] +provides-extras = ["perf-tools"] [package.metadata.requires-dev] build = [ @@ -577,6 +623,11 @@ dev = [ { name = "python-semantic-release", specifier = ">=7.32.2,<8.0.0" }, { name = "tqdm", specifier = ">=4.67.0,<5.0.0" }, ] +perf-test = [ + { name = "pdfplumber", specifier = ">=0.11.7" }, + { name = "pymupdf", specifier = ">=1.26.4" }, + { name = "pypdfium2", specifier = ">=4.30.0" }, +] [[package]] name = "docutils" @@ -1338,6 +1389,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" }, ] +[[package]] +name = "pdfminer-six" +version = "20250506" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "charset-normalizer" }, + { name = "cryptography" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/78/46/5223d613ac4963e1f7c07b2660fe0e9e770102ec6bda8c038400113fb215/pdfminer_six-20250506.tar.gz", hash = "sha256:b03cc8df09cf3c7aba8246deae52e0bca7ebb112a38895b5e1d4f5dd2b8ca2e7", size = 7387678, upload-time = "2025-05-06T16:17:00.787Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/73/16/7a432c0101fa87457e75cb12c879e1749c5870a786525e2e0f42871d6462/pdfminer_six-20250506-py3-none-any.whl", hash = "sha256:d81ad173f62e5f841b53a8ba63af1a4a355933cfc0ffabd608e568b9193909e3", size = 5620187, upload-time = "2025-05-06T16:16:58.669Z" }, +] + +[[package]] +name = "pdfplumber" +version = "0.11.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pdfminer-six" }, + { name = "pillow" }, + { name = "pypdfium2" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6d/0d/4135821aa7b1a0b77a29fac881ef0890b46b0b002290d04915ed7acc0043/pdfplumber-0.11.7.tar.gz", hash = "sha256:fa67773e5e599de1624255e9b75d1409297c5e1d7493b386ce63648637c67368", size = 115518, upload-time = "2025-06-12T11:30:49.864Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/db/e0/52b67d4f00e09e497aec4f71bc44d395605e8ebcea52543242ed34c25ef9/pdfplumber-0.11.7-py3-none-any.whl", hash = "sha256:edd2195cca68bd770da479cf528a737e362968ec2351e62a6c0b71ff612ac25e", size = 60029, upload-time = "2025-06-12T11:30:48.89Z" }, +] + [[package]] name = "pexpect" version = "4.9.0" @@ -1671,6 +1749,41 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293, upload-time = "2025-01-06T17:26:25.553Z" }, ] +[[package]] +name = "pymupdf" +version = "1.26.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/90/35/031556dfc0d332d8e9ed9b61ca105138606d3f8971b9eb02e20118629334/pymupdf-1.26.4.tar.gz", hash = "sha256:be13a066d42bfaed343a488168656637c4d9843ddc63b768dc827c9dfc6b9989", size = 83077563, upload-time = "2025-08-25T14:20:29.499Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/ae/3be722886cc7be2093585cd94f466db1199133ab005645a7a567b249560f/pymupdf-1.26.4-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:cb95562a0a63ce906fd788bdad5239063b63068cf4a991684f43acb09052cb99", size = 23061974, upload-time = "2025-08-25T14:16:58.811Z" }, + { url = "https://files.pythonhosted.org/packages/fc/b0/9a451d837e1fe18ecdbfbc34a6499f153c8a008763229cc634725383a93f/pymupdf-1.26.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:67e9e6b45832c33726651c2a031e9a20108fd9e759140b9e843f934de813a7ff", size = 22410112, upload-time = "2025-08-25T14:17:24.511Z" }, + { url = "https://files.pythonhosted.org/packages/d8/13/0916e8e02cb5453161fb9d9167c747d0a20d58633e30728645374153f815/pymupdf-1.26.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:2604f687dd02b6a1b98c81bd8becfc0024899a2d2085adfe3f9e91607721fd22", size = 23454948, upload-time = "2025-08-25T21:20:07.71Z" }, + { url = "https://files.pythonhosted.org/packages/4e/c6/d3cfafc75d383603884edeabe4821a549345df954a88d79e6764e2c87601/pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:973a6dda61ebd34040e4df3753bf004b669017663fbbfdaa294d44eceba98de0", size = 24060686, upload-time = "2025-08-25T14:17:56.536Z" }, + { url = "https://files.pythonhosted.org/packages/72/08/035e9d22c801e801bba50c6745bc90ba8696a042fe2c68793e28bf0c3b07/pymupdf-1.26.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:299a49797df5b558e695647fa791329ba3911cbbb31ed65f24a6266c118ef1a7", size = 24265046, upload-time = "2025-08-25T14:18:21.238Z" }, + { url = "https://files.pythonhosted.org/packages/28/8c/c201e4846ec0fb6ae5d52aa3a5d66f9355f0c69fb94230265714df0de65e/pymupdf-1.26.4-cp39-abi3-win32.whl", hash = "sha256:51b38379aad8c71bd7a8dd24d93fbe7580c2a5d9d7e1f9cd29ebbba315aa1bd1", size = 17127332, upload-time = "2025-08-25T14:18:39.132Z" }, + { url = "https://files.pythonhosted.org/packages/d1/c4/87d27b108c2f6d773aa5183c5ae367b2a99296ea4bc16eb79f453c679e30/pymupdf-1.26.4-cp39-abi3-win_amd64.whl", hash = "sha256:0b6345a93a9afd28de2567e433055e873205c52e6b920b129ca50e836a3aeec6", size = 18743491, upload-time = "2025-08-25T14:19:01.104Z" }, +] + +[[package]] +name = "pypdfium2" +version = "4.30.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a1/14/838b3ba247a0ba92e4df5d23f2bea9478edcfd72b78a39d6ca36ccd84ad2/pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16", size = 140239, upload-time = "2024-05-09T18:33:17.552Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/9a/c8ff5cc352c1b60b0b97642ae734f51edbab6e28b45b4fcdfe5306ee3c83/pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab", size = 2837254, upload-time = "2024-05-09T18:32:48.653Z" }, + { url = "https://files.pythonhosted.org/packages/21/8b/27d4d5409f3c76b985f4ee4afe147b606594411e15ac4dc1c3363c9a9810/pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de", size = 2707624, upload-time = "2024-05-09T18:32:51.458Z" }, + { url = "https://files.pythonhosted.org/packages/11/63/28a73ca17c24b41a205d658e177d68e198d7dde65a8c99c821d231b6ee3d/pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854", size = 2793126, upload-time = "2024-05-09T18:32:53.581Z" }, + { url = "https://files.pythonhosted.org/packages/d1/96/53b3ebf0955edbd02ac6da16a818ecc65c939e98fdeb4e0958362bd385c8/pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2", size = 2591077, upload-time = "2024-05-09T18:32:55.99Z" }, + { url = "https://files.pythonhosted.org/packages/ec/ee/0394e56e7cab8b5b21f744d988400948ef71a9a892cbeb0b200d324ab2c7/pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad", size = 2864431, upload-time = "2024-05-09T18:32:57.911Z" }, + { url = "https://files.pythonhosted.org/packages/65/cd/3f1edf20a0ef4a212a5e20a5900e64942c5a374473671ac0780eaa08ea80/pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f", size = 2812008, upload-time = "2024-05-09T18:32:59.886Z" }, + { url = "https://files.pythonhosted.org/packages/c8/91/2d517db61845698f41a2a974de90762e50faeb529201c6b3574935969045/pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163", size = 6181543, upload-time = "2024-05-09T18:33:02.597Z" }, + { url = "https://files.pythonhosted.org/packages/ba/c4/ed1315143a7a84b2c7616569dfb472473968d628f17c231c39e29ae9d780/pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e", size = 6175911, upload-time = "2024-05-09T18:33:05.376Z" }, + { url = "https://files.pythonhosted.org/packages/7a/c4/9e62d03f414e0e3051c56d5943c3bf42aa9608ede4e19dc96438364e9e03/pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be", size = 6267430, upload-time = "2024-05-09T18:33:08.067Z" }, + { url = "https://files.pythonhosted.org/packages/90/47/eda4904f715fb98561e34012826e883816945934a851745570521ec89520/pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e", size = 2775951, upload-time = "2024-05-09T18:33:10.567Z" }, + { url = "https://files.pythonhosted.org/packages/25/bd/56d9ec6b9f0fc4e0d95288759f3179f0fcd34b1a1526b75673d2f6d5196f/pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c", size = 2892098, upload-time = "2024-05-09T18:33:13.107Z" }, + { url = "https://files.pythonhosted.org/packages/be/7a/097801205b991bc3115e8af1edb850d30aeaf0118520b016354cf5ccd3f6/pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29", size = 2752118, upload-time = "2024-05-09T18:33:15.489Z" }, +] + [[package]] name = "pytest" version = "7.4.4"