Skip to content

Pdf that breakes dlparse_v4 #160

@vku-ibm

Description

@vku-ibm

Link to original pdf:
http://media.turuz.com/users/kesli-2017/Bolum_1_Files_0-499/0207-CHaghatay_lugatleri_uzre_notlar.pdf

Logged error:

INFO:docling.pipeline.base_pipeline:Processing document 8107320367458434831.pdf
WARNING:docling.pipeline.base_pipeline:Encountered an error during conversion of document 1fdabc0b786a08982baa89fa258d542e682449ab31140d917082fceb76a63f0f:
Traceback (most recent call last):

  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 164, in _build_document
    for p in pipeline_pages:  # Must exhaust!

  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 129, in _apply_on_pages
    yield from page_batch

  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/models/page_assemble_model.py", line 70, in __call__
    for page in page_batch:

  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/models/table_structure_model.py", line 177, in __call__
    for page in page_batch:

  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/models/layout_model.py", line 152, in __call__
    pages = list(page_batch)
            ^^^^^^^^^^^^^^^^

  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/models/easyocr_model.py", line 130, in __call__
    yield from page_batch

  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/models/page_preprocessing_model.py", line 48, in __call__
    page = self._parse_page_cells(conv_res, page)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/models/page_preprocessing_model.py", line 72, in _parse_page_cells
    page.parsed_page = page._backend.get_segmented_page()
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/backend/docling_parse_v4_backend.py", line 96, in get_segmented_page
    self._ensure_parsed()

  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/backend/docling_parse_v4_backend.py", line 48, in _ensure_parsed
    seg_page = self._dp_doc.get_page(
               ^^^^^^^^^^^^^^^^^^^^^^

  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling_parse/pdf_parser.py", line 136, in get_page
    doc_dict = self._parser.parse_pdf_from_key_on_page(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

RuntimeError: font_name [NULL] is not known: /F0, /F1, /F2, /F24, /F28, /F29, /F3, /F30, /F31, /F5, /F52, /F53, /F54, /F55, /F78

Traceback (most recent call last):
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/convert-batch-s3.py", line 133, in <module>
    for item in result_processor.process_documents(
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/local_utils/results_processor.py", line 63, in process_documents
    for i, conv_res in enumerate(results):
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/document_converter.py", line 267, in convert_all
    for conv_res in conv_res_iter:
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/document_converter.py", line 339, in _convert
    for item in map(
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/document_converter.py", line 386, in _process_document
    conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/document_converter.py", line 409, in _execute_pipeline
    conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 57, in execute
    raise e
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 49, in execute
    conv_res = self._build_document(conv_res)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 204, in _build_document
    raise e
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 164, in _build_document
    for p in pipeline_pages:  # Must exhaust!
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/pipeline/base_pipeline.py", line 129, in _apply_on_pages
    yield from page_batch
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/models/page_assemble_model.py", line 70, in __call__
    for page in page_batch:
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/models/table_structure_model.py", line 177, in __call__
    for page in page_batch:
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/models/layout_model.py", line 152, in __call__
    pages = list(page_batch)
            ^^^^^^^^^^^^^^^^
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/models/easyocr_model.py", line 130, in __call__
    yield from page_batch
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/models/page_preprocessing_model.py", line 48, in __call__
    page = self._parse_page_cells(conv_res, page)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/models/page_preprocessing_model.py", line 72, in _parse_page_cells
    page.parsed_page = page._backend.get_segmented_page()
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/backend/docling_parse_v4_backend.py", line 96, in get_segmented_page
    self._ensure_parsed()
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling/backend/docling_parse_v4_backend.py", line 48, in _ensure_parsed
    seg_page = self._dp_doc.get_page(
               ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/vku/Documents/cloud/bluevela-stuff/docling-convert-s3-to-s3/.venv/lib/python3.11/site-packages/docling_parse/pdf_parser.py", line 136, in get_page
    doc_dict = self._parser.parse_pdf_from_key_on_page(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: font_name [NULL] is not known: /F0, /F1, /F2, /F24, /F28, /F29, /F3, /F30, /F31, /F5, /F52, /F53, /F54, /F55, /F78

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions