Merge pull request #842 from datalab-to/dev

VikParuchuri · web-flow · commit 272a0fbf2f85 · 2025-08-20T12:46:08.000-04:00
Optional block ids
diff --git a/marker/processors/equation.py b/marker/processors/equation.py
@@ -133,8 +133,8 @@ def get_latex_batched(
             recognition_batch_size=self.get_batch_size(),
             sort_lines=False,
             drop_repeated_text=self.drop_repeated_text,
-            max_tokens=1024,
-            max_sliding_window=1200,
+            max_tokens=2048,
+            max_sliding_window=2148,
         )
 
         equation_predictions = [
diff --git a/marker/processors/llm/__init__.py b/marker/processors/llm/__init__.py
@@ -156,7 +156,9 @@ def rewrite_blocks(self, document: Document):
             return
 
         pbar = tqdm(
-            desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm
+            total=total_blocks,
+            desc=f"{self.__class__.__name__} running",
+            disable=self.disable_tqdm
         )
         with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
             for future in as_completed(
diff --git a/marker/processors/llm/llm_mathblock.py b/marker/processors/llm/llm_mathblock.py
@@ -142,7 +142,9 @@ def rewrite_blocks(self, document: Document):
             return
 
         pbar = tqdm(
-            desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm
+            total=total_blocks,
+            desc=f"{self.__class__.__name__} running",
+            disable=self.disable_tqdm
         )
         with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
             for future in as_completed(
diff --git a/marker/processors/llm/llm_table.py b/marker/processors/llm/llm_table.py
@@ -161,7 +161,7 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Table):
                 batch_bbox[3] = block_image.size[1]
 
             batch_image = block_image.crop(batch_bbox)
-            block_html = block.format_cells(document, [], batch_cells)
+            block_html = block.format_cells(document, [], None, batch_cells)
             batch_image = self.handle_image_rotation(batch_cells, batch_image)
             batch_parsed_cells = self.rewrite_single_chunk(
                 page, block, block_html, batch_cells, batch_image
diff --git a/marker/processors/llm/llm_table_merge.py b/marker/processors/llm/llm_table_merge.py
@@ -158,8 +158,7 @@ def rewrite_blocks(self, document: Document):
         if self.no_merge_tables_across_pages:
             logger.info("Skipping table merging across pages due to --no_merge_tables_across_pages flag")
             return
-            
-        pbar = tqdm(desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm)
+
         table_runs = []
         table_run = []
         prev_block = None
@@ -221,6 +220,17 @@ def rewrite_blocks(self, document: Document):
         if table_run:
             table_runs.append(table_run)
 
+        # Don't show progress if there is nothing to process
+        total_table_runs = len(table_runs)
+        if total_table_runs == 0:
+            return
+
+        pbar = tqdm(
+            total=total_table_runs,
+            desc=f"{self.__class__.__name__} running",
+            disable=self.disable_tqdm,
+        )
+
         with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
             for future in as_completed([
                 executor.submit(self.process_rewriting, document, blocks)
diff --git a/marker/processors/table.py b/marker/processors/table.py
@@ -64,6 +64,7 @@ class TableProcessor(BaseProcessor):
         bool,
         "Whether to disable the tqdm progress bar.",
     ] = False
+    disable_ocr_math: Annotated[bool, "Disable inline math recognition in OCR"] = False
     drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False
 
     def __init__(
@@ -475,6 +476,7 @@ def assign_ocr_lines(self, ocr_blocks: list):
             det_predictor=self.detection_model,
             recognition_batch_size=self.get_recognition_batch_size(),
             detection_batch_size=self.get_detection_batch_size(),
+            math_mode=not self.disable_ocr_math,
             drop_repeated_text=self.drop_repeated_text,
         )
 
diff --git a/marker/renderers/__init__.py b/marker/renderers/__init__.py
@@ -29,13 +29,17 @@ class BaseRenderer:
     keep_pagefooter_in_output: Annotated[
         bool, "Keep the page footer in the output HTML."
     ] = False
+    add_block_ids: Annotated[bool, "Whether to add block IDs to the output HTML."] = (
+        False
+    )
 
     def __init__(self, config: Optional[BaseModel | dict] = None):
         assign_config(self, config)
 
         self.block_config = {
             "keep_pageheader_in_output": self.keep_pageheader_in_output,
             "keep_pagefooter_in_output": self.keep_pagefooter_in_output,
+            "add_block_ids": self.add_block_ids,
         }
 
     def __call__(self, document):
diff --git a/marker/renderers/html.py b/marker/renderers/html.py
@@ -47,6 +47,37 @@ def extract_image(self, document, image_id):
         )
         return cropped
 
+    def insert_block_id(self, soup, block_id: BlockId):
+        """
+        Insert a block ID into the soup as a data attribute.
+        """
+        if block_id.block_type in [BlockTypes.Line, BlockTypes.Span]:
+            return soup
+
+        if self.add_block_ids:
+            # Find the outermost tag (first tag that isn't a NavigableString)
+            outermost_tag = None
+            for element in soup.contents:
+                if hasattr(element, "name") and element.name:
+                    outermost_tag = element
+                    break
+
+            # If we found an outermost tag, add the data-block-id attribute
+            if outermost_tag:
+                outermost_tag["data-block-id"] = str(block_id)
+
+            # If soup only contains text or no tags, wrap in a span
+            elif soup.contents:
+                wrapper = soup.new_tag("span")
+                wrapper["data-block-id"] = str(block_id)
+
+                contents = list(soup.contents)
+                for content in contents:
+                    content.extract()
+                    wrapper.append(content)
+                soup.append(wrapper)
+        return soup
+
     def extract_html(self, document, document_output, level=0):
         soup = BeautifulSoup(document_output.html, "html.parser")
 
@@ -69,22 +100,24 @@ def extract_html(self, document, document_output, level=0):
                     image = self.extract_image(document, ref_block_id)
                     image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
                     images[image_name] = image
-                    ref.replace_with(
-                        BeautifulSoup(
-                            f"<p>{content}<img src='{image_name}'></p>", "html.parser"
-                        )
+                    element = BeautifulSoup(
+                        f"<p>{content}<img src='{image_name}'></p>", "html.parser"
                     )
+                    ref.replace_with(self.insert_block_id(element, ref_block_id))
                 else:
                     # This will be the image description if using llm mode, or empty if not
-                    ref.replace_with(BeautifulSoup(f"{content}", "html.parser"))
+                    element = BeautifulSoup(f"{content}", "html.parser")
+                    ref.replace_with(self.insert_block_id(element, ref_block_id))
             elif ref_block_id.block_type in self.page_blocks:
                 images.update(sub_images)
                 if self.paginate_output:
                     content = f"<div class='page' data-page-id='{ref_block_id.page_id}'>{content}</div>"
-                ref.replace_with(BeautifulSoup(f"{content}", "html.parser"))
+                element = BeautifulSoup(f"{content}", "html.parser")
+                ref.replace_with(self.insert_block_id(element, ref_block_id))
             else:
                 images.update(sub_images)
-                ref.replace_with(BeautifulSoup(f"{content}", "html.parser"))
+                element = BeautifulSoup(f"{content}", "html.parser")
+                ref.replace_with(self.insert_block_id(element, ref_block_id))
 
         output = str(soup)
         if level == 0:
diff --git a/marker/schema/blocks/basetable.py b/marker/schema/blocks/basetable.py
@@ -11,7 +11,7 @@ class BaseTable(Block):
 
     @staticmethod
     def format_cells(
-        document, child_blocks, child_cells: List[TableCell] | None = None
+        document, child_blocks, block_config, child_cells: List[TableCell] | None = None
     ):
         if child_cells is None:
             child_cells: List[TableCell] = [
@@ -28,7 +28,9 @@ def format_cells(
             )
             html_repr += "<tr>"
             for cell in row_cells:
-                html_repr += cell.assemble_html(document, child_blocks, None, None)
+                html_repr += cell.assemble_html(
+                    document, child_blocks, None, block_config
+                )
             html_repr += "</tr>"
         html_repr += "</tbody></table>"
         return html_repr
@@ -56,7 +58,7 @@ def assemble_html(
             return template + self.html
         elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types:
             # Table processor
-            return template + self.format_cells(document, child_blocks)
+            return template + self.format_cells(document, child_blocks, block_config)
         else:
             # Default text lines and spans
             return f"<p>{template}</p>"
diff --git a/marker/schema/blocks/tablecell.py b/marker/schema/blocks/tablecell.py
@@ -21,12 +21,16 @@ def text(self):
     def assemble_html(
         self, document, child_blocks, parent_structure=None, block_config=None
     ):
+        add_cell_id = block_config and block_config.get("add_block_ids", False)
+
         tag_cls = "th" if self.is_header else "td"
         tag = f"<{tag_cls}"
         if self.rowspan > 1:
             tag += f" rowspan={self.rowspan}"
         if self.colspan > 1:
             tag += f" colspan={self.colspan}"
+        if add_cell_id:
+            tag += f' data-block-id="{self.id}"'
         if self.text_lines is None:
             self.text_lines = []
         text = "<br>".join(self.text_lines)
diff --git a/marker/services/gemini.py b/marker/services/gemini.py
@@ -95,6 +95,20 @@ def __call__(
                 else:
                     logger.error(f"APIError: {e}")
                     break
+            except json.JSONDecodeError as e:
+                # The response was not valid JSON
+                if tries == total_tries:
+                    # Last attempt failed. Give up
+                    logger.error(
+                        f"JSONDecodeError: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})",
+                    )
+                    break
+                else:
+                    wait_time = tries * self.retry_wait_time
+                    logger.warning(
+                        f"JSONDecodeError: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})",
+                    )
+                    time.sleep(wait_time)
             except Exception as e:
                 logger.error(f"Exception: {e}")
                 traceback.print_exc()
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "1.8.3"
+version = "1.8.4"
 description = "Convert documents to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"
@@ -26,7 +26,7 @@ torch = "^2.7.0"
 tqdm = "^4.66.1"
 ftfy = "^6.1.1"
 rapidfuzz = "^3.8.1"
-surya-ocr = "^0.15.2"
+surya-ocr = "^0.15.4"
 regex = "^2024.4.28"
 pdftext = "~0.6.3"
 markdownify = "^1.1.0"
diff --git a/tests/renderers/test_html_renderer.py b/tests/renderers/test_html_renderer.py
@@ -0,0 +1,19 @@
+import pytest
+
+from marker.renderers.html import HTMLRenderer
+
+
+@pytest.mark.config(
+    {
+        "page_range": [0],
+        "disable_ocr": True,
+        "add_block_ids": True,
+        "paginate_output": True,
+    }
+)
+def test_html_renderer_block_ids(pdf_document, config):
+    renderer = HTMLRenderer(config)
+    html = renderer(pdf_document).html
+
+    # Verify some block IDs are present
+    assert "/page/0/Text/1" in html