datalab-to
diff --git a/‎.github/workflows/cla.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/cla.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎marker/processors/llm/llm_table.py‎
Lines changed: 29 additions & 8 deletions b/‎marker/processors/llm/llm_table.py‎
Lines changed: 29 additions & 8 deletions
diff --git a/‎marker/processors/table.py‎
Lines changed: 166 additions & 38 deletions b/‎marker/processors/table.py‎
Lines changed: 166 additions & 38 deletions
@@ -29,4 +29,4 @@ jobs:
           path-to-document: 'https://github.com/VikParuchuri/marker/blob/master/CLA.md'
           # branch should not be protected
           branch: 'master'
-          allowlist: VikParuchuri
+          allowlist: VikParuchuri,Sandy
@@ -36,6 +36,10 @@ class LLMTableProcessor(BaseLLMComplexBlockProcessor):
         float,
         "The maximum width/height ratio for table cells for a table to be considered rotated.",
     ] = 0.6
+    max_table_iterations: Annotated[
+        int,
+        "The maximum number of iterations to attempt rewriting a table.",
+    ] = 2
     table_rewriting_prompt: Annotated[
         str,
         "The prompt to use for rewriting text.",
@@ -58,6 +62,7 @@ class LLMTableProcessor(BaseLLMComplexBlockProcessor):
 2. Analyze the html representation of the table.
 3. Write a comparison of the image and the html representation, paying special attention to the column headers matching the correct column values.
 4. If the html representation is completely correct, or you cannot read the image properly, then write "No corrections needed."  If the html representation has errors, generate the corrected html representation.  Output only either the corrected html representation or "No corrections needed."
+5. If you made corrections, analyze your corrections against the original image, and provide a score from 1-5, indicating how well the corrected html matches the image, with 5 being perfect.
 **Example:**
 Input:
 ```html
@@ -70,7 +75,6 @@ class LLMTableProcessor(BaseLLMComplexBlockProcessor):
     <tr>
         <td>John</td>
         <td>Doe</td>
-        <td>25</td>
     </tr>
 </table>
 ```
@@ -79,6 +83,8 @@ class LLMTableProcessor(BaseLLMComplexBlockProcessor):
 ```html
 No corrections needed.
 ```
+analysis: I did not make any corrections, as the html representation was already accurate.
+score: 5
 **Input:**
 ```html
 {block_html}
@@ -186,6 +192,7 @@ def rewrite_single_chunk(
         block_html: str,
         children: List[TableCell],
         image: Image.Image,
+        total_iterations: int = 0,
     ):
         prompt = self.table_rewriting_prompt.replace("{block_html}", block_html)
 
@@ -202,19 +209,31 @@ def rewrite_single_chunk(
             return
 
         corrected_html = corrected_html.strip().lstrip("```html").rstrip("```").strip()
+
+        # Re-iterate if low score
+        total_iterations += 1
+        score = response.get("score", 5)
+        analysis = response.get("analysis", "")
+        logger.debug(f"Got table rewriting score {score} with analysis: {analysis}")
+        if total_iterations < self.max_table_iterations and score < 4:
+            logger.info(
+                f"Table rewriting low score {score}, on iteration {total_iterations}"
+            )
+            block_html = corrected_html
+            return self.rewrite_single_chunk(
+                page, block, block_html, children, image, total_iterations
+            )
+
         parsed_cells = self.parse_html_table(corrected_html, block, page)
         if len(parsed_cells) <= 1:
             block.update_metadata(llm_error_count=1)
+            logger.debug(f"Table parsing issue, only {len(parsed_cells)} cells found")
             return
 
         if not corrected_html.endswith("</table>"):
-            block.update_metadata(llm_error_count=1)
-            return
-
-        parsed_cell_text = "".join([cell.text for cell in parsed_cells])
-        orig_cell_text = "".join([cell.text for cell in children])
-        # Potentially a partial response
-        if len(parsed_cell_text) < len(orig_cell_text) * 0.5:
+            logger.debug(
+                "Table parsing issue, corrected html does not end with </table>"
+            )
             block.update_metadata(llm_error_count=1)
             return
 
@@ -304,3 +323,5 @@ def parse_html_table(
 class TableSchema(BaseModel):
     comparison: str
     corrected_html: str
+    analysis: str
+    score: int
@@ -6,7 +6,8 @@
 from PIL import Image
 
 from ftfy import fix_text
-from surya.recognition import RecognitionPredictor, OCRResult, TextLine
+from surya.detection import DetectionPredictor, TextDetectionResult
+from surya.recognition import RecognitionPredictor, TextLine
 from surya.table_rec import TableRecPredictor
 from surya.table_rec.schema import TableResult, TableCell as SuryaTableCell
 from pdftext.extraction import table_output
@@ -35,6 +36,11 @@ class TableProcessor(BaseProcessor):
         "The batch size to use for the table recognition model.",
         "Default is None, which will use the default batch size for the model.",
     ] = None
+    detection_batch_size: Annotated[
+        int,
+        "The batch size to use for the table detection model.",
+        "Default is None, which will use the default batch size for the model.",
+    ] = None
     recognition_batch_size: Annotated[
         int,
         "The batch size to use for the table recognition model.",
@@ -56,27 +62,34 @@ class TableProcessor(BaseProcessor):
         bool,
         "Whether to disable the tqdm progress bar.",
     ] = False
-    drop_repeated_table_text: Annotated[bool, "Drop repeated text in OCR results."] = False
+    drop_repeated_table_text: Annotated[bool, "Drop repeated text in OCR results."] = (
+        False
+    )
     filter_tag_list = ["p", "table", "td", "tr", "th", "tbody"]
     disable_ocr_math: Annotated[bool, "Disable inline math recognition in OCR"] = False
+    disable_ocr: Annotated[bool, "Disable OCR entirely."] = False
 
     def __init__(
         self,
         recognition_model: RecognitionPredictor,
         table_rec_model: TableRecPredictor,
+        detection_model: DetectionPredictor,
         config=None,
     ):
         super().__init__(config)
 
         self.recognition_model = recognition_model
         self.table_rec_model = table_rec_model
+        self.detection_model = detection_model
 
     def __call__(self, document: Document):
         filepath = document.filepath  # Path to original pdf file
 
         table_data = []
         for page in document.pages:
             for block in page.contained_blocks(document, self.block_types):
+                if block.block_type == BlockTypes.Table:
+                    block.polygon = block.polygon.expand(0.01, 0.01)
                 image = block.get_image(document, highres=True)
                 image_poly = block.polygon.rescale(
                     (page.polygon.width, page.polygon.height),
@@ -105,6 +118,9 @@ def __call__(self, document: Document):
             [t["table_image"] for t in table_data],
             batch_size=self.get_table_rec_batch_size(),
         )
+        assert len(tables) == len(table_data), (
+            "Number of table results should match the number of tables"
+        )
 
         # Assign cell text if we don't need OCR
         # We do this at a line level
@@ -180,21 +196,21 @@ def finalize_cell_text(self, cell: SuryaTableCell):
             # Unspaced sequences: "...", "---", "___", "……"
             text = re.sub(r"[.\-_…]{2,}", "", text)
             # Remove mathbf formatting if there is only digits with decimals/commas/currency symbols inside
-            text = re.sub(r'\\mathbf\{([0-9.,$€£]+)\}', r'<b>\1</b>', text)
+            text = re.sub(r"\\mathbf\{([0-9.,$€£]+)\}", r"<b>\1</b>", text)
             # Drop empty tags like \overline{}
-            text = re.sub(r'\\[a-zA-Z]+\{\s*\}', '', text)
+            text = re.sub(r"\\[a-zA-Z]+\{\s*\}", "", text)
             # Drop \phantom{...} (remove contents too)
-            text = re.sub(r'\\phantom\{.*?\}', '', text)
+            text = re.sub(r"\\phantom\{.*?\}", "", text)
             # Drop \quad
-            text = re.sub(r'\\quad', '', text)
+            text = re.sub(r"\\quad", "", text)
             # Drop \,
-            text = re.sub(r'\\,', '', text)
+            text = re.sub(r"\\,", "", text)
             # Unwrap \mathsf{...}
-            text = re.sub(r'\\mathsf\{([^}]*)\}', r'\1', text)
+            text = re.sub(r"\\mathsf\{([^}]*)\}", r"\1", text)
             # Handle unclosed tags: keep contents, drop the command
-            text = re.sub(r'\\[a-zA-Z]+\{([^}]*)$', r'\1', text)
+            text = re.sub(r"\\[a-zA-Z]+\{([^}]*)$", r"\1", text)
             # If the whole string is \text{...} → unwrap
-            text = re.sub(r'^\s*\\text\{([^}]*)\}\s*$', r'\1', text)
+            text = re.sub(r"^\s*\\text\{([^}]*)\}\s*$", r"\1", text)
 
             # In case the above steps left no more latex math - We can unwrap
             text = unwrap_math(text)
@@ -479,31 +495,134 @@ def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
                 "Number of tables and table inputs must match"
             )
 
-    def needs_ocr(self, tables: List[TableResult]):
+    def align_table_cells(
+        self, table: TableResult, table_detection_result: TextDetectionResult
+    ):
+        table_cells = table.cells
+        table_text_lines = table_detection_result.bboxes
+
+        text_line_bboxes = [t.bbox for t in table_text_lines]
+        table_cell_bboxes = [c.bbox for c in table_cells]
+
+        intersection_matrix = matrix_intersection_area(
+            text_line_bboxes, table_cell_bboxes
+        )
+
+        # Map cells -> list of assigned text lines
+        cell_text = defaultdict(list)
+        for text_line_idx, table_text_line in enumerate(table_text_lines):
+            intersections = intersection_matrix[text_line_idx]
+            if intersections.sum() == 0:
+                continue
+            max_intersection = intersections.argmax()
+            cell_text[max_intersection].append(table_text_line)
+
+        # Adjust cell polygons in place
+        for cell_idx, cell in enumerate(table_cells):
+            # all intersecting lines
+            intersecting_line_indices = [
+                i for i, area in enumerate(intersection_matrix[:, cell_idx]) if area > 0
+            ]
+            if not intersecting_line_indices:
+                continue
+
+            assigned_lines = cell_text.get(cell_idx, [])
+            # Expand to fit assigned lines - **Only in the y direction**
+            for assigned_line in assigned_lines:
+                x1 = cell.bbox[0]
+                x2 = cell.bbox[2]
+                y1 = min(cell.bbox[1], assigned_line.bbox[1])
+                y2 = max(cell.bbox[3], assigned_line.bbox[3])
+                cell.polygon = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
+
+            # Clear out non-assigned lines
+            non_assigned_lines = [
+                table_text_lines[i]
+                for i in intersecting_line_indices
+                if table_text_lines[i] not in cell_text.get(cell_idx, [])
+            ]
+            if non_assigned_lines:
+                # Find top-most and bottom-most non-assigned boxes
+                top_box = min(
+                    non_assigned_lines, key=lambda line: line.bbox[1]
+                )  # smallest y0
+                bottom_box = max(
+                    non_assigned_lines, key=lambda line: line.bbox[3]
+                )  # largest y1
+
+                # Current cell bbox (from polygon)
+                x0, y0, x1, y1 = cell.bbox
+
+                # Adjust y-limits based on non-assigned boxes
+                new_y0 = max(y0, top_box.bbox[3])  # top moves down
+                new_y1 = min(y1, bottom_box.bbox[1])  # bottom moves up
+
+                if new_y0 < new_y1:
+                    # Replace polygon with a new shrunken rectangle
+                    cell.polygon = [
+                        [x0, new_y0],
+                        [x1, new_y0],
+                        [x1, new_y1],
+                        [x0, new_y1],
+                    ]
+
+    def needs_ocr(self, tables: List[TableResult], table_blocks: List[dict]):
         ocr_tables = []
-        ocr_polys = []
         ocr_idxs = []
-        for j, result in enumerate(tables):
-            table_cells: List[SuryaTableCell] = result.cells
-            if any([tc.text_lines is None for tc in table_cells]):
-                ocr_tables.append(result)
-                polys = [tc for tc in table_cells if tc.text_lines is None]
-                ocr_polys.append(polys)
+        for j, (table_result, table_block) in enumerate(zip(tables, table_blocks)):
+            table_cells: List[SuryaTableCell] = table_result.cells
+            text_lines_need_ocr = any([tc.text_lines is None for tc in table_cells])
+            if (
+                table_block["ocr_block"]
+                and text_lines_need_ocr
+                and not self.disable_ocr
+            ):
+                logger.debug(
+                    f"Table {j} needs OCR, info table block needs ocr: {table_block['ocr_block']}, text_lines {text_lines_need_ocr}"
+                )
+                ocr_tables.append(table_result)
                 ocr_idxs.append(j)
+
+        detection_results: List[TextDetectionResult] = self.detection_model(
+            images=[table_blocks[i]["table_image"] for i in ocr_idxs],
+            batch_size=self.get_detection_batch_size(),
+        )
+        assert len(detection_results) == len(ocr_idxs), (
+            "Every OCRed table requires a text detection result"
+        )
+
+        for idx, table_detection_result in zip(ocr_idxs, detection_results):
+            self.align_table_cells(tables[idx], table_detection_result)
+
+        ocr_polys = []
+        for ocr_idx in ocr_idxs:
+            table_cells = tables[ocr_idx].cells
+            polys = [tc for tc in table_cells if tc.text_lines is None]
+            ocr_polys.append(polys)
         return ocr_tables, ocr_polys, ocr_idxs
 
-    def get_ocr_results(self, table_images: List[Image.Image], ocr_polys: List[List[SuryaTableCell]]):
-        ocr_polys_blank = []
+    def get_ocr_results(
+        self, table_images: List[Image.Image], ocr_polys: List[List[SuryaTableCell]]
+    ):
+        ocr_polys_bad = []
 
         for table_image, polys in zip(table_images, ocr_polys):
-            table_polys_blank = [is_blank_image(table_image.crop(poly.bbox), poly.polygon) for poly in polys]
-            ocr_polys_blank.append(table_polys_blank)
-                
+            table_polys_bad = [
+                any(
+                    [
+                        poly.height < 6,
+                        is_blank_image(table_image.crop(poly.bbox), poly.polygon),
+                    ]
+                )
+                for poly in polys
+            ]
+            ocr_polys_bad.append(table_polys_bad)
+
         filtered_polys = []
-        for table_polys, table_polys_blank in zip(ocr_polys, ocr_polys_blank):
+        for table_polys, table_polys_bad in zip(ocr_polys, ocr_polys_bad):
             filtered_table_polys = []
-            for p, is_blank in zip(table_polys, table_polys_blank):
-                if is_blank:
+            for p, is_bad in zip(table_polys, table_polys_bad):
+                if is_bad:
                     continue
                 polygon = p.polygon
                 # Round the polygon
@@ -527,19 +646,21 @@ def get_ocr_results(self, table_images: List[Image.Image], ocr_polys: List[List[
         )
 
         # Re-align the predictions to the original length, since we skipped some predictions
-        for table_ocr_result, table_polys_blank in zip(ocr_results, ocr_polys_blank):
+        for table_ocr_result, table_polys_bad in zip(ocr_results, ocr_polys_bad):
             updated_lines = []
             idx = 0
-            for is_blank in table_polys_blank:
-                if is_blank:
-                    updated_lines.append(TextLine(
-                        text = "",
-                        polygon=[[0, 0], [0, 0], [0, 0], [0, 0]],
-                        confidence=1,
-                        chars=[],
-                        original_text_good=False,
-                        words=None
-                    ))
+            for is_bad in table_polys_bad:
+                if is_bad:
+                    updated_lines.append(
+                        TextLine(
+                            text="",
+                            polygon=[[0, 0], [0, 0], [0, 0], [0, 0]],
+                            confidence=1,
+                            chars=[],
+                            original_text_good=False,
+                            words=None,
+                        )
+                    )
                 else:
                     updated_lines.append(table_ocr_result.text_lines[idx])
                     idx += 1
@@ -548,7 +669,7 @@ def get_ocr_results(self, table_images: List[Image.Image], ocr_polys: List[List[
         return ocr_results
 
     def assign_ocr_lines(self, tables: List[TableResult], table_blocks: list):
-        ocr_tables, ocr_polys, ocr_idxs = self.needs_ocr(tables)
+        ocr_tables, ocr_polys, ocr_idxs = self.needs_ocr(tables, table_blocks)
         det_images = [
             t["table_image"] for i, t in enumerate(table_blocks) if i in ocr_idxs
         ]
@@ -589,3 +710,10 @@ def get_recognition_batch_size(self):
         elif settings.TORCH_DEVICE_MODEL == "cuda":
             return 48
         return 32
+
+    def get_detection_batch_size(self):
+        if self.detection_batch_size is not None:
+            return self.detection_batch_size
+        elif settings.TORCH_DEVICE_MODEL == "cuda":
+            return 10
+        return 4