Skip to content

Commit 272a0fb

Browse files
authored
Merge pull request #842 from datalab-to/dev
Optional block ids
2 parents 62b3c75 + b772240 commit 272a0fb

File tree

14 files changed

+115
-23
lines changed

14 files changed

+115
-23
lines changed

marker/processors/equation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,8 @@ def get_latex_batched(
133133
recognition_batch_size=self.get_batch_size(),
134134
sort_lines=False,
135135
drop_repeated_text=self.drop_repeated_text,
136-
max_tokens=1024,
137-
max_sliding_window=1200,
136+
max_tokens=2048,
137+
max_sliding_window=2148,
138138
)
139139

140140
equation_predictions = [

marker/processors/llm/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,9 @@ def rewrite_blocks(self, document: Document):
156156
return
157157

158158
pbar = tqdm(
159-
desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm
159+
total=total_blocks,
160+
desc=f"{self.__class__.__name__} running",
161+
disable=self.disable_tqdm
160162
)
161163
with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
162164
for future in as_completed(

marker/processors/llm/llm_mathblock.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,9 @@ def rewrite_blocks(self, document: Document):
142142
return
143143

144144
pbar = tqdm(
145-
desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm
145+
total=total_blocks,
146+
desc=f"{self.__class__.__name__} running",
147+
disable=self.disable_tqdm
146148
)
147149
with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
148150
for future in as_completed(

marker/processors/llm/llm_table.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Table):
161161
batch_bbox[3] = block_image.size[1]
162162

163163
batch_image = block_image.crop(batch_bbox)
164-
block_html = block.format_cells(document, [], batch_cells)
164+
block_html = block.format_cells(document, [], None, batch_cells)
165165
batch_image = self.handle_image_rotation(batch_cells, batch_image)
166166
batch_parsed_cells = self.rewrite_single_chunk(
167167
page, block, block_html, batch_cells, batch_image

marker/processors/llm/llm_table_merge.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,7 @@ def rewrite_blocks(self, document: Document):
158158
if self.no_merge_tables_across_pages:
159159
logger.info("Skipping table merging across pages due to --no_merge_tables_across_pages flag")
160160
return
161-
162-
pbar = tqdm(desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm)
161+
163162
table_runs = []
164163
table_run = []
165164
prev_block = None
@@ -221,6 +220,17 @@ def rewrite_blocks(self, document: Document):
221220
if table_run:
222221
table_runs.append(table_run)
223222

223+
# Don't show progress if there is nothing to process
224+
total_table_runs = len(table_runs)
225+
if total_table_runs == 0:
226+
return
227+
228+
pbar = tqdm(
229+
total=total_table_runs,
230+
desc=f"{self.__class__.__name__} running",
231+
disable=self.disable_tqdm,
232+
)
233+
224234
with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
225235
for future in as_completed([
226236
executor.submit(self.process_rewriting, document, blocks)

marker/processors/table.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ class TableProcessor(BaseProcessor):
6464
bool,
6565
"Whether to disable the tqdm progress bar.",
6666
] = False
67+
disable_ocr_math: Annotated[bool, "Disable inline math recognition in OCR"] = False
6768
drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False
6869

6970
def __init__(
@@ -475,6 +476,7 @@ def assign_ocr_lines(self, ocr_blocks: list):
475476
det_predictor=self.detection_model,
476477
recognition_batch_size=self.get_recognition_batch_size(),
477478
detection_batch_size=self.get_detection_batch_size(),
479+
math_mode=not self.disable_ocr_math,
478480
drop_repeated_text=self.drop_repeated_text,
479481
)
480482

marker/renderers/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,17 @@ class BaseRenderer:
2929
keep_pagefooter_in_output: Annotated[
3030
bool, "Keep the page footer in the output HTML."
3131
] = False
32+
add_block_ids: Annotated[bool, "Whether to add block IDs to the output HTML."] = (
33+
False
34+
)
3235

3336
def __init__(self, config: Optional[BaseModel | dict] = None):
3437
assign_config(self, config)
3538

3639
self.block_config = {
3740
"keep_pageheader_in_output": self.keep_pageheader_in_output,
3841
"keep_pagefooter_in_output": self.keep_pagefooter_in_output,
42+
"add_block_ids": self.add_block_ids,
3943
}
4044

4145
def __call__(self, document):

marker/renderers/html.py

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,37 @@ def extract_image(self, document, image_id):
4747
)
4848
return cropped
4949

50+
def insert_block_id(self, soup, block_id: BlockId):
51+
"""
52+
Insert a block ID into the soup as a data attribute.
53+
"""
54+
if block_id.block_type in [BlockTypes.Line, BlockTypes.Span]:
55+
return soup
56+
57+
if self.add_block_ids:
58+
# Find the outermost tag (first tag that isn't a NavigableString)
59+
outermost_tag = None
60+
for element in soup.contents:
61+
if hasattr(element, "name") and element.name:
62+
outermost_tag = element
63+
break
64+
65+
# If we found an outermost tag, add the data-block-id attribute
66+
if outermost_tag:
67+
outermost_tag["data-block-id"] = str(block_id)
68+
69+
# If soup only contains text or no tags, wrap in a span
70+
elif soup.contents:
71+
wrapper = soup.new_tag("span")
72+
wrapper["data-block-id"] = str(block_id)
73+
74+
contents = list(soup.contents)
75+
for content in contents:
76+
content.extract()
77+
wrapper.append(content)
78+
soup.append(wrapper)
79+
return soup
80+
5081
def extract_html(self, document, document_output, level=0):
5182
soup = BeautifulSoup(document_output.html, "html.parser")
5283

@@ -69,22 +100,24 @@ def extract_html(self, document, document_output, level=0):
69100
image = self.extract_image(document, ref_block_id)
70101
image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
71102
images[image_name] = image
72-
ref.replace_with(
73-
BeautifulSoup(
74-
f"<p>{content}<img src='{image_name}'></p>", "html.parser"
75-
)
103+
element = BeautifulSoup(
104+
f"<p>{content}<img src='{image_name}'></p>", "html.parser"
76105
)
106+
ref.replace_with(self.insert_block_id(element, ref_block_id))
77107
else:
78108
# This will be the image description if using llm mode, or empty if not
79-
ref.replace_with(BeautifulSoup(f"{content}", "html.parser"))
109+
element = BeautifulSoup(f"{content}", "html.parser")
110+
ref.replace_with(self.insert_block_id(element, ref_block_id))
80111
elif ref_block_id.block_type in self.page_blocks:
81112
images.update(sub_images)
82113
if self.paginate_output:
83114
content = f"<div class='page' data-page-id='{ref_block_id.page_id}'>{content}</div>"
84-
ref.replace_with(BeautifulSoup(f"{content}", "html.parser"))
115+
element = BeautifulSoup(f"{content}", "html.parser")
116+
ref.replace_with(self.insert_block_id(element, ref_block_id))
85117
else:
86118
images.update(sub_images)
87-
ref.replace_with(BeautifulSoup(f"{content}", "html.parser"))
119+
element = BeautifulSoup(f"{content}", "html.parser")
120+
ref.replace_with(self.insert_block_id(element, ref_block_id))
88121

89122
output = str(soup)
90123
if level == 0:

marker/schema/blocks/basetable.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ class BaseTable(Block):
1111

1212
@staticmethod
1313
def format_cells(
14-
document, child_blocks, child_cells: List[TableCell] | None = None
14+
document, child_blocks, block_config, child_cells: List[TableCell] | None = None
1515
):
1616
if child_cells is None:
1717
child_cells: List[TableCell] = [
@@ -28,7 +28,9 @@ def format_cells(
2828
)
2929
html_repr += "<tr>"
3030
for cell in row_cells:
31-
html_repr += cell.assemble_html(document, child_blocks, None, None)
31+
html_repr += cell.assemble_html(
32+
document, child_blocks, None, block_config
33+
)
3234
html_repr += "</tr>"
3335
html_repr += "</tbody></table>"
3436
return html_repr
@@ -56,7 +58,7 @@ def assemble_html(
5658
return template + self.html
5759
elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types:
5860
# Table processor
59-
return template + self.format_cells(document, child_blocks)
61+
return template + self.format_cells(document, child_blocks, block_config)
6062
else:
6163
# Default text lines and spans
6264
return f"<p>{template}</p>"

marker/schema/blocks/tablecell.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,16 @@ def text(self):
2121
def assemble_html(
2222
self, document, child_blocks, parent_structure=None, block_config=None
2323
):
24+
add_cell_id = block_config and block_config.get("add_block_ids", False)
25+
2426
tag_cls = "th" if self.is_header else "td"
2527
tag = f"<{tag_cls}"
2628
if self.rowspan > 1:
2729
tag += f" rowspan={self.rowspan}"
2830
if self.colspan > 1:
2931
tag += f" colspan={self.colspan}"
32+
if add_cell_id:
33+
tag += f' data-block-id="{self.id}"'
3034
if self.text_lines is None:
3135
self.text_lines = []
3236
text = "<br>".join(self.text_lines)

0 commit comments

Comments
 (0)