Skip to content
This repository was archived by the owner on Apr 11, 2025. It is now read-only.

Commit 64a83c9

Browse files
committed
[REF]: Network parser search header -> merge_zones and extract_zones
Split the search header method to reduce complexity.
1 parent e1b16b0 commit 64a83c9

File tree

1 file changed

+99
-53
lines changed

1 file changed

+99
-53
lines changed

camelot/parsers/network.py

Lines changed: 99 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -125,19 +125,105 @@ def find_closest_tls( # noqa: C901
125125
}
126126

127127

128-
def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
129-
"""Expand a bbox vertically up by looking for plausible headers.
128+
def _extract_zones(
129+
all_above: list[Any], max_v_gap: float, top: float
130+
) -> tuple[list[list[float]], float]:
131+
"""Extract zones from the textlines above the body bbox.
132+
133+
Parameters
134+
----------
135+
all_above : List[Any]
136+
Textlines that are above the bounding box.
137+
max_v_gap : float
138+
The maximum vertical gap allowed.
139+
top : float
140+
The current top boundary.
141+
142+
Returns
143+
-------
144+
Tuple[List[List[float]], float]
145+
The extracted zones and the new top boundary.
146+
"""
147+
tls_in_new_row = []
148+
pushed_up = True
149+
150+
while pushed_up:
151+
pushed_up = False
152+
for (
153+
textline
154+
) in all_above.copy(): # Copy to avoid modifying the list during iteration
155+
if textline.y0 < top:
156+
# The bottom of this element is within our row so we add it.
157+
tls_in_new_row.append(textline)
158+
all_above.remove(textline)
159+
if textline.y1 > top:
160+
# If the top of this element raises our row's
161+
# band, we'll need to keep on searching for
162+
# overlapping items
163+
top = textline.y1
164+
pushed_up = True
165+
166+
return [[textline.x0, textline.x1] for textline in tls_in_new_row], top
167+
168+
169+
def _merge_zones(zones: list[list[float]]) -> list[list[float]]:
170+
"""Merge overlapping zones into consolidated zones.
171+
172+
Parameters
173+
----------
174+
zones : List[List[float]]
175+
A list of zones defined by their x-coordinates.
176+
177+
Returns
178+
-------
179+
List[List[float]]
180+
A list of merged zones.
181+
"""
182+
zones.sort(key=lambda z: z[0])
183+
merged_zones: list[list[float]] = []
184+
185+
for zone in zones:
186+
if not merged_zones or merged_zones[-1][1] < zone[0]:
187+
merged_zones.append(zone)
188+
else:
189+
merged_zones[-1][1] = max(merged_zones[-1][1], zone[1]) # Merge the zones
190+
191+
return merged_zones
192+
193+
194+
def search_header_from_body_bbox(
195+
body_bbox: tuple[float, float, float, float],
196+
textlines: list[Any],
197+
col_anchors: list[float],
198+
max_v_gap: float,
199+
) -> tuple[float, float, float, float]:
200+
"""Expand a bounding box (bbox) vertically by looking for plausible headers.
130201
131202
The core algorithm is based on fairly strict alignment of text. It works
132-
for the table body, but might fail on tables' headers since they tend to be
133-
in a different font, alignment (e.g. vertical), etc.
134-
This method evalutes the area above the table body's bbox for
135-
characteristics of a table header: close to the top of the body, with cells
136-
that fit within the horizontal bounds identified.
203+
for the table body but might fail on table headers since they tend to be
204+
in a different font, alignment (e.g., vertical), etc. This method evaluates
205+
the area above the table body's bbox for characteristics of a table header:
206+
close to the top of the body, with cells that fit within the horizontal bounds identified.
207+
208+
Parameters
209+
----------
210+
body_bbox : Tuple[float, float, float, float]
211+
The bounding box of the body in the format (left, bottom, right, top).
212+
textlines : List[Any]
213+
A list of textline objects, each with properties x0, x1, y0, and y1.
214+
col_anchors : List[float]
215+
A list of x-coordinates representing column anchors.
216+
max_v_gap : float
217+
The maximum vertical gap allowed to consider a header plausible.
218+
219+
Returns
220+
-------
221+
Tuple[float, float, float, float]
222+
The expanded bounding box in the format (left, bottom, right, top).
137223
"""
138224
new_bbox = body_bbox
139225
(left, bottom, right, top) = body_bbox
140-
zones = []
226+
zones: list[list[float]] = []
141227

142228
keep_searching = True
143229
while keep_searching:
@@ -154,55 +240,15 @@ def search_header_from_body_bbox(body_bbox, textlines, col_anchors, max_v_gap):
154240
closest_above = min(all_above, key=lambda tl: tl.y0, default=None)
155241

156242
if closest_above and closest_above.y0 < top + max_v_gap:
157-
# b/ We have a candidate cell that is within the correct
243+
# We have a candidate cell that is within the correct
158244
# vertical band, and directly above the table. Starting from
159245
# this anchor, we list all the textlines within the same row.
160-
tls_in_new_row = []
161-
top = closest_above.y1
162-
pushed_up = True
163-
while pushed_up:
164-
pushed_up = False
165-
# Iterate and extract elements that fit in the row
166-
# from our list
167-
for i in range(len(all_above) - 1, -1, -1):
168-
textline = all_above[i]
169-
if textline.y0 < top:
170-
# The bottom of this element is within our row
171-
# so we add it.
172-
tls_in_new_row.append(textline)
173-
all_above.pop(i)
174-
if textline.y1 > top:
175-
# If the top of this element raises our row's
176-
# band, we'll need to keep on searching for
177-
# overlapping items
178-
top = textline.y1
179-
pushed_up = True
180-
181-
# Get the x-ranges for all the textlines, and merge the
182-
# x-ranges that overlap
183-
zones = zones + list(
184-
map(lambda textline: [textline.x0, textline.x1], tls_in_new_row)
185-
)
186-
zones.sort(key=lambda z: z[0]) # Sort by left coordinate
187-
# Starting from the right, if two zones overlap horizontally,
188-
# merge them
189-
merged_something = True
190-
while merged_something:
191-
merged_something = False
192-
for i in range(len(zones) - 1, 0, -1):
193-
zone_right = zones[i]
194-
zone_left = zones[i - 1]
195-
if zone_left[1] >= zone_right[0]:
196-
zone_left[1] = max(zone_right[1], zone_left[1])
197-
zones.pop(i)
198-
merged_something = True
246+
zones, top = _extract_zones(all_above, max_v_gap, closest_above.y1)
247+
# Starting from the right, if two zones overlap horizontally, merge them
248+
merged_zones = _merge_zones(zones)
199249

200250
max_spread = max(
201-
list(
202-
map(
203-
lambda zone: column_spread(zone[0], zone[1], col_anchors), zones
204-
)
205-
)
251+
column_spread(zone[0], zone[1], col_anchors) for zone in merged_zones
206252
)
207253

208254
# Accept textlines that cross columns boundaries, as long as they

0 commit comments

Comments
 (0)