Skip to content

Commit 8bd3e98

Browse files
David Huggins-Dainesdhdaines
authored andcommitted
fix: handle case of overlapping cells in contract_cells_into_lines_v2
Signed-off-by: David Huggins-Daines <dhd@ecolingui.ca>
1 parent 514b6fe commit 8bd3e98

File tree

2 files changed

+69
-9
lines changed

2 files changed

+69
-9
lines changed

src/v2/pdf_resources/page_cell.h

Lines changed: 67 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ namespace pdflib
2727

2828
bool is_adjacent_to(pdf_resource<PAGE_CELL>& other, double delta);
2929

30+
bool intersects(pdf_resource<PAGE_CELL>& other);
31+
32+
bool contains(double x, double y);
33+
3034
bool has_same_reading_orientation(pdf_resource<PAGE_CELL>& other);
3135

3236
bool merge_with(pdf_resource<PAGE_CELL>& other, double delta);
@@ -259,13 +263,64 @@ namespace pdflib
259263

260264
return (num_chars>0? len/num_chars : 0.0);
261265
}
262-
266+
267+
bool pdf_resource<PAGE_CELL>::intersects(pdf_resource<PAGE_CELL>& other)
268+
{
269+
// Use point-in-polygon (via even-odd rule) to determine if
270+
// bounding quadrilaterals intersect.
271+
return (contains(other.r_x0, other.r_y0)
272+
or contains(other.r_x1, other.r_y1)
273+
or contains(other.r_x2, other.r_y2)
274+
or contains(other.r_x3, other.r_y3)
275+
or other.contains(r_x0, r_y0)
276+
or other.contains(r_x1, r_y1)
277+
or other.contains(r_x2, r_y2)
278+
or other.contains(r_x3, r_y3));
279+
}
280+
281+
inline bool inside_plane(double x, double y, double xi, double yi, double xj, double yj)
282+
{
283+
return ((yi > y) != (yj > y) and (x < (xj - xi) * (y - yi) / (yj - yi) + xi));
284+
}
285+
286+
bool pdf_resource<PAGE_CELL>::contains(double x, double y)
287+
{
288+
// point-in-polygon via even-odd rule
289+
bool inside = false;
290+
if (inside_plane(x, y, r_x3, r_y3, r_x0, r_y0))
291+
inside = not inside;
292+
if (inside_plane(x, y, r_x0, r_y0, r_x1, r_y1))
293+
inside = not inside;
294+
if (inside_plane(x, y, r_x1, r_y1, r_x2, r_y2))
295+
inside = not inside;
296+
if (inside_plane(x, y, r_x2, r_y2, r_x3, r_y3))
297+
inside = not inside;
298+
return inside;
299+
}
300+
301+
263302
bool pdf_resource<PAGE_CELL>::is_adjacent_to(pdf_resource<PAGE_CELL>& other, double eps)
264303
{
265-
double d0 = std::sqrt((r_x1-other.r_x0)*(r_x1-other.r_x0) + (r_y1-other.r_y0)*(r_y1-other.r_y0));
266-
double d1 = std::sqrt((r_x2-other.r_x3)*(r_x2-other.r_x3) + (r_y2-other.r_y3)*(r_y2-other.r_y3));
304+
// NOTE: This assumes (even for right-to-left text) that other is
305+
// to the right of this, as the calling code seems to do that.
306+
307+
// lower_right(this) : lower_left(other)
308+
double dx0 = other.r_x0 - r_x1;
309+
double dy0 = other.r_y0 - r_y1;
310+
double d0 = std::sqrt(dx0 * dx0 + dy0 * dy0);
267311

268-
return ((d0<eps) and (d1<eps));
312+
if (d0 >= eps)
313+
return false;
314+
315+
// upper_right(this) : upper_left(other)
316+
double dx1 = other.r_x3 - r_x2;
317+
double dy1 = other.r_y3 - r_y2;
318+
double d1 = std::sqrt(dx1 * dx1 + dy1 * dy1);
319+
320+
if (d1 >= eps)
321+
return false;
322+
323+
return true;
269324
}
270325

271326
bool pdf_resource<PAGE_CELL>::has_same_reading_orientation(pdf_resource<PAGE_CELL>& other)
@@ -285,11 +340,16 @@ namespace pdflib
285340
LOG_S(ERROR) << "inconsistent merging of cells!";
286341
}
287342

288-
double d0 = std::sqrt((r_x1-other.r_x0)*(r_x1-other.r_x0) + (r_y1-other.r_y0)*(r_y1-other.r_y0));
343+
// FIXME: Redundant calculation with is_adjacent_to
344+
double dx0 = other.r_x0 - r_x1;
345+
double dy0 = other.r_y0 - r_y1;
346+
double d0 = std::sqrt(dx0 * dx0 + dy0 * dy0);
347+
289348

290349
if((not left_to_right) or (not other.left_to_right))
291350
{
292-
if(delta<d0)
351+
// FIXME: Reundant calculation of intersects here as well...
352+
if(d0 >= delta and not intersects(other))
293353
{
294354
text = " " + text;
295355
}
@@ -299,7 +359,7 @@ namespace pdflib
299359
}
300360
else
301361
{
302-
if(delta<d0)
362+
if(d0 >= delta and not intersects(other))
303363
{
304364
text += " ";
305365
}

src/v2/pdf_sanitators/cells.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ namespace pdflib
283283
*/
284284

285285
void pdf_sanitator<PAGE_CELLS>::contract_cells_into_lines_v2(pdf_resource<PAGE_CELLS>& cells,
286-
double horizontal_cell_tolerance,
286+
double horizontal_cell_tolerance, // FIXME: UNUSED
287287
bool enforce_same_font,
288288
double space_width_factor_for_merge,
289289
double space_width_factor_for_merge_with_space)
@@ -319,7 +319,7 @@ namespace pdflib
319319
double delta_0 = cells[i].average_char_width()*space_width_factor_for_merge;
320320
double delta_1 = cells[i].average_char_width()*space_width_factor_for_merge_with_space;
321321

322-
if(cells[i].is_adjacent_to(cells[j], delta_0))
322+
if(cells[i].is_adjacent_to(cells[j], delta_0) or cells[i].intersects(cells[j]))
323323
{
324324
cells[i].merge_with(cells[j], delta_1);
325325

0 commit comments

Comments
 (0)