Skip to content

Commit 4ed034c

Browse files
fix: out-of-range vector error (#15)
* fix out-of-range vector error Signed-off-by: Peter Staar <[email protected]> * reformat code Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]>
1 parent 9c8cccc commit 4ed034c

File tree

3 files changed

+55
-23
lines changed

3 files changed

+55
-23
lines changed

docling_parse/run.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def main():
4747

4848
# Load the document
4949
success = parser.load_document(doc_key, doc_file)
50+
# parser.set_loglevel(args.log_level)
5051

5152
# Get number of pages
5253
num_pages = parser.number_of_pages(doc_key)
@@ -56,7 +57,10 @@ def main():
5657
json_doc = parser.parse_pdf_from_key_on_page(doc_key, page)
5758

5859
if "pages" not in json_doc: # page could not get parsed
60+
print(f"ERROR: page {page} is not parsed ... ")
5961
continue
62+
else:
63+
print(f"page {page} is parsed ... ")
6064

6165
json_page = json_doc["pages"][0]
6266

@@ -80,10 +84,13 @@ def main():
8084
]
8185
)
8286

83-
print(f"cells of page: {page}")
84-
print(
85-
tabulate(cells, headers=["page", "cell-id", "text", "x0", "y0", "x1", "y1"])
86-
)
87+
if False:
88+
print(f"cells of page: {page}")
89+
print(
90+
tabulate(
91+
cells, headers=["page", "cell-id", "text", "x0", "y0", "x1", "y1"]
92+
)
93+
)
8794

8895
# find bitmap images
8996
images = []

src/proj_folders/pdf_parser/post_processors/build_hv_lines.h

Lines changed: 34 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -121,16 +121,16 @@ namespace pdf_lib
121121
std::vector<scalar_type>& x,
122122
std::vector<scalar_type>& y)
123123
{
124-
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
124+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
125125

126126
if(j+1<x.size())
127127
return false;
128128

129-
scalar_type x0 = x[j+0];
130-
scalar_type y0 = y[j+0];
129+
scalar_type x0 = x.at(j+0);
130+
scalar_type y0 = y.at(j+0);
131131

132-
scalar_type x1 = x[j+1];
133-
scalar_type y1 = y[j+1];
132+
scalar_type x1 = x.at(j+1);
133+
scalar_type y1 = y.at(j+1);
134134

135135
if(std::abs(x1-x0)>1.e-3 and
136136
std::abs(y1-y0)<1.e-3)
@@ -144,16 +144,16 @@ namespace pdf_lib
144144
std::vector<scalar_type>& x,
145145
std::vector<scalar_type>& y)
146146
{
147-
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
147+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
148148

149149
if(j+1<x.size())
150150
return false;
151151

152-
scalar_type x0 = x[j+0];
153-
scalar_type y0 = y[j+0];
152+
scalar_type x0 = x.at(j+0);
153+
scalar_type y0 = y.at(j+0);
154154

155-
scalar_type x1 = x[j+1];
156-
scalar_type y1 = y[j+1];
155+
scalar_type x1 = x.at(j+1);
156+
scalar_type y1 = y.at(j+1);
157157

158158
if(std::abs(x1-x0)<1.e-3 and
159159
std::abs(y1-y0)>1.e-3)
@@ -167,7 +167,7 @@ namespace pdf_lib
167167
scalar_type x1, scalar_type y1,
168168
std::vector<horizontal_line<scalar_type> >& hlines_)
169169
{
170-
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
170+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
171171

172172
horizontal_line<scalar_type> hline;
173173
hline.y = y0;
@@ -183,7 +183,7 @@ namespace pdf_lib
183183
scalar_type x1, scalar_type y1,
184184
std::vector<vertical_line<scalar_type> >& vlines_)
185185
{
186-
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
186+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
187187

188188
vertical_line<scalar_type> vline;
189189
vline.x = x0;
@@ -199,7 +199,7 @@ namespace pdf_lib
199199
std::vector<vertical_line <scalar_type> >& vlines_,
200200
std::vector<horizontal_line<scalar_type> >& hlines_)
201201
{
202-
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
202+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t" << __FUNCTION__;
203203

204204
hlines_.clear();
205205
vlines_.clear();
@@ -213,15 +213,31 @@ namespace pdf_lib
213213
x <= paths[k][core::keys<core::PATH>::x_values()];
214214
y <= paths[k][core::keys<core::PATH>::y_values()];
215215

216+
/*
217+
logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t #-subpaths: " << subpaths.size();
218+
for(int i=0; i<subpaths.size(); i++)
219+
{
220+
logging_lib::info("pdf-parser") << i << "\t" << subpaths.at(i);
221+
}
222+
*/
223+
216224
for(int i=0; i<subpaths.size()-1; i++)
217225
{
218-
for(int j=subpaths[i+0]; j<subpaths[i+1]; j++)
226+
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t subpath (" << i << "): " << subpaths.size();
227+
for(int j=subpaths.at(i+0); j<subpaths.at(i+1); j++)
219228
{
220-
scalar_type x0 = x[j+0];
221-
scalar_type y0 = y[j+0];
229+
//logging_lib::info("pdf-parser") << __FILE__ << ":" << __LINE__ << "\t x/y: " << j << "/" << x.size() << ":" << y.size();
230+
231+
if(j+1>=x.size() or j+1>=y.size()) // skip
232+
{
233+
continue;
234+
}
235+
236+
scalar_type x0 = x.at(j+0);
237+
scalar_type y0 = y.at(j+0);
222238

223-
scalar_type x1 = x[j+1];
224-
scalar_type y1 = y[j+1];
239+
scalar_type x1 = x.at(j+1);
240+
scalar_type y1 = y.at(j+1);
225241

226242
if(std::abs(y1-y0)<1.e-3 and std::abs(x1-x0)>1.e-3)
227243
register_hline(x0, y0, x1, y1, hlines_);

src/proj_folders/pdf_parser/post_processors/split_textcells.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,13 @@ namespace pdf_lib
515515
while(splitting)
516516
{
517517
splitting=false;
518+
519+
/*
520+
for(int j=0; j<vlines.size(); j+=1)
521+
{
522+
logging_lib::info("pdf-parser") << "vline (" << j << "): " << vlines[j].x << "," << vlines[j].y0 << "," << vlines[j].y1;
523+
}
524+
*/
518525

519526
for(int i=0; i<cells.get_size(); i+=1)
520527
{
@@ -530,13 +537,15 @@ namespace pdf_lib
530537
//auto height = bbox.height();
531538
//auto width = bbox.width();
532539

533-
//logging_lib::info("pdf-parser") << "\t cell \""<< cell_m.text << "\"";
540+
//logging_lib::info("pdf-parser") << cell_m.text << "\t" << x0 << "," << y0 << "," << x1 << "," << y1 << "\n";
534541
for(int j=0; j<vlines.size(); j+=1)
535542
{
536543
//if((x0+0.00*width < vlines[j].x and vlines[j].x < x1-0.00*width) and
537544
//(vlines[j].y0 < y0+0.05*height and y1-0.05*height < vlines[j].y1))
538545
if(post_processor<BUILD_HV_LINES, scalar_type>::is_split_by_vline(x0, y0, x1, y1, vlines[j]))
539546
{
547+
//logging_lib::info("pdf-parser") << "vline: " << vlines[j].x << "," << vlines[j].y0 << "," << vlines[j].y1;
548+
540549
logging_lib::warn("pdf-parser") << "\t --> splitting cell \""<< cell_m.text << "\"";
541550
splitting=split_cell_by_vline_on_page(i, cells, vlines[j]);
542551
}

0 commit comments

Comments
 (0)