1+ """Contains the core functions to parse tables from PDFs."""
2+
13import math
24import os
35import sqlite3
3032
3133class TextAlignment :
3234 """Represents a list of textlines sharing an alignment on a coordinate.
35+
3336 The alignment can be left/right/middle or top/bottom/center.
3437 (PDF coordinate space)
3538
@@ -59,7 +62,7 @@ def __init__(self, coord, textline, align):
5962 self .textlines = [textline ]
6063 self .align = align
6164
62- def __repr__ (self ):
65+ def __repr__ (self ): # noqa D105
6366 text_inside = " | " .join (
6467 map (lambda x : x .get_text (), self .textlines [:2 ])
6568 ).replace ("\n " , "" )
@@ -79,8 +82,9 @@ def register_aligned_textline(self, textline, coord):
7982
8083
8184class TextEdge (TextAlignment ):
82- """Defines a text edge coordinates relative to a left-bottom
83- origin. (PDF coordinate space)
85+ """Defines a text edge coordinates relative to a left-bottom origin.
86+
87+ (PDF coordinate space)
8488 An edge is an alignment bounded over a segment.
8589
8690 Parameters
@@ -108,7 +112,7 @@ def __init__(self, coord, textline, align):
108112 self .y1 = textline .y1
109113 self .is_valid = False
110114
111- def __repr__ (self ):
115+ def __repr__ (self ): # noqa D105
112116 x = round (self .coord , 2 )
113117 y0 = round (self .y0 , 2 )
114118 y1 = round (self .y1 , 2 )
@@ -117,7 +121,9 @@ def __repr__(self):
117121 )
118122
119123 def update_coords (self , x , textline , edge_tol = 50 ):
120- """Updates the text edge's x and bottom y coordinates and sets
124+ """Update text edge coordinates.
125+
126+ Update the text edge's x and bottom y coordinates and sets
121127 the is_valid attribute.
122128 """
123129 if math .isclose (self .y0 , textline .y0 , abs_tol = edge_tol ):
@@ -146,7 +152,7 @@ def _update_alignment(self, alignment, coord, textline):
146152 return NotImplemented
147153
148154 def _register_textline (self , textline ):
149- """Updates an existing text edge in the current dict."""
155+ """Update an existing text edge in the current dict."""
150156 coords = get_textline_coords (textline )
151157 for alignment_id , alignment_array in self ._text_alignments .items ():
152158 coord = coords [alignment_id ]
@@ -180,7 +186,9 @@ def _register_textline(self, textline):
180186
181187
182188class TextEdges (TextAlignments ):
183- """Defines a dict of left, right and middle text edges found on
189+ """Defines a dict text edges on the PDF page.
190+
191+ The dict contains the left, right and middle text edges found on
184192 the PDF page. The dict has three keys based on the alignments,
185193 and each key's value is a list of camelot.core.TextEdge objects.
186194 """
@@ -194,25 +202,24 @@ def _create_new_text_alignment(self, coord, textline, align):
194202 return TextEdge (coord , textline , align )
195203
196204 def add (self , coord , textline , align ):
197- """Adds a new text edge to the current dict."""
205+ """Add a new text edge to the current dict."""
198206 te = self ._create_new_text_alignment (coord , textline , align )
199207 self ._text_alignments [align ].append (te )
200208
201209 def _update_alignment (self , alignment , coord , textline ):
202210 alignment .update_coords (coord , textline , self .edge_tol )
203211
204212 def generate (self , textlines ):
205- """Generates the text edges dict based on horizontal text
206- rows.
207- """
213+ """Generates the text edges dict based on horizontal text rows."""
208214 for tl in textlines :
209215 if len (tl .get_text ().strip ()) > 1 : # TODO: hacky
210216 self ._register_textline (tl )
211217
212218 def get_relevant (self ):
213- """Returns the list of relevant text edges (all share the same
214- alignment) based on which list intersects horizontal text rows
215- the most.
219+ """Return the list of relevant text edges.
220+
221+ (all share the same alignment)
222+ based on which list intersects horizontal text rows the most.
216223 """
217224 intersections_sum = {
218225 "left" : sum (
@@ -239,8 +246,9 @@ def get_relevant(self):
239246 )
240247
241248 def get_table_areas (self , textlines , relevant_textedges ):
242- """Returns a dict of interesting table areas on the PDF page
243- calculated using relevant text edges.
249+ """Return a dict of interesting table areas on the PDF page.
250+
251+ The table areas are calculated using relevant text edges.
244252 """
245253
246254 def pad (area , average_row_height ):
@@ -312,7 +320,9 @@ def pad(area, average_row_height):
312320
313321
314322class Cell :
315- """Defines a cell in a table with coordinates relative to a
323+ """Defines a cell in a table.
324+
325+ With coordinates relative to a
316326 left-bottom origin. (PDF coordinate space)
317327
318328 Parameters
@@ -370,19 +380,19 @@ def __init__(self, x1, y1, x2, y2):
370380 self .vspan = False
371381 self ._text = ""
372382
373- def __repr__ (self ):
383+ def __repr__ (self ): # noqa D105
374384 x1 = round (self .x1 )
375385 y1 = round (self .y1 )
376386 x2 = round (self .x2 )
377387 y2 = round (self .y2 )
378388 return f"<Cell x1={ x1 } y1={ y1 } x2={ x2 } y2={ y2 } >"
379389
380390 @property
381- def text (self ):
391+ def text (self ): # noqa D102
382392 return self ._text
383393
384394 @text .setter
385- def text (self , t ):
395+ def text (self , t ): # noqa D105
386396 self ._text = "" .join ([self ._text , t ])
387397
388398 @property
@@ -392,8 +402,9 @@ def bound(self):
392402
393403
394404class Table :
395- """Defines a table with coordinates relative to a left-bottom
396- origin. (PDF coordinate space)
405+ """Defines a table with coordinates relative to a left-bottom origin.
406+
407+ (PDF coordinate space)
397408
398409 Parameters
399410 ----------
@@ -443,9 +454,28 @@ def __init__(self, cols, rows):
443454 self ._image_path = None # Temporary file to hold an image of the pdf
444455
445456 def __repr__ (self ):
457+ """Return a string representation of the class .
458+
459+ Returns
460+ -------
461+ [type]
462+ [description]
463+ """
446464 return f"<{ self .__class__ .__name__ } shape={ self .shape } >"
447465
448466 def __lt__ (self , other ):
467+ """Return True if the two pages are less than the current page .
468+
469+ Parameters
470+ ----------
471+ other : [type]
472+ [description]
473+
474+ Returns
475+ -------
476+ [type]
477+ [description]
478+ """
449479 if self .page == other .page :
450480 if self .order < other .order :
451481 return True
@@ -462,7 +492,9 @@ def data(self):
462492
463493 @property
464494 def parsing_report (self ):
465- """Returns a parsing report with %accuracy, %whitespace,
495+ """Returns a parsing report.
496+
497+ with % accuracy, % whitespace,
466498 table number on page and page number.
467499 """
468500 # pretty?
@@ -475,7 +507,7 @@ def parsing_report(self):
475507 return report
476508
477509 def record_metadata (self , parser ):
478- """Record data about the origin of the table"""
510+ """Record data about the origin of the table. """
479511 self .flavor = parser .id
480512 self .filename = parser .filename
481513 self .debug_info = parser .debug_info
@@ -489,7 +521,7 @@ def record_metadata(self, parser):
489521 self .pdf_size = (parser .pdf_width , parser .pdf_height )
490522
491523 def get_pdf_image (self ):
492- """Compute pdf image and cache it"""
524+ """Compute pdf image and cache it. """
493525 if self ._image is None :
494526 if self ._image_path is None :
495527 self ._image_path = build_file_path_in_temp_dir (
@@ -501,14 +533,16 @@ def get_pdf_image(self):
501533 return self ._image
502534
503535 def set_all_edges (self ):
504- """Sets all table edges to True."""
536+ """Set all table edges to True."""
505537 for row in self .cells :
506538 for cell in row :
507539 cell .left = cell .right = cell .top = cell .bottom = True
508540 return self
509541
510542 def set_edges (self , vertical , horizontal , joint_tol = 2 ):
511- """Sets a cell's edges to True depending on whether the cell's
543+ """Set the edges of the joint.
544+
545+ Set a cell's edges to True depending on whether the cell's
512546 coordinates overlap with the line's coordinates within a
513547 tolerance.
514548
@@ -518,7 +552,8 @@ def set_edges(self, vertical, horizontal, joint_tol=2):
518552 List of detected vertical lines.
519553 horizontal : list
520554 List of detected horizontal lines.
521-
555+ joint_tol : int, optional
556+ [description], by default 2
522557 """
523558
524559 def find_close_point (over , coord , joint_tol ):
@@ -584,7 +619,9 @@ def set_border(self):
584619 return self
585620
586621 def set_span (self ):
587- """Sets a cell's hspan or vspan attribute to True depending
622+ """Set a cell's hspan or vspan attribute.
623+
624+ Set the cell's hspan or vspan attribute to True depending
588625 on whether the cell spans horizontally or vertically.
589626 """
590627 for row in self .cells :
@@ -616,13 +653,15 @@ def set_span(self):
616653
617654 def copy_spanning_text (self , copy_text = None ):
618655 """Copies over text in empty spanning cells.
656+
619657 Parameters
620658 ----------
621659 copy_text : list, optional (default: None)
622660 {'h', 'v'}
623661 Select one or more strings from above and pass them as a list
624662 to specify the direction in which text should be copied over
625663 when a cell spans multiple rows or columns.
664+
626665 Returns
627666 -------
628667 t : camelot.core.Table
@@ -643,7 +682,7 @@ def copy_spanning_text(self, copy_text=None):
643682 return self
644683
645684 def to_csv (self , path , ** kwargs ):
646- """Writes Table to a comma-separated values (csv) file.
685+ """Write Table(s) to a comma-separated values (csv) file.
647686
648687 For kwargs, check :meth:`pandas.DataFrame.to_csv`.
649688
@@ -658,7 +697,7 @@ def to_csv(self, path, **kwargs):
658697 self .df .to_csv (path , ** kw )
659698
660699 def to_json (self , path , ** kwargs ):
661- """Writes Table to a JSON file.
700+ """Write Table(s) to a JSON file.
662701
663702 For kwargs, check :meth:`pandas.DataFrame.to_json`.
664703
@@ -675,7 +714,7 @@ def to_json(self, path, **kwargs):
675714 f .write (json_string )
676715
677716 def to_excel (self , path , ** kwargs ):
678- """Writes Table to an Excel file.
717+ """Write Table(s) to an Excel file.
679718
680719 For kwargs, check :meth:`pandas.DataFrame.to_excel`.
681720
@@ -695,7 +734,7 @@ def to_excel(self, path, **kwargs):
695734 writer .save ()
696735
697736 def to_html (self , path , ** kwargs ):
698- """Writes Table to an HTML file.
737+ """Write Table(s) to an HTML file.
699738
700739 For kwargs, check :meth:`pandas.DataFrame.to_html`.
701740
@@ -710,7 +749,7 @@ def to_html(self, path, **kwargs):
710749 f .write (html_string )
711750
712751 def to_markdown (self , path , ** kwargs ):
713- """Writes Table to a Markdown file.
752+ """Write Table(s) to a Markdown file.
714753
715754 For kwargs, check :meth:`pandas.DataFrame.to_markdown`.
716755
@@ -725,7 +764,7 @@ def to_markdown(self, path, **kwargs):
725764 f .write (md_string )
726765
727766 def to_sqlite (self , path , ** kwargs ):
728- """Writes Table to sqlite database.
767+ """Write Table(s) to sqlite database.
729768
730769 For kwargs, check :meth:`pandas.DataFrame.to_sql`.
731770
@@ -745,8 +784,9 @@ def to_sqlite(self, path, **kwargs):
745784
746785
747786class TableList :
748- """Defines a list of camelot.core.Table objects. Each table can
749- be accessed using its index.
787+ """Defines a list of camelot.core.Table objects.
788+
789+ Each table can be accessed using its index.
750790
751791 Attributes
752792 ----------
@@ -755,19 +795,19 @@ class TableList:
755795
756796 """
757797
758- def __init__ (self , tables ):
798+ def __init__ (self , tables ): # noqa D105
759799 self ._tables = tables
760800
761- def __repr__ (self ):
801+ def __repr__ (self ): # noqa D105
762802 return f"<{ self .__class__ .__name__ } n={ self .n } >"
763803
764- def __len__ (self ):
804+ def __len__ (self ): # noqa D105
765805 return len (self ._tables )
766806
767- def __getitem__ (self , idx ):
807+ def __getitem__ (self , idx ): # noqa D105
768808 return self ._tables [idx ]
769809
770- def __iter__ (self ):
810+ def __iter__ (self ): # noqa D105
771811 yield from self ._tables
772812
773813 @staticmethod
@@ -776,6 +816,7 @@ def _format_func(table, f):
776816
777817 @property
778818 def n (self ):
819+ """The number of tables in the list."""
779820 return len (self )
780821
781822 def _write_file (self , f = None , ** kwargs ):
@@ -801,7 +842,7 @@ def _compress_dir(self, **kwargs):
801842 z .write (filepath , os .path .basename (filepath ))
802843
803844 def export (self , path , f = "csv" , compress = False ):
804- """Exports the list of tables to specified file format.
845+ """Export the list of tables to specified file format.
805846
806847 Parameters
807848 ----------
0 commit comments