1
1
#!/usr/bin/env python
2
2
3
3
import sys
4
- from bs4 import BeautifulSoup
4
+ from bs4 import BeautifulSoup , Comment
5
5
import fire
6
6
from pathlib import Path
7
7
import pandas as pd
8
+ import numpy as np
8
9
import json
10
+ import re
11
+ from ast import literal_eval
9
12
10
13
11
14
class Tabular :
@@ -21,15 +24,64 @@ def flatten_tables(soup):
21
24
for elem in inner .select ("tr, td, colgroup, tbody, col" ):
22
25
elem .name = 'div'
23
26
24
- def html2data (filename , table ):
27
+
28
+ def escape (s ):
29
+ return repr (s )
30
+
31
+
32
+ def unescape (r ):
33
+ return literal_eval (r )
34
+
35
+
36
+ multirow_re = re .compile (r"^\s*rows=(P<rows>\d+)\s*$" )
37
+ whitespace_re = re .compile (r'[\r\n]+|\s{2,}' )
38
+
39
+ def escape_table_content (soup ):
40
+ for item in soup .find_all (["td" , "th" ]):
41
+ escaped = escape (whitespace_re .sub (" " , item .get_text ().strip ()))
42
+
43
+ multirow = item .find ("div" , class_ = "multirow" , recursive = False )
44
+ if multirow and multirow .contents and isinstance (multirow .contents [0 ], Comment ):
45
+ match = multirow_re .match (str (multirow .contents [0 ]))
46
+ if match :
47
+ escaped = f"multirow={ match .group ('rows' )} ;{ escaped } "
48
+
49
+ item .string = escaped
50
+
51
+
52
+ def fix_htlatex_multirow (df ):
53
+ rows , cols = df .shape
54
+
55
+ for col in range (cols ):
56
+ for row in range (rows ):
57
+ cell = df .iloc [row , col ]
58
+ if cell .startswith ("multirow=" ):
59
+ pos = cell .find (';' )
60
+ multirows = int (cell [9 :pos ])
61
+ assert df .iloc [row + 1 : row + multirows , col ].isna ().all ()
62
+ df .iloc [row : row + multirows , col ] = cell [pos + 1 :]
63
+
64
+
65
+ def unescape_table_content (df ):
66
+ return df .applymap (unescape )
67
+
68
+
69
+ def fix_table (df ):
70
+ df = df .fillna (repr ('' ))
71
+ fix_htlatex_multirow (df )
72
+ df = df .replace ("''" , np .NaN ).dropna (how = 'all' ).dropna (axis = 'columns' , how = 'all' ).fillna ("''" )
73
+ return unescape_table_content (df )
74
+
75
+
76
+ def html2data (table ):
25
77
data = pd .read_html (str (table ), match = '' )
26
78
if len (data ) > 1 :
27
- raise ValueError (f"<table> element in ' { filename } ' contains wrong number of tables: { len (data )} " )
79
+ raise ValueError (f"<table> element contains wrong number of tables: { len (data )} " )
28
80
return data [0 ] if len (data ) == 1 else None
29
81
30
82
31
83
def save_table (data , filename ):
32
- data .dropna ( how = 'all' ). dropna ( axis = 'columns' , how = 'all' ). to_csv (filename , header = None , index = None )
84
+ data .to_csv (filename , header = None , index = None )
33
85
34
86
35
87
def save_tables (data , outdir ):
@@ -44,25 +96,31 @@ def save_tables(data, outdir):
44
96
45
97
46
98
def deepclone (elem ):
47
- return BeautifulSoup (str (elem ), "html.parser " )
99
+ return BeautifulSoup (str (elem ), "lxml " )
48
100
49
101
50
102
def extract_tables (filename , outdir ):
51
103
with open (filename , "rb" ) as f :
52
104
html = f .read ()
53
- outdir = Path (outdir )
54
- soup = BeautifulSoup (html , "html.parser" )
105
+ outdir = Path (outdir ) / Path (filename ).stem
106
+ outdir .mkdir (parents = True , exist_ok = True )
107
+ soup = BeautifulSoup (html , "lxml" )
55
108
flatten_tables (soup )
56
109
tables = soup .select ("div.tabular" )
57
110
58
111
data = []
59
112
for table in tables :
60
113
if table .find ("table" ) is not None :
61
114
float_div = table .find_parent ("div" , class_ = "float" )
62
- tab = html2data (filename , table )
115
+ #print(table)
116
+ escape_table_content (table )
117
+ #print(table)
118
+ tab = html2data (table )
63
119
if tab is None :
64
120
continue
65
121
122
+ tab = fix_table (tab )
123
+
66
124
caption = None
67
125
if float_div is not None :
68
126
float_div = deepclone (float_div )
0 commit comments