@@ -51,12 +51,14 @@ def read_str_csv(filename):
51
51
52
52
53
53
class Table :
54
- def __init__ (self , df , layout , caption = None , figure_id = None , annotations = None , old_name = None , guessed_tags = None ):
54
+ def __init__ (self , df , layout , caption = None , figure_id = None , annotations = None , migrate = False , old_name = None , guessed_tags = None ):
55
55
self .df = df
56
56
self .caption = caption
57
57
self .figure_id = figure_id
58
58
self .df = df .applymap (str2cell )
59
- self .old_name = old_name
59
+
60
+ if migrate :
61
+ self .old_name = old_name
60
62
61
63
if layout is not None :
62
64
#self.layout = layout
@@ -74,41 +76,49 @@ def __init__(self, df, layout, caption=None, figure_id=None, annotations=None, o
74
76
tags = annotations .matrix_gold_tags
75
77
gt_rows = len (tags )
76
78
if gt_rows == 0 and len (self .df ) > 0 :
77
- #print(f"Gold tags size mismatch: 0 vs {len(self.df)} in old name {old_name}")
78
- self .old_name = None
79
+ print (f"Gold tags size mismatch: 0 vs { len (self .df )} in old name { old_name } " )
80
+ if migrate :
81
+ self .old_name = None
79
82
elif gt_rows > 0 :
80
83
gt_cols = len (tags [0 ])
81
84
if self .df .shape != (0 ,0 ) and self .df .shape == (gt_rows , gt_cols ):
82
85
for r , row in enumerate (tags ):
83
86
for c , cell in enumerate (row ):
84
87
self .df .iloc [r ,c ].gold_tags = cell .strip ()
85
88
else :
86
- if guessed_tags is not None :
87
- print (f"Gold tags size mismatch: { gt_rows } ,{ gt_cols } vs { self .df .shape } " )
89
+ print (f"Gold tags size mismatch: { gt_rows } ,{ gt_cols } vs { self .df .shape } " )
88
90
# print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}")
89
91
# print(annotations.matrix_gold_tags)
90
92
# print(self.df.applymap(lambda c:c.value))
91
- self .old_name = None
93
+ if migrate :
94
+ self .old_name = None
92
95
else :
93
96
self .gold_tags = ''
94
97
self .dataset_text = ''
95
98
self .notes = ''
96
99
97
100
@classmethod
98
- def from_file (cls , path , metadata , annotations = None , match_name = None , guessed_tags = None ):
101
+ def from_file (cls , path , metadata , annotations = None , migrate = False , match_name = None , guessed_tags = None ):
99
102
path = Path (path )
100
103
filename = path / metadata ['filename' ]
101
104
df = read_str_csv (filename )
102
105
if 'layout' in metadata :
103
106
layout = read_str_csv (path / metadata ['layout' ])
104
107
else :
105
108
layout = None
106
- if annotations is not None and match_name is not None :
107
- table_ann = annotations .table_set .filter (name = match_name ) + [None ]
108
- table_ann = table_ann [0 ]
109
+ if annotations is not None :
110
+ if not migrate :
111
+ # TODO: remove parser after migration is fully finished
112
+ table_ann = annotations .table_set .filter (name = metadata ['filename' ], parser = "latexml" ) + [None ]
113
+ table_ann = table_ann [0 ]
114
+ elif match_name is not None :
115
+ table_ann = annotations .table_set .filter (name = match_name ) + [None ]
116
+ table_ann = table_ann [0 ]
117
+ else :
118
+ table_ann = None
109
119
else :
110
120
table_ann = None
111
- return cls (df , layout , metadata .get ('caption' ), metadata .get ('figure_id' ), table_ann , match_name , guessed_tags )
121
+ return cls (df , layout , metadata .get ('caption' ), metadata .get ('figure_id' ), table_ann , migrate , match_name , guessed_tags )
112
122
113
123
def display (self ):
114
124
display_table (self .df .applymap (lambda x : x .value ).values , self .df .applymap (lambda x : x .gold_tags ).values )
@@ -241,16 +251,21 @@ def _match_tables_by_content(path, annotations, metadata):
241
251
return matched , new_tags
242
252
####
243
253
244
- def read_tables (path , annotations ):
254
+ def read_tables (path , annotations , migrate = False ):
245
255
path = Path (path )
246
256
with open (path / "metadata.json" , "r" ) as f :
247
257
metadata = json .load (f )
248
- _matched_names_by_captions = {} #_match_tables_by_captions(annotations, metadata)
249
- _matched_names_by_content , _guessed_tags = _match_tables_by_content (path , annotations , metadata )
250
- _matched_names = _matched_names_by_captions
251
- for new_name , old_name in _matched_names_by_content .items ():
252
- if new_name in _matched_names and _matched_names [new_name ] != old_name :
253
- print (f"Multiple matches for table { path } /{ new_name } : { _matched_names [new_name ]} by caption and { old_name } by content" )
254
- else :
255
- _matched_names [new_name ] = old_name
256
- return [Table .from_file (path , m , annotations , match_name = _matched_names .get (m ["filename" ]), guessed_tags = _guessed_tags .get (m ["filename" ])) for m in metadata ]
258
+
259
+ if migrate :
260
+ _matched_names_by_captions = {} #_match_tables_by_captions(annotations, metadata)
261
+ _matched_names_by_content , _guessed_tags = _match_tables_by_content (path , annotations , metadata )
262
+ _matched_names = _matched_names_by_captions
263
+ for new_name , old_name in _matched_names_by_content .items ():
264
+ if new_name in _matched_names and _matched_names [new_name ] != old_name :
265
+ print (f"Multiple matches for table { path } /{ new_name } : { _matched_names [new_name ]} by caption and { old_name } by content" )
266
+ else :
267
+ _matched_names [new_name ] = old_name
268
+ else :
269
+ _matched_names = {}
270
+ _guessed_tags = {}
271
+ return [Table .from_file (path , m , annotations , migrate = migrate , match_name = _matched_names .get (m ["filename" ]), guessed_tags = _guessed_tags .get (m ["filename" ])) for m in metadata ]
0 commit comments