1
1
import pandas as pd
2
+ import numpy as np
2
3
import json
3
4
from pathlib import Path
4
5
import re
9
10
@dataclass
10
11
class Cell :
11
12
value : str
13
+ raw_value : str
12
14
gold_tags : str = ''
13
15
refs : List [str ] = field (default_factory = list )
14
16
layout : str = ''
@@ -36,7 +38,7 @@ def extract_references(s):
36
38
37
39
def str2cell (s ):
38
40
value , refs = extract_references (s )
39
- return Cell (value = value , refs = refs )
41
+ return Cell (value = value , raw_value = s , refs = refs )
40
42
41
43
def read_str_csv (filename ):
42
44
try :
@@ -49,34 +51,51 @@ def read_str_csv(filename):
49
51
50
52
51
53
class Table :
52
- def __init__ (self , df , layout , caption = None , figure_id = None , annotations = None , old_name = None ):
54
+ def __init__ (self , df , layout , caption = None , figure_id = None , annotations = None , old_name = None , guessed_tags = None ):
53
55
self .df = df
54
56
self .caption = caption
55
57
self .figure_id = figure_id
56
58
self .df = df .applymap (str2cell )
57
59
self .old_name = old_name
58
60
59
61
if layout is not None :
60
- self .layout = layout
62
+ # self.layout = layout
61
63
for r , row in layout .iterrows ():
62
64
for c , cell in enumerate (row ):
63
65
self .df .iloc [r ,c ].layout = cell
64
66
65
67
if annotations is not None :
66
68
self .gold_tags = annotations .gold_tags .strip ()
67
- tags = annotations .matrix_gold_tags
68
- gt_rows = len (annotations .matrix_gold_tags )
69
- if gt_rows > 0 :
70
- gt_cols = len (annotations .matrix_gold_tags [0 ])
69
+ self .dataset_text = annotations .dataset_text .strip ()
70
+ self .notes = annotations .notes .strip ()
71
+ if guessed_tags is not None :
72
+ tags = guessed_tags .values
73
+ else :
74
+ tags = annotations .matrix_gold_tags
75
+ gt_rows = len (tags )
76
+ if gt_rows == 0 and len (self .df ) > 0 :
77
+ #print(f"Gold tags size mismatch: 0 vs {len(self.df)} in old name {old_name}")
78
+ self .old_name = None
79
+ elif gt_rows > 0 :
80
+ gt_cols = len (tags [0 ])
71
81
if self .df .shape != (0 ,0 ) and self .df .shape == (gt_rows , gt_cols ):
72
82
for r , row in enumerate (tags ):
73
83
for c , cell in enumerate (row ):
74
84
self .df .iloc [r ,c ].gold_tags = cell .strip ()
85
+ else :
86
+ if guessed_tags is not None :
87
+ print (f"Gold tags size mismatch: { gt_rows } ,{ gt_cols } vs { self .df .shape } " )
88
+ # print(f"Gold tags size mismatch: {gt_rows},{gt_cols} vs {self.df.shape}")
89
+ # print(annotations.matrix_gold_tags)
90
+ # print(self.df.applymap(lambda c:c.value))
91
+ self .old_name = None
75
92
else :
76
93
self .gold_tags = ''
94
+ self .dataset_text = ''
95
+ self .notes = ''
77
96
78
97
@classmethod
79
- def from_file (cls , path , metadata , annotations = None , match_name = None ):
98
+ def from_file (cls , path , metadata , annotations = None , match_name = None , guessed_tags = None ):
80
99
path = Path (path )
81
100
filename = path / metadata ['filename' ]
82
101
df = read_str_csv (filename )
@@ -89,7 +108,7 @@ def from_file(cls, path, metadata, annotations=None, match_name=None):
89
108
table_ann = table_ann [0 ]
90
109
else :
91
110
table_ann = None
92
- return cls (df , layout , metadata .get ('caption' ), metadata .get ('figure_id' ), table_ann , match_name )
111
+ return cls (df , layout , metadata .get ('caption' ), metadata .get ('figure_id' ), table_ann , match_name , guessed_tags )
93
112
94
113
def display (self ):
95
114
display_table (self .df .applymap (lambda x : x .value ).values , self .df .applymap (lambda x : x .gold_tags ).values )
@@ -104,11 +123,15 @@ def display(self):
104
123
import string
105
124
from collections import Counter
106
125
126
+ figure_prefix_re = re .compile ('^(table|figure)\s+([0-9]+|[ivxl]+)?' )
107
127
punctuation_table = str .maketrans ('' , '' , string .punctuation )
108
128
def normalize_string (s ):
109
129
if s is None :
110
130
return ""
111
- return unidecode (s .strip ().lower ().replace (' ' , '' )).translate (punctuation_table )
131
+
132
+ s = s .strip ().lower ()
133
+ s = figure_prefix_re .sub ('' , s ).strip ()
134
+ return unidecode (s .replace ('\xa0 ' , '' ).replace (' ' , '' )).translate (punctuation_table )
112
135
113
136
def _remove_almost_empty_values (d ):
114
137
return {k :v for k ,v in d .items () if len (v ) >= 10 }
@@ -128,11 +151,106 @@ def _match_tables_by_captions(annotations, metadata):
128
151
old_captions_reverse = {v :k for k ,v in old_captions .items ()}
129
152
return {new_name :old_captions_reverse [caption ] for new_name , caption in new_captions .items () if caption in old_captions_reverse }
130
153
154
+ def normalize_cell (s ):
155
+ #s = reference_re.sub(' [] ', s)
156
+ return normalize_string (s )
157
+
158
+ # begin of guess annotations mapping
159
+ def create_cell_contexts (df ):
160
+ cell_context = df .values
161
+ cells = np .pad (cell_context , 1 , mode = 'constant' , constant_values = '' )
162
+
163
+ slices = [slice (None , - 2 ), slice (1 ,- 1 ), slice (2 , None )]
164
+
165
+ row_context = np .stack ([cells [1 :- 1 , s ] for s in slices ], axis = - 1 )
166
+ col_context = np .stack ([cells [s , 1 :- 1 ] for s in slices ], axis = - 1 )
167
+ box_context = np .stack ([cells [s1 , s2 ] for s1 in slices for s2 in slices ], axis = - 1 )
168
+ return box_context , row_context , col_context , cell_context [...,None ]
169
+
170
+ def map_context (context , values ):
171
+ ctx_len = context .shape [- 1 ]
172
+ mapping = {}
173
+ for ctx , val in zip (context .reshape ((- 1 , ctx_len )), values .reshape (- 1 )):
174
+ mapping .setdefault (tuple (ctx ), set ()).add (val )
175
+ return mapping
176
+
177
+ REANNOTATE_TAG = 'reannotate'
178
+
179
+ def guess_annotations (old_table , gold_tags , new_table ):
180
+ df = pd .DataFrame ().reindex_like (new_table ).fillna (REANNOTATE_TAG )
181
+ if old_table .empty :
182
+ return 0 , df
183
+ old_contexts = create_cell_contexts (old_table )
184
+ old_mappings = [map_context (ctx , gold_tags .values ) for ctx in old_contexts ]
185
+ new_contexts = create_cell_contexts (new_table )
186
+
187
+ rows , cols = new_table .shape
188
+ matched = 0
189
+ for row in range (rows ):
190
+ for col in range (cols ):
191
+ for mapping , context in zip (old_mappings , new_contexts ):
192
+ ctx = tuple (context [row , col ])
193
+ values = mapping .get (ctx , set ())
194
+ if len (values ) == 1 :
195
+ (val ,) = values
196
+ df .iloc [row , col ] = val
197
+ matched += 1
198
+ break
199
+ return matched , df
200
+
201
+ # end of guess annotations mapping
202
+
203
+
204
+ def same_table (old_table , new_table ):
205
+ return old_table .equals (new_table )
206
+
207
+ DEB_PAPER = "1607.00036v2"
208
+
209
+ def deb (path , old_name , old_table , new_name , new_table ):
210
+ if path .name == DEB_PAPER and old_name == "table_02.csv" == new_name :
211
+ print (old_table )
212
+ print (new_table )
213
+
214
+ def _match_tables_by_content (path , annotations , metadata ):
215
+ if annotations is None :
216
+ return {}, {}
217
+ old_tables = {x .name : (pd .DataFrame (x .matrix ).applymap (normalize_cell ), pd .DataFrame (x .matrix_gold_tags )) for x in annotations .table_set }
218
+ new_tables = {m ['filename' ]: Table .from_file (path , m , None , None ).df .applymap (lambda c : normalize_cell (c .value )) for m in metadata }
219
+ matched = {}
220
+ new_tags = {}
221
+ for new_name , new_table in new_tables .items ():
222
+ max_hits = 0
223
+ matched_name = None
224
+ size = np .prod (new_table .shape )
225
+ guessed_tags = None
226
+ for old_name , (old_table , gold_tags ) in old_tables .items ():
227
+ hits , tags = guess_annotations (old_table , gold_tags , new_table )
228
+ if hits > max_hits :
229
+ max_hits = hits
230
+ matched_name = old_name
231
+ guessed_tags = tags
232
+ if max_hits > size / 2 :
233
+ matched [new_name ] = matched_name
234
+ new_tags [new_name ] = guessed_tags
235
+ #deb(path, old_name, old_table, new_name, new_table)
236
+ #if same_table(old_table, new_table):
237
+ # if new_name in matched:
238
+ # print(f"Multiple matches for {path}/{new_name}: {matched[new_name]}, {old_name}")
239
+ # else:
240
+ # matched[new_name] = old_name
241
+ return matched , new_tags
131
242
####
132
243
133
244
def read_tables (path , annotations ):
134
245
path = Path (path )
135
246
with open (path / "metadata.json" , "r" ) as f :
136
247
metadata = json .load (f )
137
- _match_names = _match_tables_by_captions (annotations , metadata )
138
- return [Table .from_file (path , m , annotations , match_name = _match_names .get (m ["filename" ])) for m in metadata ]
248
+ _matched_names_by_captions = {} #_match_tables_by_captions(annotations, metadata)
249
+ _matched_names_by_content , _guessed_tags = _match_tables_by_content (path , annotations , metadata )
250
+ _matched_names = _matched_names_by_captions
251
+ for new_name , old_name in _matched_names_by_content .items ():
252
+ if new_name in _matched_names and _matched_names [new_name ] != old_name :
253
+ print (f"Multiple matches for table { path } /{ new_name } : { _matched_names [new_name ]} by caption and { old_name } by content" )
254
+ else :
255
+ _matched_names [new_name ] = old_name
256
+ return [Table .from_file (path , m , annotations , match_name = _matched_names .get (m ["filename" ]), guessed_tags = _guessed_tags .get (m ["filename" ])) for m in metadata ]
0 commit comments