22
22
UDCOREF_ADDN = 0 if not IS_UDCOREF_FORMAT else 1
23
23
24
24
def process_documents (docs , augment = False ):
25
+ # docs = sections
25
26
processed_section = []
26
27
27
28
for idx , (doc , doc_id , lang ) in enumerate (tqdm (docs )):
@@ -67,6 +68,7 @@ def process_documents(docs, augment=False):
67
68
span_clusters = defaultdict (list )
68
69
word_clusters = defaultdict (list )
69
70
head2span = []
71
+ is_zero = []
70
72
word_total = 0
71
73
SPANS = re .compile (r"(\(\w+|[%\w]+\))" )
72
74
for parsed_sentence in doc .sentences :
@@ -114,8 +116,23 @@ def process_documents(docs, augment=False):
114
116
coref_spans .append ([int (k ), i [0 ], i [1 ]])
115
117
sentence_upos = [x .upos for x in parsed_sentence .all_words ]
116
118
sentence_heads = [x .head - 1 if x .head and x .head > 0 else None for x in parsed_sentence .all_words ]
119
+ sentence_text = [x .text for x in parsed_sentence .all_words ]
120
+
121
+ # if "_" in sentence_text and sentence_text.index("_") in [j for i in coref_spans for j in i]:
122
+ # import ipdb
123
+ # ipdb.set_trace()
117
124
118
125
for span in coref_spans :
126
+ zero = False
127
+ if sentence_text [span [1 ]] == "_" and span [1 ] == span [2 ]:
128
+ is_zero .append ([span [0 ], True ])
129
+ zero = True
130
+ # oo! thaht's a zero coref, we should merge it forwards
131
+ # i.e. we pick the next word as the head!
132
+ span = [span [0 ], span [1 ]+ 1 , span [2 ]+ 1 ]
133
+ else :
134
+ is_zero .append ([span [0 ], False ])
135
+
119
136
# input is expected to be start word, end word + 1
120
137
# counting from 0
121
138
# whereas the OntoNotes coref_span is [start_word, end_word] inclusive
@@ -124,10 +141,13 @@ def process_documents(docs, augment=False):
124
141
# if its a zero coref (i.e. coref, but the head in None), we call
125
142
# the beginning of the span (i.e. the zero itself) the head
126
143
127
- try :
128
- candidate_head = find_cconj_head (sentence_heads , sentence_upos , span [1 ], span [2 ]+ 1 )
129
- except RecursionError :
144
+ if zero :
130
145
candidate_head = span [1 ]
146
+ else :
147
+ try :
148
+ candidate_head = find_cconj_head (sentence_heads , sentence_upos , span [1 ], span [2 ]+ 1 )
149
+ except RecursionError :
150
+ candidate_head = span [1 ]
131
151
132
152
if candidate_head is None :
133
153
for candidate_head in range (span [1 ], span [2 ] + 1 ):
@@ -153,6 +173,7 @@ def process_documents(docs, augment=False):
153
173
span_clusters = sorted ([sorted (values ) for _ , values in span_clusters .items ()])
154
174
word_clusters = sorted ([sorted (values ) for _ , values in word_clusters .items ()])
155
175
head2span = sorted (head2span )
176
+ is_zero = [i for _ ,i in sorted (is_zero )]
156
177
157
178
processed = {
158
179
"document_id" : doc_id ,
@@ -165,7 +186,8 @@ def process_documents(docs, augment=False):
165
186
"span_clusters" : span_clusters ,
166
187
"word_clusters" : word_clusters ,
167
188
"head2span" : head2span ,
168
- "lang" : lang
189
+ "lang" : lang ,
190
+ "is_zero" : is_zero
169
191
}
170
192
processed_section .append (processed )
171
193
return processed_section
@@ -183,6 +205,7 @@ def process_dataset(short_name, coref_output_path, split_test, train_files, dev_
183
205
lang = load .split ("/" )[- 1 ].split ("_" )[0 ]
184
206
print ("Ingesting %s from %s of lang %s" % (section , load , lang ))
185
207
docs = CoNLL .conll2multi_docs (load , ignore_gapping = False )
208
+ # sections = docs[:10]
186
209
print (" Ingested %d documents" % len (docs ))
187
210
if split_test and section == 'train' :
188
211
test_section = []
@@ -302,5 +325,17 @@ def main():
302
325
process_dataset (project , coref_output_path , args .split_test , train_filenames , dev_filenames )
303
326
304
327
if __name__ == '__main__' :
305
- main ()
328
+ # main()
329
+
330
+ project = "test"
331
+
332
+ paths = get_default_paths ()
333
+ coref_output_path = paths ['COREF_DATA_DIR' ]
334
+ process_dataset (
335
+ project ,
336
+ coref_output_path ,
337
+ False ,
338
+ ["./extern_data/coref/corefud_v1_3/hu_szegedkoref-corefud-dev.conllu" ],
339
+ ["./extern_data/coref/corefud_v1_3/hu_szegedkoref-corefud-dev.conllu" ]
340
+ )
306
341
0 commit comments