Skip to content

Commit 59a0036

Browse files
committed
wip underscore converter
1 parent e41acd4 commit 59a0036

File tree

1 file changed

+40
-5
lines changed

1 file changed

+40
-5
lines changed

stanza/utils/datasets/coref/convert_udcoref.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
UDCOREF_ADDN = 0 if not IS_UDCOREF_FORMAT else 1
2323

2424
def process_documents(docs, augment=False):
25+
# docs = sections
2526
processed_section = []
2627

2728
for idx, (doc, doc_id, lang) in enumerate(tqdm(docs)):
@@ -67,6 +68,7 @@ def process_documents(docs, augment=False):
6768
span_clusters = defaultdict(list)
6869
word_clusters = defaultdict(list)
6970
head2span = []
71+
is_zero = []
7072
word_total = 0
7173
SPANS = re.compile(r"(\(\w+|[%\w]+\))")
7274
for parsed_sentence in doc.sentences:
@@ -114,8 +116,23 @@ def process_documents(docs, augment=False):
114116
coref_spans.append([int(k), i[0], i[1]])
115117
sentence_upos = [x.upos for x in parsed_sentence.all_words]
116118
sentence_heads = [x.head - 1 if x.head and x.head > 0 else None for x in parsed_sentence.all_words]
119+
sentence_text = [x.text for x in parsed_sentence.all_words]
120+
121+
# if "_" in sentence_text and sentence_text.index("_") in [j for i in coref_spans for j in i]:
122+
# import ipdb
123+
# ipdb.set_trace()
117124

118125
for span in coref_spans:
126+
zero = False
127+
if sentence_text[span[1]] == "_" and span[1] == span[2]:
128+
is_zero.append([span[0], True])
129+
zero = True
130+
# oo! thaht's a zero coref, we should merge it forwards
131+
# i.e. we pick the next word as the head!
132+
span = [span[0], span[1]+1, span[2]+1]
133+
else:
134+
is_zero.append([span[0], False])
135+
119136
# input is expected to be start word, end word + 1
120137
# counting from 0
121138
# whereas the OntoNotes coref_span is [start_word, end_word] inclusive
@@ -124,10 +141,13 @@ def process_documents(docs, augment=False):
124141
# if its a zero coref (i.e. coref, but the head in None), we call
125142
# the beginning of the span (i.e. the zero itself) the head
126143

127-
try:
128-
candidate_head = find_cconj_head(sentence_heads, sentence_upos, span[1], span[2]+1)
129-
except RecursionError:
144+
if zero:
130145
candidate_head = span[1]
146+
else:
147+
try:
148+
candidate_head = find_cconj_head(sentence_heads, sentence_upos, span[1], span[2]+1)
149+
except RecursionError:
150+
candidate_head = span[1]
131151

132152
if candidate_head is None:
133153
for candidate_head in range(span[1], span[2] + 1):
@@ -153,6 +173,7 @@ def process_documents(docs, augment=False):
153173
span_clusters = sorted([sorted(values) for _, values in span_clusters.items()])
154174
word_clusters = sorted([sorted(values) for _, values in word_clusters.items()])
155175
head2span = sorted(head2span)
176+
is_zero = [i for _,i in sorted(is_zero)]
156177

157178
processed = {
158179
"document_id": doc_id,
@@ -165,7 +186,8 @@ def process_documents(docs, augment=False):
165186
"span_clusters": span_clusters,
166187
"word_clusters": word_clusters,
167188
"head2span": head2span,
168-
"lang": lang
189+
"lang": lang,
190+
"is_zero": is_zero
169191
}
170192
processed_section.append(processed)
171193
return processed_section
@@ -183,6 +205,7 @@ def process_dataset(short_name, coref_output_path, split_test, train_files, dev_
183205
lang = load.split("/")[-1].split("_")[0]
184206
print("Ingesting %s from %s of lang %s" % (section, load, lang))
185207
docs = CoNLL.conll2multi_docs(load, ignore_gapping=False)
208+
# sections = docs[:10]
186209
print(" Ingested %d documents" % len(docs))
187210
if split_test and section == 'train':
188211
test_section = []
@@ -302,5 +325,17 @@ def main():
302325
process_dataset(project, coref_output_path, args.split_test, train_filenames, dev_filenames)
303326

304327
if __name__ == '__main__':
305-
main()
328+
# main()
329+
330+
project = "test"
331+
332+
paths = get_default_paths()
333+
coref_output_path = paths['COREF_DATA_DIR']
334+
process_dataset(
335+
project,
336+
coref_output_path,
337+
False,
338+
["./extern_data/coref/corefud_v1_3/hu_szegedkoref-corefud-dev.conllu"],
339+
["./extern_data/coref/corefud_v1_3/hu_szegedkoref-corefud-dev.conllu"]
340+
)
306341

0 commit comments

Comments
 (0)