Skip to content

Commit c55d3d3

Browse files
committed
Add a lang=he field to all of the documents produced by the Hebrew coref conversion
1 parent 4554b47 commit c55d3d3

File tree

2 files changed

+4
-2
lines changed

2 files changed

+4
-2
lines changed

stanza/utils/datasets/coref/convert_hebrew_iahlt.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ def main():
156156
input_filename = os.path.join(hebrew_base_path, input_filename)
157157
assert os.path.exists(input_filename)
158158
docs = read_doc(tokenizer, input_filename)
159-
dataset = [process_document(pipe, doc.doc_id, "", doc.sentences, doc.coref_spans, None) for doc in tqdm(docs)]
159+
dataset = [process_document(pipe, doc.doc_id, "", doc.sentences, doc.coref_spans, None, lang="he") for doc in tqdm(docs)]
160160

161161
output_filename = os.path.join(paths["COREF_DATA_DIR"], output_filename)
162162
write_json_file(output_filename, dataset)

stanza/utils/datasets/coref/utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def find_cconj_head(heads, upos, start, end):
5656
return cc_indexes[0] + start
5757
return None
5858

59-
def process_document(pipe, doc_id, part_id, sentences, coref_spans, sentence_speakers, use_cconj_heads=True):
59+
def process_document(pipe, doc_id, part_id, sentences, coref_spans, sentence_speakers, use_cconj_heads=True, lang=None):
6060
"""
6161
doc_id: a string naming the document
6262
part_id: if the document has a particular subpart (can be blank)
@@ -154,4 +154,6 @@ def process_document(pipe, doc_id, part_id, sentences, coref_spans, sentence_spe
154154
}
155155
if part_id is not None:
156156
processed["part_id"] = part_id
157+
if lang is not None:
158+
processed["lang"] = lang
157159
return processed

0 commit comments

Comments
 (0)