@@ -400,6 +400,7 @@ def from_conll2012_file(
400400 max_span_size : int ,
401401 tokens_split_idx : int ,
402402 corefs_split_idx : int ,
403+ separator : str = "\t " ,
403404 ) -> CoreferenceDataset :
404405 """
405406 :param tokens_split_idx: index of the tokens column in the
@@ -435,7 +436,7 @@ def from_conll2012_file(
435436 open_mentions = {}
436437 continue
437438
438- splitted = line .split (" \t " )
439+ splitted = line .split (separator )
439440
440441 # - tokens
441442 document_tokens .append (splitted [tokens_split_idx ])
@@ -453,7 +454,7 @@ def from_conll2012_file(
453454 # - A ending parenthesis indicate the end of a mention
454455 # - The middle number indicates the ID of the coreference chain
455456 # the mention belongs to
456- if splitted [4 ] == "-" :
457+ if splitted [corefs_split_idx ] == "-" :
457458 continue
458459
459460 coref_datas_list = splitted [corefs_split_idx ].split ("|" )
@@ -635,6 +636,21 @@ def load_litbank_dataset(
635636 )
636637
637638
639+ def load_democrat_dataset (
640+ root_path : str , tokenizer : PreTrainedTokenizerFast , max_span_size : int
641+ ) -> CoreferenceDataset :
642+ "Load the Democrat dataset from the boberle/coreference_databases repository."
643+ root_path = os .path .expanduser (root_path .rstrip ("/" ))
644+ return CoreferenceDataset .from_conll2012_file (
645+ f"{ root_path } /democrat_dem1921/dem1921_base.conll" ,
646+ tokenizer ,
647+ max_span_size ,
648+ 3 ,
649+ 11 ,
650+ separator = " " ,
651+ )
652+
653+
638654def load_fr_litbank_dataset (
639655 root_path : str , tokenizer : PreTrainedTokenizerFast , max_span_size : int
640656):
0 commit comments