Skip to content

Commit 06877ad

Browse files
committed
allow loading invalid CoNLL-U files with more than 1 empty line
1 parent a15082d commit 06877ad

File tree

1 file changed

+13
-8
lines changed

1 file changed

+13
-8
lines changed

udapi/block/read/conllu.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,22 +82,27 @@ def parse_comment_line(self, line, root):
8282

8383
def read_trees(self):
8484
if not self.max_docs:
85+
# Valid CoNLL-U files must have sentences separated by a single empty line.
86+
# However, some users have to work with invalid files e.g. ending with two empty lines.
87+
# It is obvious how to parse such files and re.split(r'\n\n+', s) is only twice as slow
88+
# as s.split('\n\n') and this time is negligble
89+
# relative to the main CoNLL-U parsing in read_tree_from_lines().
8590
return [self.read_tree_from_lines(s.split('\n')) for s in
86-
self.filehandle.read().split('\n\n') if s]
91+
re.split(r'\n\n+', self.filehandle.read()) if s]
8792
# udapi.core.basereader takes care about the max_docs parameter.
8893
# However, we can make the loading much faster by not reading
8994
# the whole file if the user wants just first N documents.
9095
trees, lines, loaded_docs = [], [], 0
9196
for line in self.filehandle:
9297
line = line.rstrip()
9398
if line == '':
94-
tree = self.read_tree_from_lines(lines)
95-
lines = []
96-
if tree.newdoc:
97-
if loaded_docs == self.max_docs:
98-
return trees
99-
loaded_docs += 1
100-
trees.append(tree)
99+
tree = self.read_tree_from_lines(lines)
100+
lines = []
101+
if tree.newdoc:
102+
if loaded_docs == self.max_docs:
103+
return trees
104+
loaded_docs += 1
105+
trees.append(tree)
101106
else:
102107
lines.append(line)
103108
return

0 commit comments

Comments
 (0)