Skip to content

Commit e5b5186

Browse files
committed
A new block to survey fixed multiword expressions in UD treebanks.
1 parent 9760e69 commit e5b5186

File tree

1 file changed

+104
-0
lines changed

1 file changed

+104
-0
lines changed

udapi/block/ud/printfixed.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
"""
2+
Block PrintFixed prints occurrences of fixed multiword expressions in UD. It
3+
can be run twice in a row, first collecting known fixed expressions and then
4+
also reporting other occurrences of these expressions where they are not
5+
annotated as fixed.
6+
7+
Usage:
8+
udapy ud.PrintFixed only_forms=1 < in.conllu | sort -u > fixed_expressions.txt
9+
udapy ud.PrintFixed known_expressions=fixed_expressions.txt < in.conllu | sort | uniq -c | less
10+
11+
Author: Dan Zeman
12+
"""
13+
from udapi.core.block import Block
14+
import re
15+
import logging
16+
17+
class PrintFixed(Block):
18+
"""
19+
Print fixed multiword expressions.
20+
"""
21+
22+
def __init__(self, only_forms=False, known_expressions=None, **kwargs):
23+
"""
24+
Create the PrintFixed block.
25+
26+
Parameters:
27+
only_forms=1: print the word forms but not tags and other info;
28+
This can be used to create the list of known forms that we want to
29+
identify even if they are not annotated as fixed.
30+
known_expressions: the name of the text file with the expressions
31+
"""
32+
super().__init__(**kwargs)
33+
self.only_forms = only_forms
34+
self.known_expressions = {}
35+
self.first_words = {}
36+
self.max_length = 2
37+
if known_expressions:
38+
fh = open(known_expressions, 'r', encoding='utf-8')
39+
n = 0
40+
for expression in fh.readlines():
41+
expression = expression.replace('\n', '')
42+
if expression in self.known_expressions:
43+
self.known_expressions[expression] += 1
44+
else:
45+
self.known_expressions[expression] = 1
46+
logging.info("Read known fixed expression '%s'" % expression)
47+
n += 1
48+
words = expression.split(' ')
49+
first_word = words[0]
50+
self.first_words[first_word] = 1
51+
length = len(words)
52+
if length > self.max_length:
53+
self.max_length = length
54+
logging.info('Read %d known fixed expressions.' % n)
55+
56+
def process_node(self, node):
57+
fixed_children = [x for x in node.children if x.udeprel == 'fixed']
58+
if len(fixed_children) > 0:
59+
# Fixed children are always to the right of of the parent. But there
60+
# may be other nodes in between that are not fixed children (for
61+
# example, there may be punctuation that is attached to one of the
62+
# fixed nodes).
63+
n = node
64+
list_of_forms = [node.form.lower()]
65+
list_of_tags = [node.upos]
66+
while n != fixed_children[-1]:
67+
n = n.next_node
68+
if n.parent == node and n.udeprel == 'fixed':
69+
list_of_forms.append(n.form.lower())
70+
list_of_tags.append(n.upos)
71+
else:
72+
list_of_forms.append('X')
73+
list_of_tags.append('X')
74+
forms = ' '.join(list_of_forms)
75+
tags = ' '.join(list_of_tags)
76+
if self.only_forms:
77+
print(forms)
78+
else:
79+
print("%s / %s / %s" % (forms, tags, node.deprel))
80+
else:
81+
# If this is not the first word of a fixed expression, check whether
82+
# something that looks like a known fixed expression starts here.
83+
# Note that it is also possible that a known expression starts here
84+
# but only a subset is actually marked as such; we currently do not
85+
# account for this.
86+
if node.form.lower() in self.first_words:
87+
n = node
88+
list_of_forms = [node.form.lower()]
89+
list_of_tags = [node.upos]
90+
for i in range(self.max_length - 1):
91+
n = n.next_node
92+
if not n:
93+
break
94+
###!!! At present we cannot identify known expressions with gaps ('X').
95+
list_of_forms.append(n.form.lower())
96+
list_of_tags.append(n.upos)
97+
forms = ' '.join(list_of_forms)
98+
if forms in self.known_expressions:
99+
if self.only_forms:
100+
print(forms)
101+
else:
102+
tags = ' '.join(list_of_tags)
103+
print("%s / %s / NOT FIXED" % (forms, tags))
104+
break

0 commit comments

Comments
 (0)