|
| 1 | +""" |
| 2 | +Block PrintFixed prints occurrences of fixed multiword expressions in UD. It |
| 3 | +can be run twice in a row, first collecting known fixed expressions and then |
| 4 | +also reporting other occurrences of these expressions where they are not |
| 5 | +annotated as fixed. |
| 6 | +
|
| 7 | +Usage: |
| 8 | +udapy ud.PrintFixed only_forms=1 < in.conllu | sort -u > fixed_expressions.txt |
| 9 | +udapy ud.PrintFixed known_expressions=fixed_expressions.txt < in.conllu | sort | uniq -c | less |
| 10 | +
|
| 11 | +Author: Dan Zeman |
| 12 | +""" |
| 13 | +from udapi.core.block import Block |
| 14 | +import re |
| 15 | +import logging |
| 16 | + |
| 17 | +class PrintFixed(Block): |
| 18 | + """ |
| 19 | + Print fixed multiword expressions. |
| 20 | + """ |
| 21 | + |
| 22 | + def __init__(self, only_forms=False, known_expressions=None, **kwargs): |
| 23 | + """ |
| 24 | + Create the PrintFixed block. |
| 25 | +
|
| 26 | + Parameters: |
| 27 | + only_forms=1: print the word forms but not tags and other info; |
| 28 | + This can be used to create the list of known forms that we want to |
| 29 | + identify even if they are not annotated as fixed. |
| 30 | + known_expressions: the name of the text file with the expressions |
| 31 | + """ |
| 32 | + super().__init__(**kwargs) |
| 33 | + self.only_forms = only_forms |
| 34 | + self.known_expressions = {} |
| 35 | + self.first_words = {} |
| 36 | + self.max_length = 2 |
| 37 | + if known_expressions: |
| 38 | + fh = open(known_expressions, 'r', encoding='utf-8') |
| 39 | + n = 0 |
| 40 | + for expression in fh.readlines(): |
| 41 | + expression = expression.replace('\n', '') |
| 42 | + if expression in self.known_expressions: |
| 43 | + self.known_expressions[expression] += 1 |
| 44 | + else: |
| 45 | + self.known_expressions[expression] = 1 |
| 46 | + logging.info("Read known fixed expression '%s'" % expression) |
| 47 | + n += 1 |
| 48 | + words = expression.split(' ') |
| 49 | + first_word = words[0] |
| 50 | + self.first_words[first_word] = 1 |
| 51 | + length = len(words) |
| 52 | + if length > self.max_length: |
| 53 | + self.max_length = length |
| 54 | + logging.info('Read %d known fixed expressions.' % n) |
| 55 | + |
| 56 | + def process_node(self, node): |
| 57 | + fixed_children = [x for x in node.children if x.udeprel == 'fixed'] |
| 58 | + if len(fixed_children) > 0: |
| 59 | + # Fixed children are always to the right of of the parent. But there |
| 60 | + # may be other nodes in between that are not fixed children (for |
| 61 | + # example, there may be punctuation that is attached to one of the |
| 62 | + # fixed nodes). |
| 63 | + n = node |
| 64 | + list_of_forms = [node.form.lower()] |
| 65 | + list_of_tags = [node.upos] |
| 66 | + while n != fixed_children[-1]: |
| 67 | + n = n.next_node |
| 68 | + if n.parent == node and n.udeprel == 'fixed': |
| 69 | + list_of_forms.append(n.form.lower()) |
| 70 | + list_of_tags.append(n.upos) |
| 71 | + else: |
| 72 | + list_of_forms.append('X') |
| 73 | + list_of_tags.append('X') |
| 74 | + forms = ' '.join(list_of_forms) |
| 75 | + tags = ' '.join(list_of_tags) |
| 76 | + if self.only_forms: |
| 77 | + print(forms) |
| 78 | + else: |
| 79 | + print("%s / %s / %s" % (forms, tags, node.deprel)) |
| 80 | + else: |
| 81 | + # If this is not the first word of a fixed expression, check whether |
| 82 | + # something that looks like a known fixed expression starts here. |
| 83 | + # Note that it is also possible that a known expression starts here |
| 84 | + # but only a subset is actually marked as such; we currently do not |
| 85 | + # account for this. |
| 86 | + if node.form.lower() in self.first_words: |
| 87 | + n = node |
| 88 | + list_of_forms = [node.form.lower()] |
| 89 | + list_of_tags = [node.upos] |
| 90 | + for i in range(self.max_length - 1): |
| 91 | + n = n.next_node |
| 92 | + if not n: |
| 93 | + break |
| 94 | + ###!!! At present we cannot identify known expressions with gaps ('X'). |
| 95 | + list_of_forms.append(n.form.lower()) |
| 96 | + list_of_tags.append(n.upos) |
| 97 | + forms = ' '.join(list_of_forms) |
| 98 | + if forms in self.known_expressions: |
| 99 | + if self.only_forms: |
| 100 | + print(forms) |
| 101 | + else: |
| 102 | + tags = ' '.join(list_of_tags) |
| 103 | + print("%s / %s / NOT FIXED" % (forms, tags)) |
| 104 | + break |
0 commit comments