diff --git a/.gitignore b/.gitignore index 0bb4106a5..8efffba13 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,7 @@ docs/_build/ # PyBuilder target/ + +# Validator +validator/logs/*.log +validator/logs/*.err diff --git a/test-cases/test.log b/test-cases/test.log deleted file mode 100644 index 3aca228b7..000000000 --- a/test-cases/test.log +++ /dev/null @@ -1,489 +0,0 @@ --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\ambiguous-feature.conllu -[Line 4 Sent ambiguous-feature]: [L2 Morpho invalid-feature] Spurious morphological feature: 'A=B=C'. Should be of the form Feature=Value and must start with [A-Z] and only contain [A-Za-z0-9]. -Morpho errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\cyclic-deps.conllu -[Line 4 Sent cyclic-deps]: [L2 Syntax non-tree] Non-tree structure. Words 2,3 are not reachable from the root 0. -Syntax errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\deprel-not-empty-in-empty.conllu -[Line 5 Sent deprel-not-empty-in-empty]: [L2 Format mwt-nonempty-field] An empty node must have '_' in the column DEPREL. Now: 'orphan'. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\dos-newlines.conllu -[Line 23 Sent dos-newlines3]: [L1 Format non-unix-newline] Only the unix-style LF line terminator is allowed. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\duplicate-feature.conllu -[Line 4 Sent duplicate-feature]: [L2 Morpho repeated-feature] Repeated features are disallowed: 'Gen=M|Gen=M'. -Morpho errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\duplicate-id.conllu -[Line 4 Sent duplicate-id]: [L1 Format word-id-sequence] Words do not form a sequence. Got '1,1'. Expected '1,2'. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\duplicate-layered-feature.conllu -[Line 4 Sent duplicate-layered-feature]: [L2 Morpho repeated-feature] Repeated features are disallowed: 'Number=Sing|Number[psor]=Plur|Number[psor]=Sing'. -Morpho errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\duplicate-value.conllu -[Line 4 Sent duplicate-value]: [L2 Morpho repeated-feature-value] Repeated feature values are disallowed: 'Gen=M,M' -Morpho errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\empty-field.conllu -[Line 4 Sent empty-field]: [L1 Format empty-column] Empty value in column FORM. -[Line 4 Sent empty-field]: [L2 Metadata missing-text] Missing the text attribute. -Format errors: 1 -Metadata errors: 1 -*** FAILED *** with 2 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\empty-head.conllu -[Line 4 Sent empty-head]: [L1 Format empty-column] Empty value in column HEAD. -[Line 5 Sent empty-head]: [L2 Format invalid-head] Invalid HEAD: ''. -[Line 5 Sent empty-head]: [L2 Syntax unknown-head] Undefined HEAD (no such ID): ''. -Format errors: 2 -Syntax errors: 1 -*** FAILED *** with 3 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\empty-sentence.conllu -[Line 3 Sent None]: [L1 Format extra-empty-line] Spurious empty line. Only one empty line is expected after every sentence. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\extra-empty-line.conllu -[Line 6 Sent extra-empty-line1]: [L1 Format extra-empty-line] Spurious empty line. Only one empty line is expected after every sentence. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\extra-field.conllu -[Line 4 Sent extra-field]: [L1 Format number-of-columns] The line has 11 columns but 10 are expected. The line will be excluded from further tests. -[Line 5 Sent extra-field]: [L1 Format extra-empty-line] Spurious empty line. Only one empty line is expected after every sentence. -[Line 5 Sent extra-field]: [L1 Format misplaced-comment] Spurious comment line. Comments are only allowed before a sentence. -Format errors: 3 -*** FAILED *** with 3 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\head-0-deprel-not-root.conllu -[Line 4 Sent head-0-deprel-not-root]: [L2 Syntax 0-is-not-root] DEPREL must be 'root' if HEAD is 0. -Syntax errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\head-not-0-deprel-root.conllu -[Line 5 Sent head-not-0-deprel-root]: [L2 Syntax root-is-not-0] DEPREL cannot be 'root' if HEAD is not 0. -Syntax errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\head-not-empty-in-empty.conllu -[Line 5 Sent head-not-empty-in-empty]: [L2 Format mwt-nonempty-field] An empty node must have '_' in the column HEAD. Now: '1'. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\id-starting-from-2.conllu -[Line 9 Sent id-starting-from-2b]: [L1 Format word-id-sequence] Words do not form a sequence. Got '2'. Expected '1'. -[Line 10 Sent id-starting-from-2b]: [L1 Format word-interval-out] Spurious token interval 2-2 (out of range) -Format errors: 2 -*** FAILED *** with 2 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\id-with-extra-0.conllu -[Line 5 Sent id-with-extra-0]: [L1 Format invalid-word-id] Unexpected ID format '01'. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\invalid-deps-id.conllu -[Line 6 Sent invalid-deps-id]: [L2 Enhanced unknown-ehead] Undefined enhanced head reference (no such ID): '3'. -Enhanced errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\invalid-deps-order.conllu -[Line 4 Sent invalid-deps-order]: [L2 Format unsorted-deps] DEPS not sorted by head index: '4:nsubj|2:csubj' -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\invalid-deps-syntax.conllu -[Line 6 Sent invalid-deps-syntax]: [L2 Format invalid-deps] Failed to parse DEPS: '2'. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\invalid-head.conllu -[Line 6 Sent invalid-head]: [L2 Syntax unknown-head] Undefined HEAD (no such ID): '3'. -Syntax errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\invalid-range-format.conllu -[Line 11 Sent invalid-range-format]: [L1 Format invalid-word-id] Unexpected ID format '2-X'. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\invalid-range.conllu -[Line 11 Sent invalid-range]: [L1 Format reversed-word-interval] Spurious token interval 2-1 -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\lowercase-feature-in-empty.conllu -[Line 5 Sent lowercase-feature-in-empty]: [L2 Morpho invalid-feature] Spurious morphological feature: 'tense=Pres'. Should be of the form Feature=Value and must start with [A-Z] and only contain [A-Za-z0-9]. -Morpho errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\lowercase-feature-value-in-empty.conllu -[Line 5 Sent lowercase-feature-value-in-empty]: [L2 Morpho invalid-feature] Spurious morphological feature: 'Tense=pres'. Should be of the form Feature=Value and must start with [A-Z] and only contain [A-Za-z0-9]. -Morpho errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\lowercase-feature.conllu -[Line 5 Sent lowercase-feature]: [L2 Morpho invalid-feature] Spurious morphological feature: 'lower=Nonvalid'. Should be of the form Feature=Value and must start with [A-Z] and only contain [A-Za-z0-9]. -Morpho errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\lowercase-postag-in-empty.conllu -[Line 5 Sent lowercase-postag-in-empty]: [L2 Morpho unknown-upos] Unknown UPOS tag: 'verb'. -Morpho errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\lowercase-postag.conllu -[Line 4 Sent lowercase-postag]: [L2 Morpho unknown-upos] Unknown UPOS tag: 'noun'. -Morpho errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\lowercase-value.conllu -[Line 5 Sent lowercase-value]: [L2 Morpho invalid-feature] Spurious morphological feature: 'Lower=nonvalid'. Should be of the form Feature=Value and must start with [A-Z] and only contain [A-Za-z0-9]. -Morpho errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\malformed_deps.conllu -[Line 15 Sent malformed_deps2]: [L2 Format invalid-deps] Failed to parse DEPS: 'xxx'. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\misindexed-empty-node.conllu -[Line 7 Sent misindexed-empty-node]: [L1 Format misplaced-empty-node] Empty node id 2.2, expected 1.1 -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\misordered-feature.conllu -[Line 5 Sent misordered-feature]: [L2 Morpho unsorted-features] Morphological features must be sorted: 'XB=True|Xa=True'. -Morpho errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\misordered-layered-feature.conllu -[Line 5 Sent misordered-layered-feature]: [L2 Morpho unsorted-features] Morphological features must be sorted: 'Number[psor]=Plur|Number=Sing'. -Morpho errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\misordered-multiword.conllu -[Line 12 Sent misordered-multiword]: [L1 Format misplaced-word-interval] Multiword range not before its first word. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\misplaced-comment-end.conllu -[Line 12 Sent misplaced-comment-end]: [L1 Format misplaced-comment] Spurious comment line. Comments are only allowed before a sentence. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\misplaced-comment-mid.conllu -[Line 6 Sent misplaced-comment-mid]: [L1 Format misplaced-comment] Spurious comment line. Comments are only allowed before a sentence. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\missing-final-line.conllu -[Line 4 Sent missing-final-line]: [L1 Format missing-empty-line] Missing empty line after the last sentence. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\missing-space-after.conllu -[Line 4 Sent missing-space-after]: [L2 Metadata missing-spaceafter] 'SpaceAfter=No' is missing in the MISC field of node 1 because the text is 'Dog.'. -Metadata errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\multiple-sent_id.conllu -[Line 19 Sent tanl2]: [L2 Metadata multiple-sent-id] Multiple sent_id attributes. -Metadata errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\multiword-with-pos.conllu -[Line 6 Sent multiword-with-pos]: [L2 Format mwt-nonempty-field] A multi-word token line must have '_' in the column UPOS. Now: 'VERB'. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\no-sent_id.conllu -[Line 8 Sent tanl1]: [L2 Metadata missing-sent-id] Missing the sent_id attribute. -Metadata errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\nonsequential-empty-node-id.conllu -[Line 7 Sent nonsequential-empty-node-id]: [L1 Format misplaced-empty-node] Empty node id 1.2, expected 1.1 -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\nonsequential-id.conllu -[Line 4 Sent nonsequential-id]: [L1 Format word-id-sequence] Words do not form a sequence. Got '1,3'. Expected '1,2'. -[Line 6 Sent nonsequential-id]: [L1 Format word-interval-out] Spurious token interval 3-3 (out of range) -Format errors: 2 -*** FAILED *** with 2 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\overlapping-multiword.conllu -[Line 12 Sent overlapping-multiword]: [L1 Format overlapping-word-intervals] Range overlaps with others: 3-4 -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\seemingly-empty-line.conllu -[Line 5 Sent 1]: [L1 Format pseudo-empty-line] Spurious line that appears empty but is not; there are whitespace characters. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\self-cycle-deps.conllu -[Line 5 Sent self-cycle-deps]: [L2 Enhanced deps-self-loop] Self-loop in DEPS for '2' -Enhanced errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\self-cycle-head.conllu -[Line 5 Sent self-cycle-head]: [L2 Syntax head-self-loop] HEAD == ID for 2 -Syntax errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\space-in-field.conllu -[Line 4 Sent space-in-field]: [L1 Format invalid-whitespace] White space not allowed in column XPOS: 'this is not valid'. -Format errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\tanl-broken.conllu -[Line 6 Sent tanl-broken1]: [L1 Format extra-empty-line] Spurious empty line. Only one empty line is expected after every sentence. -[Line 8 Sent tanl-broken1]: [L1 Format extra-empty-line] Spurious empty line. Only one empty line is expected after every sentence. -[Line 11 Sent tanl-broken2]: [L1 Format word-id-sequence] Words do not form a sequence. Got '1,2,3,4,6'. Expected '1,2,3,4,5'. -[Line 17 Sent tanl-broken2]: [L1 Format word-interval-out] Spurious token interval 6-6 (out of range) -[Line 24 Sent tanl-broken3]: [L1 Format missing-empty-line] Missing empty line after the last sentence. -[Line 20 Sent tanl-broken3]: [L2 Morpho invalid-feature] Spurious morphological feature: 'gen=f'. Should be of the form Feature=Value and must start with [A-Z] and only contain [A-Za-z0-9]. -[Line 20 Sent tanl-broken3]: [L2 Morpho invalid-feature] Spurious morphological feature: 'mod=p'. Should be of the form Feature=Value and must start with [A-Z] and only contain [A-Za-z0-9]. -[Line 20 Sent tanl-broken3]: [L2 Morpho invalid-feature] Spurious morphological feature: 'num=s'. Should be of the form Feature=Value and must start with [A-Z] and only contain [A-Za-z0-9]. -[Line 21 Sent tanl-broken3]: [L2 Morpho invalid-feature] Spurious morphological feature: 'gen=f'. Should be of the form Feature=Value and must start with [A-Z] and only contain [A-Za-z0-9]. -[Line 21 Sent tanl-broken3]: [L2 Morpho invalid-feature] Spurious morphological feature: 'num=s'. Should be of the form Feature=Value and must start with [A-Z] and only contain [A-Za-z0-9]. -Format errors: 5 -Morpho errors: 5 -*** FAILED *** with 10 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\token_with_cols_filled.conllu -[Line 6 Sent token-with-cols-filled]: [L2 Format mwt-nonempty-field] A multi-word token line must have '_' in the column LEMMA. Now: 'dalla'. -[Line 6 Sent token-with-cols-filled]: [L2 Format mwt-nonempty-field] A multi-word token line must have '_' in the column HEAD. Now: '0'. -[Line 6 Sent token-with-cols-filled]: [L2 Format mwt-nonempty-field] A multi-word token line must have '_' in the column DEPREL. Now: 'root'. -Format errors: 3 -*** FAILED *** with 3 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\trailing-tab.conllu -[Line 4 Sent trailing-tab]: [L1 Format number-of-columns] The line has 11 columns but 10 are expected. The line will be excluded from further tests. -[Line 5 Sent trailing-tab]: [L1 Format extra-empty-line] Spurious empty line. Only one empty line is expected after every sentence. -[Line 5 Sent trailing-tab]: [L1 Format misplaced-comment] Spurious comment line. Comments are only allowed before a sentence. -Format errors: 3 -*** FAILED *** with 3 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\unnormalized-unicode.conllu -[Line 3 Sent None]: [L1 Unicode unicode-normalization] Unicode not normalized: character[28] is LATIN SMALL LETTER C, should be LATIN SMALL LETTER C WITH CARON. - -This error usually does not mean that LATIN SMALL LETTER C is an invalid character. Usually it means that this is a base character followed by combining diacritics, and you should replace them by a single combined character. In this case, your next character is COMBINING CARON. You can fix normalization errors using the normalize_unicode.pl script from the tools repository. - -[Line 8 Sent 1]: [L1 Unicode unicode-normalization] Unicode not normalized: LEMMA.character[3] is LATIN SMALL LETTER C, should be LATIN SMALL LETTER C WITH CARON. -Unicode errors: 2 -*** FAILED *** with 2 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\uppercase-deprel.conllu -[Line 5 Sent uppercase-deprel]: [L2 Syntax invalid-deprel] Invalid DEPREL value 'Punct'. Only lowercase English letters or a colon are expected. -[Line 5 Sent uppercase-deprel Node 2]: [L2 Syntax unknown-deprel] Unknown DEPREL label: 'Punct' - -The following 37 relations are currently permitted in language [ud]: -acl, advcl, advmod, amod, appos, aux, case, cc, ccomp, clf, compound, conj, cop, csubj, dep, det, discourse, dislocated, expl, fixed, flat, goeswith, iobj, list, mark, nmod, nsubj, nummod, obj, obl, orphan, parataxis, punct, reparandum, root, vocative, xcomp -If a language needs a relation subtype that is not documented in the universal guidelines, the relation -must have a language-specific documentation page in a prescribed format. -See https://universaldependencies.org/contributing_language_specific.html for further guidelines. -Documented dependency relations can be specifically turned on/off for each language in which they are used. -See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_deprel.pl for details. - - -Syntax errors: 2 -*** FAILED *** with 2 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level1-2\uppercase-deps-deprel.conllu -[Line 5 Sent uppercase-deps-deprel]: [L2 Enhanced invalid-edeprel] Invalid enhanced relation type: '1:Dep'. -[Line 5 Sent uppercase-deps-deprel Node 2]: [L2 Enhanced unknown-edeprel] Unknown enhanced relation type 'Dep' in '1:Dep' - -The following 40 enhanced relations are currently permitted in language [ud]: -acl, advcl, advmod, amod, appos, aux, case, cc, ccomp, clf, compound, conj, cop, csubj, csubj:xsubj, dep, det, discourse, dislocated, expl, fixed, flat, goeswith, iobj, list, mark, nmod, nsubj, nsubj:xsubj, nummod, obj, obl, orphan, parataxis, punct, ref, reparandum, root, vocative, xcomp -See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_edeprel.pl for details. - - -Enhanced errors: 2 -*** FAILED *** with 2 errors - --------------------------------------------------------------------------------- -LEVEL 3 TESTS --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level3\non-proj.conllu -[Line 6 Sent non-proj Node 3]: [L3 Syntax punct-is-nonproj] Punctuation must not be attached non-projectively over nodes [Node] -Syntax errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level3\rel-upos.conllu -[Line 13 Sent WR-P-P-H-0000000041\WR-P-P-H-0000000041.p.2.s.4]: [L3 Warning fixed-without-extpos] Fixed expression '5 2/3' does not have the 'ExtPos' feature -[Line 13 Sent WR-P-P-H-0000000041\WR-P-P-H-0000000041.p.2.s.4]: [L3 Syntax rel-upos-det] 'det' should be 'DET' or 'PRON' but it is 'NUM' ('5') -[Line 42 Sent CP891-2]: [L3 Syntax rel-upos-nummod] 'nummod' should be 'NUM' but it is 'PROPN' ('X') -[Line 52 Sent CP891-2]: [L3 Syntax rel-upos-nummod] 'nummod' should be 'NUM' but it is 'PROPN' ('VIII') -[Line 63 Sent wikisample_Boarische-Grammatik-Konjunktiona_71]: [L3 Warning fixed-without-extpos] Fixed expression 'Fir des' does not have the 'ExtPos' feature -[Line 63 Sent wikisample_Boarische-Grammatik-Konjunktiona_71]: [L3 Syntax rel-upos-advmod] 'advmod' should be 'ADV' but it is 'ADP' ('Fir') -[Line 85 Sent 646]: [L3 Syntax rel-upos-expl] 'expl' should normally be 'PRON' but it is 'ADV' ('der') -[Line 111 Sent rel-upos-aux]: [L3 Syntax rel-upos-aux] 'aux' should be 'AUX' but it is 'VERB' ('have') -[Line 139 Sent 003966]: [L3 Syntax rel-upos-cop] 'cop' should be 'AUX' or 'PRON'/'DET' but it is 'VERB' ('vere') -[Line 145 Sent CESS-CAT-A-20000925-15876-s8]: [L3 Warning fixed-without-extpos] Fixed expression 'Tot i' does not have the 'ExtPos' feature -[Line 145 Sent CESS-CAT-A-20000925-15876-s8]: [L3 Syntax rel-upos-case] 'case' should not be 'PRON' ('Tot') -[Line 159 Sent news.hr-s112]: [L3 Warning fixed-without-extpos] Fixed expression 'kao to' does not have the 'ExtPos' feature -[Line 173 Sent news.hr-s112]: [L3 Warning fixed-without-extpos] Fixed expression 'obzirom da' does not have the 'ExtPos' feature -[Line 173 Sent news.hr-s112]: [L3 Syntax rel-upos-mark] 'mark' should not be 'NOUN' ('obzirom') -[Line 176 Sent news.hr-s112]: [L3 Warning fixed-without-extpos] Fixed expression 'kao i' does not have the 'ExtPos' feature -[Line 206 Sent 2509]: [L3 Warning fixed-without-extpos] Fixed expression 'maraon le' does not have the 'ExtPos' feature -[Line 206 Sent 2509]: [L3 Syntax rel-upos-cc] 'cc' should not be 'NOUN' ('maraon') -[Line 233 Sent punct]: [L3 Syntax upos-rel-punct] 'PUNCT' must be 'punct' but it is 'obj' ('Mary') -[Line 234 Sent punct]: [L3 Syntax rel-upos-punct] 'punct' must be 'PUNCT' but it is 'PROPN' ('.') -[Line 246 Sent 14356]: [L3 Warning fixed-without-extpos] Fixed expression 'mag-Laos SEA Games' does not have the 'ExtPos' feature -[Line 247 Sent 14356]: [L3 Syntax rel-upos-fixed] 'fixed' should not be used for proper nouns ('SEA'). -[Line 248 Sent 14356]: [L3 Syntax rel-upos-fixed] 'fixed' should not be used for proper nouns ('Games'). -Syntax errors: 14 -Warnings: 8 -*** FAILED *** with 14 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level3\right-to-left.conllu -[Line 9 Sent 13147 Node 3]: [L3 Syntax right-to-left-conj] Parent of relation 'conj' must precede the child in the word order. -[Line 11 Sent 13147 Node 5]: [L3 Syntax right-to-left-flat] Parent of relation 'flat' must precede the child in the word order. -[Line 13 Sent 13147 Node 7]: [L3 Syntax right-to-left-flat] Parent of relation 'flat' must precede the child in the word order. -[Line 30 Sent ASCA.YEAR0738.001.002 Node 5]: [L3 Syntax right-to-left-flat] Parent of relation 'flat' must precede the child in the word order. -[Line 31 Sent ASCA.YEAR0738.001.002 Node 6]: [L3 Syntax right-to-left-appos] Parent of relation 'appos' must precede the child in the word order. -[Line 48 Sent P334676 Node 10]: [L3 Syntax right-to-left-fixed] Parent of relation 'fixed' must precede the child in the word order. -[Line 49 Sent P334676]: [L3 Warning fixed-without-extpos] Fixed expression 'b\u0113t ina' does not have the 'ExtPos' feature -[Line 65 Sent goeswith Node 1]: [L3 Syntax right-to-left-goeswith] Parent of relation 'goeswith' must precede the child in the word order. -[Line 66 Sent goeswith Node 2]: [L3 Syntax goeswith-gap] Gaps in goeswith group [Node, Node] != []. -Syntax errors: 8 -Warnings: 1 -*** FAILED *** with 8 errors - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level3\too-many-subjects.conllu -[Line 17 Sent dev-6 Node 12]: [L3 Syntax too-many-subjects] Multiple subjects [2, 11] ('situacins', 'persoas') not subtyped as ':outer'. - -Outer subjects are allowed if a clause acts as the predicate of another clause. - -Syntax errors: 1 -*** FAILED *** with 1 errors - --------------------------------------------------------------------------------- -LEVEL 4-5 TESTS --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\invalid-level4-5\cs_pud-ud-test.conllu -[Line 9 Sent n01001011 Node 1]: [L4 Morpho feature-unknown] Feature NonExistent is not documented for language [cs]. - -The following 95 feature values are currently permitted in language [cs]: -Abbr=Yes, AdpType=Comprep, AdpType=Prep, AdpType=Voc, Animacy=Anim, Animacy=Inan, Aspect=Imp, Aspect=Perf, Case=Acc, Case=Dat, Case=Gen, Case=Ins, Case=Loc, Case=Nom, Case=Voc, ConjType=Oper, Degree=Cmp, Degree=Pos, Degree=Sup, Emph=Yes, ExtPos=ADJ, ExtPos=ADP, ExtPos=ADV, ExtPos=CCONJ, ExtPos=SCONJ, Foreign=Yes, Gender=Fem, Gender=Masc, Gender=Neut, Gender[psor]=Fem, Gender[psor]=Masc, Gender[psor]=Neut, Hyph=Yes, Mood=Cnd, Mood=Imp, Mood=Ind, NameType=Com, NameType=Geo, NameType=Giv, NameType=Nat, NameType=Oth, NameType=Pro, NameType=Sur, NumForm=Digit, NumForm=Roman, NumForm=Word, NumType=Card, NumType=Frac, NumType=Mult, NumType=Ord, NumType=Sets, Number=Dual, Number=Plur, Number=Sing, Number[psor]=Plur, Number[psor]=Sing, Person=1, Person=2, Person=3, Polarity=Neg, Polarity=Pos, Polite=Form, Poss=Yes, PrepCase=Npr, PrepCase=Pre, PronType=Dem, PronType=Emp, PronType=Ind, PronType=Int, PronType=Neg, PronType=Prs, PronType=Rel, PronType=Tot, Reflex=Yes, Style=Coll, Style=Expr, Style=Rare, Style=Slng, Style=Vrnc, Style=Vulg, Tense=Fut, Tense=Imp, Tense=Past, Tense=Pres, Typo=Yes, Variant=Long, Variant=Short, VerbForm=Conv, VerbForm=Fin, VerbForm=Inf, VerbForm=Part, VerbForm=Sup, VerbForm=Vnoun, Voice=Act, Voice=Pass -If a language needs a feature that is not documented in the universal guidelines, the feature must -have a language-specific documentation page in a prescribed format. -See https://universaldependencies.org/contributing_language_specific.html for further guidelines. -All features including universal must be specifically turned on for each language in which they are used. -See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_feature.pl for details. - - -[Line 10 Sent n01001011 Node 2]: [L4 Morpho feature-value-unknown] Value Ine is not documented for feature Case in language [cs]. -[Line 12 Sent n01001011 Node 4]: [L4 Morpho feature-upos-not-permitted] Feature PronType is not permitted with UPOS NOUN in language [cs]. -Morpho errors: 3 -*** FAILED *** with 3 errors - --------------------------------------------------------------------------------- -THE FOLLOWING FILES SHOULD BE VALID --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\valid\empty-file.conllu -*** PASSED *** - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\valid\empty-nodes.conllu -*** PASSED *** - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\valid\layered-features.conllu -*** PASSED *** - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\valid\maximal-empty-node.conllu -*** PASSED *** - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\valid\minimal-empty-node.conllu -*** PASSED *** - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\valid\multiple-features.conllu -*** PASSED *** - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\valid\tanl.conllu -*** PASSED *** - --------------------------------------------------------------------------------- -Processing file: C:\Users\zeman\Documents\lingvistika-projekty\ud-repos\tools\test-cases\valid\whitespace.conllu -*** PASSED *** - diff --git a/validate.py b/validate.py index a1dea3d63..104ab2869 100755 --- a/validate.py +++ b/validate.py @@ -7,6 +7,7 @@ import os.path import argparse import traceback +from collections import defaultdict # According to https://stackoverflow.com/questions/1832893/python-regex-matching-unicode-properties, # the regex module has the same API as re but it can check Unicode character properties using \p{} # as in Perl. @@ -57,6 +58,9 @@ def __init__(self): # Error counter by error type. Key: error type; value: error count. # Incremented in Incident.report(). self.error_counter = {} + # Lists of errors for each type, up to --max_store + # Key: error type; value: a list of the errors + self.error_tracker = defaultdict(list) # Set of detailed error explanations that have been printed so far. # Each explanation will be printed only once. Typically, an explanation # can be identified by test id + language code. Nevertheless, we put @@ -72,6 +76,12 @@ def __init__(self): # Remember all sentence ids seen in all input files (presumably one # corpus). We need it to check that each id is unique. self.known_sent_ids = set() + # Similarly, parallel ids should be unique in a corpus. (If multiple + # sentences are equivalents of the same virtual sentence in the + # parallel collection, they should be distinguished with 'altN'.) + self.known_parallel_ids = set() + self.parallel_id_lastalt = {} + self.parallel_id_lastpart = {} #---------------------------------------------------------------------- # Various things that we may have seen earlier in the corpus. The value # is None if we have not seen it, otherwise it is the line number of @@ -541,6 +551,8 @@ def __init__(self): self.newpar = re.compile(r"#\s*newpar(?:\s+(\S+))?") # Sentence id comment line. The actual id is bracketed. self.sentid = re.compile(r"#\s*sent_id\s*=\s*(\S+)") + # Parallel sentence id comment line. The actual id as well as its predefined parts are bracketed. + self.parallelid = re.compile(r"#\s*parallel_id\s*=\s*(([a-z]+)/([-0-9a-z]+)(?:/(alt[1-9][0-9]*|part[1-9][0-9]*|alt[1-9][0-9]*part[1-9][0-9]*))?)") # Sentence text comment line. The actual text is bracketed. self.text = re.compile(r"#\s*text\s*=\s*(.*\S)") # Global entity comment is a declaration of entity attributes in MISC. @@ -594,7 +606,10 @@ class Incident: default_testid = 'generic-error' default_message = 'No error description provided.' default_lineno = None - def __init__(self, state, level=None, testclass=None, testid=None, message=None, lineno=None, nodeid=None, explanation=''): + def __init__(self, state, args, level=None, testclass=None, testid=None, message=None, lineno=None, nodeid=None, explanation=''): + self.state = state + self.args = args + # Validation level to which the incident belongs. Integer 1-5. self.level = self.default_level if level == None else level # Thematic area to which the incident belongs: Format, Meta, Morpho, @@ -625,14 +640,16 @@ def __init__(self, state, level=None, testclass=None, testid=None, message=None, # ID of the node on which the error occurred (if it pertains to one node). self.nodeid = nodeid - def report(self, state, args): + def report(self): # Even if we should be quiet, at least count the error. - state.error_counter[self.testclass] = state.error_counter.get(self.testclass, 0)+1 - if args.quiet: + self.state.error_counter[self.testclass] = self.state.error_counter.get(self.testclass, 0)+1 + if self.args.max_store <= 0 or len(self.state.error_tracker[self.testclass]) < self.args.max_store: + self.state.error_tracker[self.testclass].append(self) + if self.args.quiet: return # Suppress error messages of a type of which we have seen too many. - if args.max_err > 0 and state.error_counter[self.testclass] > args.max_err: - if state.error_counter[self.testclass] == args.max_err + 1: + if self.args.max_err > 0 and self.state.error_counter[self.testclass] > self.args.max_err: + if self.state.error_counter[self.testclass] == self.args.max_err + 1: print(f'...suppressing further errors regarding {self.testclass}', file=sys.stderr) return # suppressed # If we are here, the error message should really be printed. @@ -641,15 +658,15 @@ def report(self, state, args): if self.nodeid: address += f' Node {self.nodeid}' # Insert file name if there are several input files. - if len(args.input) > 1: + if len(self.args.input) > 1: address = f'File {self.filename} ' + address # Classification of the incident. levelclassid = f'L{self.level} {self.testclass} {self.testid}' # Message (+ explanation, if this is the first error of its kind). message = self.message - if self.explanation and self.explanation not in state.explanation_printed: + if self.explanation and self.explanation not in self.state.explanation_printed: message += "\n\n" + self.explanation + "\n" - state.explanation_printed.add(self.explanation) + self.state.explanation_printed.add(self.explanation) print(f'[{address}]: [{levelclassid}] {message}', file=sys.stderr) @@ -700,9 +717,10 @@ def lemmatl(node): class Validator: - def __init__(self, args): - self.conllu_reader = udapi.block.read.conllu.Conllu() + def __init__(self, args=None): + args = parse_args(args) self.args = args + self.conllu_reader = udapi.block.read.conllu.Conllu() def next_sentence(self, state, inp): @@ -742,10 +760,10 @@ def next_sentence(self, state, inp): self.validate_unicode_normalization(state, line) if is_whitespace(line): Incident( - state=state, + state=state, args=self.args, testid='pseudo-empty-line', message='Spurious line that appears empty but is not; there are whitespace characters.' - ).report(state, self.args) + ).report() # We will pretend that the line terminates a sentence in order to # avoid subsequent misleading error messages. if token_lines_fields: @@ -767,10 +785,10 @@ def next_sentence(self, state, inp): state.comment_start_line = None else: Incident( - state=state, + state=state, args=self.args, testid='extra-empty-line', message='Spurious empty line. Only one empty line is expected after every sentence.' - ).report(state, self.args) + ).report() elif line[0] == '#': # We will really validate sentence ids later. But now we want to remember # everything that looks like a sentence id and use it in the error messages. @@ -784,10 +802,10 @@ def next_sentence(self, state, inp): comment_lines.append(line) else: Incident( - state=state, + state=state, args=self.args, testid='misplaced-comment', message='Spurious comment line. Comments are only allowed before a sentence.' - ).report(state, self.args) + ).report() elif line[0].isdigit(): if not token_lines_fields: # new sentence state.sentence_line = state.current_line @@ -802,31 +820,31 @@ def next_sentence(self, state, inp): self.validate_whitespace(state, cols) else: Incident( - state=state, + state=state, args=self.args, testid='number-of-columns', message=f'The line has {len(cols)} columns but {COLCOUNT} are expected. The line will be excluded from further tests.' - ).report(state, self.args) + ).report() corrupted = True else: # A line which is neither a comment nor a token/word, nor empty. That's bad! Incident( - state=state, + state=state, args=self.args, testid='invalid-line', message=f"Spurious line: '{line}'. All non-empty lines should start with a digit or the # character. The line will be excluded from further tests." - ).report(state, self.args) + ).report() else: # end of file if comment_lines and not token_lines_fields: # Comments at the end of the file, no sentence follows them. Incident( - state=state, + state=state, args=self.args, testid='misplaced-comment', message='Spurious comment line. Comments are only allowed before a sentence.' - ).report(state, self.args) + ).report() elif comment_lines or token_lines_fields: # These should have been yielded on an empty line! Incident( - state=state, + state=state, args=self.args, testid='missing-empty-line', message='Missing empty line after the last sentence.' - ).report(state, self.args) + ).report() if not corrupted: yield all_lines, comment_lines, token_lines_fields @@ -913,13 +931,13 @@ def validate_unicode_normalization(self, state, text): testmessage = f"Unicode not normalized: character[{firstj}] is {inpfirst}, should be {nfcfirst}." explanation_second = f" In this case, your next character is {inpsecond}." if inpsecond else '' Incident( - state=state, + state=state, args=self.args, level=1, testclass='Unicode', testid='unicode-normalization', message=testmessage, explanation=f"This error usually does not mean that {inpfirst} is an invalid character. Usually it means that this is a base character followed by combining diacritics, and you should replace them by a single combined character.{explanation_second} You can fix normalization errors using the normalize_unicode.pl script from the tools repository." - ).report(state, self.args) + ).report() @@ -938,61 +956,56 @@ def validate_whitespace(self, state, cols): Incident.default_testclass = 'Format' Incident.default_lineno = None # use the most recently read line # Some whitespace may be permitted in FORM, LEMMA and MISC but not elsewhere. - for col_idx in range(COLCOUNT): - # Must never be empty - if not cols[col_idx]: - Incident( - state=state, - testid='empty-column', - message=f'Empty value in column {COLNAMES[col_idx]}.' - ).report(state, self.args) - else: - # Must never have leading/trailing whitespace - if cols[col_idx][0].isspace(): - Incident( - state=state, - testid='leading-whitespace', - message=f'Leading whitespace not allowed in column {COLNAMES[col_idx]}.' - ).report(state, self.args) - if cols[col_idx][-1].isspace(): - Incident( - state=state, - testid='trailing-whitespace', - message=f'Trailing whitespace not allowed in column {COLNAMES[col_idx]}.' - ).report(state, self.args) - # Must never contain two consecutive whitespace characters - if crex.ws2.search(cols[col_idx]): - Incident( - state=state, - testid='repeated-whitespace', - message=f'Two or more consecutive whitespace characters not allowed in column {COLNAMES[col_idx]}.' - ).report(state, self.args) # Multi-word tokens may have whitespaces in MISC but not in FORM or LEMMA. # If it contains a space, it does not make sense to treat it as a MWT. - if is_multiword_token(cols): - for col_idx in (FORM, LEMMA): - if col_idx >= len(cols): - break # this has been already reported in next_sentence() - if crex.ws.search(cols[col_idx]): - Incident( - state=state, - testid='invalid-whitespace-mwt', - message=f"White space not allowed in multi-word token '{cols[col_idx]}'. If it contains a space, it is not one surface token." - ).report(state, self.args) - # These columns must not have whitespace. - for col_idx in (ID, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS): + ismwt = is_multiword_token(cols) + for col_idx in range(COLCOUNT): if col_idx >= len(cols): break # this has been already reported in next_sentence() - if crex.ws.search(cols[col_idx]): + if ismwt and col_idx in (FORM, LEMMA) and crex.ws.search(cols[col_idx]): + Incident( + state=state, args=self.args, + testid='invalid-whitespace-mwt', + message=f"White space not allowed in multi-word token '{cols[col_idx]}'. If it contains a space, it is not a single surface token." + ).report() + # These columns must not have whitespace. + elif col_idx in (ID, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS) and crex.ws.search(cols[col_idx]): Incident( - state=state, + state=state, args=self.args, testid='invalid-whitespace', message=f"White space not allowed in column {COLNAMES[col_idx]}: '{cols[col_idx]}'." - ).report(state, self.args) - # We should also check the ID format (e.g., '1' is good, '01' is wrong). - # Although it is checking just a single column, we will do it in - # validate_id_sequence() because that function has the power to block - # further tests, which could choke up on this. + ).report() + # Only perform the following tests if we have not found and reported a space above. + else: + # Must never be empty + if not cols[col_idx]: + Incident( + state=state, args=self.args, + testid='empty-column', + message=f"Empty value in column {COLNAMES[col_idx]}: '{cols[col_idx]}'." + ).report() + else: + # Must never have leading/trailing/repeated whitespace. + # This will be only reported for columns that allow whitespace in general. + if cols[col_idx][0].isspace(): + Incident( + state=state, args=self.args, + testid='leading-whitespace', + message=f"Leading whitespace not allowed in column {COLNAMES[col_idx]}: '{cols[col_idx]}'." + ).report() + if cols[col_idx][-1].isspace(): + Incident( + state=state, args=self.args, + testid='trailing-whitespace', + message=f"Trailing whitespace not allowed in column {COLNAMES[col_idx]}: '{cols[col_idx]}'." + ).report() + # Must never contain two consecutive whitespace characters + if crex.ws2.search(cols[col_idx]): + Incident( + state=state, args=self.args, + testid='repeated-whitespace', + message=f"Two or more consecutive whitespace characters not allowed in column {COLNAMES[col_idx]}: '{cols[col_idx]}'." + ).report() @@ -1021,10 +1034,10 @@ def validate_id_sequence(self, state, sentence): # Check for the format of the ID value. (ID must not be empty.) if not (is_word(cols) or is_empty_node(cols) or is_multiword_token(cols)): Incident( - state=state, + state=state, args=self.args, testid='invalid-word-id', message=f"Unexpected ID format '{cols[ID]}'." - ).report(state, self.args) + ).report() ok = False continue if not is_empty_node(cols): @@ -1040,19 +1053,19 @@ def validate_id_sequence(self, state, sentence): match = crex.mwtid.fullmatch(cols[ID]) # Check the interval against the regex if not match: # This should not happen. The function is_multiword_token() would then not return True. Incident( - state=state, + state=state, args=self.args, testid='invalid-word-interval', message=f"Spurious word interval definition: '{cols[ID]}'." - ).report(state, self.args) + ).report() ok = False continue beg, end = int(match.group(1)), int(match.group(2)) if not ((not words and beg >= 1) or (words and beg >= words[-1] + 1)): Incident( - state=state, + state=state, args=self.args, testid='misplaced-word-interval', message='Multiword range not before its first word.' - ).report(state, self.args) + ).report() ok = False continue tokens.append((beg, end)) @@ -1060,10 +1073,10 @@ def validate_id_sequence(self, state, sentence): word_id, empty_id = (int(i) for i in parse_empty_node_id(cols)) if word_id != current_word_id or empty_id != next_empty_id: Incident( - state=state, + state=state, args=self.args, testid='misplaced-empty-node', message=f'Empty node id {cols[ID]}, expected {current_word_id}.{next_empty_id}' - ).report(state, self.args) + ).report() ok = False next_empty_id += 1 # Interaction of multiword tokens and empty nodes if there is an empty @@ -1072,10 +1085,10 @@ def validate_id_sequence(self, state, sentence): # This sequence is wrong: 4 5-6 4.1 5 6 if word_id == current_word_id and tokens and word_id < tokens[-1][0]: Incident( - state=state, + state=state, args=self.args, testid='misplaced-empty-node', message=f"Empty node id {cols[ID]} must occur before multiword token {tokens[-1][0]}-{tokens[-1][1]}." - ).report(state, self.args) + ).report() ok = False # Now let's do some basic sanity checks on the sequences. # Expected sequence of word IDs is 1, 2, ... @@ -1083,29 +1096,29 @@ def validate_id_sequence(self, state, sentence): wrdstrseq = ','.join(str(x) for x in words) if wrdstrseq != expstrseq: Incident( - state=state, + state=state, args=self.args, lineno=-1, testid='word-id-sequence', message=f"Words do not form a sequence. Got '{wrdstrseq}'. Expected '{expstrseq}'." - ).report(state, self.args) + ).report() ok = False # Check elementary sanity of word intervals. # Remember that these are not just multi-word tokens. Here we have intervals even for single-word tokens (b=e)! for (b, e) in tokens: if e < b: # end before beginning Incident( - state=state, + state=state, args=self.args, testid='reversed-word-interval', message=f'Spurious token interval {b}-{e}' - ).report(state, self.args) + ).report() ok = False continue if b < 1 or e > len(words): # out of range Incident( - state=state, + state=state, args=self.args, testid='word-interval-out', message=f'Spurious token interval {b}-{e} (out of range)' - ).report(state, self.args) + ).report() ok = False continue return ok @@ -1128,20 +1141,20 @@ def validate_token_ranges(self, state, sentence): m = crex.mwtid.fullmatch(cols[ID]) if not m: # This should not happen. The function is_multiword_token() would then not return True. Incident( - state=state, + state=state, args=self.args, testid='invalid-word-interval', message=f"Spurious word interval definition: '{cols[ID]}'." - ).report(state, self.args) + ).report() continue start, end = m.groups() start, end = int(start), int(end) # Do not test if start >= end: This was already tested above in validate_id_sequence(). if covered & set(range(start, end+1)): Incident( - state=state, + state=state, args=self.args, testid='overlapping-word-intervals', message=f'Range overlaps with others: {cols[ID]}' - ).report(state, self.args) + ).report() covered |= set(range(start, end+1)) @@ -1154,13 +1167,13 @@ def validate_newlines(self, state, inp): """ if inp.newlines and inp.newlines != '\n': Incident( - state=state, + state=state, args=self.args, level=1, testclass='Format', lineno=state.current_line, testid='non-unix-newline', message='Only the unix-style LF line terminator is allowed.' - ).report(state, self.args) + ).report() @@ -1194,42 +1207,129 @@ def validate_sent_id(self, state, comments, lcode): else: if c.startswith('# sent_id') or c.startswith('#sent_id'): Incident( - state=state, + state=state, args=self.args, testid='invalid-sent-id', message=f"Spurious sent_id line: '{c}' should look like '# sent_id = xxxxx' where xxxxx is not whitespace. Forward slash reserved for special purposes." - ).report(state, self.args) + ).report() if not matched: Incident( - state=state, + state=state, args=self.args, testid='missing-sent-id', message='Missing the sent_id attribute.' - ).report(state, self.args) + ).report() elif len(matched) > 1: Incident( - state=state, + state=state, args=self.args, testid='multiple-sent-id', message='Multiple sent_id attributes.' - ).report(state, self.args) + ).report() else: # Uniqueness of sentence ids should be tested treebank-wide, not just file-wide. # For that to happen, all three files should be tested at once. sid = matched[0].group(1) if sid in state.known_sent_ids: Incident( - state=state, + state=state, args=self.args, testid='non-unique-sent-id', message=f"Non-unique sent_id attribute '{sid}'." - ).report(state, self.args) + ).report() if sid.count('/') > 1 or (sid.count('/') == 1 and lcode != 'ud'): Incident( - state=state, + state=state, args=self.args, testid='slash-in-sent-id', message=f"The forward slash is reserved for special use in parallel treebanks: '{sid}'" - ).report(state, self.args) + ).report() state.known_sent_ids.add(sid) + def validate_parallel_id(self, state, comments): + """ + The parallel_id sentence-level comment is used after sent_id of + sentences that are parallel translations of sentences in other + treebanks. Like sent_id, it must be well-formed and unique. Unlike + sent_id, it is optional. Sentences that do not have it are not + parallel. + """ + Incident.default_level = 2 + Incident.default_testclass = 'Metadata' + Incident.default_lineno = -1 # use the first line after the comments + matched = [] + for c in comments: + match = crex.parallelid.fullmatch(c) + if match: + matched.append(match) + else: + if c.startswith('# parallel_id') or c.startswith('#parallel_id'): + Incident( + state=state, args=self.args, + testid='invalid-parallel-id', + message=f"Spurious parallel_id line: '{c}' should look like '# parallel_id = corpus/sentence' where corpus is [a-z]+ and sentence is [-0-9a-z]. Optionally, '/alt[1-9][0-9]*' and/or 'part[1-9][0-9]*' may follow." + ).report() + if len(matched) > 1: + Incident( + state=state, args=self.args, + testid='multiple-parallel-id', + message='Multiple parallel_id attributes.' + ).report() + elif matched: + # Uniqueness of parallel ids should be tested treebank-wide, not just file-wide. + # For that to happen, all three files should be tested at once. + pid = matched[0].group(1) + if pid in state.known_parallel_ids: + Incident( + state=state, args=self.args, + testid='non-unique-parallel-id', + message=f"Non-unique parallel_id attribute '{pid}'." + ).report() + else: + # Additional tests when pid has altN or partN. + # Do them only if the whole pid is unique. + sid = matched[0].group(2) + '/' + matched[0].group(3) + alt = None + part = None + altpart = matched[0].group(4) + if altpart: + apmatch = re.fullmatch(r"(?:alt([0-9]+))?(?:part([0-9]+))?", altpart) + if apmatch: + alt = apmatch.group(1) + part = apmatch.group(2) + if alt: + alt = int(alt) + if part: + part = int(part) + if sid in state.parallel_id_lastalt: + if state.parallel_id_lastalt[sid] == None and alt != None or state.parallel_id_lastalt[sid] != None and alt == None: + Incident( + state=state, args=self.args, + testid='parallel-id-alt', + message=f"Some instances of parallel sentence '{sid}' have the 'alt' suffix while others do not." + ).report() + elif alt != None and alt != state.parallel_id_lastalt[sid] + 1: + Incident( + state=state, args=self.args, + testid='parallel-id-alt', + message=f"The alt suffix of parallel sentence '{sid}' should be {state.parallel_id_lastalt[sid]}+1 but it is {alt}." + ).report() + state.parallel_id_lastalt[sid] = alt + if sid in state.parallel_id_lastpart: + if state.parallel_id_lastpart[sid] == None and part != None or state.parallel_id_lastpart[sid] != None and part == None: + Incident( + state=state, args=self.args, + testid='parallel-id-part', + message=f"Some instances of parallel sentence '{sid}' have the 'part' suffix while others do not." + ).report() + elif part != None and part != state.parallel_id_lastpart[sid] + 1: + Incident( + state=state, args=self.args, + testid='parallel-id-part', + message=f"The part suffix of parallel sentence '{sid}' should be {state.parallel_id_lastpart[sid]}+1 but it is {part}." + ).report() + state.parallel_id_lastpart[sid] = part + state.known_parallel_ids.add(pid) + + + def validate_text_meta(self, state, comments, tree): """ Checks metadata other than sentence id, that is, document breaks, paragraph @@ -1254,42 +1354,42 @@ def validate_text_meta(self, state, comments, tree): text_matched.append(text_match) if len(newdoc_matched) > 1: Incident( - state=state, + state=state, args=self.args, testid='multiple-newdoc', message='Multiple newdoc attributes.' - ).report(state, self.args) + ).report() if len(newpar_matched) > 1: Incident( - state=state, + state=state, args=self.args, testid='multiple-newpar', message='Multiple newpar attributes.' - ).report(state, self.args) + ).report() if (newdoc_matched or newpar_matched) and state.spaceafterno_in_effect: Incident( - state=state, + state=state, args=self.args, testid='spaceafter-newdocpar', message='New document or paragraph starts when the last token of the previous sentence says SpaceAfter=No.' - ).report(state, self.args) + ).report() if not text_matched: Incident( - state=state, + state=state, args=self.args, testid='missing-text', message='Missing the text attribute.' - ).report(state, self.args) + ).report() elif len(text_matched) > 1: Incident( - state=state, + state=state, args=self.args, testid='multiple-text', message='Multiple text attributes.' - ).report(state, self.args) + ).report() else: stext = text_matched[0].group(1) if stext[-1].isspace(): Incident( - state=state, + state=state, args=self.args, testid='text-trailing-whitespace', message='The text attribute must not end with whitespace.' - ).report(state, self.args) + ).report() # Validate the text against the SpaceAfter attribute in MISC. skip_words = set() mismatch_reported = 0 # do not report multiple mismatches in the same sentence; they usually have the same cause @@ -1301,25 +1401,25 @@ def validate_text_meta(self, state, comments, tree): iline += 1 if 'NoSpaceAfter=Yes' in cols[MISC]: # I leave this without the split("|") to catch all Incident( - state=state, + state=state, args=self.args, testid='nospaceafter-yes', message="'NoSpaceAfter=Yes' should be replaced with 'SpaceAfter=No'." - ).report(state, self.args) + ).report() if len([x for x in cols[MISC].split('|') if re.match(r"^SpaceAfter=", x) and x != 'SpaceAfter=No']) > 0: Incident( - state=state, + state=state, args=self.args, lineno=state.sentence_line+iline, testid='spaceafter-value', message="Unexpected value of the 'SpaceAfter' attribute in MISC. Did you mean 'SpacesAfter'?" - ).report(state, self.args) + ).report() if is_empty_node(cols): if 'SpaceAfter=No' in cols[MISC]: # I leave this without the split("|") to catch all Incident( - state=state, + state=state, args=self.args, lineno=state.sentence_line+iline, testid='spaceafter-empty-node', message="'SpaceAfter=No' cannot occur with empty nodes." - ).report(state, self.args) + ).report() continue elif is_multiword_token(cols): beg, end = cols[ID].split('-') @@ -1330,11 +1430,11 @@ def validate_text_meta(self, state, comments, tree): elif cols[ID] in skip_words: if 'SpaceAfter=No' in cols[MISC]: Incident( - state=state, + state=state, args=self.args, lineno=state.sentence_line+iline, testid='spaceafter-mwt-node', message="'SpaceAfter=No' cannot occur with words that are part of a multi-word token." - ).report(state, self.args) + ).report() continue else: # Err, I guess we have nothing to do here. :) @@ -1346,11 +1446,11 @@ def validate_text_meta(self, state, comments, tree): if len(stext) >= 1 and stext[0].isspace(): extra_message = ' (perhaps extra SpaceAfter=No at previous token?)' Incident( - state=state, + state=state, args=self.args, lineno=state.sentence_line+iline, testid='text-form-mismatch', message=f"Mismatch between the text attribute and the FORM field. Form[{cols[ID]}] is '{cols[FORM]}' but text is '{stext[:len(cols[FORM])+20]}...'"+extra_message - ).report(state, self.args) + ).report() mismatch_reported = 1 else: stext = stext[len(cols[FORM]):] # eat the form @@ -1362,18 +1462,18 @@ def validate_text_meta(self, state, comments, tree): state.spaceafterno_in_effect = False if (stext) and not stext[0].isspace(): Incident( - state=state, + state=state, args=self.args, lineno=state.sentence_line+iline, testid='missing-spaceafter', message=f"'SpaceAfter=No' is missing in the MISC field of node {cols[ID]} because the text is '{shorten(cols[FORM]+stext)}'." - ).report(state, self.args) + ).report() stext = stext.lstrip() if stext: Incident( - state=state, + state=state, args=self.args, testid='text-extra-chars', message=f"Extra characters at the end of the text attribute, not accounted for in the FORM fields: '{stext}'" - ).report(state, self.args) + ).report() @@ -1441,13 +1541,13 @@ def validate_mwt_empty_vals(self, state, cols, line): state.mwt_typo_span_end = m.group(2) elif cols[col_idx] != '_': Incident( - state=state, + state=state, args=self.args, lineno=line, level=2, testclass='Format', testid='mwt-nonempty-field', message=f"A multi-word token line must have '_' in the column {COLNAMES[col_idx]}. Now: '{cols[col_idx]}'." - ).report(state, self.args) + ).report() @@ -1469,13 +1569,13 @@ def validate_empty_node_empty_vals(self, state, cols, line): for col_idx in (HEAD, DEPREL): if cols[col_idx]!= '_': Incident( - state=state, + state=state, args=self.args, lineno=line, level=2, testclass='Format', testid='mwt-nonempty-field', message=f"An empty node must have '_' in the column {COLNAMES[col_idx]}. Now: '{cols[col_idx]}'." - ).report(state, self.args) + ).report() @@ -1499,29 +1599,29 @@ def validate_character_constraints(self, state, cols, line): # directly against the list of known tags. That is a level 2 test, too. if not (crex.deprel.fullmatch(cols[DEPREL]) or (is_empty_node(cols) and cols[DEPREL] == '_')): Incident( - state=state, + state=state, args=self.args, testclass='Syntax', testid='invalid-deprel', message=f"Invalid DEPREL value '{cols[DEPREL]}'. Only lowercase English letters or a colon are expected." - ).report(state, self.args) + ).report() try: self.deps_list(cols) except ValueError: Incident( - state=state, + state=state, args=self.args, testclass='Enhanced', testid='invalid-deps', message=f"Failed to parse DEPS: '{cols[DEPS]}'." - ).report(state, self.args) + ).report() return if any(deprel for head, deprel in self.deps_list(cols) if not crex.edeprel.fullmatch(deprel)): Incident( - state=state, + state=state, args=self.args, testclass='Enhanced', testid='invalid-edeprel', message=f"Invalid enhanced relation type: '{cols[DEPS]}'." - ).report(state, self.args) + ).report() @@ -1544,13 +1644,13 @@ def validate_upos(self, state, cols, line): # from a JSON file, should conform to the regular expression. if not crex.upos.fullmatch(cols[UPOS]) or cols[UPOS] not in data.upos: Incident( - state=state, + state=state, args=self.args, lineno=line, level=2, testclass='Morpho', testid='unknown-upos', message=f"Unknown UPOS tag: '{cols[UPOS]}'." - ).report(state, self.args) + ).report() @@ -1583,10 +1683,10 @@ def validate_features_level2(self, state, cols, line): feat_list = feats.split('|') if [f.lower() for f in feat_list] != sorted(f.lower() for f in feat_list): Incident( - state=state, + state=state, args=self.args, testid='unsorted-features', message=f"Morphological features must be sorted: '{feats}'." - ).report(state, self.args) + ).report() attr_set = set() # I'll gather the set of features here to check later that none is repeated. # Subsequent higher-level tests could fail if a feature is not in the # Feature=Value format. If that happens, we will return False and the caller @@ -1596,10 +1696,10 @@ def validate_features_level2(self, state, cols, line): match = crex.featval.fullmatch(f) if match is None: Incident( - state=state, + state=state, args=self.args, testid='invalid-feature', message=f"Spurious morphological feature: '{f}'. Should be of the form Feature=Value and must start with [A-Z] and only contain [A-Za-z0-9]." - ).report(state, self.args) + ).report() attr_set.add(f) # to prevent misleading error "Repeated features are disallowed" safe = False else: @@ -1609,36 +1709,35 @@ def validate_features_level2(self, state, cols, line): values = match.group(2).split(',') if len(values) != len(set(values)): Incident( - state=state, + state=state, args=self.args, testid='repeated-feature-value', message=f"Repeated feature values are disallowed: '{feats}'" - ).report(state, self.args) + ).report() if [v.lower() for v in values] != sorted(v.lower() for v in values): Incident( - state=state, + state=state, args=self.args, testid='unsorted-feature-values', message=f"If a feature has multiple values, these must be sorted: '{f}'" - ).report(state, self.args) + ).report() for v in values: if not crex.val.fullmatch(v): Incident( - state=state, + state=state, args=self.args, testid='invalid-feature-value', message=f"Spurious value '{v}' in '{f}'. Must start with [A-Z0-9] and only contain [A-Za-z0-9]." - ).report(state, self.args) + ).report() # Level 2 tests character properties and canonical order but not that the f-v pair is known. if len(attr_set) != len(feat_list): Incident( - state=state, + state=state, args=self.args, testid='repeated-feature', message=f"Repeated features are disallowed: '{feats}'." - ).report(state, self.args) + ).report() return safe - @staticmethod - def features_present(state): + def features_present(self, state): """ In general, the annotation of morphological features is optional, although highly encouraged. However, if the treebank does have features, then certain @@ -1651,7 +1750,7 @@ def features_present(state): state.seen_morpho_feature = state.current_line for testid in state.delayed_feature_errors: for occurrence in state.delayed_feature_errors[testid]['occurrences']: - occurrence.report(state, self.args) + occurrence['incident'].report() @@ -1691,10 +1790,10 @@ def validate_deps(self, state, cols, line): heads = [float(h) for h, d in deps] if heads != sorted(heads): Incident( - state=state, + state=state, args=self.args, testid='unsorted-deps', message=f"DEPS not sorted by head index: '{cols[DEPS]}'" - ).report(state, self.args) + ).report() else: lasth = None lastd = None @@ -1702,16 +1801,16 @@ def validate_deps(self, state, cols, line): if h == lasth: if d < lastd: Incident( - state=state, + state=state, args=self.args, testid='unsorted-deps-2', message=f"DEPS pointing to head '{h}' not sorted by relation type: '{cols[DEPS]}'" - ).report(state, self.args) + ).report() elif d == lastd: Incident( - state=state, + state=state, args=self.args, testid='repeated-deps', message=f"DEPS contain multiple instances of the same relation '{h}:{d}'" - ).report(state, self.args) + ).report() lasth = h lastd = d try: @@ -1721,11 +1820,11 @@ def validate_deps(self, state, cols, line): return if id_ in heads: Incident( - state=state, + state=state, args=self.args, testclass='Enhanced', testid='deps-self-loop', message=f"Self-loop in DEPS for '{cols[ID]}'" - ).report(state, self.args) + ).report() @@ -1759,61 +1858,61 @@ def validate_misc(self, state, cols, line): if ma[0] == '': if len(ma) == 1: Incident( - state=state, + state=state, args=self.args, testid='empty-misc', message="Empty attribute in MISC; possible misinterpreted vertical bar?" - ).report(state, self.args) + ).report() else: Incident( - state=state, + state=state, args=self.args, testid='empty-misc-key', message=f"Empty MISC attribute name in '{ma[0]}={ma[1]}'." - ).report(state, self.args) + ).report() # We do not warn about MISC items that do not contain '='. # But the remaining error messages below assume that ma[1] exists. if len(ma) == 1: ma.append('') if re.match(r"^\s", ma[0]): Incident( - state=state, + state=state, args=self.args, testid='misc-extra-space', message=f"MISC attribute name starts with space in '{ma[0]}={ma[1]}'." - ).report(state, self.args) + ).report() elif re.search(r"\s$", ma[0]): Incident( - state=state, + state=state, args=self.args, testid='misc-extra-space', message=f"MISC attribute name ends with space in '{ma[0]}={ma[1]}'." - ).report(state, self.args) + ).report() elif re.match(r"^\s", ma[1]): Incident( - state=state, + state=state, args=self.args, testid='misc-extra-space', message=f"MISC attribute value starts with space in '{ma[0]}={ma[1]}'." - ).report(state, self.args) + ).report() elif re.search(r"\s$", ma[1]): Incident( - state=state, + state=state, args=self.args, testid='misc-extra-space', message=f"MISC attribute value ends with space in '{ma[0]}={ma[1]}'." - ).report(state, self.args) - if re.match(r"^(SpaceAfter|Lang|Translit|LTranslit|Gloss|LId|LDeriv)$", ma[0]): + ).report() + if re.match(r"^(SpaceAfter|Lang|Translit|LTranslit|Gloss|LId|LDeriv|Ref)$", ma[0]): mamap.setdefault(ma[0], 0) mamap[ma[0]] = mamap[ma[0]] + 1 - elif re.match(r"^\s*(spaceafter|lang|translit|ltranslit|gloss|lid|lderiv)\s*$", ma[0], re.IGNORECASE): + elif re.match(r"^\s*(spaceafter|lang|translit|ltranslit|gloss|lid|lderiv|ref)\s*$", ma[0], re.IGNORECASE): Incident( - state=state, + state=state, args=self.args, testid='misc-attr-typo', message=f"Possible typo (case or spaces) in MISC attribute '{ma[0]}={ma[1]}'." - ).report(state, self.args) + ).report() for a in list(mamap): if mamap[a] > 1: Incident( - state=state, + state=state, args=self.args, testclass='Format', # this one is real error testid='repeated-misc', message=f"MISC attribute '{a}' not supposed to occur twice" - ).report(state, self.args) + ).report() @@ -1850,46 +1949,46 @@ def validate_id_references(self, state, sentence): match = crex.head.fullmatch(cols[HEAD]) if match is None: Incident( - state=state, + state=state, args=self.args, testid='invalid-head', message=f"Invalid HEAD: '{cols[HEAD]}'." - ).report(state, self.args) + ).report() ok = False if not (cols[HEAD] in ids or cols[HEAD] == '0'): Incident( - state=state, + state=state, args=self.args, testclass='Syntax', testid='unknown-head', message=f"Undefined HEAD (no such ID): '{cols[HEAD]}'." - ).report(state, self.args) + ).report() ok = False try: deps = self.deps_list(cols) except ValueError: # Similar errors have probably been reported earlier. Incident( - state=state, + state=state, args=self.args, testid='invalid-deps', message=f"Failed to parse DEPS: '{cols[DEPS]}'." - ).report(state, self.args) + ).report() ok = False continue for head, deprel in deps: match = crex.ehead.fullmatch(head) if match is None: Incident( - state=state, + state=state, args=self.args, testid='invalid-ehead', message=f"Invalid enhanced head reference: '{head}'." - ).report(state, self.args) + ).report() ok = False if not (head in ids or head == '0'): Incident( - state=state, + state=state, args=self.args, testclass='Enhanced', testid='unknown-ehead', message=f"Undefined enhanced head reference (no such ID): '{head}'." - ).report(state, self.args) + ).report() ok = False return ok @@ -1940,11 +2039,11 @@ def validate_tree(self, state, sentence): head = int(cols[HEAD]) if head == id_: Incident( - state=state, + state=state, args=self.args, lineno=node_line, testid='head-self-loop', message=f'HEAD == ID for {cols[ID]}' - ).report(state, self.args) + ).report() return False # Incrementally build the set of children of every node. children.setdefault(head, set()).add(id_) @@ -1953,11 +2052,11 @@ def validate_tree(self, state, sentence): children_0 = sorted(children.get(0, [])) if len(children_0) > 1 and self.args.single_root: Incident( - state=state, + state=state, args=self.args, lineno=-1, testid='multiple-roots', message=f"Multiple root words: {children_0}" - ).report(state, self.args) + ).report() return False # Return None if there are any cycles. Otherwise we could not later ask # Udapi to built a data structure representing the tree. @@ -1977,11 +2076,11 @@ def validate_tree(self, state, sentence): if unreachable: str_unreachable = ','.join(str(w) for w in sorted(unreachable)) Incident( - state=state, + state=state, args=self.args, lineno=-1, testid='non-tree', message=f'Non-tree structure. Words {str_unreachable} are not reachable from the root 0.' - ).report(state, self.args) + ).report() return False return True @@ -2006,32 +2105,32 @@ def validate_root(self, state, node, line): if not node.is_empty(): if node.parent.ord == 0 and node.udeprel != 'root': Incident( - state=state, + state=state, args=self.args, testid='0-is-not-root', message="DEPREL must be 'root' if HEAD is 0." - ).report(state, self.args) + ).report() if node.parent.ord != 0 and node.udeprel == 'root': Incident( - state=state, + state=state, args=self.args, testid='root-is-not-0', message="DEPREL cannot be 'root' if HEAD is not 0." - ).report(state, self.args) + ).report() # In the enhanced graph, test both regular and empty roots. for edep in node.deps: if edep['parent'].ord == 0 and lspec2ud(edep['deprel']) != 'root': Incident( - state=state, + state=state, args=self.args, testclass='Enhanced', testid='enhanced-0-is-not-root', message="Enhanced relation type must be 'root' if head is 0." - ).report(state, self.args) + ).report() if edep['parent'].ord != 0 and lspec2ud(edep['deprel']) == 'root': Incident( - state=state, + state=state, args=self.args, testclass='Enhanced', testid='enhanced-root-is-not-0', message="Enhanced relation type cannot be 'root' if head is not 0." - ).report(state, self.args) + ).report() @@ -2061,19 +2160,19 @@ def validate_deps_all_or_none(self, state, sentence): state.seen_enhanced_graph = state.sentence_line if state.seen_tree_without_enhanced_graph: Incident( - state=state, + state=state, args=self.args, testid='edeps-only-sometimes', message=f"Enhanced graph must be empty because we saw empty DEPS on line {state.seen_tree_without_enhanced_graph}" - ).report(state, self.args) + ).report() else: if not state.seen_tree_without_enhanced_graph: state.seen_tree_without_enhanced_graph = state.sentence_line if state.seen_enhanced_graph: Incident( - state=state, + state=state, args=self.args, testid='edeps-only-sometimes', message=f"Enhanced graph cannot be empty because we saw non-empty DEPS on line {state.seen_enhanced_graph}" - ).report(state, self.args) + ).report() @@ -2129,13 +2228,13 @@ def validate_egraph_connected(self, state, nodes, linenos): if unreachable: sur = sorted(unreachable) Incident( - state=state, + state=state, args=self.args, lineno=linenos[sur[0]], level=2, testclass='Enhanced', testid='unconnected-egraph', message=f"Enhanced graph is not connected. Nodes {sur} are not reachable from any root" - ).report(state, self.args) + ).report() return None @@ -2177,7 +2276,7 @@ def validate_required_feature(self, state, feats, required_feature, required_val ok = False if not ok: if state.seen_morpho_feature: - incident.report(state, self.args) + incident.report() else: if not incident.testid in state.delayed_feature_errors: state.delayed_feature_errors[incident.testid] = {'occurrences': []} @@ -2206,22 +2305,22 @@ def validate_expected_features(self, state, node, lineno): Incident.default_testclass = 'Warning' if node.upos in ['PRON', 'DET']: self.validate_required_feature(state, node.feats, 'PronType', None, Incident( - state=state, + state=state, args=self.args, testid='pron-det-without-prontype', message=f"The word '{formtl(node)}' is tagged '{node.upos}' but it lacks the 'PronType' feature" )) if node.feats['VerbForm'] == 'Fin' and node.feats['Mood'] == '': Incident( - state=state, + state=state, args=self.args, testid='verbform-fin-without-mood', message=f"Finite verb '{formtl(node)}' lacks the 'Mood' feature" - ).report(state, self.args) + ).report() elif node.feats['Mood'] != '' and node.feats['VerbForm'] != 'Fin': Incident( - state=state, + state=state, args=self.args, testid='mood-without-verbform-fin', message=f"Non-empty 'Mood' feature at a word that is not finite verb ('{formtl(node)}')" - ).report(state, self.args) + ).report() @@ -2260,11 +2359,11 @@ def validate_upos_vs_deprel(self, state, node, lineno): fixed_forms = [node.form] + [x.form for x in node.children if x.udeprel == 'fixed'] str_fixed_forms = ' '.join(fixed_forms) Incident( - state=state, + state=state, args=self.args, testclass='Warning', testid='fixed-without-extpos', message=f"Fixed expression '{str_fixed_forms}' does not have the 'ExtPos' feature" - ).report(state, self.args) + ).report() # Certain relations are reserved for nominals and cannot be used for verbs. # Nevertheless, they can appear with adjectives or adpositions if they are promoted due to ellipsis. # Unfortunately, we cannot enforce this test because a word can be cited @@ -2274,20 +2373,20 @@ def validate_upos_vs_deprel(self, state, node, lineno): # Determiner can alternate with a pronoun. if deprel == 'det' and not re.match(r"^(DET|PRON)", upos): Incident( - state=state, + state=state, args=self.args, testid='rel-upos-det', message=f"'det' should be 'DET' or 'PRON' but it is '{upos}' ('{formtl(node)}')" - ).report(state, self.args) + ).report() # Nummod is for "number phrases" only. This could be interpreted as NUM only, # but some languages treat some cardinal numbers as NOUNs, and in # https://github.com/UniversalDependencies/docs/issues/596, # we concluded that the validator will tolerate them. if deprel == 'nummod' and not re.match(r"^(NUM|NOUN|SYM)$", upos): Incident( - state=state, + state=state, args=self.args, testid='rel-upos-nummod', message=f"'nummod' should be 'NUM' but it is '{upos}' ('{formtl(node)}')" - ).report(state, self.args) + ).report() # Advmod is for adverbs, perhaps particles but not for prepositional phrases or clauses. # Nevertheless, we should allow adjectives because they can be used as adverbs in some languages. # https://github.com/UniversalDependencies/docs/issues/617#issuecomment-488261396 @@ -2296,31 +2395,31 @@ def validate_upos_vs_deprel(self, state, node, lineno): # det is not much better, so maybe we should not enforce it. Adding DET to the tolerated UPOS tags. if deprel == 'advmod' and not re.match(r"^(ADV|ADJ|CCONJ|DET|PART|SYM)", upos) and not 'goeswith' in childrels: Incident( - state=state, + state=state, args=self.args, testid='rel-upos-advmod', message=f"'advmod' should be 'ADV' but it is '{upos}' ('{formtl(node)}')" - ).report(state, self.args) + ).report() # Known expletives are pronouns. Determiners and particles are probably acceptable, too. if deprel == 'expl' and not re.match(r"^(PRON|DET|PART)$", upos): Incident( - state=state, + state=state, args=self.args, testid='rel-upos-expl', message=f"'expl' should normally be 'PRON' but it is '{upos}' ('{formtl(node)}')" - ).report(state, self.args) + ).report() # Auxiliary verb/particle must be AUX. if deprel == 'aux' and not re.match(r"^(AUX)", upos): Incident( - state=state, + state=state, args=self.args, testid='rel-upos-aux', message=f"'aux' should be 'AUX' but it is '{upos}' ('{formtl(node)}')" - ).report(state, self.args) + ).report() # Copula is an auxiliary verb/particle (AUX) or a pronoun (PRON|DET). if deprel == 'cop' and not re.match(r"^(AUX|PRON|DET|SYM)", upos): Incident( - state=state, + state=state, args=self.args, testid='rel-upos-cop', message=f"'cop' should be 'AUX' or 'PRON'/'DET' but it is '{upos}' ('{formtl(node)}')" - ).report(state, self.args) + ).report() # Case is normally an adposition, maybe particle. # However, there are also secondary adpositions and they may have the original POS tag: # NOUN: [cs] pomocí, prostřednictvím @@ -2328,10 +2427,10 @@ def validate_upos_vs_deprel(self, state, node, lineno): # Interjection can also act as case marker for vocative, as in Sanskrit: भोः भगवन् / bhoḥ bhagavan / oh sir. if deprel == 'case' and re.match(r"^(PROPN|ADJ|PRON|DET|NUM|AUX)", upos): Incident( - state=state, + state=state, args=self.args, testid='rel-upos-case', message=f"'case' should not be '{upos}' ('{formtl(node)}')" - ).report(state, self.args) + ).report() # Mark is normally a conjunction or adposition, maybe particle but definitely not a pronoun. ###!!! February 2022: Temporarily allow mark+VERB ("regarding"). In the future, it should be banned again ###!!! by default (and case+VERB too), but there should be a language-specific list of exceptions. @@ -2341,35 +2440,35 @@ def validate_upos_vs_deprel(self, state, node, lineno): ###!!! now be required also for single-word expressions. if deprel == 'mark' and re.match(r"^(NOUN|PROPN|ADJ|PRON|DET|NUM|AUX|INTJ)", upos): Incident( - state=state, + state=state, args=self.args, testid='rel-upos-mark', message=f"'mark' should not be '{upos}' ('{formtl(node)}')" - ).report(state, self.args) + ).report() # Cc is a conjunction, possibly an adverb or particle. if deprel == 'cc' and re.match(r"^(NOUN|PROPN|ADJ|PRON|DET|NUM|VERB|AUX|INTJ)", upos): Incident( - state=state, + state=state, args=self.args, testid='rel-upos-cc', message=f"'cc' should not be '{upos}' ('{formtl(node)}')" - ).report(state, self.args) + ).report() if deprel == 'punct' and upos != 'PUNCT': Incident( - state=state, + state=state, args=self.args, testid='rel-upos-punct', message=f"'punct' must be 'PUNCT' but it is '{upos}' ('{formtl(node)}')" - ).report(state, self.args) + ).report() if upos == 'PUNCT' and not re.match(r"^(punct|root)", deprel): Incident( - state=state, + state=state, args=self.args, testid='upos-rel-punct', message=f"'PUNCT' must be 'punct' but it is '{node.deprel}' ('{formtl(node)}')" - ).report(state, self.args) + ).report() if upos == 'PROPN' and (deprel == 'fixed' or 'fixed' in childrels): Incident( - state=state, + state=state, args=self.args, testid='rel-upos-fixed', message=f"'fixed' should not be used for proper nouns ('{formtl(node)}')." - ).report(state, self.args) + ).report() @@ -2398,20 +2497,20 @@ def validate_flat_foreign(self, state, node, lineno, linenos): parent = node.parent if node.upos != 'X' or str(node.feats) != 'Foreign=Yes': Incident( - state=state, + state=state, args=self.args, lineno=lineno, nodeid=node.ord, testid='flat-foreign-upos-feats', message="The child of a flat:foreign relation should have UPOS X and Foreign=Yes (but no other features)." - ).report(state, self.args) + ).report() if parent.upos != 'X' or str(parent.feats) != 'Foreign=Yes': Incident( - state=state, + state=state, args=self.args, lineno=linenos[str(parent.ord)], nodeid=parent.ord, testid='flat-foreign-upos-feats', message="The parent of a flat:foreign relation should have UPOS X and Foreign=Yes (but no other features)." - ).report(state, self.args) + ).report() @@ -2431,7 +2530,7 @@ def validate_left_to_right_relations(self, state, node, lineno): The 1-based index of the line where the node occurs. """ # According to the v2 guidelines, apposition should also be left-headed, although the definition of apposition may need to be improved. - if re.match(r"^(conj|fixed|flat|goeswith|appos)", node.deprel): + if node.udeprel in ['conj', 'fixed', 'flat', 'goeswith', 'appos']: ichild = node.ord iparent = node.parent.ord if ichild < iparent: @@ -2441,14 +2540,14 @@ def validate_left_to_right_relations(self, state, node, lineno): # The designation "right-to-left" is confusing in languages with right-to-left writing systems. # We keep it in the testid but we make the testmessage more neutral. Incident( - state=state, + state=state, args=self.args, lineno=lineno, nodeid=node.ord, level=3, testclass='Syntax', testid=f"right-to-left-{node.udeprel}", message=f"Parent of relation '{node.deprel}' must precede the child in the word order." - ).report(state, self.args) + ).report() @@ -2505,7 +2604,7 @@ def is_inner_subject(node): subject_forms = [formtl(x) for x in subjects] if len(subjects) > 1: Incident( - state=state, + state=state, args=self.args, lineno=lineno, nodeid=node.ord, level=3, @@ -2513,7 +2612,7 @@ def is_inner_subject(node): testid='too-many-subjects', message=f"Multiple subjects {str(subject_ids)} ({str(subject_forms)[1:-1]}) not subtyped as ':outer'.", explanation="Outer subjects are allowed if a clause acts as the predicate of another clause." - ).report(state, self.args) + ).report() @@ -2536,14 +2635,14 @@ def validate_single_object(self, state, node, lineno): object_forms = [formtl(x) for x in objects] if len(objects) > 1: Incident( - state=state, + state=state, args=self.args, lineno=lineno, nodeid=node.ord, level=3, testclass='Syntax', testid='too-many-objects', message=f"Multiple direct objects {str(object_ids)} ({str(object_forms)[1:-1]}) under one predicate." - ).report(state, self.args) + ).report() @@ -2576,14 +2675,14 @@ def validate_orphan(self, state, node, lineno): # for details and a Latin example. if not re.match(r"^(conj|parataxis|root|csubj|ccomp|advcl|acl|reparandum)$", node.parent.udeprel): Incident( - state=state, + state=state, args=self.args, lineno=lineno, nodeid=node.ord, level=3, testclass='Warning', testid='orphan-parent', message=f"The parent of 'orphan' should normally be 'conj' but it is '{node.parent.udeprel}'." - ).report(state, self.args) + ).report() @@ -2605,7 +2704,7 @@ def validate_functional_leaves(self, state, node, lineno, linenos): """ # This is a level 3 test, we will check only the universal part of the relation. deprel = node.udeprel - if re.match(r"^(case|mark|cc|aux|cop|det|clf|fixed|goeswith|punct)$", deprel): + if deprel in ['case', 'mark', 'cc', 'aux', 'cop', 'det', 'clf', 'fixed', 'goeswith', 'punct']: idparent = node.ord pdeprel = deprel pfeats = node.feats @@ -2659,18 +2758,18 @@ def validate_functional_leaves(self, state, node, lineno, linenos): # a 'conj' dependent. In "and/or", "or" will depend on "and" as 'conj'.) if re.match(r"^(mark|case)$", pdeprel) and not re.match(r"^(advmod|obl|goeswith|fixed|reparandum|conj|cc|punct)$", cdeprel): Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='leaf-mark-case', message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" - ).report(state, self.args) + ).report() if re.match(r"^(aux|cop)$", pdeprel) and not re.match(r"^(goeswith|fixed|reparandum|conj|cc|punct)$", cdeprel): Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='leaf-aux-cop', message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" - ).report(state, self.args) + ).report() # Classifiers must be allowed under demonstrative determiners according to the clf guidelines. # People have identified various constructions where the restriction # on children of det dependents may have to be relaxed even if not @@ -2707,25 +2806,25 @@ def validate_functional_leaves(self, state, node, lineno, linenos): # be another exception. if re.match(r"^(det)$", pdeprel) and not re.match(r"^(det|case|advmod|obl|clf|goeswith|fixed|flat|compound|reparandum|discourse|parataxis|conj|cc|punct)$", cdeprel) and not (pfeats['Poss'] == 'Yes' and re.match(r"^(appos|acl|nmod)$", cdeprel)): Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='leaf-det', message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" - ).report(state, self.args) + ).report() if re.match(r"^(clf)$", pdeprel) and not re.match(r"^(advmod|obl|goeswith|fixed|reparandum|conj|cc|punct)$", cdeprel): Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='leaf-clf', message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" - ).report(state, self.args) + ).report() if re.match(r"^(cc)$", pdeprel) and not re.match(r"^(goeswith|fixed|reparandum|conj|punct)$", cdeprel): Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='leaf-cc', message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" - ).report(state, self.args) + ).report() # Fixed expressions should not be nested, i.e., no chains of fixed relations. # As they are supposed to represent functional elements, they should not have # other dependents either, with the possible exception of conj. @@ -2736,28 +2835,28 @@ def validate_functional_leaves(self, state, node, lineno, linenos): # practical to retokenize. elif pdeprel == 'fixed' and not re.match(r"^(goeswith|reparandum|conj|punct)$", cdeprel): Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='leaf-fixed', message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" - ).report(state, self.args) + ).report() # Goeswith cannot have any children, not even another goeswith. elif pdeprel == 'goeswith': Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='leaf-goeswith', message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" - ).report(state, self.args) + ).report() # Punctuation can exceptionally have other punct children if an exclamation # mark is in brackets or quotes. It cannot have other children. elif pdeprel == 'punct' and cdeprel != 'punct': Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='leaf-punct', message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" - ).report(state, self.args) + ).report() @@ -2788,14 +2887,14 @@ def validate_fixed_span(self, state, node, lineno): fxordlist = [n.ord for n in fxlist] fxexpr = ' '.join([(n.form if n in fxlist else '*') for n in fxrange]) Incident( - state=state, + state=state, args=self.args, lineno=lineno, nodeid=node.ord, level=3, testclass='Warning', testid='fixed-gap', message=f"Gaps in fixed expression {str(fxordlist)} '{fxexpr}'" - ).report(state, self.args) + ).report() def validate_goeswith_span(self, state, node, lineno): @@ -2826,26 +2925,26 @@ def validate_goeswith_span(self, state, node, lineno): gwordlist = [n.ord for n in gwlist] gwordrange = [n.ord for n in gwrange] Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='goeswith-gap', message=f"Gaps in goeswith group {str(gwordlist)} != {str(gwordrange)}." - ).report(state, self.args) + ).report() # Non-last node in a goeswith range must have a space after itself. nospaceafter = [x for x in gwlist[:-1] if x.misc['SpaceAfter'] == 'No'] if nospaceafter: Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='goeswith-nospace', message="'goeswith' cannot connect nodes that are not separated by whitespace." - ).report(state, self.args) + ).report() # This is not about the span of the interrupted word, but since we already # know that we are at the head of a goeswith word, let's do it here, too. # Every goeswith parent should also have Typo=Yes. However, this is not # required if the treebank does not have features at all. incident = Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testclass='Morpho', testid='goeswith-missing-typo', @@ -2874,33 +2973,33 @@ def validate_goeswith_morphology_and_edeps(self, state, node, lineno): if node.udeprel == 'goeswith': if node.lemma != '_': Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='goeswith-lemma', message="The lemma of a 'goeswith'-connected word must be annotated only at the first part." - ).report(state, self.args) + ).report() if node.upos != 'X': Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='goeswith-upos', message="The UPOS tag of a 'goeswith'-connected word must be annotated only at the first part; the other parts must be tagged 'X'." - ).report(state, self.args) + ).report() if str(node.feats) != '_': Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='goeswith-feats', message="The morphological features of a 'goeswith'-connected word must be annotated only at the first part." - ).report(state, self.args) + ).report() if str(node.raw_deps) != '_' and str(node.raw_deps) != str(node.parent.ord)+':'+node.deprel: Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testclass='Enhanced', testid='goeswith-edeps', message="A 'goeswith' dependent cannot have any additional dependencies in the enhanced graph." - ).report(state, self.args) + ).report() def get_caused_nonprojectivities(self, node): @@ -2991,7 +3090,7 @@ def get_gap(node): gap = [] if rangebetween: gap = [n for n in node.root.descendants if n.ord in rangebetween and not n in node.parent.descendants] - return gap + return sorted(gap) @@ -3013,20 +3112,22 @@ def validate_projective_punctuation(self, state, node, lineno): if node.udeprel == 'punct': nonprojnodes = self.get_caused_nonprojectivities(node) if nonprojnodes: + nonprojids = [x.ord for x in nonprojnodes] Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='punct-causes-nonproj', - message=f"Punctuation must not cause non-projectivity of nodes {nonprojnodes}" - ).report(state, self.args) - gap = self.get_gap(node) - if gap: + message=f"Punctuation must not cause non-projectivity of nodes {nonprojids}" + ).report() + gapnodes = self.get_gap(node) + if gapnodes: + gapids = [x.ord for x in gapnodes] Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='punct-is-nonproj', - message=f"Punctuation must not be attached non-projectively over nodes {sorted(gap)}" - ).report(state, self.args) + message=f"Punctuation must not be attached non-projectively over nodes {gapids}" + ).report() @@ -3094,11 +3195,11 @@ def validate_enhanced_orphan(self, state, node, line): # and only if an orphan occurred before it. if state.seen_enhanced_orphan: Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='empty-node-after-eorphan', message=f"Empty node means that we address gapping and there should be no orphans in the enhanced graph; but we saw one on line {state.seen_enhanced_orphan}" - ).report(state, self.args) + ).report() udeprels = set([lspec2ud(edep['deprel']) for edep in node.deps]) if 'orphan' in udeprels: if not state.seen_enhanced_orphan: @@ -3106,11 +3207,11 @@ def validate_enhanced_orphan(self, state, node, line): # If we have seen an empty node, then the orphan is an error. if state.seen_empty_node: Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='eorphan-after-empty-node', message=f"'orphan' not allowed in enhanced graph because we saw an empty node on line {state.seen_empty_node}" - ).report(state, self.args) + ).report() @@ -3158,20 +3259,20 @@ def validate_words_with_spaces(self, state, node, line, lang): string_to_test = re.sub(r'\xA0', ' ', word) if not tospacedata[1].fullmatch(string_to_test): Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='invalid-word-with-space', message=f"'{word}' in column {column} is not on the list of exceptions allowed to contain whitespace.", explanation=data.explain_tospace(lang) - ).report(state, self.args) + ).report() else: Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='invalid-word-with-space', message=f"'{word}' in column {column} is not on the list of exceptions allowed to contain whitespace.", explanation=data.explain_tospace(lang) - ).report(state, self.args) + ).report() @@ -3214,11 +3315,11 @@ def validate_features_level4(self, state, node, line, lang): # If it occurs there, it cannot be duplicated on the lines of the component words. if f == 'Typo' and state.mwt_typo_span_end and node.ord <= state.mwt_typo_span_end: Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='mwt-typo-repeated-at-word', message="Feature Typo cannot occur at a word if it already occurred at the corresponding multi-word token." - ).report(state, self.args) + ).report() # In case of code switching, the current token may not be in the default language # and then its features are checked against a different feature set. An exception # is the feature Foreign, which always relates to the default language of the @@ -3233,48 +3334,48 @@ def validate_features_level4(self, state, node, line, lang): if effective_featset is not None: if f not in effective_featset: Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='feature-unknown', message=f"Feature {f} is not documented for language [{effective_lang}] ('{formtl(node)}').", explanation=data.explain_feats(effective_lang) - ).report(state, self.args) + ).report() else: lfrecord = effective_featset[f] if lfrecord['permitted'] == 0: Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='feature-not-permitted', message=f"Feature {f} is not permitted in language [{effective_lang}] ('{formtl(node)}').", explanation=data.explain_feats(effective_lang) - ).report(state, self.args) + ).report() else: values = lfrecord['uvalues'] + lfrecord['lvalues'] + lfrecord['unused_uvalues'] + lfrecord['unused_lvalues'] if not v in values: Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='feature-value-unknown', message=f"Value {v} is not documented for feature {f} in language [{effective_lang}] ('{formtl(node)}').", explanation=data.explain_feats(effective_lang) - ).report(state, self.args) + ).report() elif not node.upos in lfrecord['byupos']: Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='feature-upos-not-permitted', message=f"Feature {f} is not permitted with UPOS {node.upos} in language [{effective_lang}] ('{formtl(node)}').", explanation=data.explain_feats(effective_lang) - ).report(state, self.args) + ).report() elif not v in lfrecord['byupos'][node.upos] or lfrecord['byupos'][node.upos][v]==0: Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='feature-value-upos-not-permitted', message=f"Value {v} of feature {f} is not permitted with UPOS {node.upos} in language [{effective_lang}] ('{formtl(node)}').", explanation=data.explain_feats(effective_lang) - ).report(state, self.args) + ).report() if state.mwt_typo_span_end and int(state.mwt_typo_span_end) <= int(node.ord): state.mwt_typo_span_end = None @@ -3325,12 +3426,12 @@ def validate_deprels(self, state, node, line): Incident.default_level = 2 if deprel not in main_deprelset and deprel not in alt_deprelset: Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='unknown-deprel', message=f"Unknown DEPREL label: '{deprel}'", explanation=data.explain_deprel(mainlang) - ).report(state, self.args) + ).report() # If there are enhanced dependencies, test their deprels, too. # We already know that the contents of DEPS is parsable (deps_list() was # first called from validate_id_references() and the head indices are OK). @@ -3348,12 +3449,12 @@ def validate_deprels(self, state, node, line): Incident.default_level = 2 if not (deprel in main_edeprelset or naltlang != None and naltlang != mainlang and naltlang == paltlang and deprel in alt_edeprelset): Incident( - state=state, + state=state, args=self.args, nodeid=node.ord, testid='unknown-edeprel', message=f"Unknown enhanced relation type '{deprel}' in '{parent.ord}:{deprel}'", explanation=data.explain_edeprel(mainlang) - ).report(state, self.args) + ).report() @@ -3385,7 +3486,7 @@ def validate_auxiliary_verbs(self, state, node, line, lang): auxlist = data.get_aux_for_language(lang) if not auxlist or not node.lemma in auxlist: Incident( - state=state, + state=state, args=self.args, lineno=line, nodeid=node.ord, level=5, @@ -3393,7 +3494,7 @@ def validate_auxiliary_verbs(self, state, node, line, lang): testid='aux-lemma', message=f"'{node.lemma}' is not an auxiliary in language [{lang}]", explanation=data.explain_aux(lang) - ).report(state, self.args) + ).report() @@ -3419,7 +3520,7 @@ def validate_copula_lemmas(self, state, node, line, lang): coplist = data.get_cop_for_language(lang) if not coplist or not node.lemma in coplist: Incident( - state=state, + state=state, args=self.args, lineno=line, nodeid=node.ord, level=5, @@ -3427,7 +3528,7 @@ def validate_copula_lemmas(self, state, node, line, lang): testid='cop-lemma', message=f"'{node.lemma}' is not a copula in language [{lang}]", explanation=data.explain_cop(lang) - ).report(state, self.args) + ).report() @@ -3461,72 +3562,72 @@ def validate_misc_entity(self, state, comments, sentence): if state.seen_global_entity: if global_entity_match.group(1) != state.global_entity_attribute_string: Incident( - state=state, + state=state, args=self.args, testid='global-entity-mismatch', message=f"New declaration of global.Entity '{global_entity_match.group(1)}' does not match the first declaration '{state.global_entity_attribute_string}' on line {state.seen_global_entity}." - ).report(state, self.args) + ).report() else: state.seen_global_entity = state.comment_start_line + iline state.global_entity_attribute_string = global_entity_match.group(1) if not re.match(r"^[a-z]+(-[a-z]+)*$", state.global_entity_attribute_string): Incident( - state=state, + state=state, args=self.args, testid='spurious-global-entity', message=f"Cannot parse global.Entity attribute declaration '{state.global_entity_attribute_string}'." - ).report(state, self.args) + ).report() else: global_entity_attributes = state.global_entity_attribute_string.split('-') if not 'eid' in global_entity_attributes: Incident( - state=state, + state=state, args=self.args, testid='spurious-global-entity', message=f"Global.Entity attribute declaration '{state.global_entity_attribute_string}' does not include 'eid'." - ).report(state, self.args) + ).report() elif global_entity_attributes[0] != 'eid': Incident( - state=state, + state=state, args=self.args, testid='spurious-global-entity', message=f"Attribute 'eid' must come first in global.Entity attribute declaration '{state.global_entity_attribute_string}'." - ).report(state, self.args) + ).report() if not 'etype' in global_entity_attributes: Incident( - state=state, + state=state, args=self.args, testid='spurious-global-entity', message=f"Global.Entity attribute declaration '{state.global_entity_attribute_string}' does not include 'etype'." - ).report(state, self.args) + ).report() elif global_entity_attributes[1] != 'etype': Incident( - state=state, + state=state, args=self.args, testid='spurious-global-entity', message=f"Attribute 'etype' must come second in global.Entity attribute declaration '{state.global_entity_attribute_string}'." - ).report(state, self.args) + ).report() if not 'head' in global_entity_attributes: Incident( - state=state, + state=state, args=self.args, testid='spurious-global-entity', message=f"Global.Entity attribute declaration '{state.global_entity_attribute_string}' does not include 'head'." - ).report(state, self.args) + ).report() elif global_entity_attributes[2] != 'head': Incident( - state=state, + state=state, args=self.args, testid='spurious-global-entity', message=f"Attribute 'head' must come third in global.Entity attribute declaration '{state.global_entity_attribute_string}'." - ).report(state, self.args) + ).report() if 'other' in global_entity_attributes and global_entity_attributes[3] != 'other': Incident( - state=state, + state=state, args=self.args, testid='spurious-global-entity', message=f"Attribute 'other', if present, must come fourth in global.Entity attribute declaration '{state.global_entity_attribute_string}'." - ).report(state, self.args) + ).report() # Fill the global dictionary that maps attribute names to list indices. i = 0 for a in global_entity_attributes: if a in state.entity_attribute_index: Incident( - state=state, + state=state, args=self.args, testid='spurious-global-entity', message=f"Attribute '{a}' occurs more than once in global.Entity attribute declaration '{state.global_entity_attribute_string}'." - ).report(state, self.args) + ).report() else: state.entity_attribute_index[a] = i i += 1 @@ -3554,62 +3655,62 @@ def validate_misc_entity(self, state, comments, sentence): splitante = [x for x in misc if re.match(r"^SplitAnte=", x)] if is_multiword_token(cols) and (len(entity)>0 or len(bridge)>0 or len(splitante)>0): Incident( - state=state, + state=state, args=self.args, testid='entity-mwt', message="Entity or coreference annotation must not occur at a multiword-token line." - ).report(state, self.args) + ).report() continue if len(entity)>1: Incident( - state=state, + state=state, args=self.args, testid='multiple-entity-statements', message=f"There can be at most one 'Entity=' statement in MISC but we have {str(misc)}." - ).report(state, self.args) + ).report() continue if len(bridge)>1: Incident( - state=state, + state=state, args=self.args, testid='multiple-bridge-statements', message=f"There can be at most one 'Bridge=' statement in MISC but we have {str(misc)}." - ).report(state, self.args) + ).report() continue if len(splitante)>1: Incident( - state=state, + state=state, args=self.args, testid='multiple-splitante-statements', message=f"There can be at most one 'SplitAnte=' statement in MISC but we have {str(misc)}." - ).report(state, self.args) + ).report() continue if len(bridge)>0 and len(entity)==0: Incident( - state=state, + state=state, args=self.args, testid='bridge-without-entity', message=f"The 'Bridge=' statement can only occur together with 'Entity=' in MISC but we have {str(misc)}." - ).report(state, self.args) + ).report() continue if len(splitante)>0 and len(entity)==0: Incident( - state=state, + state=state, args=self.args, testid='splitante-without-entity', message=f"The 'SplitAnte=' statement can only occur together with 'Entity=' in MISC but we have {str(misc)}." - ).report(state, self.args) + ).report() continue # There is at most one Entity (and only if it is there, there may be also one Bridge and/or one SplitAnte). if len(entity)>0: if not state.seen_global_entity: Incident( - state=state, + state=state, args=self.args, testid='entity-without-global-entity', message="No global.Entity comment was found before the first 'Entity' in MISC." - ).report(state, self.args) + ).report() continue match = re.match(r"^Entity=((?:\([^( )]+(?:-[^( )]+)*\)?|[^( )]+\))+)$", entity[0]) if not match: Incident( - state=state, + state=state, args=self.args, testid='spurious-entity-statement', message=f"Cannot parse the Entity statement '{entity[0]}'." - ).report(state, self.args) + ).report() else: entity_string = match.group(1) # We cannot check the rest if we cannot identify the 'eid' attribute. @@ -3636,10 +3737,10 @@ def validate_misc_entity(self, state, comments, sentence): continue # If we pre-checked the string well, we should never arrive here! Incident( - state=state, + state=state, args=self.args, testid='internal-error', message='INTERNAL ERROR' - ).report(state, self.args) + ).report() # All 1 cases should precede all 0 cases. # The 2 cases can be either before the first 1 case, or after the last 0 case. seen0 = False @@ -3657,10 +3758,10 @@ def validate_misc_entity(self, state, comments, sentence): # More attributes are not allowed. if len(attributes) > state.entity_attribute_number: Incident( - state=state, + state=state, args=self.args, testid='too-many-entity-attributes', message=f"Entity '{e}' has {len(attributes)} attributes while only {state.entity_attribute_number} attributes are globally declared." - ).report(state, self.args) + ).report() # The raw eid (bracket eid) may include an identification of a part of a discontinuous mention, # as in 'e155[1/2]'. This is fine for matching opening and closing brackets # because the closing bracket must contain it too. However, to identify the @@ -3670,10 +3771,10 @@ def validate_misc_entity(self, state, comments, sentence): # No attributes other than eid are expected at the closing bracket. if len(attributes) > 1: Incident( - state=state, + state=state, args=self.args, testid='too-many-entity-attributes', message=f"Entity '{e}' has {len(attributes)} attributes while only eid is expected at the closing bracket." - ).report(state, self.args) + ).report() beid = attributes[0] eid = beid ipart = 1 @@ -3688,23 +3789,23 @@ def validate_misc_entity(self, state, comments, sentence): # We should omit the square brackets if they would be [1/1]. if ipart == 1 and npart == 1: Incident( - state=state, + state=state, args=self.args, testid='spurious-entity-id', message=f"Discontinuous mention must have at least two parts but it has one in '{beid}'." - ).report(state, self.args) + ).report() if ipart > npart: Incident( - state=state, + state=state, args=self.args, testid='spurious-entity-id', message=f"Entity id '{beid}' of discontinuous mention says the current part is higher than total number of parts." - ).report(state, self.args) + ).report() else: if re.match(r"[\[\]]", beid): Incident( - state=state, + state=state, args=self.args, testid='spurious-entity-id', message=f"Entity id '{beid}' contains square brackets but does not have the form used in discontinuous mentions." - ).report(state, self.args) + ).report() #-------------------------------------------------------------------------------------------------------------------------------- # The code that we will have to execute at single-node continuous parts and at the opening brackets of multi-node continuous parts. @@ -3736,24 +3837,24 @@ def opening_bracket(): discontinuous_mention = state.open_discontinuous_mentions[eidnpart][-1] if ipart != discontinuous_mention['last_ipart']+1: Incident( - state=state, + state=state, args=self.args, testid='misplaced-mention-part', message=f"Unexpected part of discontinuous mention '{beid}': last part was '{discontinuous_mention['last_ipart']}/{discontinuous_mention['npart']}' on line {discontinuous_mention['last_part_line']}." - ).report(state, self.args) + ).report() # We will update last_ipart at closing bracket, i.e., after the current part has been entirely processed. # Otherwise nested discontinuous mentions might wrongly assess where they belong. elif attrstring_to_match != discontinuous_mention['attributes']: Incident( - state=state, + state=state, args=self.args, testid='mention-attribute-mismatch', message=f"Attribute mismatch of discontinuous mention: current part has '{attrstring_to_match}', first part '{discontinuous_mention['attributes']}' was at line {discontinuous_mention['first_part_line']}." - ).report(state, self.args) + ).report() else: Incident( - state=state, + state=state, args=self.args, testid='misplaced-mention-part', message=f"Unexpected part of discontinuous mention '{beid}': this is part {ipart} but we do not have information about the previous parts." - ).report(state, self.args) + ).report() discontinuous_mention = {'last_ipart': ipart, 'npart': npart, 'first_part_line': state.sentence_line+iline, 'last_part_line': state.sentence_line+iline, @@ -3763,10 +3864,10 @@ def opening_bracket(): # Check all attributes of the entity, except those that must be examined at the closing bracket. if eid in state.entity_ids_other_documents: Incident( - state=state, + state=state, args=self.args, testid='entity-across-newdoc', message=f"Same entity id should not occur in multiple documents; '{eid}' first seen on line {state.entity_ids_other_documents[eid]}, before the last newdoc." - ).report(state, self.args) + ).report() elif not eid in state.entity_ids_this_document: state.entity_ids_this_document[eid] = state.sentence_line+iline etype = '' @@ -3777,10 +3878,10 @@ def opening_bracket(): # https://github.com/ufal/corefUD/issues/13#issuecomment-1008447464 if not re.match(r"^(person|place|organization|animal|plant|object|substance|time|number|abstract|event|other)?$", etype): Incident( - state=state, + state=state, args=self.args, testid='spurious-entity-type', message=f"Spurious entity type '{etype}'." - ).report(state, self.args) + ).report() if 'identity' in state.entity_attribute_index and len(attributes) >= state.entity_attribute_index['identity']+1: identity = attributes[state.entity_attribute_index['identity']] # Check the form of the head index now. @@ -3790,10 +3891,10 @@ def opening_bracket(): if 'head' in state.entity_attribute_index and len(attributes) >= state.entity_attribute_index['head']+1: if not re.match(r"^[1-9][0-9]*$", attributes[state.entity_attribute_index['head']]): Incident( - state=state, + state=state, args=self.args, testid='spurious-mention-head', message=f"Entity head index '{attributes[state.entity_attribute_index['head']]}' must be a non-zero-starting integer." - ).report(state, self.args) + ).report() else: head = int(attributes[state.entity_attribute_index['head']]) # If this is the first mention of the entity, remember the values @@ -3804,17 +3905,17 @@ def opening_bracket(): # All mentions of one entity (cluster) must have the same entity type. if etype != state.entity_types[eid][0]: Incident( - state=state, + state=state, args=self.args, testid='entity-type-mismatch', message=f"Entity '{eid}' cannot have type '{etype}' that does not match '{state.entity_types[eid][0]}' from the first mention on line {state.entity_types[eid][2]}." - ).report(state, self.args) + ).report() # All mentions of one entity (cluster) must have the same identity (Wikipedia link or similar). if identity != state.entity_types[eid][1]: Incident( - state=state, + state=state, args=self.args, testid='entity-identity-mismatch', message=f"Entity '{eid}' cannot have identity '{identity}' that does not match '{state.entity_types[eid][1]}' from the first mention on line {state.entity_types[eid][2]}." - ).report(state, self.args) + ).report() # Remember the line where (the current part of) the entity mention starts. mention = {'beid': beid, 'line': state.sentence_line+iline, 'span': [cols[ID]], 'text': cols[FORM], @@ -3834,10 +3935,10 @@ def closing_bracket(): opening_line = 0 if len(state.open_entity_mentions)==0: Incident( - state=state, + state=state, args=self.args, testid='ill-nested-entities', message=f"Cannot close entity '{beid}' because there are no open entities." - ).report(state, self.args) + ).report() return else: # If the closing bracket does not occur where expected, it is currently only a warning. @@ -3846,11 +3947,11 @@ def closing_bracket(): ###!!! not be a problem in such cases because one mention will be closed first, then the other will be opened. if beid != state.open_entity_mentions[-1]['beid']: Incident( - state=state, + state=state, args=self.args, testclass='Warning', testid='ill-nested-entities-warning', message=f"Entity mentions are not well nested: closing '{beid}' while the innermost open entity is '{state.open_entity_mentions[-1]['beid']}' from line {state.open_entity_mentions[-1]['line']}: {str(state.open_entity_mentions)}." - ).report(state, self.args) + ).report() # Try to find and close the entity whether or not it was well-nested. for i in reversed(range(len(state.open_entity_mentions))): if state.open_entity_mentions[i]['beid'] == beid: @@ -3863,10 +3964,10 @@ def closing_bracket(): else: # If we did not find the entity to close, then the warning above was not enough and we have to make it a validation error. Incident( - state=state, + state=state, args=self.args, testid='ill-nested-entities', message=f"Cannot close entity '{beid}' because it was not found among open entities: {str(state.open_entity_mentions)}" - ).report(state, self.args) + ).report() return # If this is a part of a discontinuous mention, update the information about the whole mention. # We do this after reading the new part (and not when we see its opening bracket) so that nested @@ -3882,11 +3983,11 @@ def closing_bracket(): else: # This should have been taken care of at the opening bracket. Incident( - state=state, + state=state, args=self.args, testclass='Internal', testid='internal-error', message="INTERNAL ERROR: at the closing bracket of a part of a discontinuous mention, still no record in state.open_discontinuous_mentions." - ).report(state, self.args) + ).report() discontinuous_mention = {'last_ipart': ipart, 'npart': npart, 'first_part_line': opening_line, 'last_part_line': opening_line, @@ -3902,18 +4003,18 @@ def closing_bracket(): if ipart == npart: if mention_length < head: Incident( - state=state, + state=state, args=self.args, testid='mention-head-out-of-range', message=f"Entity mention head was specified as {head} on line {opening_line} but the mention has only {mention_length} nodes." - ).report(state, self.args) + ).report() # Check that no two mentions have identical spans (only if this is the last part of a mention). ending_mention_key = str(opening_line)+str(mention_span) if ending_mention_key in ending_mentions: Incident( - state=state, + state=state, args=self.args, testid='same-span-entity-mentions', message=f"Entity mentions '{ending_mentions[ending_mention_key]}' and '{beid}' from line {opening_line} have the same span {str(mention_span)}." - ).report(state, self.args) + ).report() else: ending_mentions[ending_mention_key] = beid # Remember the span of the current mention so that we can later check whether it crosses the span of another mention. @@ -3927,10 +4028,10 @@ def closing_bracket(): ms = state.entity_mention_spans[eid][sentid][m] if ms.intersection(myset) and not ms.issubset(myset) and not myset.issubset(ms): Incident( - state=state, + state=state, args=self.args, testid='crossing-mentions-same-entity', message=f"Mentions of entity '{eid}' have crossing spans: {m} vs. {str(mention_span)}." - ).report(state, self.args) + ).report() else: state.entity_mention_spans[eid][sentid] = {} else: @@ -3951,36 +4052,36 @@ def closing_bracket(): if b==0: if seen2 and not seen1: Incident( - state=state, + state=state, args=self.args, testid='spurious-entity-statement', message=f"If there are no closing entity brackets, single-node entity must follow all opening entity brackets in '{entity[0]}'." - ).report(state, self.args) + ).report() if seen0 and seen2: Incident( - state=state, + state=state, args=self.args, testid='spurious-entity-statement', message=f"Single-node entity must either precede all closing entity brackets or follow all opening entity brackets in '{entity[0]}'." - ).report(state, self.args) + ).report() seen0 = True seen2 = False opening_bracket() elif b==2: if seen1 and not seen0: Incident( - state=state, + state=state, args=self.args, testid='spurious-entity-statement', message=f"If there are no opening entity brackets, single-node entity must precede all closing entity brackets in '{entity[0]}'." - ).report(state, self.args) + ).report() seen2 = True opening_bracket() closing_bracket() else: # b==1 if seen0: Incident( - state=state, + state=state, args=self.args, testid='spurious-entity-statement', message=f"All closing entity brackets must precede all opening entity brackets in '{entity[0]}'." - ).report(state, self.args) + ).report() seen1 = True closing_bracket() # Now we are done with checking the 'Entity=' statement. @@ -3989,10 +4090,10 @@ def closing_bracket(): match = re.match(r"^Bridge=([^(< :>)]+<[^(< :>)]+(:[a-z]+)?(,[^(< :>)]+<[^(< :>)]+(:[a-z]+)?)*)$", bridge[0]) if not match: Incident( - state=state, + state=state, args=self.args, testid='spurious-bridge-statement', message=f"Cannot parse the Bridge statement '{bridge[0]}'." - ).report(state, self.args) + ).report() else: bridges = match.group(1).split(',') # Hash src 0: match = re.match(r"^SplitAnte=([^(< :>)]+<[^(< :>)]+(,[^(< :>)]+<[^(< :>)]+)*)$", splitante[0]) if not match: Incident( - state=state, + state=state, args=self.args, testid='spurious-splitante-statement', message=f"Cannot parse the SplitAnte statement '{splitante[0]}'." - ).report(state, self.args) + ).report() else: antecedents = match.group(1).split(',') # Hash src0: Incident( - state=state, + state=state, args=self.args, testid='cross-sentence-mention', message=f"Entity mentions must not cross sentence boundaries; still open at sentence end: {str(state.open_entity_mentions)}." - ).report(state, self.args) + ).report() # Close the mentions forcibly. Otherwise one omitted closing bracket would cause the error messages to to explode because the words would be collected from the remainder of the file. state.open_entity_mentions = [] if len(state.open_discontinuous_mentions)>0: Incident( - state=state, + state=state, args=self.args, testid='cross-sentence-mention', message=f"Entity mentions must not cross sentence boundaries; still open at sentence end: {str(state.open_discontinuous_mentions)}." - ).report(state, self.args) + ).report() # Close the mentions forcibly. Otherwise one omission would cause the error messages to to explode because the words would be collected from the remainder of the file. state.open_discontinuous_mentions = {} # Since we only test mentions within one sentence at present, we do not have to carry all mention spans until the end of the corpus. @@ -4170,7 +4271,7 @@ def validate_file(self, state, inp): if is_word(cols) or is_empty_node(cols): self.validate_character_constraints(state, cols, line) # level 2 self.validate_upos(state, cols, line) # level 2 - colssafe = colssafe and self.validate_features_level2(state, cols, line) # level 2 (level 4 tests will be called later) + colssafe = self.validate_features_level2(state, cols, line) and colssafe # level 2 (level 4 tests will be called later) self.validate_deps(state, cols, line) # level 2; must operate on pre-Udapi DEPS (to see order of relations) self.validate_misc(state, cols, line) # level 2; must operate on pre-Udapi MISC if not colssafe: @@ -4180,6 +4281,7 @@ def validate_file(self, state, inp): # structure for us. tree = self.build_tree_udapi(all_lines) self.validate_sent_id(state, comments, self.args.lang) # level 2 + self.validate_parallel_id(state, comments) # level 2 self.validate_text_meta(state, comments, sentence) # level 2 # Test that enhanced graphs exist either for all sentences or for # none. As a side effect, get line numbers for all nodes including @@ -4221,12 +4323,12 @@ def validate_end(self, state): # the DEPS annotation was not a mere copy of the basic trees. if self.args.level>2 and state.seen_enhanced_graph and not state.seen_enhancement: Incident( - state=state, + state=state, args=self.args, level=3, testclass='Enhanced', testid='edeps-identical-to-basic-trees', message="Enhanced graphs are copies of basic trees in the entire dataset. This can happen for some simple sentences where there is nothing to enhance, but not for all sentences. If none of the enhancements from the guidelines (https://universaldependencies.org/u/overview/enhanced-syntax.html) are annotated, the DEPS should be left unspecified" - ).report(state, self.args) + ).report() def validate_files(self, filenames): @@ -4246,12 +4348,12 @@ def validate_files(self, filenames): self.validate_end(state) except: Incident( - state=state, + state=state, args=self.args, level=0, testclass='Internal', testid='exception', message="Exception caught!" - ).report(state, self.args) + ).report() # If the output is used in an HTML page, it must be properly escaped # because the traceback can contain e.g. "". However, escaping # is beyond the goal of validation, which can be also run in a console. @@ -4293,6 +4395,10 @@ def build_argparse(): action="store", type=int, default=20, help="""How many errors to output before exiting? 0 for all. Default: %(default)d.""") + io_group.add_argument('--max-store', + action="store", type=int, default=20, + help="""How many errors to save when collecting errors. 0 for all. + Default: %(default)d.""") io_group.add_argument('input', nargs='*', help="""Input file name(s), or "-" or nothing for standard input.""") @@ -4326,9 +4432,9 @@ def build_argparse(): help='Test coreference and entity-related annotation in MISC.') return opt_parser -def parse_args(): +def parse_args(args=None): opt_parser = build_argparse() - args = opt_parser.parse_args() #Parsed command-line arguments + args = opt_parser.parse_args(args=args) #Parsed command-line arguments # Level of validation if args.level < 1: @@ -4346,8 +4452,8 @@ def parse_args(): return args def main(): - args = parse_args() - validator = Validator(args) + validator = Validator() + args = validator.args state = validator.validate_files(args.input) # Summarize the warnings and errors. @@ -4376,4 +4482,3 @@ def main(): if __name__=="__main__": errcode = main() sys.exit(errcode) - diff --git a/test-cases/valid/empty-file.conllu b/validator/README.md similarity index 100% rename from test-cases/valid/empty-file.conllu rename to validator/README.md diff --git a/validator/docs/checks_table.md b/validator/docs/checks_table.md new file mode 100644 index 000000000..30a7833e7 --- /dev/null +++ b/validator/docs/checks_table.md @@ -0,0 +1,32 @@ +| check_xxx | acts on | Purpose | Errors | Warnings | requires | depends_on | Test | Old function | TODO | +| --------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------- | ------------------------------- | ---------------------------------------------------------------------------------------------------------------------------- | ---------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| invalid_lines | line | checks for lines that are not empty, not comments and not tokens | invalid-line | | | | invalid-lines | next_sentence | - | +| columns_format | line | checks that token lines have the right number of columns and do not contain spurious whitespaces | number-of-columns, empty-column, leading-whitespace, trailing-whitespace, repeated-whitespace, invalid-whitespace-mwt, invalid-whitespace | | | invalid-lines | columns-format | next_sentence, validate_whitespace | - | +| misplaced_comment | block of lines | checks that comment lines always precede token lines | misplaced-comment | | | | misplaced-comment | next_sentence | - | +| extra_empty_line | block of lines | | extra-empty-line | | | | extra-empty-line | next_sentence | - | +| pseudo_empty_line | line | checks if a line that appears empty contains whitespaces | pseudo-empty-line | | | | pseudo-empty-line | next_sentence | - | +| unicode_normalization | line | checks that text is NFC | unicode-normalization | | | | unicode-normalization | validate_unicode_normalization | fix explanation_second | +| id_sequence | block of columns | Validates that the ID sequence is correctly formed. | invalid-word-id, invalid-word-interval?, misplaced-word-interval, misplaced-empty-node, word-id-sequence, reversed-word-interval, word-interval-out | | | | invalid-word-id, misplaced-empty-node, misplaced-empty-node-2, misplaced-word-interval, word-id-sequence, word-id-sequence-2 | validate_id_sequence | If this function returns an nonempty list, subsequent tests should not be run. | +| token_ranges | block of columns | Checks that the word ranges for multiword tokens are valid. | invalid-word-interval, overlapping-word-intervals | | | | invalid-word-interval, overlapping-word-interval | validate_token_ranges | - | +| newlines | file | Checks that the input file consistently uses linux-style newlines (LF only, not CR LF like in Windows). | non-unix-newline | | | | non-unix-newlines | validate_newlines | - | +| sent_id | blocks of comments | Checks that sentence id exists, is well-formed and unique. | invalid-sent-id, missing-sent-id, multiple-sent-id, non-unique-sent-id, slash-in-sent-id | | allow_slash:bool, known_sent_ids:set | | multiple-sent-id | validate_sent_id | - | +| parallel_id | blocks of comments | Checks that sentence id is well-formed and unique. | invalid-parallel-id, multiple-parallel-id, non-unique-parallel-id, parallel-id-alt, parallel-id-part | | allow_slash:bool, known_sent_ids:set, parallel_id_lastalt:?, parallel_id_lastpart:? | | TODO | validate_parallel_id | add correct types, add parentheses, define tests | +| text_meta | block of comments | Checks metadata other than sentence id | multiple-newdoc, multiple-newpar, spaceafter-newdocpar, missing-text, multiple-text, text-trailing-whitespace, nospaceafter-yes, spaceafter-value, spaceafter-empty-node, spaceafter-mwt-node, text-form-mismatch, missing-spaceafter, text-extra-chars | | spaceafterno_in_effect:bool | | | validate_text_meta | it would probably be easier to refactor this function in multiple checks + it might not work with our current next_block() | +| mwt_empty_vals | columns | Checks that a multi-word token has _ empty values in all fields except MISC | mwt-nonempty-field | | columns represent multiword token | | mwt-nonempty-field | validate_mwt_empty_vals | If a multi-word token has Typo=Yes, its component words must not have it. We must remember the span of the MWT and check it in validate_features_level4() | +| empty_node_empty_vals | columns | Checks that an empty node has _ empty values in HEAD and DEPREL. | mwt-nonempty-field | | columns represent empty token | | mwt-nonempty-2 | validate_empty_node_empty_vals | possible copy-paste error in testid | +| character_constraints | columns | Checks general constraints on valid characters | invalid-deprel, invalid-deps, invalid-edeprel | | | | | validate_character_constraints | rename into check_deps_deprel_contraints | +| upos | columns | Checks that the UPOS field contains one of the 17 known tags. | unknown-upos | | | | | validate_upos | | +| features_level2 | columns | Checks general constraints on feature-value format | unsorted-features, invalid-feature, repeated-feature-value, unsorted-feature-values, invalid-feature-value, repeated-feature | | | | | validate_features_level2 | the engine has to know that 'invalid-feature' is a testid that prevents from further testing, the original function also called features_present() | +| deps | columns | Checks that DEPS is correctly formatted and that there are no self-loops in DEPS | unsorted-deps, unsorted-deps-2, unsorted-deps, deps-self-loop | | | | | validate_deps | the engine must assume that it is run after check_id_references() and only if DEPS is parsable and the head indices in it are OK. + removed check that there is at least one difference with deprels | +| misc | columns | Checks raw MISC to allow Udapi parsing | repeated-misc, misc-extra-space | empty-misc, empty-misc-key, misc-attr-typo | seen_enhanced_graph:bool | | | validate_misc | | +| deps_all_or_none | block of tokens | Checks that enhanced dependencies are present if they were present at another sentence, and absent if they were absent at another sentence. | edeps-only-sometimes | | | | | validate_deps_all_or_none | update state | +| id_references | blocks of tokens | Verifies that HEAD and DEPS reference existing IDs. | invalid-head, unknown-head, invalid-deps, invalid-ehead, unknown-ehead | | | | | validate_id_references | If this function returns a nonempty list, most of the other tests should be skipped for the current sentence (in particular anything that considers the tree structure). | +| tree | blocks of tokens | Performs basic validation of the tree structure (without UDApi). | head-self-loop, multiple-roots, non-tree | | node_line:int, single_root:bool | | | validate_tree | should be called only if both ID and HEAD values have been found valid for all tree nodes, including the sequence of IDs and the references from HEAD to existing IDs. | +| deprels_level2 | node | Checks that a dependency relation (main) label is listed as approved in the given language. | unknown-deprel, unknown-edeprel | | deprels:Dict, lang:str | | | validate_deprels | | +| deprels_level4 | node | Checks that a dependency relation label is listed as approved in the given language. | unknown-deprel, unknown-edeprel | | deprels:Dict, lang:str | unknown-deprel, unknown-edeprel | | validate_deprels | | +| root | node | Checks that DEPREL is "root" iff HEAD is 0. | 0-is-not-root, root-is-not-0, enhanced-0-is-not-root, enhanced-root-is-not-0 | | | | | validate_root | | +| enhanced_orphan | node | Checks universally valid consequences of the annotation guidelines in the enhanced representation. | empty-node-after-eorphan, eorphan-after-empty-node | | seen_empty_node:bool, seen_enhanced_orphan:bool | | | validate_enhanced_orphan | | +| words_with_spaces | node | Checks a single line for disallowed whitespace. | invalid-word-with-space | | lang:str, specs:UDSpecs | | | validate_words_with_spaces | we assume that all language-independent whitespace-related tests have already been done on level 1, so we only check for words with spaces that are explicitly allowed in a given language. | +| features_level4 | node | Checks that a feature-value pair is listed as approved. | mwt-typo-repeated-at-word, feature-unknown, feature-not-permitted, feature-value-unknown, feature-upos-not-permitted, feature-value-upos-not-permitted | | lang:str, specs:UDSpecs, mwt_typo_span_end:bool? | | | validate_features_level4 | deal with mwt_span_end | +| auxiliary_verbs | node | Checks that the UPOS tag AUX is used only with lemmas that are known to act as auxiliary verbs or particles in the given language. | aux-lemma | | lang:str, specs:UDSpecs | | | validate_auxiliary_verbs | | +| copula_lemmas | node | Check that the relation cop is used only with lemmas that are known to act as copulas in the given language. | cop-lemma | | lang:str, specs:UDSpecs | | | validate_copula_lemmas | | \ No newline at end of file diff --git a/validator/docs/example_config.yaml b/validator/docs/example_config.yaml new file mode 100644 index 000000000..47afe21cf --- /dev/null +++ b/validator/docs/example_config.yaml @@ -0,0 +1,48 @@ +file: + check_newlines: + level: 1 + +block: + check_extra_empty_line: + level: 1 + check_misplaced_comment: + level: 1 + check_invalid_lines: + level: 1 + +line: + check_unicode_normalization: + level: 1 + check_pseudo_empty_line: + level: 1 + +token_lines: + check_columns_format: + level: 1 + +comment_lines: + +cols: + check_id_sequence: + level: 1 + check_token_ranges: + level: 1 + check_tree: + level: 2 + depends_on: + - invalid-word-id + - invalid-word-interval + - misplaced-word-interval + - misplaced-empty-node + - word-id-sequence + - reversed-word-interval + - word-interval-out + - invalid-head + - unknown-head + - invalid-deps + - invalid-ehead + - unknown-ehead + +tree: + +node: \ No newline at end of file diff --git a/validator/docs/example_output.json b/validator/docs/example_output.json new file mode 100644 index 000000000..7b84e856d --- /dev/null +++ b/validator/docs/example_output.json @@ -0,0 +1,36 @@ +{ + "filename_1.conllu": [ + { + "level": 1, + "testclass": "FORMAT", + "testid": "trailing-whitespace", + "message": "short error description", + "sent_id": "3456", + "line_no": 1245, + "line": "CONLL-U LINE CONTENT" + }, + { + "level": 3, + "testclass": "MORPHO", + "testid": "unknown-feature", + "message": "short error description", + "sent_id": "234", + "line_no": 500 + }, + { + ... + } + + ], + "filename_2.conllu": [ + { + ... + }, + { + ... + }, + { + ... + } + ] +} \ No newline at end of file diff --git a/validator/docs/example_working.yaml b/validator/docs/example_working.yaml new file mode 100644 index 000000000..5eb8cb21e --- /dev/null +++ b/validator/docs/example_working.yaml @@ -0,0 +1,29 @@ +file: + check_newlines: + level: 1 + +block: + check_extra_empty_line: + level: 1 + check_misplaced_comment: + level: 1 + check_invalid_lines: + level: 1 + +line: + check_unicode_normalization: + level: 1 + check_pseudo_empty_line: + level: 1 + +token_lines: + check_columns_format: + level: 1 + +comment_lines: + +cols: + +tree: + +node: \ No newline at end of file diff --git a/validator/docs/explanations.txt b/validator/docs/explanations.txt new file mode 100644 index 000000000..454741ab8 --- /dev/null +++ b/validator/docs/explanations.txt @@ -0,0 +1,2 @@ +explain_deprel > unknown-deprel +explain_edeprel > unknown-edeprel \ No newline at end of file diff --git a/validator/docs/extra.py b/validator/docs/extra.py new file mode 100644 index 000000000..d5970587a --- /dev/null +++ b/validator/docs/extra.py @@ -0,0 +1,153 @@ + +# def load_set(f_name_ud, lang, validate_langspec=False, validate_enhanced=False): +# """ +# Loads a list of values from the two files, and returns their +# set. If lang doesn't exist, loads nothing and returns +# None (ie this taglist is not checked for the given language). If lang +# is None, only loads the UD one. This is probably only useful for CPOS which doesn't +# allow language-specific extensions. Set validate_langspec=True when loading basic dependencies. +# That way the language specific deps will be checked to be truly extensions of UD ones. +# Set validate_enhanced=True when loading enhanced dependencies. They will be checked to be +# truly extensions of universal relations, too; but a more relaxed regular expression will +# be checked because enhanced relations may contain stuff that is forbidden in the basic ones. +# """ +# res = load_file(os.path.join(g.THISDIR, 'data', f_name_ud)) +# # Now res holds UD +# # Next load and optionally check the langspec extensions +# if lang is not None and lang != f_name_ud: +# l_spec = load_file(os.path.join(g.THISDIR,"data","tokens_w_space.json"), lang) +# for v in l_spec: +# if validate_enhanced: +# # We are reading the list of language-specific dependency relations in the enhanced representation +# # (i.e., the DEPS column, not DEPREL). Make sure that they match the regular expression that +# # restricts enhanced dependencies. +# if not g.edeprel_re.match(v): +# testlevel = 4 +# testclass = 'Enhanced' +# testid = 'edeprel-def-regex' +# testmessage = f"Spurious language-specific enhanced relation '{v}' - it does not match the regular expression that restricts enhanced relations." +# warn(testmessage, testclass, testlevel, testid, lineno=-1) +# continue +# elif validate_langspec: +# # We are reading the list of language-specific dependency relations in the basic representation +# # (i.e., the DEPREL column, not DEPS). Make sure that they match the regular expression that +# # restricts basic dependencies. (In particular, that they do not contain extensions allowed in +# # enhanced dependencies, which should be listed in a separate file.) +# if not re.match(r"^[a-z]+(:[a-z]+)?$", v): +# testlevel = 4 +# testclass = 'Syntax' +# testid = 'deprel-def-regex' +# testmessage = f"Spurious language-specific relation '{v}' - in basic UD, it must match '^[a-z]+(:[a-z]+)?'." +# warn(testmessage, testclass, testlevel, testid, lineno=-1) +# continue +# if validate_langspec or validate_enhanced: +# try: +# parts=v.split(':') +# if parts[0] not in res and parts[0] != 'ref': +# testlevel = 4 +# testclass = 'Syntax' +# testid = 'deprel-def-universal-part' +# testmessage = f"Spurious language-specific relation '{v}' - not an extension of any UD relation." +# warn(testmessage, testclass, testlevel, testid, lineno=-1) +# continue +# except: +# testlevel = 4 +# testclass = 'Syntax' +# testid = 'deprel-def-universal-part' +# testmessage = f"Spurious language-specific relation '{v}' - not an extension of any UD relation." +# warn(testmessage, testclass, testlevel, testid, lineno=-1) +# continue +# res.add(v) +# return res + + + +# def load_feat_set(filename_langspec, lcode): +# """ +# Loads the list of permitted feature-value pairs and returns it as a set. +# """ +# with open(os.path.join(g.THISDIR, 'data', filename_langspec), 'r', encoding='utf-8') as f: +# all_features_0 = json.load(f) +# g.featdata = all_features_0['features'] +# featset = get_featdata_for_language(lcode) +# # Prepare a global message about permitted features and values. We will add +# # it to the first error message about an unknown feature. Note that this +# # global information pertains to the default validation language and it +# # should not be used with code-switched segments in alternative languages. +# msg = '' +# if not lcode in g.featdata: +# msg += f"No feature-value pairs have been permitted for language [{lcode}].\n" +# msg += "They can be permitted at the address below (if the language has an ISO code and is registered with UD):\n" +# msg += "https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_feature.pl\n" +# g.warn_on_undoc_feats = msg +# else: +# # Identify feature values that are permitted in the current language. +# for f in featset: +# for e in featset[f]['errors']: +# msg += f"ERROR in _{lcode}/feat/{f}.md: {e}\n" +# res = set() +# for f in featset: +# if featset[f]['permitted'] > 0: +# for v in featset[f]['uvalues']: +# res.add(f+'='+v) +# for v in featset[f]['lvalues']: +# res.add(f+'='+v) +# sorted_documented_features = sorted(res) +# msg += f"The following {len(sorted_documented_features)} feature values are currently permitted in language [{lcode}]:\n" +# msg += ', '.join(sorted_documented_features) + "\n" +# msg += "If a language needs a feature that is not documented in the universal guidelines, the feature must\n" +# msg += "have a language-specific documentation page in a prescribed format.\n" +# msg += "See https://universaldependencies.org/contributing_language_specific.html for further guidelines.\n" +# msg += "All features including universal must be specifically turned on for each language in which they are used.\n" +# msg += "See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_feature.pl for details.\n" +# warn_on_undoc_feats = msg +# return featset + +# def get_featdata_for_language(lcode): +# """ +# Searches the previously loaded database of feature-value combinations. +# Returns the lists for a given language code. For most CoNLL-U files, +# this function is called only once at the beginning. However, some files +# contain code-switched data and we may temporarily need to validate +# another language. +# """ +# ###!!! If lcode is 'ud', we should permit all universal feature-value pairs, +# ###!!! regardless of language-specific documentation. +# # Do not crash if the user asks for an unknown language. +# if not lcode in g.featdata: +# return {} ###!!! or None? +# return g.featdata[lcode] + +# def get_auxdata_for_language(lcode): +# """ +# Searches the previously loaded database of auxiliary/copula lemmas. Returns +# the AUX and COP lists for a given language code. For most CoNLL-U files, +# this function is called only once at the beginning. However, some files +# contain code-switched data and we may temporarily need to validate +# another language. +# """ +# auxdata = g.auxdata +# # If any of the functions of the lemma is other than cop.PRON, it counts as an auxiliary. +# # If any of the functions of the lemma is cop.*, it counts as a copula. +# auxlist = [] +# coplist = [] +# if lcode == 'shopen': +# for lcode1 in auxdata.keys(): +# lemmalist = auxdata[lcode1].keys() +# auxlist = auxlist + [x for x in lemmalist +# if len([y for y in auxdata[lcode1][x]['functions'] +# if y['function'] != 'cop.PRON']) > 0] +# coplist = coplist + [x for x in lemmalist +# if len([y for y in auxdata[lcode1][x]['functions'] +# if re.match(r"^cop\.", y['function'])]) > 0] +# else: +# lemmalist = auxdata.get(lcode, {}).keys() +# auxlist = [x for x in lemmalist +# if len([y for y in auxdata[lcode][x]['functions'] +# if y['function'] != 'cop.PRON']) > 0] +# coplist = [x for x in lemmalist +# if len([y for y in auxdata[lcode][x]['functions'] +# if re.match(r"^cop\.", y['function'])]) > 0] +# return auxlist, coplist + + diff --git a/validator/docs/full-validation.yaml b/validator/docs/full-validation.yaml new file mode 100644 index 000000000..517723c63 --- /dev/null +++ b/validator/docs/full-validation.yaml @@ -0,0 +1,46 @@ +file: + check_newlines: + level: 1 + +block: + check_extra_empty_line: + level: 1 + check_misplaced_comment: + level: 1 + +line: + check_invalid_lines: + level: 1 + check_columns_format: + level: 1 + depends_on: + - invalid-line + check_pseudo_empty_line: + level: 1 + check_unicode_normalization: + level: 1 + + +token_lines: + +comment_lines: + check_sent_id: + level: 2 + +cols: + check_mwt_empty_vals: + level: 2 + check_empty_node_empty_vals: + level: 2 + +tokens_cols: + check_id_sequence: + level: 1 + check_token_ranges: + level: 1 + depends_on: + - invalid-word-id + +tree: + +node: \ No newline at end of file diff --git a/validator/docs/invalid-line.yaml b/validator/docs/invalid-line.yaml new file mode 100644 index 000000000..2401a628c --- /dev/null +++ b/validator/docs/invalid-line.yaml @@ -0,0 +1,24 @@ +file: + +block: + check_misplaced_comment: + level: 1 + +line: + check_invalid_lines: + level: 1 + check_columns_format: + level: 1 + depends_on: + - 'invalid-line' + +token_lines: + +comment_lines: + +cols: + + +tree: + +node: \ No newline at end of file diff --git a/validator/docs/long_term_todo.md b/validator/docs/long_term_todo.md new file mode 100644 index 000000000..e261c17c8 --- /dev/null +++ b/validator/docs/long_term_todo.md @@ -0,0 +1,3 @@ +- organize test cases so that they match `testid`s + - add as a comment which tests should fail as a metadata +- try to have a 1:1 mapping between test functions (`validate_xxx`) and `testid`s/incidents, or at least modularize test functions further (e.g. `validate_tree` is conceptually composed of 3 tests) \ No newline at end of file diff --git a/validator/docs/notes.txt b/validator/docs/notes.txt new file mode 100644 index 000000000..30c559f7a --- /dev/null +++ b/validator/docs/notes.txt @@ -0,0 +1,731 @@ +CLASSES + builtins.object + Incident + State + Validator + | TODO: find out where to put these functions, if needed + | get_aux_for_language(self, lcode) + | An entry point for get_auxcop_for_language() that returns only the aux + | list. It either takes the cached list (if available), or calls + | get_auxcop_for_language(). + | + | get_auxcop_for_language(self, lcode) + | Searches the previously loaded database of auxiliary/copula lemmas. + | Returns the AUX and COP lists for a given language code. Also saves + | the result in self so that next time it can be fetched quickly (once + | we loaded the data, we do not expect them to change). + | + | get_cop_for_language(self, lcode) + | An entry point for get_auxcop_for_language() that returns only the cop + | list. It either takes the cached list (if available), or calls + | get_auxcop_for_language(). + | + | get_deprel_for_language(self, lcode) + | Searches the previously loaded database of dependency relation labels. + | Returns the set of permitted deprels for a given language code. Also + | saves the result in self so that next time it can be fetched quickly + | (once we loaded the data, we do not expect them to change). + | + | get_edeprel_for_language(self, lcode) + | Searches the previously loaded database of enhanced case markers. + | Returns the set of permitted edeprels for a given language code. Also + | saves the result in self so that next time it can be fetched quickly + | (once we loaded the data, we do not expect them to change). + | + | get_feats_for_language(self, lcode) + | Searches the previously loaded database of feature-value-UPOS combinations. + | Returns the data for a given language code, organized in dictionaries. + | Returns an empty dict if there are no data for the given language code. + | + | get_tospace_for_language(self, lcode) + | Searches the previously loaded database of regular expressions describing + | permitted tokens with spaces. Returns the expressions for a given language code. + + + WHAT: class for object describing specific errors + class Incident(builtins.object) + | Incident( + | state, + | level=None, + | testclass=None, + | testid=None, + | message=None, + | lineno=None, + | nodeid=None, + | explanation='' + | ) + | + | Instances of this class describe individual errors or warnings in the input + | file. + | + | Methods defined here: + | + | __init__( + | self, + | state, + | level=None, + | testclass=None, + | testid=None, + | message=None, + | lineno=None, + | nodeid=None, + | explanation='' + | ) + | Initialize self. See help(type(self)) for accurate signature. + | + | report(self, state, args) + | + | Data and other attributes defined here: + | + | TODO: understand why these are not the default params for the class + | default_level = 1 + | + | default_lineno = None + | + | default_message = 'No error description provided.' + | + | default_testclass = 'Format' + | + | default_testid = 'generic-error' + + TODO: transform into data class + class State(builtins.object) + | The State class holds various global data about where we are in the file + | and what we have seen so far. Typically there will be just one instance of + | this class. + | + | Methods defined here: + | + | __init__(self) + | Initialize self. See help(type(self)) for accurate signature. + | + + class Validator(builtins.object) + | Validator(args) + | + | Methods defined here: + | + | WHAT: init CoNLL-U reader + | __init__(self, args) + | Initialize self. See help(type(self)) for accurate signature. + | + | build_tree_udapi(self, lines) + | + | TODO: mv to module for tree ops? + | get_caused_nonprojectivities(self, node) + | Checks whether a node is in a gap of a nonprojective edge. Report true only + | if the node's parent is not in the same gap. (We use this function to check + | that a punctuation node does not cause nonprojectivity. But if it has been + | dragged to the gap with a larger subtree, then we do not blame it.) This + | extra condition makes this function different from node.is_nonprojective_gap(); + | another difference is that instead of just detecting the nonprojectivity, + | we return the nonprojective nodes so we can report them. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The tree node to be tested. + | + | Returns + | ------- + | cross : list of udapi.core.node.Node objects + | The nodes whose attachment is nonprojective because of the current node. + | + | next_sentence(self, state, inp) + | `inp` a file-like object yielding lines as unicode + | + | This function does elementary checking of the input and yields one + | sentence at a time from the input stream. The function guarantees + | elementary integrity of its yields. Some lines may be skipped (e.g., + | extra empty lines or misplaced comments), and a whole sentence will be + | skipped if one of its token lines has unexpected number of columns. + | + | However, some low-level errors currently do not lead to excluding the + | sentence from being yielded and put to subsequent tests. Specifically, + | character constraints on individual fields are tested here but errors + | are not considered fatal. + | + | This function is a generator. The caller can call it in a 'for x in ...' + | loop. In each iteration of the caller's loop, the generator will generate + | the next sentence, that is, it will read the next sentence from the input + | stream. (Technically, the function returns an object, and the object will + | then read the sentences within the caller's loop.) + | + | WHAT: entry points for validation + | validate_files(self, filenames) + | + | validate_file(self, state, inp) + | The main entry point for all validation tests applied to one input file. + | It reads sentences from the input stream one by one, each sentence is + | immediately tested. + | + | Parameters + | ---------- + | inp : open file handle + | The CoNLL-U-formatted input stream. + | | + | validate_tree(self, state, sentence) + | Takes the list of non-comment lines (line = list of columns) describing + | a sentence. Returns an array with line number corresponding to each tree + | node. In case of fatal problems (missing HEAD etc.) returns None + | (and reports the error, unless it is something that should have been + | reported earlier). + | + | We will assume that this function is called only if both ID and HEAD values + | have been found valid for all tree nodes, including the sequence of IDs + | and the references from HEAD to existing IDs. + | + | This function originally served to build a data structure that would + | describe the tree and make it accessible during subsequent tests. Now we + | use the Udapi data structures instead but we still have to call this + | function first because it will survive and report ill-formed input. In + | such a case, the Udapi data structure will not be built and Udapi-based + | tests will be skipped. + | + | Parameters + | ---------- + | sentence : list + | Lines (arrays of columns): words, mwt tokens, empty nodes. + | + | Returns + | ------- + | ok : bool + | + | ACTUAL VALIDATION FUNCTIONS + | TODO: add returns + | + | validate_annotation(self, state, tree, linenos) + | Checks universally valid consequences of the annotation guidelines. Looks + | at regular nodes and basic tree, not at enhanced graph (which is checked + | elsewhere). + | + | Parameters + | ---------- + | tree : udapi.core.root.Root object + | linenos : dict + | Key is node ID (string, not int or float!) Value is the 1-based index + | of the line where the node occurs (int). + | + | validate_auxiliary_verbs(self, state, node, line, lang) + | Verifies that the UPOS tag AUX is used only with lemmas that are known to + | act as auxiliary verbs or particles in the given language. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The node to be validated. + | line : int + | Number of the line where the node occurs in the file. + | lang : str + | Code of the main language of the corpus. + | + | validate_character_constraints(self, state, cols, line) + | Checks general constraints on valid characters, e.g. that UPOS + | only contains [A-Z]. + | + | Parameters + | ---------- + | cols : list + | The values of the columns on the current node / token line. + | line : int + | Number of the line where the node occurs in the file. + | + | validate_copula_lemmas(self, state, node, line, lang) + | Verifies that the relation cop is used only with lemmas that are known to + | act as copulas in the given language. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The node to be validated. + | line : int + | Number of the line where the node occurs in the file. + | lang : str + | Code of the main language of the corpus. + | + | validate_deprels(self, state, node, line) + | Checks that a dependency relation label is listed as approved in the given + | language. As a language-specific test, this function generally belongs to + | level 4, but it can be also used on levels 2 and 3, in which case it will + | check only the main dependency type and ignore any subtypes. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The node whose incoming relation will be validated. + | line : int + | Number of the line where the node occurs in the file. + | + | validate_deps(self, state, cols, line) + | Validates that DEPS is correctly formatted and that there are no + | self-loops in DEPS (longer cycles are allowed in enhanced graphs but + | self-loops are not). + | + | This function must be run on raw DEPS before it is fed into Udapi because + | it checks the order of relations, which is not guaranteed to be preserved + | in Udapi. On the other hand, we assume that it is run after + | validate_id_references() and only if DEPS is parsable and the head indices + | in it are OK. + | + | Parameters + | ---------- + | cols : list + | The values of the columns on the current node / token line. + | line : int + | Number of the line where the node occurs in the file. + | + | validate_deps_all_or_none(self, state, sentence) + | Takes the list of non-comment lines (line = list of columns) describing + | a sentence. Checks that enhanced dependencies are present if they were + | present at another sentence, and absent if they were absent at another + | sentence. + | + | validate_egraph_connected(self, state, nodes, linenos) + | Takes the list of nodes (including empty nodes). If there are enhanced + | dependencies in DEPS, builds the enhanced graph and checks that it is + | rooted and connected. + | + | Parameters + | ---------- + | nodes : list of udapi.core.node.Node objects + | List of nodes in the sentence, including empty nodes, sorted by word + | order. + | linenos : dict + | Indexed by node ID (string), contains the line number on which the node + | occurs. + | + | validate_empty_node_empty_vals(self, state, cols, line) + | Checks that an empty node has _ empty values in HEAD and DEPREL. This is + | required by UD guidelines but not necessarily by CoNLL-U, therefore + | a level 2 test. + | + | Parameters + | ---------- + | cols : list + | The values of the columns on the current node / token line. + | line : int + | Number of the line where the node occurs in the file. + | + | validate_end(self, state) + | Final tests after processing the entire treebank (possibly multiple files). + | + | validate_enhanced_orphan(self, state, node, line) + | Checks universally valid consequences of the annotation guidelines in the + | enhanced representation. Currently tests only phenomena specific to the + | enhanced dependencies; however, we should also test things that are + | required in the basic dependencies (such as left-to-right coordination), + | unless it is obvious that in enhanced dependencies such things are legal. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The node whose incoming relations will be validated. This function + | operates on both regular and empty nodes. Make sure to call it for + | empty nodes, too! + | line : int + | Number of the line where the node occurs in the file. + | + | validate_expected_features(self, state, node, lineno) + | Certain features are expected to occur with certain UPOS or certain values + | of other features. This function issues warnings instead of errors, as + | features are in general optional and language-specific. Even the warnings + | are issued only if the treebank has features. Note that the expectations + | tested here are considered (more or less) universal. Checking that a given + | feature-value pair is compatible with a particular UPOS is done using + | language-specific lists at level 4. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The tree node to be tested. + | lineno : int + | The 1-based index of the line where the node occurs. + | + | validate_features_level2(self, state, cols, line) + | Checks general constraints on feature-value format: Permitted characters in + | feature name and value, features must be sorted alphabetically, features + | cannot be repeated etc. + | + | Parameters + | ---------- + | cols : list + | The values of the columns on the current node / token line. + | line : int + | Number of the line where the node occurs in the file. + | + | Returns + | ------- + | safe : bool + | There were no errors or the errors are not so severe that we should + | refrain from loading the sentence into Udapi. + | + | validate_features_level4(self, state, node, line, lang) + | Checks that a feature-value pair is listed as approved. Feature lists are + | language-specific. To disallow non-universal features, test on level 4 with + | language 'ud'. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The node to be validated. + | line : int + | Number of the line where the node occurs in the file. + | lang : str + | Code of the main language of the corpus. + | + | validate_fixed_span(self, state, node, lineno) + | Like with goeswith, the fixed relation should not in general skip words that + | are not part of the fixed expression. Unlike goeswith however, there can be + | an intervening punctuation symbol. Moreover, the rule that fixed expressions + | cannot be discontiguous has been challenged with examples from Swedish and + | Coptic, see https://github.com/UniversalDependencies/docs/issues/623. + | Hence, the test was turned off 2019-04-13. I am re-activating it 2023-09-03 + | as just a warning. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The tree node to be tested. + | lineno : int + | The 1-based index of the line where the node occurs. + | + | validate_flat_foreign(self, state, node, lineno, linenos) + | flat:foreign is an optional subtype of flat. It is used to connect two words + | in a code-switched segment of foreign words if the annotators did not want + | to provide the analysis according to the source language. If flat:foreign + | is used, both the parent and the child should have the Foreign=Yes feature + | and their UPOS tag should be X. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The tree node to be tested. + | lineno : int + | The 1-based index of the line where the node occurs. + | linenos : dict + | Key is node ID (string, not int or float!) Value is the 1-based index + | of the line where the node occurs (int). + | + | validate_functional_leaves(self, state, node, lineno, linenos) + | Most of the time, function-word nodes should be leaves. This function + | checks for known exceptions and warns in the other cases. + | (https://universaldependencies.org/u/overview/syntax.html#function-word-modifiers) + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The tree node to be tested. + | lineno : int + | The 1-based index of the line where the node occurs. + | linenos : dict + | Key is node ID (string, not int or float!) Value is the 1-based index + | of the line where the node occurs (int). + | + | validate_goeswith_morphology_and_edeps(self, state, node, lineno) + | If a node has the 'goeswith' incoming relation, it is a non-first part of + | a mistakenly interrupted word. The lemma, upos tag and morphological features + | of the word should be annotated at the first part, not here. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The tree node to be tested. + | lineno : int + | The 1-based index of the line where the node occurs. + | + | validate_goeswith_span(self, state, node, lineno) + | The relation 'goeswith' is used to connect word parts that are separated + | by whitespace and should be one word instead. We assume that the relation + | goes left-to-right, which is checked elsewhere. Here we check that the + | nodes really were separated by whitespace. If there is another node in the + | middle, it must be also attached via 'goeswith'. The parameter id refers to + | the node whose goeswith children we test. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The tree node to be tested. + | lineno : int + | The 1-based index of the line where the node occurs. + | + | validate_id_references(self, state, sentence) + | Verifies that HEAD and DEPS reference existing IDs. If this function does + | not return True, most of the other tests should be skipped for the current + | sentence (in particular anything that considers the tree structure). + | + | Parameters + | ---------- + | sentence : list + | Lines (arrays of columns): words, mwt tokens, empty nodes. + | + | Returns + | ------- + | ok : bool + | + | validate_id_sequence(self, state, sentence) + | Validates that the ID sequence is correctly formed. + | Besides reporting the errors, it also returns False to the caller so it can + | avoid building a tree from corrupt IDs. + | + | sentence ... array of arrays, each inner array contains columns of one line + | + | validate_left_to_right_relations(self, state, node, lineno) + | Certain UD relations must always go left-to-right (in the logical order, + | meaning that parent precedes child, disregarding that some languages have + | right-to-left writing systems). + | Here we currently check the rule for the basic dependencies. + | The same should also be tested for the enhanced dependencies! + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The tree node to be tested. + | lineno : int + | The 1-based index of the line where the node occurs. + | + | validate_misc(self, state, cols, line) + | In general, the MISC column can contain almost anything. However, if there + | is a vertical bar character, it is interpreted as the separator of two + | MISC attributes, which may or may not have the form of attribute=value pair. + | In general it is not forbidden that the same attribute appears several times + | with different values, but this should not happen for selected attributes + | that are described in the UD documentation. + | + | This function must be run on raw MISC before it is fed into Udapi because + | Udapi is not prepared for some of the less recommended usages of MISC. + | + | Parameters + | ---------- + | cols : list + | The values of the columns on the current node / token line. + | line : int + | Number of the line where the node occurs in the file. + | + | validate_misc_entity(self, state, comments, sentence) + | Optionally checks the well-formedness of the MISC attributes that pertain + | to coreference and named entities. + | + | validate_mwt_empty_vals(self, state, cols, line) + | Checks that a multi-word token has _ empty values in all fields except MISC. + | This is required by UD guidelines although it is not a problem in general, + | therefore a level 2 test. + | + | Parameters + | ---------- + | cols : list + | The values of the columns on the current node / token line. + | line : int + | Number of the line where the node occurs in the file. + | + | validate_newlines(self, state, inp) + | Checks that the input file consistently uses linux-style newlines (LF only, + | not CR LF like in Windows). To be run on the input file handle after the + | whole input has been read. + | + | validate_orphan(self, state, node, lineno) + | The orphan relation is used to attach an unpromoted orphan to the promoted + | orphan in gapping constructions. A common error is that the promoted orphan + | gets the orphan relation too. The parent of orphan is typically attached + | via a conj relation, although some other relations are plausible too. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The tree node to be tested. + | lineno : int + | The 1-based index of the line where the node occurs. + | + | validate_projective_punctuation(self, state, node, lineno) + | Punctuation is not supposed to cause nonprojectivity or to be attached + | nonprojectively. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The tree node to be tested. + | lineno : int + | The 1-based index of the line where the node occurs. + | + | validate_required_feature( + | self, + | state, + | feats, + | required_feature, + | required_value, + | incident + | ) + | In general, the annotation of morphological features is optional, although + | highly encouraged. However, if the treebank does have features, then certain + | features become required. This function will check the presence of a feature + | and if it is missing, an error will be reported only if at least one feature + | has been already encountered. Otherwise the error will be remembered and it + | may be reported afterwards if any feature is encountered later. + | + | Parameters + | ---------- + | feats : udapi.core.dualdict.DualDict object + | The feature-value set to be tested whether they contain the required one. + | required_feature : str + | The name of the required feature. + | required_value : str + | The required value of the feature. Multivalues are not supported (they + | are just a string value containing one or more commas). If + | required_value is None or an empty string, it means that we require any + | non-empty value of required_feature. + | incident : Incident object + | The message that should be printed if the error is confirmed. + | + | validate_root(self, state, node, line) + | Checks that DEPREL is "root" iff HEAD is 0. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The node whose incoming relation will be validated. This function + | operates on both regular and empty nodes. Make sure to call it for + | empty nodes, too! + | line : int + | Number of the line where the node occurs in the file. + | + | validate_sent_id(self, state, comments, lcode) + | Checks that sentence id exists, is well-formed and unique. + | + | validate_single_object(self, state, node, lineno) + | No predicate should have more than one direct object (number of indirect + | objects is unlimited). Theoretically, ccomp should be understood as a + | clausal equivalent of a direct object, but we do not have an indirect + | equivalent, so it seems better to tolerate additional ccomp at present. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The tree node to be tested. + | lineno : int + | The 1-based index of the line where the node occurs. + | + | validate_single_subject(self, state, node, lineno) + | No predicate should have more than one subject. + | An xcomp dependent normally has no subject, but in some languages the + | requirement may be weaker: it could have an overt subject if it is + | correferential with a particular argument of the matrix verb. Hence we do + | not check zero subjects of xcomp dependents at present. + | Furthermore, in some situations we must allow multiple subjects. If a clause + | acts as a nonverbal predicate of another clause, then we must attach two + | subjects to the predicate of the inner clause: one is the predicate of the + | inner clause, the other is the predicate of the outer clause. This could in + | theory be recursive but in practice it isn't. As of UD 2.10, an amendment + | of the guidelines says that the inner predicate of the predicate clause + | should govern both subjects even if there is a copula (previously such + | cases were an exception from the UD approach that copulas should not be + | heads); however, the outer subjects should be attached as [nc]subj:outer. + | See https://universaldependencies.org/changes.html#multiple-subjects. + | See also issue 34 (https://github.com/UniversalDependencies/tools/issues/34). + | Strictly speaking, :outer is optional because it is a subtype, and some + | treebanks may want to avoid it. For example, in Coptic Scriptorium, there + | is only one occurrence in dev, one in test, and none in train, so it would + | be impossible to train a parser that gets it right. For that reason, it is + | possible to replace the :outer subtype with Subject=Outer in MISC. The MISC + | attribute is just a directive for the validator and no parser is expected + | to predict it. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The tree node to be tested. + | lineno : int + | The 1-based index of the line where the node occurs. + | + | validate_text_meta(self, state, comments, tree) + | Checks metadata other than sentence id, that is, document breaks, paragraph + | breaks and sentence text (which is also compared to the sequence of the + | forms of individual tokens, and the spaces vs. SpaceAfter=No in MISC). + | + | validate_token_ranges(self, state, sentence) + | Checks that the word ranges for multiword tokens are valid. + | + | sentence ... array of arrays, each inner array contains columns of one line + | + | + | validate_unicode_normalization(self, state, text) + | Tests that letters composed of multiple Unicode characters (such as a base + | letter plus combining diacritics) conform to NFC normalization (canonical + | decomposition followed by canonical composition). + | + | Parameters + | ---------- + | text : str + | The input line to be tested. If the line consists of TAB-separated + | fields (token line), errors reports will specify the field where the + | error occurred. Otherwise (comment line), the error report will not be + | localized. + | + | validate_upos(self, state, cols, line) + | Checks that the UPOS field contains one of the 17 known tags. + | + | Parameters + | ---------- + | cols : list + | The values of the columns on the current node / token line. + | line : int + | Number of the line where the node occurs in the file. + | + | validate_upos_vs_deprel(self, state, node, lineno) + | For certain relations checks that the dependent word belongs to an expected + | part-of-speech category. Occasionally we may have to check the children of + | the node, too. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The tree node to be tested. + | lineno : int + | The 1-based index of the line where the node occurs. + | + | validate_whitespace(self, state, cols) + | Checks that columns are not empty and do not contain whitespace characters + | except for patterns that could be allowed at level 4. Applies to all types + | of TAB-containing lines: nodes / words, mwt ranges, empty nodes. + | + | Parameters + | ---------- + | cols : list + | The values of the columns on the current node / token line. + | + | validate_words_with_spaces(self, state, node, line, lang) + | Checks a single line for disallowed whitespace. + | Here we assume that all language-independent whitespace-related tests have + | already been done on level 1, so we only check for words with spaces that + | are explicitly allowed in a given language. + | + | Parameters + | ---------- + | node : udapi.core.node.Node object + | The node to be validated. + | line : int + | Number of the line where the node occurs in the file. + | lang : str + | Code of the main language of the corpus. + | + +FUNCTIONS + TODO: ditch + build_argparse() + parse_args() + main() + +WHAT: CoNLL-U spec +TODO: convert into config + loader +DATA + n_cols = 10 + COLNAMES = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'D... + DEPREL = 7 + DEPS = 8 + FEATS = 5 + FORM = 1 + HEAD = 6 + ID = 0 + LEMMA = 2 + MISC = 9 + THISDIR = '/home/harisont/Repos/UniversalDependencies/tools/validator/... + UPOS = 3 + XPOS = 4 \ No newline at end of file diff --git a/validator/docs/pycallgraph.png b/validator/docs/pycallgraph.png new file mode 100644 index 000000000..b842c3071 Binary files /dev/null and b/validator/docs/pycallgraph.png differ diff --git a/validator/logs/.gitkeep b/validator/logs/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/validator/pyproject.toml b/validator/pyproject.toml new file mode 100644 index 000000000..4c624732c --- /dev/null +++ b/validator/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.envs.static] +dev-mode = true +dev-mode-dirs = ["."] + +[project] +name = "validator" +version = "0.0.1" +authors = [ + { name="Filip Ginter"}, + { name="Sampo Pyysalo"}, + { name="Daniel Zeman", email="zeman@ufal.mff.cuni.cz"}, + { name="Arianna Masciolini", email="arianna.masciolini@gu.se"}, + { name="Ludovica Pannitto", email="ellepannitto@gmail.com"}, + { name="John Bauer", email="horatio@gmail.com"} +] +readme = "README.md" +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] + +dependencies = ["dotenv", + "regex", + "unicodedata"] + +[tool.hatch.build] +# Tell hatchling where your source code is +sources = ["src"] + +[project.scripts] +validate = "validator.validate:main" + diff --git a/validator/src/__init__.py b/validator/src/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/validator/src/validator/__init__.py b/validator/src/validator/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/validator/src/validator/cli.py b/validator/src/validator/cli.py new file mode 100644 index 000000000..b0e721f33 --- /dev/null +++ b/validator/src/validator/cli.py @@ -0,0 +1,172 @@ +#! /usr/bin/env python3 +# Original code (2015) by Filip Ginter and Sampo Pyysalo. +# DZ 2018-11-04: Porting the validator to Python 3. +# DZ: Many subsequent changes. See the git history. +# 2025-08-31: Refactoring by @AngledLuffa +# 2025-09: Refactoring by @harisont and @ellepannitto + +import sys +import argparse +import os +import yaml +import functools + +import logging + +# According to https://stackoverflow.com/questions/1832893/python-regex-matching-unicode-properties, +# the regex module has the same API as re but it can check Unicode character properties using \p{} +# as in Perl. +import regex as re + +import validator.logging_utils as logging_utils + +import validator.validate_lib as VLib +import validator.specifications as specifications +import validator.utils as utils +import validator.output_utils as outils +import validator.validate as vlib + +logger = logging.getLogger(__name__) +logging_utils.setup_logging(logger) + +def _validate(args): + out_format = args.format + dest = args.dest + explanations = args.explanations + lines_content = args.lines_content + + if dest == "-": + dest = sys.stdout + elif dest == "stderr": + dest = sys.stderr + else: + dest = open(dest, 'w') + + ud_specs = specifications.UDSpecs(args.data_folder) + validation_fun = vlib.validate + cfg = yaml.safe_load(open(args.config_file)) + + for incidents in vlib.validate(args.input, cfg_obj=cfg): + if out_format == "json": + outils.dump_json(incidents, dest, explanations, lines_content) + else: + outils.serialize_output(incidents, dest, explanations, lines_content) + + if len(incidents): + return 1 + else: + return 0 + + # Summarize the warnings and errors. + passed = True + nerror = 0 + #if state.error_counter: + # for k, v in sorted(state.error_counter.items()): + # if k == 'Warning': + # errors = 'Warnings' + # else: + # errors = k+' errors' + # nerror += v + # passed = False + # if not args.quiet: + # print(f'{errors}: {v}', file=sys.stderr) + ## Print the final verdict and exit. + #if passed: + # if not args.quiet: + # print('*** PASSED ***', file=sys.stderr) + # return 0 + #else: + # if not args.quiet: + # print(f'*** FAILED *** with {nerror} errors', file=sys.stderr) + # return 1 + +def main(): + + opt_parser = argparse.ArgumentParser(description="CoNLL-U validation script. Python 3 is needed to run it!") + + io_group = opt_parser.add_argument_group("Input / output options") + io_group.add_argument('--quiet', + dest="quiet", action="store_true", default=False, + help="""Do not print any error messages. + Exit with 0 on pass, non-zero on fail.""") + io_group.add_argument('--max-err', + action="store", type=int, default=20, + help="""How many errors to output before exiting? 0 for all. + Default: %(default)d.""") + io_group.add_argument('input', + nargs='*', + help="""Input file name(s), or "-" or nothing for standard input.""") + + list_group = opt_parser.add_argument_group("Tag sets", "Options relevant to checking tag sets.") + list_group.add_argument("--lang", + action="store", required=True, default=None, + help="""Which langauge are we checking? + If you specify this (as a two-letter code), the tags will be checked + using the language-specific files in the + data/directory of the validator.""") + list_group.add_argument("--level", + action="store", type=int, default=5, dest="level", + help="""Level 1: Test only CoNLL-U backbone. + Level 2: UD format. + Level 3: UD contents. + Level 4: Language-specific labels. + Level 5: Language-specific contents.""") + + tree_group = opt_parser.add_argument_group("Tree constraints", + "Options for checking the validity of the tree.") + tree_group.add_argument("--multiple-roots", + action="store_false", default=True, dest="single_root", + help="""Allow trees with several root words + (single root required by default).""") + + meta_group = opt_parser.add_argument_group("Metadata constraints", + "Options for checking the validity of tree metadata.") + meta_group.add_argument("--no-tree-text", + action="store_false", default=True, dest="check_tree_text", + help="""Do not test tree text. + For internal use only, this test is required and on by default.""") + meta_group.add_argument("--no-space-after", + action="store_false", default=True, dest="check_space_after", + help="Do not test presence of SpaceAfter=No.") + + coref_group = opt_parser.add_argument_group("Coreference / entity constraints", + "Options for checking coreference and entity annotation.") + coref_group.add_argument('--coref', + action='store_true', default=False, dest='check_coref', + help='Test coreference and entity-related annotation in MISC.') + + config_group = opt_parser.add_argument_group("Directories and paths", "TBD") # TODO better helper + config_group.add_argument('--data-folder', default=os.path.normpath(os.path.join(utils.THIS_DIR,"../../../data"))) + config_group.add_argument('--config-file', type=str) + + + out_format = opt_parser.add_argument_group("Choices of output formats", "TBD") + out_format.add_argument('--format', default='LOG', choices=['json', 'LOG'], + help='Produce output in desired format') + out_format.add_argument('--dest', default='-', type=str, + help="Output destination") + out_format.add_argument( + '--explanations', + action='store_true', + default=False, + help="Include longer explanations.") + out_format.add_argument( + '--lines-content', # TODO: better names + action='store_true', + default=False, + help="Include the content of the errored lines in the output.") + + opt_parser.set_defaults(func=_validate) + args = opt_parser.parse_args() #Parsed command-line arguments + + if "func" not in args: + opt_parser.print_usage() + exit() + + logger.info("Arguments: \n%s", logging_utils.pprint(vars(args))) + + args.func(args) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/validator/src/validator/compiled_regex.py b/validator/src/validator/compiled_regex.py new file mode 100644 index 000000000..8259fb31a --- /dev/null +++ b/validator/src/validator/compiled_regex.py @@ -0,0 +1,98 @@ +""" +The CompiledRegexes module holds various regular expressions needed to +recognize individual elements of the CoNLL-U format, precompiled to speed +up parsing. Individual expressions are typically not enclosed in ^...$ +because one can use re.fullmatch() if it is desired that the whole string +matches the expression. +""" + +import regex as re + +# Whitespace(s) +ws = re.compile(r"\s+") + +# TODO: rename in 'two_ws' +# Exactly two whitespaces +ws2 = re.compile(r"\s\s") + +# TODO: rename in 'integer' +# Integer +wordid = re.compile(r"[1-9][0-9]*") + +# TODO: rename in 'integer_range' +# Multiword token id: range of integers. +# The two parts are bracketed so they can be captured and processed separately. +mwtid = re.compile(r"([1-9][0-9]*)-([1-9][0-9]*)") # range of integers. + +# TODO: rename in 'decimal' +# Empty node id: "decimal" number (but 1.10 != 1.1). +# The two parts are bracketed so they can be captured and processed separately. +enodeid = re.compile(r"([0-9]+)\.([1-9][0-9]*)") + +# New document comment line. Document id, if present, is bracketed. +# ! Why not replacing \s with " "? +# ! proposal for new regex: "# newdoc(?: = (\S+))?" +newdoc = re.compile(r"#\s*newdoc(?:\s+(\S+))?") + +# New paragraph comment line. Paragraph id, if present, is bracketed. +# ! proposal for new regex: "# newpar(?: = (\S+))?" +newpar = re.compile(r"#\s*newpar(?:\s+(\S+))?") + +# Sentence id comment line. The actual id is bracketed. +# ! proposal for new regex: "# sent_id = (\S+)" +sentid = re.compile(r"#\s*sent_id\s*=\s*(\S+)") + +# Parallel sentence id comment line. The actual id as well as its predefined parts are bracketed. +# TODO: add test +parallelid = re.compile(r"#\s*parallel_id\s*=\s*(([a-z]+)/([-0-9a-z]+)(?:/(alt[1-9][0-9]*|part[1-9][0-9]*|alt[1-9][0-9]*part[1-9][0-9]*))?)") + +# Sentence text comment line. The actual text is bracketed. +# ! proposal for new regex: "# text = (.*\S)" +text = re.compile(r"#\s*text\s*=\s*(.*\S)") + +# Global entity comment is a declaration of entity attributes in MISC. +# It occurs once per document and it is optional (only CorefUD data). +# The actual attribute declaration is bracketed so it can be captured in the match. +# ! proposal for new regex: "# global\.Entity = (.+)" +# TODO: write test +global_entity = re.compile(r"#\s*global\.Entity\s*=\s*(.+)") + +# TODO: rename in 'uppercase_string' +# UPOS tag. +upos = re.compile(r"[A-Z]+") + +# Feature=value pair. +# Feature name and feature value are bracketed so that each can be captured separately in the match. +featval = re.compile( + r"([A-Z][A-Za-z0-9]*(?:\[[a-z0-9]+\])?)=(([A-Z0-9][A-Z0-9a-z]*)(,([A-Z0-9][A-Z0-9a-z]*))*)" + ) +val = re.compile(r"[A-Z0-9][A-Za-z0-9]*") # ! why do we need this? + # TODO: test? + +# Basic parent reference (HEAD). +# TODO: rename in 'natural_number' +head = re.compile(r"(0|[1-9][0-9]*)") + +# Enhanced parent reference (head). +# TODO: rename in 'decimal_withzero' +ehead = re.compile(r"(0|[1-9][0-9]*)(\.[1-9][0-9]*)?") + +# Basic dependency relation (including optional subtype). +deprel = re.compile(r"[a-z]+(:[a-z]+)?") + +# TODO: write test +# Enhanced dependency relation (possibly with Unicode subtypes). +# Ll ... lowercase Unicode letters +# Lm ... modifier Unicode letters (e.g., superscript h) +# Lo ... other Unicode letters (all caseless scripts, e.g., Arabic) +# M .... combining diacritical marks +# Underscore is allowed between letters but not at beginning, end, or next to another underscore. +edeprelpart_resrc = r'[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(_[\p{Ll}\p{Lm}\p{Lo}\p{M}]+)*' + +# There must be always the universal part, consisting only of ASCII letters. +# There can be up to three additional, colon-separated parts: subtype, preposition and case. +# One of them, the preposition, may contain Unicode letters. We do not know which one it is +# (only if there are all four parts, we know it is the third one). +# ^[a-z]+(:[a-z]+)?(:[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(_[\p{Ll}\p{Lm}\p{Lo}\p{M}]+)*)?(:[a-z]+)?$ +edeprel_resrc = '^[a-z]+(:[a-z]+)?(:' + edeprelpart_resrc + ')?(:[a-z]+)?$' +edeprel = re.compile(edeprel_resrc) \ No newline at end of file diff --git a/validator/src/validator/conllu_spec.yaml b/validator/src/validator/conllu_spec.yaml new file mode 100644 index 000000000..e87ceba69 --- /dev/null +++ b/validator/src/validator/conllu_spec.yaml @@ -0,0 +1,11 @@ +columns: + - ID + - FORM + - LEMMA + - UPOS + - XPOS + - FEATS + - HEAD + - DEPREL + - DEPS + - MISC \ No newline at end of file diff --git a/validator/src/validator/incident.py b/validator/src/validator/incident.py new file mode 100644 index 000000000..b67bef397 --- /dev/null +++ b/validator/src/validator/incident.py @@ -0,0 +1,108 @@ +from dataclasses import dataclass, field +from enum import Enum +import os + +from validator.validate_lib import State + +class TestClass(Enum): + INTERNAL = 0 + UNICODE = 1 + FORMAT = 2 + MORPHO = 3 + SYNTAX = 4 + ENHANCED = 5 + COREF = 6 + METADATA = 7 + + def __str__(self): + return self.name + +class IncidentType(Enum): + ERROR = 1 + WARNING = 0 + +# TODO: make abstract +@dataclass +class Incident: + state: State = field(init=False) # TODO: check if this is actually necessary + level: int = 1 + testclass: TestClass = TestClass.FORMAT + testid: str = 'generic-error' + message: str = 'No error description provided.' + # Line number. The default is the most recently read line as recorded + # in the state; but in most cases we need to get the number + # during instantiation, as the most recently read line is the last line + # of the sentence, and the error was found on one of the words of the + # sentence. + lineno: int = 0 # ? + # File name. The default is the file from which we are reading right + # now ('-' if reading from STDIN). + filename: str = 'STDIN' + # Current (most recently read) sentence id. + sentid: str = None + # ID of the node on which the error occurred (if it pertains to one node). + nodeid: str = None + + def set_state(self, state): + + #self.state = state + # self.lineno += state.current_line + self.sentid = state.sentence_id + + if not state.current_file_name == '-': + self.filename = os.path.basename(state.current_file_name) + return self # ! + + #self.sentid = self.state.sentence_id + #self.nodeid = self.state.nodeid + + def __repr__(self): + out_str = f"[Line {self.lineno} Sent {self.sentid}]: [L{self.level} {self.testclass} {self.testid}] {self.message}" + return out_str + + # TODO: overwrite __str__ or __repr__ + # def report(self, state, args): + # # Even if we should be quiet, at least count the error. + # state.error_counter[self.testclass] = state.error_counter.get(self.testclass, 0)+1 + # if args.quiet: + # return + # # Suppress error messages of a type of which we have seen too many. + # if args.max_err > 0 and state.error_counter[self.testclass] > args.max_err: + # if state.error_counter[self.testclass] == args.max_err + 1: + # print(f'...suppressing further errors regarding {self.testclass}', file=sys.stderr) + # return # suppressed + # # If we are here, the error message should really be printed. + # # Address of the incident. + # address = f'Line {self.lineno} Sent {self.sentid}' + # if self.nodeid: + # address += f' Node {self.nodeid}' + # # Insert file name if there are several input files. + # if len(args.input) > 1: + # address = f'File {self.filename} ' + address + # # Classification of the incident. + # levelclassid = f'L{self.level} {self.testclass} {self.testid}' + # # Message (+ explanation, if this is the first error of its kind). + # message = self.message + # if self.explanation and self.explanation not in state.explanation_printed: + # message += "\n\n" + self.explanation + "\n" + # state.explanation_printed.add(self.explanation) + # print(f'[{address}]: [{levelclassid}] {message}', file=sys.stderr) + + +@dataclass +class Error(Incident): + def get_type(self): + return IncidentType.ERROR + + def __repr__(self): + out_str = f"[Line {self.lineno+1} Sent {self.sentid}]: [L{self.level} {self.testclass} {self.testid}] {self.message}" + return out_str + +@dataclass +class Warning(Incident): + def get_type(self): + return IncidentType.WARNING + + def __repr__(self): + out_str = f"[Line {self.lineno+1} Sent {self.sentid}]: [L{self.level} {self.testclass} {self.testid}] {self.message}" + return out_str \ No newline at end of file diff --git a/validator/src/validator/loaders.py b/validator/src/validator/loaders.py new file mode 100644 index 000000000..b0c251235 --- /dev/null +++ b/validator/src/validator/loaders.py @@ -0,0 +1,49 @@ +import os +import json + +import yaml +import regex as re + +def load_conllu_spec(spec_path): + with open(spec_path, encoding="utf-8") as spec_handle: + return yaml.safe_load(spec_handle) + + +def load_json_data_set(filename, key=None): + """ + Loads the set of permitted tags from a json file. + If a key is specified, it returns only the selected key. + """ + with open(filename, encoding="utf-8") as fin: + res = json.load(fin) + + if key: + return set(res[key]) + else: + return res + + +def load_json_data(filename, key=None): + """ + Loads permitted tags from a json file. + If a key is specified, it returns only the selected key. + """ + with open(filename, encoding="utf-8") as fin: + res = json.load(fin) + if key: + return res[key] + else: + return res + + +def load_combinations(filename): + + res = {} + content = load_json_data(filename, "expressions") + for lang_code, lang_dicts in content.items(): + lang_regexes = list(sorted(lang_dicts.keys())) + combination = '('+'|'.join(lang_regexes)+')' + compiled_regex = re.compile(combination) + res[lang_code] = (combination, compiled_regex) + + return res \ No newline at end of file diff --git a/validator/src/validator/logging_utils.py b/validator/src/validator/logging_utils.py new file mode 100644 index 000000000..d3492b336 --- /dev/null +++ b/validator/src/validator/logging_utils.py @@ -0,0 +1,42 @@ +import logging +from logging.handlers import RotatingFileHandler +import os + +CONSOLE_FORMAT = "[%(filename)s:%(lineno)s - %(funcName)20s()] %(message)s" + +def setup_logging(logger): + + log_file = os.getenv("LOG_FILE", "logs/validate.log") + error_file = os.getenv("ERROR_FILE", "logs/validate.err") + + log_level = os.getenv("LOG_LEVEL", "DEBUG").upper() + logger.setLevel(getattr(logging, log_level, logging.DEBUG)) + + file_handler = RotatingFileHandler(log_file, maxBytes=5 * 1024 * 1024, backupCount=3) + file_handler.setLevel(logging.DEBUG) + + error_handler = logging.FileHandler(error_file, "w", encoding="utf-8") + error_handler.setLevel(logging.ERROR) + + console_handler = logging.StreamHandler() + console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) + console_handler.setFormatter(logging.Formatter(CONSOLE_FORMAT)) + console_handler.setLevel(logging.DEBUG) + + logger.addHandler(console_handler) + + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + file_handler.setFormatter(formatter) + error_handler.setFormatter(formatter) + + logger.addHandler(file_handler) + logger.addHandler(error_handler) + + +def pprint(args): + + ret_str = "" + for key, value in args.items(): + ret_str += f"{key:40} - {str(value):80}\n" + + return ret_str \ No newline at end of file diff --git a/validator/src/validator/output_utils.py b/validator/src/validator/output_utils.py new file mode 100644 index 000000000..cef6996ff --- /dev/null +++ b/validator/src/validator/output_utils.py @@ -0,0 +1,278 @@ +import regex as re +import json + +def explain_feats(specs, lcode): + """ + Returns explanation message for features of a particular language. + To be called after language-specific features have been loaded. + + Parameters + ---------- + specs : UDSpecs object + The UD specification. + lcode : str + The language code. + + Returns + ------- + name : str + The explanation message for the features of the given language. + """ + featset = specs.get_feats_for_language(lcode) + # Prepare a global message about permitted features and values. We will add + # it to the first error message about an unknown feature. Note that this + # global information pertains to the default validation language and it + # should not be used with code-switched segments in alternative languages. + msg = '' + if not lcode in specs.feats: + msg = ( + f"No feature-value pairs have been permitted for language [{lcode}].\n" + "They can be permitted at the address below (if the language has an ISO code and is registered with UD):\n" + "https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_feature.pl\n") + else: + # Identify feature values that are permitted in the current language. + for f in featset: + for e in featset[f]['errors']: + msg += f"ERROR in _{lcode}/feat/{f}.md: {e}\n" + res = set() + for f in featset: + if featset[f]['permitted'] > 0: + for v in featset[f]['uvalues']: + res.add(f+'='+v) + for v in featset[f]['lvalues']: + res.add(f+'='+v) + sorted_documented_features = sorted(res) + msg += ( + f"The following {len(sorted_documented_features)} feature values are currently permitted in language [{lcode}]:\n" + f"{', '.join(sorted_documented_features)}\n" + "If a language needs a feature that is not documented in the universal guidelines, the feature must\n" + "have a language-specific documentation page in a prescribed format\n" + "See https://universaldependencies.org/contributing_language_specific.html for further guidelines.\n" + "All features including universal must be specifically turned on for each language in which they are used.\n" + "See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_feature.pl for details.\n") + return msg + +def explain_deprel(specs, lcode): + """ + Returns explanation message for deprels of a particular language. + To be called after language-specific deprels have been loaded. + + Parameters + ---------- + specs : UDSpecs object + The UD specification. + lcode : str + The language code. + + Returns + ------- + name : str + The explanation message for the deprels of the given language. + """ + deprelset = specs.get_deprel_for_language(lcode) + # Prepare a global message about permitted relation labels. We will add + # it to the first error message about an unknown relation. Note that this + # global information pertains to the default validation language and it + # should not be used with code-switched segments in alternative languages. + msg = '' + if len(deprelset) == 0: + msg = ( + f"No dependency relation types have been permitted for language [{lcode}].\n" + "They can be permitted at the address below (if the language has an ISO code and is registered with UD):\n" + "https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_deprel.pl\n") + else: + # Identify dependency relations that are permitted in the current language. + # If there are errors in documentation, identify the erroneous doc file. + # Note that specs.deprel[lcode] may not exist even though we have a non-empty + # set of relations, if lcode is 'ud'. + if lcode in specs.deprel: + for r in specs.deprel[lcode]: + file = re.sub(r':', r'-', r) + if file == 'aux': + file = 'aux_' + for e in specs.deprel[lcode][r]['errors']: + msg += f"ERROR in _{lcode}/dep/{file}.md: {e}\n" + sorted_documented_relations = sorted(deprelset) + msg += ( + f"The following {len(sorted_documented_relations)} relations are currently permitted in language [{lcode}]:\n" + f"{', '.join(sorted_documented_relations)}\n" + "If a language needs a relation subtype that is not documented in the universal guidelines, the relation\n" + "must have a language-specific documentation page in a prescribed format.\n" + "See https://universaldependencies.org/contributing_language_specific.html for further guidelines.\n" + "Documented dependency relations can be specifically turned on/off for each language in which they are used.\n" + "See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_deprel.pl for details.\n") + return msg + +def explain_edeprel(specs, lcode): + """ + Returns explanation message for enhanced deprels of a particular language. + To be called after language-specific enhanced deprels have been loaded. + + Parameters + ---------- + specs : UDSpecs object + The UD specification. + lcode : str + The language code. + + Returns + ------- + name : str + The explanation message for the enhanced deprels of the given language. + """ + if lcode in specs._explanation_edeprel: + return specs._explanation_edeprel[lcode] + edeprelset = specs.get_edeprel_for_language(lcode) + # Prepare a global message about permitted relation labels. We will add + # it to the first error message about an unknown relation. Note that this + # global information pertains to the default validation language and it + # should not be used with code-switched segments in alternative languages. + msg = '' + if len(edeprelset) == 0: + msg = ( + f"No enhanced dependency relation types (case markers) have been permitted for language [{lcode}].\n" + "They can be permitted at the address below (if the language has an ISO code and is registered with UD):\n" + "https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_edeprel.pl\n") + else: + # Identify dependency relations that are permitted in the current language. + # If there are errors in documentation, identify the erroneous doc file. + # Note that specs.deprel[lcode] may not exist even though we have a non-empty + # set of relations, if lcode is 'ud'. + sorted_case_markers = sorted(edeprelset) + msg += ( + f"The following {len(sorted_case_markers)} enhanced relations are currently permitted in language [{lcode}]:\n" + f"{', '.join(sorted_case_markers)}\n" + "See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_edeprel.pl for details.\n") + specs._explanation_deprel[lcode] = msg + return msg + +def explain_aux(specs, lcode): + """ + Returns explanation message for auxiliaries of a particular language. + To be called after language-specific auxiliaries have been loaded. + + Parameters + ---------- + specs : UDSpecs object + The UD specification. + lcode : str + The language code. + + Returns + ------- + name : str + The explanation message for the auxiliaries of the given language. + """ + auxspec = specs.get_aux_for_language(lcode) + # Prepare a global message about permitted auxiliary lemmas. We will add + # it to the first error message about an unknown auxiliary. Note that this + # global information pertains to the default validation language and it + # should not be used with code-switched segments in alternative languages. + if len(auxspec) == 0: + return ( + f"No auxiliaries have been documented at the address below for language [{lcode}].\n" + f"https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_auxiliary.pl?lcode={lcode}\n") + else: + # Identify auxiliaries that are permitted in the current language. + return ( + f"The following {len(auxspec)} auxiliaries are currently documented in language [{lcode}]:\n" + f"{', '.join(auxspec)}\n" + f"See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_auxiliary.pl?lcode={lcode} for details.\n") + +def explain_cop(specs, lcode): + """ + Returns explanation message for copulas of a particular language. + To be called after language-specific copulas have been loaded. + + Parameters + ---------- + specs : UDSpecs object + The UD specification. + lcode : str + The language code. + + Returns + ------- + name : str + The explanation message for the copulas of the given language. + """ + copspec = specs.get_cop_for_language(lcode) + # Prepare a global message about permitted copula lemmas. We will add + # it to the first error message about an unknown copula. Note that this + # global information pertains to the default validation language and it + # should not be used with code-switched segments in alternative languages. + if len(copspec) == 0: + return ( + f"No copulas have been documented at the address below for language [{lcode}].\n" + f"https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_auxiliary.pl?lcode={lcode}\n") + else: + # Identify auxiliaries that are permitted in the current language. + return ( + f"The following {len(copspec)} copulas are currently documented in language [{lcode}]:\n" + f"{', '.join(copspec)}\n" + f"See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_auxiliary.pl?lcode={lcode} for details.\n") + +def explain_tospace(specs, lcode): + """ + Returns explanation message for tokens with spaces of a particular language. + To be called after language-specific tokens with spaces have been loaded. + + Parameters + ---------- + specs : UDSpecs object + The UD specification. + lcode : str + The language code. + + Returns + ------- + name : str + The explanation message for the tokens with spaces of the given language. + """ + # Prepare a global message about permitted features and values. We will add + # it to the first error message about an unknown token with space. Note that + # this global information pertains to the default validation language and it + # should not be used with code-switched segments in alternative languages. + if not lcode in specs.tospace: + return( + f"No tokens with spaces have been permitted for language [{lcode}].\n" + "They can be permitted at the address below (if the language has an ISO code and is registered with UD):\n" + "https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_token_with_space.pl\n") + else: + return ( + f"Only tokens and lemmas matching the following regular expression are currently permitted to contain spaces in language [{lcode}]:\n" + f"{specs.tospace[lcode][0]}\n" + "\nOthers can be permitted at the address below (if the language has an ISO code and is registered with UD):\n" + "https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_token_with_space.pl\n") + +def serialize_output(incidents, output_fhandle, explanations, lines_content): + + for incident in incidents: + print(incident) + + + if not incidents: + print("*** PASSED ***") + else: + print(f"*** FAILED *** with {len(incidents)} error(s)") + +def dump_json(incidents, dest, explanations, lines_content): + json_dict = {} + for incident in incidents: + filename = incident.filename + if filename not in json_dict: + json_dict[filename] = [] + incidict = { + "level": incident.level, + "testclass": str(incident.testclass), + "testid": incident.testid, + "message": incident.message, + "sentid": incident.sentid, + "lineno": incident.lineno + } + if explanations: + pass # TODO: extend incidict + if lines_content: + pass # TODO: extend incidict + json_dict[filename].append(incidict) + json.dump(json_dict, dest, indent=4) diff --git a/validator/src/validator/specifications.py b/validator/src/validator/specifications.py new file mode 100644 index 000000000..99509adbe --- /dev/null +++ b/validator/src/validator/specifications.py @@ -0,0 +1,171 @@ +import os +from dataclasses import dataclass, field +from typing import Set, Dict +import regex as re + +import validator.loaders as loaders + +@dataclass +class UDSpecs: + """ + The UDSpecs class holds various dictionaries of tags, auxiliaries, regular + expressions etc. needed for detailed testing, especially for language- + specific constraints. + """ + data_folder: str + # Universal part of speech tags in the UPOS column. Just a set. + upos: Set = field(init=False) + # Morphological features in the FEATS column. + # Key: language code; value: feature-value-UPOS data from feats.json. + feats: Dict = field(init=False) + # Universal dependency relation types (without subtypes) in the DEPREL + # column. + udeprel: Set = field(init=False) + # Dependency relation types in the DEPREL column. + # Key: language code; value: deprel data from deprels.json. + # Cached processed version: key: language code; value: set of deprels. #! what is this? + deprel: Dict = field(init=False) + cached_deprel_for_language: Dict = field(init=False) + # Enhanced dependency relation types in the DEPS column. + # Key: language code; value: edeprel data from edeprels.json. + # Cached processed version: key: language code; value: set of edeprels. #! what is this? + edeprel: Dict = field(init=False) + cached_edeprel_for_language: Dict = field(init=False) + # Auxiliary (and copula) lemmas in the LEMMA column. + # Key: language code; value: auxiliary/copula data from data.json. + # Cached processed versions: key: language code; value: list of lemmas. #! what is this? + auxcop: Dict = field(init=False) + cached_aux_for_language: Dict = field(init=False) + cached_cop_for_language: Dict = field(init=False) + # Tokens with spaces in the FORM and LEMMA columns. + # Key: language code; value: data from tospace.json. + # There is one or more regular expressions for each language in the file. + # If there are multiple expressions, combine them in one and compile it. + tospace: Dict = field(init=False) + + def __post_init__(self): + self.upos = loaders.load_json_data_set(os.path.join(self.data_folder, "upos.json"), "upos") + self.feats = loaders.load_json_data(os.path.join(self.data_folder, "feats.json"), "features") + self.udeprel = loaders.load_json_data_set(os.path.join(self.data_folder, "udeprels.json"), "udeprels") #! change to plural + self.deprel = loaders.load_json_data_set(os.path.join(self.data_folder, "deprels.json"), "deprels") #! change to plural + self.cached_deprel_for_language = {} + self.edeprel = loaders.load_json_data(os.path.join(self.data_folder,"edeprels.json"), "edeprels")#! change to plural + self.cached_edeprel_for_language = {} + self.auxcop = loaders.load_json_data(os.path.join(self.data_folder,"data.json"), "auxiliaries") #! change to plural + self.cached_aux_for_language = {} + self.cached_cop_for_language = {} + self.tospace = loaders.load_combinations(os.path.join(self.data_folder,"tospace.json")) + + # TODO: understand what do these functions do + def get_feats_for_language(self, lcode): + """ + Searches the previously loaded database of feature-value-UPOS combinations. + Returns the data for a given language code, organized in dictionaries. + Returns an empty dict if there are no data for the given language code. + """ + ###!!! If lcode is 'ud', we should permit all universal feature-value pairs, + ###!!! regardless of language-specific documentation. + # Do not crash if the user asks for an unknown language. + if not lcode in self.feats: + return {} + return self.feats[lcode] + + def get_deprel_for_language(self, lcode): + """ + Searches the previously loaded database of dependency relation labels. + Returns the set of permitted deprels for a given language code. Also + saves the result in self so that next time it can be fetched quickly + (once we loaded the data, we do not expect them to change). + """ + if lcode in self.cached_deprel_for_language: + return self.cached_deprel_for_language[lcode] + deprelset = set() + # If lcode is 'ud', we should permit all universal dependency relations, + # regardless of language-specific documentation. + if lcode == 'ud': + deprelset = self.udeprel + elif lcode in self.deprel: + for r in self.deprel[lcode]: + if self.deprel[lcode][r]['permitted'] > 0: + deprelset.add(r) + self.cached_deprel_for_language[lcode] = deprelset + return deprelset + + def get_edeprel_for_language(self, lcode): + """ + Searches the previously loaded database of enhanced case markers. + Returns the set of permitted edeprels for a given language code. Also + saves the result in self so that next time it can be fetched quickly + (once we loaded the data, we do not expect them to change). + """ + if lcode in self.cached_edeprel_for_language: + return self.cached_edeprel_for_language[lcode] + basic_deprels = self.get_deprel_for_language(lcode) + edeprelset = basic_deprels | {'ref'} + for bdeprel in basic_deprels: + if re.match(r"^[nc]subj(:|$)", bdeprel): + edeprelset.add(bdeprel+':xsubj') + if lcode in self.edeprel: + for c in self.edeprel[lcode]: + for deprel in self.edeprel[lcode][c]['extends']: + for bdeprel in basic_deprels: + if bdeprel == deprel or re.match(r"^"+deprel+':', bdeprel): + edeprelset.add(bdeprel+':'+c) + self.cached_edeprel_for_language[lcode] = edeprelset + return edeprelset + + def get_auxcop_for_language(self, lcode): + """ + Searches the previously loaded database of auxiliary/copula lemmas. + Returns the AUX and COP lists for a given language code. Also saves + the result in self so that next time it can be fetched quickly (once + we loaded the data, we do not expect them to change). + """ + if lcode in self.cached_aux_for_language and lcode in self.cached_cop_for_language: + return self.cached_aux_for_language[lcode], self.cached_cop_for_language[lcode] + # If any of the functions of the lemma is other than cop.PRON, it counts as an auxiliary. + # If any of the functions of the lemma is cop.*, it counts as a copula. + auxlist = [] + coplist = [] + lemmalist = self.auxcop.get(lcode, {}).keys() + auxlist = [x for x in lemmalist + if len([y for y in self.auxcop[lcode][x]['functions'] + if y['function'] != 'cop.PRON']) > 0] + coplist = [x for x in lemmalist + if len([y for y in self.auxcop[lcode][x]['functions'] + if re.match(r"^cop\.", y['function'])]) > 0] + self.cached_aux_for_language[lcode] = auxlist + self.cached_cop_for_language[lcode] = coplist + return auxlist, coplist + + def get_aux_for_language(self, lcode): + """ + An entry point for get_auxcop_for_language() that returns only the aux + list. It either takes the cached list (if available), or calls + get_auxcop_for_language(). + """ + if lcode in self.cached_aux_for_language: + return self.cached_aux_for_language[lcode] + auxlist, coplist = self.get_auxcop_for_language(lcode) + return auxlist + + def get_cop_for_language(self, lcode): + """ + An entry point for get_auxcop_for_language() that returns only the cop + list. It either takes the cached list (if available), or calls + get_auxcop_for_language(). + """ + if lcode in self.cached_cop_for_language: + return self.cached_cop_for_language[lcode] + auxlist, coplist = self.get_auxcop_for_language(lcode) + return coplist + + def get_tospace_for_language(self, lcode): + """ + Searches the previously loaded database of regular expressions describing + permitted tokens with spaces. Returns the expressions for a given language code. + """ + # Do not crash if the user asks for an unknown language. + if not lcode in self.tospace: + return None + return self.tospace[lcode] \ No newline at end of file diff --git a/validator/src/validator/utils.py b/validator/src/validator/utils.py new file mode 100644 index 000000000..637f1e77c --- /dev/null +++ b/validator/src/validator/utils.py @@ -0,0 +1,257 @@ +import os +import regex as re + +from validator.loaders import load_conllu_spec + +import validator.compiled_regex as crex + +THIS_DIR = os.path.dirname(os.path.realpath(os.path.abspath(__file__))) + +CONLLU_SPEC = load_conllu_spec(os.path.join(THIS_DIR, "conllu_spec.yaml")) +COLCOUNT = len(CONLLU_SPEC["columns"]) +ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(COLCOUNT) +COLNAMES = CONLLU_SPEC["columns"] + +def is_whitespace(line): + """ + Checks whether a given line of text consists exclusively of whitespace. + + Parameters + ---------- + line : str + A line of text. + + Returns + ------- + _ : bool + """ + return crex.ws.fullmatch(line) + +def is_word(cols): + """ + Checks whether a CoNLL-U line represents a syntactic word by checking that + its ID field is an integer. + + Parameters + ---------- + cols : list + A CoNLL-U line, represented as a list of strings. + + Returns + ------- + _ : bool + """ + return crex.wordid.fullmatch(cols[ID]) + +def is_multiword_token(cols): + """ + Checks whether a CoNLL-U line represents a MWT by checking that its ID + field is a range, e.g. "3-5". + + Parameters + ---------- + cols : list + A CoNLL-U line, represented as a list of strings. + + Returns + ------- + _ : Match|None + """ + return crex.mwtid.fullmatch(cols[ID]) + +def is_empty_node(cols): + """ + Checks whether a CoNLL-U line represents an empty node by checking + that its ID field is a floating-point number. + + Parameters + ---------- + cols : list + A CoNLL-U line, represented as a list of strings. + + Returns + ------- + _ : bool + """ + return crex.enodeid.fullmatch(cols[ID]) + +def parse_empty_node_id(cols): + """ + Parses the ID of an empty node into a 2-uple that separates it into its + integer and decimal part (e.g. "1.2" -> ("1", "2")). + + Parameters + ---------- + cols : list + A CoNLL-U line, represented as a list of strings. + + Returns + ------- + _ : tuple + A 2-uple of strings, e.g. ("1", "2"). + """ + m = crex.enodeid.fullmatch(cols[ID]) + # ! REMOVE/CHANGE + assert m, 'parse_empty_node_id with non-empty node' + return m.groups() + +def shorten(string): + """ + Truncates a string to 25 characters. + + Parameters + ---------- + cols : str + + Returns + ------- + _ : str + """ + return string if len(string) < 25 else string[:20]+'[...]' + +# ! proposal: rename to drop_subtype +def lspec2ud(deprel): + """ + Drops the relation subtype from the given DEPREL (e.g. "nmod" -> "nmod"; + "nmod:poss" -> "nmod"). + + Parameters + ---------- + deprel : str + A DEPREL (possibly with subtypes, such as "nmod:poss"). + + Returns + ------- + _ : str + A DEPREL without subtypes, such as "nmod". + """ + return deprel.split(':', 1)[0] + +def formtl(node): + """ + Joins a node's form and transliteration together in a space-separated + string, e.g. "ኧሁ 'ăhu". + + Parameters + ---------- + node : udapi.core.node.Node + A word node. + + Returns + ------- + _ : str + A string in "FORM Translit" format, e.g. "ኧሁ 'ăhu". + """ + x = node.form + if node.misc['Translit'] != '': + x += ' ' + node.misc['Translit'] + return x + +def lemmatl(node): + """ + Joins a node's lemma and its transliteration together in a space-separated + string, e.g. "እኔ 'əne". + + Parameters + ---------- + node : udapi.core.node.Node + A word node. + + Returns + ------- + _ : str + A string in "LEMMA LTranslit" format, e.g. "እኔ 'əne". + """ + x = node.lemma + if node.misc['LTranslit'] != '': + x += ' ' + node.misc['LTranslit'] + return x + +def get_alt_language(node): + """ + In code-switching analysis of foreign words, an attribute in the MISC column + will hold the code of the language of the current word. Certain tests will + then use language-specific lists from that language instead of the main + language of the document. This function returns the alternative language + code if present, otherwise it returns None. + + Parameters + ---------- + node : udapi.core.node.Node object + The node (word) whose language is being queried. + """ + if node.misc['Lang'] != '': + return node.misc['Lang'] + return None + +def deps_list(cols): + """ + Parses the contents of the DEPS column and returns a list of incoming + enhanced dependencies. This is needed in early tests, before the sentence + has been fed to Udapi. + + Parameters + ---------- + cols : list + The values of the columns on the current node / token line. + + Raises + ------ + ValueError + If the contents of DEPS cannot be parsed. Note that this does not catch + all possible violations of the format, e.g., bad order of the relations + will not raise an exception. + + Returns + ------- + deps : list + Each list item is a two-member list, containing the parent index (head) + and the relation type (deprel). + """ + if cols[DEPS] == '_': + deps = [] + else: + deps = [hd.split(':', 1) for hd in cols[DEPS].split('|')] + if any(hd for hd in deps if len(hd) != 2): + # ! should be an error/incident + raise ValueError(f'malformed DEPS: {cols[DEPS]}') + return deps + +def get_line_numbers_for_ids(state, sentence): + """ + Takes a list of sentence lines (mwt ranges, word nodes, empty nodes). + For each mwt / node / word, gets the number of the line in the input + file where the mwt / node / word occurs. We will need this in other + functions to be able to report the line on which an error occurred. + + Parameters + ---------- + sentence : list + List of mwt / words / nodes, each represented as a list of columns. + + Returns + ------- + linenos : dict + Key: word ID (string, not int; decimal for empty nodes and range for + mwt lines). Value: 1-based index of the line in the file (int). + """ + linenos = {} + node_line = state.sentence_line - 1 + for cols in sentence: + node_line += 1 + linenos[cols[ID]] = node_line + # For normal words, add them also under integer keys, just in case + # we later forget to convert node.ord to string. But we cannot do the + # same for empty nodes and multiword tokens. + if is_word(cols): + linenos[int(cols[ID])] = node_line + return linenos + +def next_block(fin): + block = [] + for counter, line in enumerate(fin): + block.append((counter, line.rstrip("\n\r"))) + if re.fullmatch(r"^\s*$", line): + yield block + block = [] + if len(block): yield block diff --git a/validator/src/validator/validate.py b/validator/src/validator/validate.py new file mode 100644 index 000000000..eabb7cec5 --- /dev/null +++ b/validator/src/validator/validate.py @@ -0,0 +1,2538 @@ +import os +import collections +import regex as re +import unicodedata +import logging +import inspect + +from typing import List, Tuple, TextIO, Set, Any, Dict +from dataclasses import dataclass, field + +from validator.incident import Incident, Error, Warning, TestClass, IncidentType +import validator.utils as utils +import validator.compiled_regex as crex +# from validator.validate_lib import State +from validator.logging_utils import setup_logging + +logger = logging.getLogger(__name__) +setup_logging(logger) + +@dataclass +class State: + current_file_name:str + sentence_id:str = '' + parallel_id:str = '' + known_sent_ids:Set = field(default_factory=set) + known_parallel_ids:Set = field(default_factory=set) + parallel_id_lastalt: collections.defaultdict[None] = field(default_factory=lambda: collections.defaultdict(None)) + parallel_id_lastalt: collections.defaultdict[None] = field(default_factory=lambda: collections.defaultdict(None)) + +def validate(paths, cfg_obj): + ''' + Validates the input files. + ''' + # TODO: complete docstring + for path in paths: + yield validate_file(path, cfg_obj) + +def run_checks(checks, parameters, incidents, state): + + current_incidents = [] + + for check, check_info in checks.items(): + + dependencies = [] + if 'depends_on' in check_info: + dependencies = check_info['depends_on'] + + fun = globals()[check] + if not any(err.testid in dependencies for err in current_incidents): + current_incidents.extend([err.set_state(state) for err in fun(**parameters)]) + else: + pass + # incidents.append( + # Warning( + # level=0, + # testclass=TestClass.INTERNAL, + # testid='skipped-check', + # message=f"Check {check} not performed because of previous failures", + # ) + # ) + + incidents.extend(current_incidents) + +def validate_file(path, cfg_obj): + + state = State(current_file_name=os.path.basename(path)) + incidents = [] + + # newline='' necessary because otherwise non-unix newlines are + # automagically converted to \n, see + # https://docs.python.org/3/library/functions.html#open + with open(path, newline='') as fin: + + logger.info("Opening file %s", path) + block = [] + for block in utils.next_block(fin): + # state.current_line = block[0][0] + comments = [(counter,line) for (counter,line) in block if line and line[0] == "#"] + tokens = [(counter,line) for (counter,line) in block if line and line[0].isdigit()] + + for (counter, line) in comments: + match_sentid = crex.sentid.fullmatch(line) + if match_sentid: + state.sentence_id = match_sentid.group(1) + + + if cfg_obj['block']: + params = { + "block": block, + "useless_param": True + } + run_checks(cfg_obj['block'], params, incidents, state) + + block = [(counter,line) for (counter,line) in block if line] + if cfg_obj['line']: + for (counter,line) in block: + params = { + "line": (counter, line) + } + # state.current_line = counter + run_checks(cfg_obj['line'], params, incidents, state) + + if 'comment_lines' in cfg_obj: + params = { + "comments" : comments, + "allow_slash": True, + "known_sent_ids": state.known_sent_ids, + "state": state + } + run_checks(cfg_obj['comment_lines'], params, incidents, state) + + # for (counter,line) in tokens: + # state.current_line = counter + # run_checks(cfg_obj['token_lines'], line, incidents, state) + + # state.current_line = block[0][0] # TODO: FIND MORE ELEGANT SOLUTION + tokens = [(counter,line.split("\t")) for (counter,line) in tokens] + + if cfg_obj['tokens_cols']: + params = { + "sentence": tokens + } + run_checks(cfg_obj['tokens_cols'], params, incidents, state) + + for (counter,line) in tokens: + params = { + "cols": (counter, line) + } + run_checks(cfg_obj['cols'], params, incidents, state) + + if len(block) == 1 and not block[0][1]: + incidents.append(Error( + testid='missing-empty-line', + message='Missing empty line after the last sentence.' + )) + + if 'file' in cfg_obj: + params = { + "inp": fin + } + run_checks(cfg_obj['file'], params, incidents, state) + + return incidents + + +#============================================================================== +# Level 1 tests. Only CoNLL-U backbone. Values can be empty or non-UD. +#============================================================================== + +#* DONE +def check_invalid_lines(line:Tuple[int, str], **_) -> List[Incident]: + ''' + check_invalid_lines checks for lines that are not empty, not comments and not tokens. + + Empty lines are checked against the utils.is_whitespace() function. + Tokens are identified by the first character of the line being a digit. + Comments are identified by the first character of the line being a '#'. + + Parameters + ---------- + line : Tuple[int, str] + The input line to be tested. + + Returns + ------- + List[Incident] + A list of Incidents (empty if validation is successful). + + Test-ids + -------- + invalid-line + + Reference-test + -------------- + test-cases/invalid-functions/invalid-lines.conllu + ''' + lineno, line = line + incidents = [] + if line and not (line[0].isdigit() or line[0] == "#" or utils.is_whitespace(line)): + incidents.append(Error( + testid='invalid-line', + message=(f"Spurious line: '{line}'. " + "All non-empty lines should start with a digit or the # character. " + "The line will be excluded from further tests."), + lineno=lineno + )) + logger.debug("'invalid-line' error triggered by line '%s'", line) + return incidents + +#* DONE +def check_columns_format(line:Tuple[int, str], **_) -> List[Incident]: + '''check_columns_format checks that the line is made up by the right number of columns. + Moreover, it checks that no column is empty, no leader or trailing spaces are present + and that no whitespace is present in fields, except if for FORM and LEMMA if the token + is not a multiword. In case of multiword, whitespaces are not allowed in any field. + + Parameters + ---------- + line : Tuple[int, str] + The input line to be tested. + Tests are only performed if the line is a token (i.e., line starts with a digit) + + Returns + ------- + List[Incident] + A list of Incidents (empty if validation is successful). + + Test-ids + -------- + number-of-columns, empty-column, leading-whitespace, trailing-whitespace, + repeated-whitespace, invalid-whitespace-mwt, invalid-whitespace + + Reference-test + -------------- + test-cases/invalid-functions/columns-format.conllu + + Depends-on + ---------- + To be run only if no + - 'invalid-line' + errors are found on the same line. + ''' + lineno, line = line + incidents = [] + + # the function is only defined on potential tokens + if not line[0].isdigit(): + return incidents + + cols = line.split("\t") + if not len(cols) == utils.COLCOUNT: + incidents.append(Error( + testid='number-of-columns', + message=(f"The line has {len(cols)} columns but {utils.COLCOUNT} are expected. " + "The line will be excluded from further tests.") + )) + logger.debug("'number-of-columns' triggered by line '%s'.", line) + logger.debug("No other checks performed") + return incidents + + for col_idx in range(utils.COLCOUNT): + + # Must never be empty + if not cols[col_idx]: + incidents.append(Error( + testid='empty-column', + message=f'Empty value in column {utils.COLNAMES[col_idx]}.' + )) + logger.debug("'empty-columns' error triggered by column '%s'.", utils.COLNAMES[col_idx]) + else: + + # Must never have leading/trailing whitespace + if cols[col_idx][0].isspace(): + incidents.append(Error( + testclass=TestClass.FORMAT, + testid='leading-whitespace', + message=f"Leading whitespace not allowed in column {utils.COLNAMES[col_idx]}: '{cols[col_idx]}'." + )) + logger.debug("'leading-whitespace' error triggered by column '%s'.", utils.COLNAMES[col_idx]) + + if cols[col_idx][-1].isspace(): + incidents.append(Error( + testclass=TestClass.FORMAT, + testid='trailing-whitespace', + message=f"Trailing whitespace not allowed in column {utils.COLNAMES[col_idx]}: '{cols[col_idx]}'." + )) + logger.debug("'trailing-whitespace' triggered by column '%s'.", utils.COLNAMES[col_idx]) + + # Must never contain two consecutive whitespace characters + if crex.ws2.search(cols[col_idx]): + incidents.append(Error( + testclass=TestClass.FORMAT, + testid='repeated-whitespace', + message=("Two or more consecutive whitespace characters not allowed " + f"in column {utils.COLNAMES[col_idx]}.") + )) + logger.debug("'repeated-whitespace' triggered by column '%s'.", utils.COLNAMES[col_idx]) + + # Multi-word tokens may have whitespaces in MISC but not in FORM or LEMMA. + # If it contains a space, it does not make sense to treat it as a MWT. + if utils.is_multiword_token(cols): + for col_idx in (utils.FORM, utils.LEMMA): + if crex.ws.search(cols[col_idx]): + incidents.append(Error( + testclass=TestClass.FORMAT, + testid='invalid-whitespace-mwt', + message=(f"White space not allowed in multi-word token '{cols[col_idx]}'. " + "If it contains a space, it is not one surface token.") + )) + logger.debug("'invalid-whitespace-mwt' triggered by column '%s'.", utils.COLNAMES[col_idx]) + + # These columns must not have whitespace. + for col_idx in (utils.ID, utils.UPOS, utils.XPOS, utils.FEATS, utils.HEAD, utils.DEPREL, utils.DEPS): + # if crex.ws.search(cols[col_idx]): + if crex.ws.search(cols[col_idx].strip()): + incidents.append(Error( + testclass=TestClass.FORMAT, + testid='invalid-whitespace', + message=f"White space not allowed in column {utils.COLNAMES[col_idx]}: '{cols[col_idx]}'." + )) + logger.debug("'invalid-whitespace' triggered by column '%s'.", utils.COLNAMES[col_idx]) + + # ! Comment from previous validator + # ?: get rid of this comment + # We should also check the ID format (e.g., '1' is good, '01' is wrong). + # Although it is checking just a single column, we will do it in + # validate_id_sequence() because that function has the power to block + # further tests, which could choke up on this. + + return incidents + +#* DONE +def check_misplaced_comment(block: List[Tuple[int, str]], **_) -> List[Incident]: + '''check_misplaced_comment checks that comments (i.e., lines starting with '#') always precede + tokens (i.e., lines starting with digits) + + Parameters + ---------- + block : List[Tuple[int, str]] + The input lines to be tested. + + Returns + ------- + List[Incident] + A list of Incidents (empty if validation is successful). + + Test-ids + -------- + misplaced-comment + + Reference-test + -------------- + test-cases/invalid-functions/misplaced-comment.conllu + ''' + incidents = [] + + if len(block) > 1: + max_comment = len(block) + min_token = -1 + error_lineno = 0 + for (counter, (lineno, line)) in enumerate(block): + if line: + if line[0] == "#": + max_comment = counter + error_lineno = lineno + else: + if min_token == -1: + min_token = counter + + if max_comment >= min_token: + error = Error( + testclass=TestClass.FORMAT, + testid='misplaced-comment', + message='Spurious comment line. Comments are only allowed before a sentence.', + lineno=error_lineno + ) + incidents.append(error) + logger.debug("'misplaced-comment' error triggered by line: '%s'.", block[max_comment][1]) + + return incidents + +#* DONE +def check_extra_empty_line(block: List[Tuple[int, str]], **_) -> List[Incident]: + '''check_extra_empty_line checks that exactly one empty line is present after every sentence + + Parameters + ---------- + block : List[Tuple[int, str]] + The input lines to be tested. + + Returns + ------- + List[Incident] + A list of Incidents (empty if validation is successful). + + Test-ids + -------- + extra-empty-line + + Reference-test + -------------- + test-cases/invalid-functions/extra-empty-line.conllu + ''' + + incidents = [] + if len(block) == 1 and (utils.is_whitespace(block[0][1]) or len(block[0][1])==0): + logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + error = Error( + testclass=TestClass.FORMAT, + testid='extra-empty-line', + message='Spurious empty line. Only one empty line is expected after every sentence.', + lineno=block[0][0] + ) + incidents.append(error) + logger.debug("'extra-empty-line' triggered by line '%s'", block[0][1]) + + return incidents + +#* DONE +def check_pseudo_empty_line(line:Tuple[int, str], **_) -> List[Incident]: + '''check_pseudo_empty_line checks whether a line that appears empty contains whitespaces. + + Parameters + ---------- + line : Tuple[int, str] + The input line to be tested. + + Returns + ------- + List[Incident] + A list of Incidents (empty if validation is successful). + + Test-ids + -------- + pseudo-empty-line + + Reference-test + -------------- + test-cases/invalid-functions/pseudo-empty-line.conllu + ''' + lineno, line = line + incidents = [] + if utils.is_whitespace(line): + error = Error(testclass=TestClass.FORMAT, + testid='pseudo-empty-line', + message=("Spurious line that appears empty but is not; " + "there are whitespace characters."), + lineno=lineno) + incidents.append(error) + logger.debug("'pseudo-empty-line' triggered by line '%s'", line) + return incidents + +#* DONE +def check_unicode_normalization(line:Tuple[int, str], **_) -> List[Incident]: + '''check_unicode_normalization checks that letters composed of multiple Unicode characters + (such as a base letter plus combining diacritics) conform to NFC normalization (canonical + decomposition followed by canonical composition). + + Parameters + ---------- + text : Tuple[int, str] + The input line to be tested. If the line consists of TAB-separated + fields (token line), errors reports will specify the field where the + error occurred. Otherwise (comment line), the error report will not be + localized. + + Returns + ------- + List[Incident] + A list of Incidents (empty if validation is successful). + + Test-ids + -------- + unicode-normalization + + Reference-test + -------------- + test-cases/invalid-functions/unicode-normalization.conllu + ''' + lineno, line = line + incidents = [] + normalized_text = unicodedata.normalize('NFC', line) + if line != normalized_text: + # Find the first unmatched character and include it in the report. + firsti = -1 + firstj = -1 + inpfirst = '' + inpsecond = '' + nfcfirst = '' + tcols = line.split("\t") + ncols = normalized_text.split("\t") + + for i in range(len(tcols)): + for j in range(len(tcols[i])): + if tcols[i][j] != ncols[i][j]: + firsti = i + firstj = j + inpfirst = unicodedata.name(tcols[i][j]) + nfcfirst = unicodedata.name(ncols[i][j]) + if j+1 < len(tcols[i]): + inpsecond = unicodedata.name(tcols[i][j+1]) + break + if firsti >= 0: + break + if len(tcols) > 1: + testmessage = f"Unicode not normalized: {utils.COLNAMES[firsti]}.character[{firstj}] is {inpfirst}, should be {nfcfirst}." + else: + testmessage = f"Unicode not normalized: character[{firstj}] is {inpfirst}, should be {nfcfirst}." + + #TODO: WHAT TO DO WITH THIS? + # explanation_second = f" In this case, your next character is {inpsecond}." if inpsecond else '' + # This error usually does not mean that {inpfirst} is an invalid character. Usually it means that this is a base character followed by combining diacritics, and you should replace them by a single combined character.{explanation_second} You can fix normalization errors using the normalize_unicode.pl script from the tools repository." + + incidents.append(Error( + testclass=TestClass.UNICODE, + testid='unicode-normalization', + message=testmessage, + lineno=lineno + )) + logger.debug("'unicode-normalization' error triggered by line '%s'", line) + + return incidents + +#? one if to check +def check_id_sequence(sentence: List[Tuple[int, List[str]]], **_) -> List[Incident]: + '''check_id_sequence checks that the ID sequence is correctly formed. + If this function returns an nonempty list, subsequent tests should not be run. + + Parameters + ---------- + sentence : List[Tuple[int, List[str]]] + A list of lists representing a sentence in tabular format. + + Returns + ------- + List[Incident] + A list of Incidents (empty if validation is successful). + + Test-ids + -------- + invalid-word-id, invalid-word-interval?, misplaced-word-interval, misplaced-empty-node, + word-id-sequence, reversed-word-interval, word-interval-out + + Reference-test + -------------- + test-cases/invalid-functions/invalid-word-id.conllu + test-cases/invalid-functions/misplaced-empty-node.conllu + test-cases/invalid-functions/misplaced-empty-node-2.conllu + test-cases/invalid-functions/misplaced-word-interval.conllu + test-cases/invalid-functions/word-id-sequence.conllu + test-cases/invalid-functions/word-id-sequence-2.conllu + test-cases/invalid-functions/reversed-word-interval.conllu + ''' + incidents = [] + + words=[] + tokens=[] + current_word_id, next_empty_id = 0, 1 + for col_no, cols in sentence: + # Check for the format of the ID value. (ID must not be empty.) + if not (utils.is_word(cols) or utils.is_empty_node(cols) or utils.is_multiword_token(cols)): + error = Error( + testid='invalid-word-id', + message=f"Unexpected ID format '{cols[utils.ID]}'.", + lineno=col_no + ) + incidents.append(error) + logger.debug("'invalid-word-id' error triggered by line '%s'", '\t'.join(cols)) + continue + if not utils.is_empty_node(cols): + next_empty_id = 1 # reset sequence + if utils.is_word(cols): + + t_id = int(cols[utils.ID]) + current_word_id = t_id + words.append(t_id) + # Not covered by the previous interval? + if not (tokens and tokens[-1][0] <= t_id and tokens[-1][1] >= t_id): + tokens.append((t_id, t_id)) # nope - let's make a default interval for it + + # ! looks like a duplicate of check_id_sequence + elif utils.is_multiword_token(cols): + match = utils.is_multiword_token(cols) + #! This cannot not happen. The function utils.is_multiword_token() would then not return True. + # if not match: + # error = Error( + # testid='invalid-word-interval', + # message=f"Spurious word interval definition: '{cols[utils.ID]}'." + # ) + # error.lineno += counter + # incidents.append(error) + # logger.debug("'invalid-word-interval' error triggered by line '%s'", '\t'.join(cols)) + # continue + beg, end = int(match.group(1)), int(match.group(2)) + if not ((not words and beg >= 1) or (words and beg >= words[-1] + 1)): + error = Error( + testid='misplaced-word-interval', + message='Multiword range not before its first word.', + lineno=col_no + ) + incidents.append(error) + logger.debug("'misplaced-word-interval' error triggered by line '%s'", '\t'.join(cols)) + continue + tokens.append((beg, end)) + elif utils.is_empty_node(cols): + word_id, empty_id = (int(i) for i in utils.parse_empty_node_id(cols)) + if word_id != current_word_id or empty_id != next_empty_id: + incidents.append(Error( + testid='misplaced-empty-node', + message=f'Empty node id {cols[utils.ID]}, expected {current_word_id}.{next_empty_id}', + lineno=col_no + )) + logger.debug("'misplaced-empty-node' error triggered by line '%s'", '\t'.join(cols)) + next_empty_id += 1 + # Interaction of multiword tokens and empty nodes if there is an empty + # node between the first word of a multiword token and the previous word: + # This sequence is correct: 4 4.1 5-6 5 6 + # This sequence is wrong: 4 5-6 4.1 5 6 + if word_id == current_word_id and tokens and word_id < tokens[-1][0]: + incidents.append(Error( + testid='misplaced-empty-node', + message=(f"Empty node id {cols[utils.ID]} must occur before multiword token " + f"{tokens[-1][0]}-{tokens[-1][1]}."), + lineno=col_no + )) + logger.debug("'misplaced-empty-node' error triggered by line '%s'", '\t'.join(cols)) + # Now let's do some basic sanity checks on the sequences. + # Expected sequence of word IDs is 1, 2, ... + expstrseq = ','.join(str(x) for x in range(1, len(words) + 1)) + wrdstrseq = ','.join(str(x) for x in words) + if wrdstrseq != expstrseq: + incidents.append(Error( + testid='word-id-sequence', + message=f"Words do not form a sequence. Got '{wrdstrseq}'. Expected '{expstrseq}'.", + lineno=sentence[0][0] + )) + logger.debug("'word-id-sequence' error triggered by sequence '%s'", wrdstrseq) + + # Check elementary sanity of word intervals. + # Remember that these are not just multi-word tokens. Here we have intervals even for single-word tokens (b=e)! + for (b, e) in tokens: + #? how can this be triggered? + if e < b: # end before beginning + incidents.append(Error( + testid='reversed-word-interval', + message=f'Spurious token interval {b}-{e}', + lineno=sentence[0][0] + )) + logger.debug("'reversed-word-interval' error triggered by sequence '%s-%s'", b, e) + continue + if b < 1 or e > len(words): # out of range + incidents.append(Error( + testid='word-interval-out', + message=f'Spurious token interval {b}-{e} (out of range)', + lineno=sentence[0][0] + )) + logger.debug("'word-interval-out' error triggered by sequence '%s-%s'", b, e) + continue + + return incidents + +#* DONE +def check_token_ranges(sentence: List[Tuple[int, List[str]]], **_) -> List[Incident]: + '''check_token_ranges checks that the word ranges for multiword tokens are valid. + + Parameters + ---------- + sentence : List[Tuple[int, List[str]]] + A list of lists representing a sentence in tabular format. + + Returns + ------- + List[Incident] + A list of Incidents (empty if validation is successful). + + Test-ids + -------- + invalid-word-interval, overlapping-word-intervals + + Reference-test + -------------- + test-cases/invalid-functions/invalid-word-id.conllu + test-cases/invalid-functions/overlapping-word-interval.conllu + ''' + + incidents = [] + covered = set() + for col_no, cols in sentence: + if not "-" in cols[utils.ID]: + continue + m = crex.mwtid.fullmatch(cols[utils.ID]) + if not m: + incidents.append(Error( + testid="invalid-word-interval", + message=f"Spurious word interval definition: '{cols[utils.ID]}'.", + lineno=col_no + )) + logger.debug("'invalid-word-interval' error triggered by line '%s'",cols) + continue + start, end = m.groups() + start, end = int(start), int(end) + # Do not test if start >= end: + # This is tested in check_id_sequence(). + if covered & set(range(start, end+1)): + incidents.append(Error( + testid='overlapping-word-intervals', + message=f'Range overlaps with others: {cols[utils.ID]}', + lineno=col_no)) + logger.debug("'overlapping-word-intervals' error triggered by line '%s'",cols) + covered |= set(range(start, end+1)) + return incidents + +#* DONE +def check_newlines(inp: TextIO, **_) -> List[Incident]: + '''check_newlines checks that the input file consistently uses linux-style newlines + (LF only, not CR LF like in Windows). To be run on the input file handle after the + whole input has been read. + This check is universal and not configurable. + + Parameters + ---------- + inp : TextIO + File handler that is being read. + + Returns + ------- + List[Incident] + A list of Incidents (empty if validation is successful). + + Test-ids + -------- + non-unix-newline + + Reference-test + -------------- + test-cases/invalid-functions/non-unix-newline.conllu + ''' + + incidents = [] + if inp.newlines and inp.newlines != '\n': + incidents.append(Error( + level=1, + testclass=TestClass.FORMAT, + testid='non-unix-newline', + message='Only the unix-style LF line terminator is allowed.' + )) + logger.debug("'non-unix-newlines' error triggered") + + return incidents + +#============================================================================== +# Level 2 tests. Tree structure, universal tags and deprels. Note that any +# well-formed Feature=Value pair is allowed (because it could be language- +# specific) and any word form or lemma can contain spaces (because language- +# specific guidelines may permit it). +#============================================================================== + +#* DONE +def check_sent_id(comments: List[Tuple[int, str]], + allow_slash: bool, + known_sent_ids: Set, + state:State = None, + **_) -> List[Incident]: + '''check_sent_id checks that sentence id exists, is well-formed and unique. + + Parameters + ---------- + comments : List[Tuple[int, str]] + A list of comments, represented as strings. + allow_slash : bool + Whether exactly one "/" character is allowed (this is reserved for + parallel treebanks). This parameter replaces lang, which was used to + allow slashes when equal to "ud". + known_sent_ids : Set + The set of previously encountered sentence IDs. + state : State, optional + The object where known_sent_ids are stored and updated, by default None + + Returns + ------- + List[Incident] + A list of Incidents (empty if validation is successful). + + Test-ids + -------- + invalid-sent-id, missing-sent-id, multiple-sent-id, non-unique-sent-id, slash-in-sent-id + + Reference-test + -------------- + test-cases/invalid-functions/multiple-sent-id.conllu + ''' + + incidents = [] + matched = [] + firstmatch = -1 + for lineno, c in comments: + match = crex.sentid.fullmatch(c) + if match: + matched.append(match) + firstmatch = lineno + else: + if c.startswith('# sent_id') or c.startswith('#sent_id'): + # if 'sent_id' in c: + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + testid='invalid-sent-id', + message=(f"Spurious sent_id line: '{c}' should look like '# sent_id = xxxxx' " + "where xxxxx is not whitespace. Forward slash reserved for special purposes."), + lineno = lineno + )) + logger.debug("'invalid-sent-id' triggered by line '%s'", c) + + if not matched: + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + testid='missing-sent-id', + message='Missing the sent_id attribute.', + lineno=comments[0][0] + )) + logger.debug("'missing-sent-id' triggered by comments '%s'", '\n'.join([c for _, c in comments])) + elif len(matched) > 1: + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + testid='multiple-sent-id', + message='Multiple sent_id attributes.', + lineno=comments[0][0] + )) + logger.debug("'multiple-sent-id' triggered by comments '%s'", '\n'.join([c for _, c in comments])) + else: + # Uniqueness of sentence ids should be tested treebank-wide, not just file-wide. + # For that to happen, all three files should be tested at once. + sid = matched[0].group(1) + if sid in known_sent_ids: + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + testid='non-unique-sent-id', + message=f"Non-unique sent_id attribute '{sid}'.", + lineno=firstmatch + )) + logger.debug("'non-unique-sent-id' triggered by sid '%s'", sid) + + if sid.count('/') > 1 or (sid.count('/') == 1 and allow_slash): + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + testid='slash-in-sent-id', + message=f"The forward slash is reserved for special use in parallel treebanks: '{sid}'", + lineno=firstmatch + )) + logger.debug("'slash-in-sent-id' triggered by sid '%s'", sid) + + if state: + state.known_sent_ids.add(sid) + return incidents + +#! needs checking and testing, I don't think it works +def check_parallel_id(comments: List[Tuple[int, str]], + known_parallel_ids: Set, + parallel_id_lastalt: Any, #TODO: define type + parallel_id_lastpart: Any, #TODO: define type + state: State=None, + **_ ) -> List[Incident]: + '''check_parallel_id checks that parallel_id sentence-level comment + is used after sent_id of sentences that are parallel translations of sentences in other + treebanks. Like sent_id, it must be well-formed and unique. Unlike + sent_id, it is optional. Sentences that do not have it are not + parallel. + + Parameters + ---------- + comments : List[Tuple[int, str]] + _description_ + known_parallel_ids : Set + _description_ + parallel_id_lastalt : Any + _description_ + state : State, optional + The object where known_sent_ids are stored and updated, by default None + + Returns + ------- + List[Incident] + A list of Incidents (empty if validation is successful). + + Test-ids + -------- + invalid-parallel-id, multiple-parallel-id, non-unique-parallel-id, parallel-id-alt, + parallel-id-part + + Reference-test + -------------- + TODO + ''' + + incidents = [] + matched = [] + for lineno, c in comments: + match = crex.parallelid.fullmatch(c) + if match: + matched.append((lineno, match)) + else: + if c.startswith('# parallel_id') or c.startswith('#parallel_id'): + error = Error( + level=2, + testclass=TestClass.METADATA, + lineno=lineno, + testid='invalid-parallel-id', + message=(f"Spurious parallel_id line: '{c}' should look like " + "'# parallel_id = corpus/sentence' where corpus is [a-z]+ " + "and sentence is [-0-9a-z]. " + "Optionally, '/alt[1-9][0-9]*' and/or 'part[1-9][0-9]*' may follow.") + ) + incidents.append(error) + + if len(matched) > 1: + incidents.append(Error( + level=2, + testclass=TestClass.METADATA, + testid='multiple-parallel-id', + lineno=matched[1][0], # error on second parallel_id found + message="Multiple parallel_id attributes." + )) + elif matched: + lineno, match = matched[0] + # Uniqueness of parallel ids should be tested treebank-wide, not just file-wide. + # For that to happen, all three files should be tested at once. + pid = match.group(1) + if pid in known_parallel_ids: + incidents.append(Error( + level=2, + testclass=TestClass.METADATA, + testid='non-unique-parallel-id', + lineno=lineno, + message=f"Non-unique parallel_id attribute '{pid}'." + )) + else: + # Additional tests when pid has altN or partN. + # Do them only if the whole pid is unique. + sid = match.group(2) + '/' + match.group(3) + alt = None + part = None + altpart = match.group(4) + if altpart: + apmatch = re.fullmatch(r"(?:alt([0-9]+))?(?:part([0-9]+))?", altpart) + if apmatch: + alt = apmatch.group(1) + part = apmatch.group(2) + if alt: + alt = int(alt) + if part: + part = int(part) + if sid in parallel_id_lastalt: + # TODO: add parentheses to make precedence explicit + if parallel_id_lastalt[sid] == None and \ + alt != None or \ + parallel_id_lastalt[sid] != None and alt == None: + + incidents.append(Error( + level=2, + testid='parallel-id-alt', + testclass=TestClass.METADATA, + message=(f"Some instances of parallel sentence '{sid}' have the 'alt' " + "suffix while others do not.") + )) + elif alt != None and alt != parallel_id_lastalt[sid] + 1: + incidents.append(Error( + level=2, + testid='parallel-id-alt', + testclass=TestClass.METADATA, + message=(f"The alt suffix of parallel sentence '{sid}' should be" + f"{parallel_id_lastalt[sid]}+1 but it is {alt}.") + )) + + parallel_id_lastalt[sid] = alt + if state: + state.parallel_id_lastalt[sid] = alt + + if sid in parallel_id_lastpart: + #TODO: add parentheses to make precedence explicit + if parallel_id_lastpart[sid] == None and part != None or \ + parallel_id_lastpart[sid] != None and part == None: + incidents.append(Error( + testid='parallel-id-part', + level=2, + testclass=TestClass.METADATA, + message=(f"Some instances of parallel sentence '{sid}' have the 'part' " + "suffix while others do not.") + )) + + elif part != None and part != parallel_id_lastpart[sid] + 1: + incidents.append(Error( + testid='parallel-id-part', + level=2, + testclass=TestClass.METADATA, + message=(f"The part suffix of parallel sentence '{sid}' should be " + f"{parallel_id_lastpart[sid]}+1 but it is {part}.") + )) + parallel_id_lastpart[sid] = part + if state: + state.parallel_id_lastpart[sid] = part + if state: + state.known_parallel_ids.add(pid) + return incidents + +#! needs checking and testing +def check_text_meta(comments: List[Tuple[int, str]], + sentence: List[Tuple[int, List[str]]], + spaceafterno_in_effect:bool, + state :State=None, + **_) -> List[Incident]: + '''check_text_meta checks metadata other than sentence id, that is, document breaks, + paragraph breaks and sentence text (which is also compared to the sequence of the + forms of individual tokens, and the spaces vs. SpaceAfter=No in MISC). + + Parameters + ---------- + comments : List[Tuple[int, str]] + A list of comments, represented as strings. + sentence : List[Tuple[int, List[str]]] + A list of lists representing a sentence in tabular format. + spaceafterno_in_effect : bool + _description_ + + Returns + ------- + List[Incident] + A list of Incidents (empty if validation is successful). + ''' + + incidents = [] + + firstline = 0 + newdoc_matched = [] + newpar_matched = [] + text_matched = [] + for lineno, c in comments: + newdoc_match = crex.newdoc.fullmatch(c) + if newdoc_match: + newdoc_matched.append((lineno, newdoc_match)) + newpar_match = crex.newpar.fullmatch(c) + if newpar_match: + newpar_matched.append((lineno, newpar_match)) + text_match = crex.text.fullmatch(c) + if text_match: + text_matched.append((lineno, text_match)) + + if firstline == 0: + firstline = lineno + + if len(newdoc_matched) > 1: + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + lineno=newdoc_matched[1][0], + testid='multiple-newdoc', + message='Multiple newdoc attributes.' + )) + if len(newpar_matched) > 1: + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + lineno=newpar_matched[1][0], + testid='multiple-newpar', + message='Multiple newpar attributes.' + )) + + if (newdoc_matched or newpar_matched) and spaceafterno_in_effect: + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + lineno=firstline, + testid='spaceafter-newdocpar', + message=("New document or paragraph starts when the last token of the previous " + "sentence says SpaceAfter=No.") + )) + + if not text_matched: + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + lineno=firstline, + testid='missing-text', + message='Missing the text attribute.' + )) + elif len(text_matched) > 1: + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + lineno=text_match[1][0], + testid='multiple-text', + message='Multiple text attributes.' + )) + else: + lineno, text_matched = text_matched[0] + stext = text_matched.group(1) + if stext[-1].isspace(): + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + lineno=lineno, + testid='text-trailing-whitespace', + message='The text attribute must not end with whitespace.' + )) + + # Validate the text against the SpaceAfter attribute in MISC. + skip_words = set() + mismatch_reported = 0 # do not report multiple mismatches in the same sentence; they usually have the same cause + + for lineno, cols in sentence: + if 'NoSpaceAfter=Yes' in cols[utils.MISC]: # I leave this without the split("|") to catch all + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + lineno=lineno, + testid='nospaceafter-yes', + message="'NoSpaceAfter=Yes' should be replaced with 'SpaceAfter=No'." + )) + misc_attributes_spaceafter = [x for x in cols[utils.MISC].split('|') if re.match(r"^SpaceAfter=", x) and x != 'SpaceAfter=No'] + if len(misc_attributes_spaceafter) > 0: + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + lineno=lineno, + testid='spaceafter-value', + message="Unexpected value of the 'SpaceAfter' attribute in MISC. Did you mean 'SpacesAfter'?" + )) + + #? can we change the order of these conditions and avoid the 'continue'? + if utils.is_empty_node(cols): + if 'SpaceAfter=No' in cols[utils.MISC]: # I leave this without the split("|") to catch all + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + lineno=lineno, + testid='spaceafter-empty-node', + message="'SpaceAfter=No' cannot occur with empty nodes." + )) + continue + elif utils.is_multiword_token(cols): + beg, end = cols[utils.ID].split('-') + begi, endi = int(beg), int(end) + # If we see a multi-word token, add its words to an ignore-set + # these will be skipped, and also checked for absence of SpaceAfter=No. + for i in range(begi, endi+1): + skip_words.add(str(i)) + elif cols[utils.ID] in skip_words: + if 'SpaceAfter=No' in cols[utils.MISC]: + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + lineno=lineno, + testid='spaceafter-mwt-node', + message="'SpaceAfter=No' cannot occur with words that are part of a multi-word token." + )) + continue + # else: + # Err, I guess we have nothing to do here. :) + # pass + + # So now we have either a multi-word token or a word which is also a token in its entirety. + if not stext.startswith(cols[utils.FORM]): + if not mismatch_reported: + extra_message = '' + if len(stext) >= 1 and stext[0].isspace(): + extra_message = ' (perhaps extra SpaceAfter=No at previous token?)' + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + lineno = lineno, + testid='text-form-mismatch', + message=(f"Mismatch between the text attribute and the FORM field. " + f"Form[{cols[utils.ID]}] is '{cols[utils.FORM]}' but text is " + f"'{stext[:len(cols[utils.FORM])+20]}...'"+extra_message) + )) + mismatch_reported = 1 + else: + stext = stext[len(cols[utils.FORM]):] # eat the form + # Remember if SpaceAfter=No applies to the last word of the sentence. + # This is not prohibited in general but it is prohibited at the end of a paragraph or document. + #? do we need to do it for every word? Maybe just the last one + if 'SpaceAfter=No' in cols[utils.MISC].split("|"): + if state: + state.spaceafterno_in_effect = True + else: + if state: + state.spaceafterno_in_effect = False + if (stext) and not stext[0].isspace(): + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + lineno=lineno, + testid='missing-spaceafter', + message=(f"'SpaceAfter=No' is missing in the MISC field of node " + f"{cols[utils.ID]} because the text is '{utils.shorten(cols[utils.FORM]+stext)}'.") + )) + stext = stext.lstrip() + if stext: + incidents.append(Error( + testclass=TestClass.METADATA, + level=2, + lineno=lineno, + testid='text-extra-chars', + message=(f"Extra characters at the end of the text attribute, " + f"not accounted for in the FORM fields: '{stext}'") + )) + return incidents + +#* DONE +def check_mwt_empty_vals(cols: Tuple[int,List[str]], **_) -> List[Incident]: + + '''check_mwt_empty_vals checks that a multi-word token has _ empty values + in all fields except MISC. + This is required by UD guidelines although it is not a problem in general, + therefore a level 2 test. + + Parameters + ---------- + cols : Tuple[int,List[str]]) + The values of the columns on the current node / token line. + + Returns + ------- + List[Incident] + A list of Incidents (empty if validation is successful). + + Test-ids + -------- + mwt-nonempty-field + + Reference-test + -------------- + test-cases/invalid-functions/mwt-non-empty-field.conllu + ''' + lineno, cols = cols + + incidents = [] + + if not utils.is_multiword_token(cols): + return incidents + # incidents = [Error(level=0, + # testclass=TestClass.INTERNAL, + # testid='internal-error')] + # logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + # return incidents + + # all columns except the first two (ID, FORM) and the last one (MISC) + for col_idx in range(utils.LEMMA, utils.MISC): + + # Exception: The feature Typo=Yes may occur in FEATS of a multi-word token. + if cols[col_idx] != '_' and (col_idx != utils.FEATS or cols[col_idx] not in ['Typo=Yes', '_']): + incidents.append( + Error(level=2, + testclass=TestClass.FORMAT, + testid='mwt-nonempty-field', + message=f"A multi-word token line must have '_' in the column {utils.COLNAMES[col_idx]}. Now: '{cols[col_idx]}'.", + lineno=lineno + ) + ) + logger.debug("'mwt-nonempty-field' triggered by column '%s'", utils.COLNAMES[col_idx]) + + return incidents + +#? change testid +def check_empty_node_empty_vals(cols: Tuple[int,List[str]]) -> List[Incident]: + '''check_empty_node_empty_vals checks that an empty node has _ empty values in HEAD and DEPREL. + This is required by UD guidelines but not necessarily by CoNLL-U, therefore + a level 2 test. + + Parameters + ---------- + cols : Tuple[int,List[str]] + The values of the columns on the current node / token line. + + Returns + ------- + List[Incident] + A list of Incidents (empty if validation is successful). + + Test-ids + -------- + mwt-nonempty-field #? + + Reference-test + -------------- + test-cases/invalid-functions/mwt-non-empty-field.conllu #? + ''' + lineno, cols = cols + incidents = [] + + if not utils.is_empty_node(cols): + return incidents + # incidents = [Error(level=0, + # testclass=TestClass.INTERNAL, + # testid='internal-error')] + # logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + + for col_idx in (utils.HEAD, utils.DEPREL): + if cols[col_idx]!= '_': + #? check testid + incidents.append(Error( + level=2, + testclass=TestClass.FORMAT, + testid='mwt-nonempty-field', + message=(f"An empty node must have '_' in the column {utils.COLNAMES[col_idx]}. " + f"Now: '{cols[col_idx]}'."), + lineno=lineno + )) + logger.debug("'mwt-nonempty-field' triggered by column '%s'", utils.COLNAMES[col_idx]) + + return incidents + +#! proposal: rename into check_deps_deprel_contraints, or also check UPOS format (not value) +#! I don't like that it relies on crex +def check_character_constraints(cols): + """ + Checks general constraints on valid characters, e.g. that UPOS + only contains [A-Z]. + + Parameters + ---------- + cols : list + The values of the columns on the current node / token line. + """ + incidents = [] + if utils.is_multiword_token(cols): + logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + return incidents + + # Do not test the regular expression crex.upos here. We will test UPOS + # directly against the list of known tags. That is a level 2 test, too. + + if utils.is_empty_node(cols) and cols[utils.DEPREL] == '_': + logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + return incidents + + if not crex.deprel.fullmatch(cols[utils.DEPREL]): + incidents.append( + Error( + level=2, + testclass=TestClass.SYNTAX, + testid='invalid-deprel', + message=f"Invalid DEPREL value '{cols[utils.DEPREL]}'. Only lowercase" + ) + ) + + try: + deps = utils.deps_list(cols) + except ValueError: + incidents.append( + Error( + level=2, + testclass=TestClass.ENHANCED, + testid='invalid-deps', + message=f"Failed to parse DEPS: '{cols[utils.DEPS]}'." + ) + ) + logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + return incidents + + for _, edep in deps: + if not crex.edeprel.fullmatch(edep): + incidents.append( + Error( + level=2, + testclass=TestClass.ENHANCED, + testid='invalid-edeprel', + message=f"Invalid enhanced relation type: '{edep}' in '{cols[utils.DEPS]}'." + ) + ) + logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + return incidents + +def check_upos(cols, specs): + """ + Checks that the UPOS field contains one of the 17 known tags. + + Parameters + ---------- + cols : list + The values of the columns on the current node / token line. + specs : UDSpecs + The object containing specific information about the allowed values + """ + incidents = [] + #! added checking for mwt? + if utils.is_multiword_token(cols) and cols[utils.UPOS] == '_': + logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + return incidents + + if utils.is_empty_node(cols) and cols[utils.UPOS] == '_': + logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + return incidents + + # Just in case, we still match UPOS against the regular expression that + # checks general character constraints. However, the list of UPOS, loaded + # from a JSON file, should conform to the regular expression. + if not crex.upos.fullmatch(cols[utils.UPOS]) or cols[utils.UPOS] not in specs.upos: + incidents.append( + Error( + level=2, + testclass=TestClass.MORPHO, + testid='unknown-upos', + message=f"Unknown UPOS tag: '{cols[utils.UPOS]}'." + ) + ) + + logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + return incidents + +# ! proposal: rename into feature format or something alike +def check_features_level2(cols): + """ + Checks general constraints on feature-value format: Permitted characters in + feature name and value, features must be sorted alphabetically, features + cannot be repeated etc. + + Parameters + ---------- + cols : list + The values of the columns on the current node / token line. + + """ + incidents = [] + + feats = cols[utils.FEATS] + if feats == '_': + return incidents + + # self.features_present(state) # TODO: do elsewhere + + feat_list = feats.split('|') #! why not a function in utils? Like the one that gets deps + if [f.lower() for f in feat_list] != list(sorted(f.lower() for f in feat_list)): + incidents.append( + Error( + level=2, + testclass=TestClass.MORPHO, + testid='unsorted-features', + message=f"Morphological features must be alphabetically sorted: '{feats}'." + ) + ) + + # I'll gather the set of features here to check later that none is repeated. + attr_set = set() + # Level 2 tests character properties and canonical order but not that the f-v pair is known. + + for feat_val in feat_list: + match = crex.featval.fullmatch(feat_val) + if not match: + incidents.append( + Error( + level=2, + testclass=TestClass.MORPHO, + testid='invalid-feature', + message=f"Spurious morphological feature: '{feat_val}'. Should be of the form Feature=Value and must start with [A-Z] and only contain [A-Za-z0-9]." + ) + ) + + # to prevent misleading error "Repeated features are disallowed" + attr_set.add(feat_val) + + else: + # Check that the values are sorted as well + attr = match.group(1) + attr_set.add(attr) + values = match.group(2).split(',') + if len(values) != len(set(values)): + incidents.append( + Error( + level=2, + testclass=TestClass.MORPHO, + testid='repeated-feature-value', + message=f"Repeated feature values are disallowed: '{feats}' (error generated by feature '{attr}')" + ) + ) + if [v.lower() for v in values] != sorted(v.lower() for v in values): + incidents.append( + Error( + level=2, + testclass=TestClass.MORPHO, + testid='unsorted-feature-values', + message=f"If a feature has multiple values, these must be sorted: '{feat_val}'" + ) + ) + for v in values: + if not crex.val.fullmatch(v): # ! can this ever be true? If val.fullmatch() does not match, than also featval.fullmatch() wouldn't + incidents.append( + Error( + level=2, + testclass=TestClass.MORPHO, + testid='invalid-feature-value', + message=f"Spurious value '{v}' in '{feat_val}'. Must start with [A-Z0-9] and only contain [A-Za-z0-9]." + ) + ) + + if len(attr_set) != len(feat_list): + incidents.append( + Error( + level=2, + testclass=TestClass.MORPHO, + testid='repeated-feature', + message=f"Repeated features are disallowed: '{feats}'." + ) + ) + + # Subsequent higher-level tests could fail if a feature is not in the + # Feature=Value format. If that happens, we return False and the caller + # can skip the more fragile tests. + # TODO: the engine has to know that 'invalid-feature' is a testid that prevents from further testing + return incidents + +# TODO: write tests +def check_deps(cols): + """ + Validates that DEPS is correctly formatted and that there are no + self-loops in DEPS (longer cycles are allowed in enhanced graphs but + self-loops are not). + + This function must be run on raw DEPS before it is fed into Udapi because + it checks the order of relations, which is not guaranteed to be preserved + in Udapi. On the other hand, we assume that it is run after + check_id_references() and only if DEPS is parsable and the head indices + in it are OK. + + Parameters + ---------- + cols : list + The values of the columns on the current node / token line. + """ + + # TODO: the engine must assume that it is run after check_id_references() and only if DEPS is parsable and the head indices in it are OK. + + incidents = [] + if not (utils.is_word(cols) or utils.is_empty_node(cols)): + logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + return incidents + + # TODO: move elsewhere + # Remember whether there is at least one difference between the basic + # tree and the enhanced graph in the entire dataset. + #if cols[utils.DEPS] != '_' and cols[utils.DEPS] != cols[utils.HEAD]+':'+cols[utils.DEPREL]: + # state.seen_enhancement = line + + # We already know that the contents of DEPS is parsable (deps_list() was + # first called from check_id_references() and the head indices are OK). + deps = utils.deps_list(cols) + ###!!! Float will not work if there are 10 empty nodes between the same two + ###!!! regular nodes. '1.10' is not equivalent to '1.1'. + # ORIGINAL VERSION: heads = [float(h) for h, d in deps] + + # NEW VERSION: + #! maybe do this only if [0-9]+.[1-9][0-9]+ is present somewhere? + heads = [h for h, _ in deps] + floating_len = [] + for h in heads: + if "." in h: + floating_len.append(len(h)) + else: + floating_len.append(0) + hacked_heads = [h+'00000001' for i, h in enumerate(heads) if floating_len[i]] + hacked_heads_sorted = sorted(zip(hacked_heads, floating_len), key= lambda x: float(x[0])) + hacked_heads_restored = [] + for x, y in hacked_heads_sorted: + if y: + hacked_heads_sorted.append(x[:y]) + else: + hacked_heads_sorted.append(x) + + # if heads != sorted(heads): # sort strings keeping the integer-like ordering + if heads != hacked_heads_restored: + incidents.append( + Error( + level=2, + testclass=TestClass.FORMAT, + testid='unsorted-deps', + message=f"DEPS not sorted by head index: '{cols[utils.DEPS]}'" + ) + ) + else: + lasth = None + lastd = None + for h, d in deps: + if h == lasth: + if d < lastd: + incidents.append( + Error( + level=2, + testclass=TestClass.FORMAT, + testid='unsorted-deps-2', + message=f"DEPS pointing to head '{h}' not sorted by relation type: '{cols[utils.DEPS]}'" + ) + ) + elif d == lastd: + incidents.append( + Error( + level=2, + testclass=TestClass.FORMAT, + testid='unsorted-deps', + message=f"DEPS contain multiple instances of the same relation '{h}:{d}'" + ) + ) + lasth = h + lastd = d + + try: + id_ = float(cols[utils.ID]) + except ValueError: + # This error has been reported previously. + # TODO: check, before there was just a return + return incidents + + if id_ in heads: + incidents.append( + Error( + level=2, + testclass=TestClass.ENHANCED, + testid='deps-self-loop', + message=f"Self-loop in DEPS for '{cols[utils.ID]}'" + ) + ) + + logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + return incidents + + +def check_misc(cols): + """ + In general, the MISC column can contain almost anything. However, if there + is a vertical bar character, it is interpreted as the separator of two + MISC attributes, which may or may not have the form of attribute=value pair. + In general it is not forbidden that the same attribute appears several times + with different values, but this should not happen for selected attributes + that are described in the UD documentation. + + This function must be run on raw MISC before it is fed into Udapi because + Udapi is not prepared for some of the less recommended usages of MISC. + + Parameters + ---------- + cols : list + The values of the columns on the current node / token line. + """ + + incidents = [] + + if cols[utils.MISC] == '_': + return incidents + + misc = [ma.split('=', 1) for ma in cols[utils.MISC].split('|')] #! why not using a function in utils? Just like the one for features + mamap = collections.defaultdict(int) + for ma in misc: + if ma[0] == '': + if len(ma) == 1: + incidents.append( + Warning( + level=2, + testid='empty-misc', + message="Empty attribute in MISC; possible misinterpreted vertical bar?" + ) + ) + else: + incidents.append( + Warning( + level=2, + testid='empty-misc-key', + message=f"Empty MISC attribute name in '{ma[0]}={ma[1]}'." + ) + ) + # We do not warn about MISC items that do not contain '='. + # But the remaining error messages below assume that ma[1] exists. + if len(ma) == 1: + ma.append('') + if re.match(r"^\s", ma[0]) or \ + re.match(r"\s$", ma[0]) or \ + re.match(r"^\s", ma[1]) or \ + re.search(r"\s$", ma[1]): + incidents.append(Error( + level=2, + testid='misc-extra-space', + message=f"MISC attribute: leading or trailing extra space in '{'='.join(ma)}'." + ) + ) + + if re.match(r"^(SpaceAfter|Lang|Translit|LTranslit|Gloss|LId|LDeriv)$", ma[0]): + mamap[ma[0]] += 1 + elif re.match(r"^\s*(spaceafter|lang|translit|ltranslit|gloss|lid|lderiv)\s*$", ma[0], re.IGNORECASE): + incidents.append( + Warning( + level=2, + testid='misc-attr-typo', + message=f"Possible typo (case or spaces) in MISC attribute '{'='.join(ma)}'." + ) + + ) + + for ma in mamap: + if mamap[ma] > 1: + incidents.append( + Error( + level=2, + testclass=TestClass.FORMAT, + testid='repeated-misc', + message=f"MISC attribute '{ma}' not supposed to occur twice" + ) + ) + + logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + return incidents + +# TODO: write tests +def check_deps_all_or_none(sentence, seen_enhanced_graph): + """ + Takes the list of non-comment lines (line = list of columns) describing + a sentence. Checks that enhanced dependencies are present if they were + present at another sentence, and absent if they were absent at another + sentence. + """ + incidents = [] + egraph_exists = False # enhanced deps are optional + for cols in sentence: + # if utils.is_multiword_token(cols): + # continue + if not utils.is_multiword_token(cols) and (utils.is_empty_node(cols) or cols[utils.DEPS] != '_'): + egraph_exists = True + + # We are currently testing the existence of enhanced graphs separately for each sentence. + # However, we should not allow that one sentence has a connected egraph and another + # has no enhanced dependencies. Such inconsistency could come as a nasty surprise + # to the users. + if egraph_exists: + if not seen_enhanced_graph: + # TODO: do elsewhere + # state.seen_enhanced_graph = state.sentence_line + incidents.append( + Error( + testclass=TestClass.ENHANCED, + testid='edeps-only-sometimes', + message=f"Enhanced graph must be empty because we saw empty DEPS earlier." + ) + ) + #! we should add something to this message in the engine where we have access to the state: + #on line {state.seen_tree_without_enhanced_graph} + + else: +# if not state.seen_tree_without_enhanced_graph: + # TODO: do elsewhere +# state.seen_tree_without_enhanced_graph = state.sentence_line + if seen_enhanced_graph: + incidents.append( + Error( + level=2, + testid='edeps-only-sometimes', + message=f"Enhanced graph cannot be empty because we saw non-empty DEPS earlier." + ) + ) + #! we should add something to this message in the engine where we have access to the state: + # on line {state.seen_enhanced_graph} + + return incidents + + +# TODO: move elsewhere +# # If a multi-word token has Typo=Yes, its component words must not have it. +# # We must remember the span of the MWT and check it in check_features_level4(). +# m = crex.mwtid.fullmatch(cols[ID]) +# state.mwt_typo_span_end = m.group(2) + + + + +def check_id_references(sentence): + """ + Verifies that HEAD and DEPS reference existing IDs. If this function + returns a nonempty list, most of the other tests should be skipped for the current + sentence (in particular anything that considers the tree structure). + + Parameters + ---------- + sentence : list + A list of lists representing a sentence in tabular format. + + returns + ------- + incidents : list + A list of Incidents (empty if validation is successful). + """ + incidents = [] + word_tree = [cols for cols in sentence if utils.is_word(cols) or utils.is_empty_node(cols)] + ids = set([cols[utils.ID] for cols in word_tree]) + for cols in word_tree: + # Test the basic HEAD only for non-empty nodes. + # We have checked elsewhere that it is empty for empty nodes. + if not utils.is_empty_node(cols): + match = crex.head.fullmatch(cols[utils.HEAD]) + if match is None: + incidents.append(Error( + testid='invalid-head', + message=f"Invalid HEAD: '{cols[utils.HEAD]}'." + )) + if not (cols[utils.HEAD] in ids or cols[utils.HEAD] == '0'): + incidents.append(Error( + testclass=TestClass.SYNTAX, + testid='unknown-head', + message=f"Undefined HEAD (no such ID): '{cols[id.HEAD]}'." + )) + try: + deps = utils.deps_list(cols) + except ValueError: + # Similar errors have probably been reported earlier. + incidents.append(Error( + testid='invalid-deps', + message=f"Failed to parse DEPS: '{cols[utils.DEPS]}'." + )) + continue + for head, _ in deps: + match = crex.ehead.fullmatch(head) + if match is None: + incidents.append(Error( + testid='invalid-ehead', + message=f"Invalid enhanced head reference: '{head}'." + )) + if not (head in ids or head == '0'): + incidents.append(Error( + testclass=TestClass.ENHANCED, + testid='unknown-ehead', + message=f"Undefined enhanced head reference (no such ID): '{head}'." + )) + logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + return incidents + +def check_tree(sentence, node_line, single_root): + """ + Performs basic validation of the tree structure (without UDApi). + + This function originally served to build a data structure that would + describe the tree and make it accessible during subsequent tests. Now we + use the Udapi data structures instead but we still have to call this + function first because it will survive and report ill-formed input. In + such a case, the Udapi data structure will not be built and Udapi-based + tests will be skipped. + + This function should be called only if both ID and HEAD values have been + found valid for all tree nodes, including the sequence of IDs and the references from HEAD to existing IDs. + + Parameters + ---------- + sentence : list + A list of lists representing a sentence in tabular format. + node_line : int + A file-wide line counter. + single_root : bool + A flag indicating whether we should check that there is a single root. + + returns + ------- + incidents : list + A list of Incidents (empty if validation is successful). + """ + # node_line = state.sentence_line - 1 TODO: this should be done by the engine + incidents = [] + children = {} # int(node id) -> set of children + n_words = 0 + for cols in sentence: + node_line += 1 + if not utils.is_word(cols): + continue + n_words += 1 + # ID and HEAD values have been validated before and this function would + # not be called if they were not OK. So we can now safely convert them + # to integers. + id_ = int(cols[utils.ID]) + head = int(cols[utils.HEAD]) + if head == id_: + incidents.append(Error( + testclass=TestClass.SYNTAX, + lineno=node_line, + testid='head-self-loop', + message=f'HEAD == ID for {cols[utils.ID]}' + )) + # Incrementally build the set of children of every node. + children.setdefault(head, set()).add(id_) + word_ids = list(range(1, n_words+1)) + # Check that there is just one node with the root relation. + children_0 = sorted(children.get(0, [])) + if len(children_0) > 1 and single_root: + incidents.append(Error( + testclass=TestClass.SYNTAX, + testid='multiple-roots', + message=f"Multiple root words: {children_0}" + )) + projection = set() + node_id = 0 + nodes = list((node_id,)) + while nodes: + node_id = nodes.pop() + children_id = sorted(children.get(node_id, [])) + for child in children_id: + if child in projection: + continue # skip cycles + projection.add(child) + nodes.append(child) + unreachable = set(word_ids) - projection + if unreachable: + str_unreachable = ','.join(str(w) for w in sorted(unreachable)) + incidents.append(Error( + testclass=TestClass.SYNTAX, + testid='non-tree', + message=f'Non-tree structure. Words {str_unreachable} are not reachable from the root 0.' + )) + logger.debug("%d incidents occurred in %s", len(incidents), inspect.stack()[0][3]) + return incidents + +def check_deprels_level2(node, deprels, lang): + """ + Checks that a dependency relation label is listed as approved in the given + language. As a language-specific test, this function generally belongs to + level 4, but it can be also used on levels 2 and 3, in which case it will + check only the main dependency type and ignore any subtypes. + + Parameters + ---------- + node : udapi.core.node.Node object + The node whose incoming relation will be validated. + deprels: TODO + lang: TODO + """ + + # List of permited relations is language-specific. + # The current token may be in a different language due to code switching. + # Unlike with features and auxiliaries, with deprels it is less clear + # whether we want to switch the set of labels when the token belongs to + # another language. Especially with subtypes that are not so much language + # specific. For example, we may have allowed 'flat:name' for our language, + # the maintainers of the other language have not allowed it, and then we + # could not use it when the foreign language is active. (This actually + # happened in French GSD.) We will thus allow the union of the main and the + # alternative deprelset when both the parent and the child belong to the + # same alternative language. Otherwise, only the main deprelset is allowed. + + incidents = [] + + naltlang = utils.get_alt_language(node) + + # The basic relation should be tested on regular nodes but not on empty nodes. + if not node.is_empty(): + paltlang = utils.get_alt_language(node.parent) + + # Test only the universal part if testing at universal level. + deprel = node.udeprel + check = False + if deprel in deprels[lang] and deprels[lang][deprel]["permitted"]: + check = True + + if naltlang != None and naltlang != lang and naltlang == paltlang: + if deprel in deprels[naltlang] and deprels[lang][naltlang]["permitted"]: + check = True + + if not check: + incidents.append( + Error( + level=2, + testclass=TestClass.SYNTAX, + testid='unknown-deprel', + message=f"Unknown DEPREL label: '{deprel}'" + ) + ) + # If there are enhanced dependencies, test their deprels, too. + # We already know that the contents of DEPS is parsable (deps_list() was + # first called from validate_id_references() and the head indices are OK). + # The order of enhanced dependencies was already checked in validate_deps(). + if str(node.deps) != '_': + # main_edeprelset = self.specs.get_edeprel_for_language(mainlang) + # alt_edeprelset = self.specs.get_edeprel_for_language(naltlang) + for edep in node.deps: + parent = edep['parent'] + deprel = utils.lspec2ud(edep['deprel']) + paltlang = utils.get_alt_language(parent) + + check = False + if deprel in deprels[lang] and deprels[lang][deprel]["permitted"]: + check = True + + if naltlang != None and naltlang != lang and naltlang == paltlang: + if deprel in deprels[naltlang] and deprels[lang][naltlang]["permitted"]: + check = True + + if not check: + incidents.append( + Error( + level=2, + testclass=TestClass.ENHANCED, + testid='unknown-edeprel', + message=f"Unknown enhanced relation type '{deprel}' in '{parent.ord}:{deprel}'" + ) + ) + + return incidents + +def check_deprels_level4(node, deprels, lang): + """ + Checks that a dependency relation label is listed as approved in the given + language. As a language-specific test, this function generally belongs to + level 4, but it can be also used on levels 2 and 3, in which case it will + check only the main dependency type and ignore any subtypes. + + Parameters + ---------- + node : udapi.core.node.Node object + The node whose incoming relation will be validated. + line : int + Number of the line where the node occurs in the file. + """ + + # List of permited relations is language-specific. + # The current token may be in a different language due to code switching. + # Unlike with features and auxiliaries, with deprels it is less clear + # whether we want to switch the set of labels when the token belongs to + # another language. Especially with subtypes that are not so much language + # specific. For example, we may have allowed 'flat:name' for our language, + # the maintainers of the other language have not allowed it, and then we + # could not use it when the foreign language is active. (This actually + # happened in French GSD.) We will thus allow the union of the main and the + # alternative deprelset when both the parent and the child belong to the + # same alternative language. Otherwise, only the main deprelset is allowed. + + incidents = [] + + naltlang = utils.get_alt_language(node) + + # The basic relation should be tested on regular nodes but not on empty nodes. + if not node.is_empty(): + paltlang = utils.get_alt_language(node.parent) + + # main_deprelset = self.specs.get_deprel_for_language(mainlang) + # alt_deprelset = set() + # if naltlang != None and naltlang != mainlang and naltlang == paltlang: + # alt_deprelset = self.specs.get_deprel_for_language(naltlang) + + # Test only the universal part if testing at universal level. + deprel = node.deprel + + check = False + if deprel in deprels[lang] and deprels[lang][deprel]["permitted"]: + check = True + + if naltlang != None and naltlang != lang and naltlang == paltlang: + if deprel in deprels[naltlang] and deprels[lang][naltlang]["permitted"]: + check = True + + if not check: + incidents.append( + Error( + level=4, + testclass=TestClass.SYNTAX, + testid='unknown-deprel', + message=f"Unknown DEPREL label: '{deprel}'" + ) + ) + # If there are enhanced dependencies, test their deprels, too. + # We already know that the contents of DEPS is parsable (deps_list() was + # first called from validate_id_references() and the head indices are OK). + # The order of enhanced dependencies was already checked in validate_deps(). + # Incident.default_testclass = 'Enhanced' + if str(node.deps) != '_': + # main_edeprelset = self.specs.get_edeprel_for_language(mainlang) + # alt_edeprelset = self.specs.get_edeprel_for_language(naltlang) + for edep in node.deps: + parent = edep['parent'] + deprel = edep['deprel'] + paltlang = utils.get_alt_language(parent) + + check = False + if deprel in deprels[lang] and deprels[lang][deprel]["permitted"]: + check = True + + if naltlang != None and naltlang != lang and naltlang == paltlang: + if deprel in deprels[naltlang] and deprels[lang][naltlang]["permitted"]: + check = True + + if not check: + incidents.append( + Error( + level=4, + testclass=TestClass.ENHANCED, + testid='unknown-edeprel', + message=f"Unknown enhanced relation type '{deprel}' in '{parent.ord}:{deprel}'" + ) + ) + + return incidents + +def check_root(node): + """ + Checks that DEPREL is "root" iff HEAD is 0. + + Parameters + ---------- + node : udapi.core.node.Node object + The node whose incoming relation will be validated. This function + operates on both regular and empty nodes. Make sure to call it for + empty nodes, too! + + returns + ------- + incidents : list + A list of Incidents (empty if validation is successful). + """ + incidents = [] + if not node.is_empty(): + if node.parent.ord == 0 and node.udeprel != 'root': + incidents.append(Error( + level=2, + testclass=TestClass.SYNTAX, + testid='0-is-not-root', + message="DEPREL must be 'root' if HEAD is 0." + )) + if node.parent.ord != 0 and node.udeprel == 'root': + incidents.append(Error( + level=2, + testclass=TestClass.SYNTAX, + testid='root-is-not-0', + message="DEPREL cannot be 'root' if HEAD is not 0." + )) + # In the enhanced graph, test both regular and empty roots. + for edep in node.deps: + if edep['parent'].ord == 0 and utils.lspec2ud(edep['deprel']) != 'root': + incidents.append(Error( + level=2, + testclass=TestClass.SYNTAX, + testid='enhanced-0-is-not-root', + message="Enhanced relation type must be 'root' if head is 0." + )) + if edep['parent'].ord != 0 and utils.lspec2ud(edep['deprel']) == 'root': + incidents.append(Error( + level=2, + testclass=TestClass.SYNTAX, + testid='enhanced-root-is-not-0', + message="Enhanced relation type cannot be 'root' if head is not 0." + )) + return incidents + +def check_enhanced_orphan(node, seen_empty_node, seen_enhanced_orphan): + """ + Checks universally valid consequences of the annotation guidelines in the + enhanced representation. Currently tests only phenomena specific to the + enhanced dependencies; however, we should also test things that are + required in the basic dependencies (such as left-to-right coordination), + unless it is obvious that in enhanced dependencies such things are legal. + + Parameters + ---------- + node : udapi.core.node.Node object + The node whose incoming relations will be validated. This function + operates on both regular and empty nodes. Make sure to call it for + empty nodes, too! + + returns + ------- + incidents : list + A list of Incidents (empty if validation is successful). + """ + incidents = [] + # Enhanced dependencies should not contain the orphan relation. + # However, all types of enhancements are optional and orphans are excluded + # only if this treebank addresses gapping. We do not know it until we see + # the first empty node. + if str(node.deps) == '_': + return + if node.is_empty(): + if not seen_empty_node: + #TODO: outside of this function: state.seen_empty_node = line + # Empty node itself is not an error. Report it only for the first time + # and only if an orphan occurred before it. + if seen_enhanced_orphan: + incidents.append(Error( + level=3, + testclass=TestClass.ENHANCED, + nodeid=node.ord, + testid='empty-node-after-eorphan', + message=f"Empty node means that we address gapping and there should be no orphans in the enhanced graph; but we saw one on line {seen_enhanced_orphan}" + )) + udeprels = set([utils.lspec2ud(edep['deprel']) for edep in node.deps]) + if 'orphan' in udeprels: + if not seen_enhanced_orphan: + pass + # TODO: outside of this function: state.seen_enhanced_orphan = line + # If we have seen an empty node, then the orphan is an error. + if seen_empty_node: + incidents.append(Error( + level=3, + testclass=TestClass.ENHANCED, + nodeid=node.ord, + testid='eorphan-after-empty-node', + message=f"'orphan' not allowed in enhanced graph because we saw an empty node on line {seen_empty_node}" + )) + return incidents + +def check_words_with_spaces(node, lang, specs): + """ + Checks a single line for disallowed whitespace. + Here we assume that all language-independent whitespace-related tests have + already been done on level 1, so we only check for words with spaces that + are explicitly allowed in a given language. + + Parameters + ---------- + node : udapi.core.node.Node object + The node to be validated. + lang : str + Code of the main language of the corpus. + specs : UDSpecs + The object containing specific information about the allowed values + + returns + ------- + incidents : list + A list of Incidents (empty if validation is successful). + """ + incidents = [] + # List of permitted words with spaces is language-specific. + # The current token may be in a different language due to code switching. + tospacedata = specs.get_tospace_for_language(lang) + altlang = utils.get_alt_language(node) + if altlang: + lang = altlang + tospacedata = specs.get_tospace_for_language(altlang) + for column in ('FORM', 'LEMMA'): + word = node.form if column == 'FORM' else node.lemma + # Is there whitespace in the word? + if crex.ws.search(word): + # Whitespace found. Does the word pass the regular expression that defines permitted words with spaces in this language? + if tospacedata: + # For the purpose of this test, NO-BREAK SPACE is equal to SPACE. + string_to_test = re.sub(r'\xA0', ' ', word) + if not tospacedata[1].fullmatch(string_to_test): + incidents.append(Error( + level=4, + testclass=TestClass.FORMAT, + nodeid=node.ord, + testid='invalid-word-with-space', + message=f"'{word}' in column {column} is not on the list of exceptions allowed to contain whitespace.", + )) + else: + incidents.append(Error( + level=4, + testclass=TestClass.FORMAT, + nodeid=node.ord, + testid='invalid-word-with-space', + message=f"'{word}' in column {column} is not on the list of exceptions allowed to contain whitespace.", + )) + return incidents + +def check_features_level4(node, lang, specs, mwt_typo_span_end): + """ + Checks that a feature-value pair is listed as approved. Feature lists are + language 'ud'. # ? + + Parameters + ---------- + node : udapi.core.node.Node object + The node to be validated. + lang : str + Code of the main language of the corpus. + specs : UDSpecs + The object containing specific information about the allowed values + mwt_typo_span_end : TODO: add type and description + + returns + ------- + incidents : list + A list of Incidents (empty if validation is successful). + """ + incidents = [] + if str(node.feats) == '_': + return incidents + # List of permitted features is language-specific. + # The current token may be in a different language due to code switching. + default_lang = lang + default_featset = featset = specs.get_feats_for_language(lang) + altlang = utils.get_alt_language(node) + if altlang: + lang = altlang + featset = specs.get_feats_for_language(altlang) + for f in node.feats: + values = node.feats[f].split(',') + for v in values: + # Level 2 tested character properties and canonical order but not that the f-v pair is known. + # Level 4 also checks whether the feature value is on the list. + # If only universal feature-value pairs are allowed, test on level 4 with lang='ud'. + # The feature Typo=Yes is the only feature allowed on a multi-word token line. + # If it occurs there, it cannot be duplicated on the lines of the component words. + if f == 'Typo' and mwt_typo_span_end and node.ord <= mwt_typo_span_end: + incidents.append(Error( + level=4, + testclass=TestClass.MORPHO, + nodeid=node.ord, + testid='mwt-typo-repeated-at-word', + message="Feature Typo cannot occur at a word if it already occurred at the corresponding multi-word token." + )) + # In case of code switching, the current token may not be in the default language + # and then its features are checked against a different feature set. An exception + # is the feature Foreign, which always relates to the default language of the + # corpus (but Foreign=Yes should probably be allowed for all UPOS categories in + # all languages). + effective_featset = featset + effective_lang = lang + if f == 'Foreign': + # Revert to the default. + effective_featset = default_featset + effective_lang = default_lang + if effective_featset is not None: + if f not in effective_featset: + incidents.append(Error( + level=4, + testclass=TestClass.MORPHO, + nodeid=node.ord, + testid='feature-unknown', + message=f"Feature {f} is not documented for language [{effective_lang}] ('{utils.formtl(node)}').", + )) + else: + lfrecord = effective_featset[f] + if lfrecord['permitted'] == 0: + incidents.append(Error( + level=4, + testclass=TestClass.MORPHO, + nodeid=node.ord, + testid='feature-not-permitted', + message=f"Feature {f} is not permitted in language [{effective_lang}] ('{utils.formtl(node)}').", + )) + else: + values = lfrecord['uvalues'] + lfrecord['lvalues'] + lfrecord['unused_uvalues'] + lfrecord['unused_lvalues'] + if not v in values: + incidents.append(Error( + level=4, + testclass=TestClass.MORPHO, + nodeid=node.ord, + testid='feature-value-unknown', + message=f"Value {v} is not documented for feature {f} in language [{effective_lang}] ('{utils.formtl(node)}').", + )) + elif not node.upos in lfrecord['byupos']: + incidents.append(Error( + level=4, + testclass=TestClass.MORPHO, + nodeid=node.ord, + testid='feature-upos-not-permitted', + message=f"Feature {f} is not permitted with UPOS {node.upos} in language [{effective_lang}] ('{utils.formtl(node)}').", + )) + elif not v in lfrecord['byupos'][node.upos] or lfrecord['byupos'][node.upos][v]==0: + incidents.append(Error( + level=4, + testclass=TestClass.MORPHO, + nodeid=node.ord, + testid='feature-value-upos-not-permitted', + message=f"Value {v} of feature {f} is not permitted with UPOS {node.upos} in language [{effective_lang}] ('{utils.formtl(node)}').", + )) + # TODO: (outside of this function) + #if mwt_typo_span_end and int(mwt_typo_span_end) <= int(node.ord): + # state.mwt_typo_span_end = None + + return incidents + +# ! proposal: rename to validate_auxiliaries, since some ar particles as per +# ! the docstring below +def check_auxiliary_verbs(node, lang, specs): + """ + Checks that the UPOS tag AUX is used only with lemmas that are known to + act as auxiliary verbs or particles in the given language. + + + Parameters + ---------- + node : udapi.core.node.Node object + The node to be validated. + lang : str + Code of the main language of the corpus. + specs : UDSpecs + The object containing specific information about the allowed values + + returns + ------- + incidents : list + A list of Incidents (empty if validation is successful). + """ + incidents = [] + if node.upos == 'AUX' and node.lemma != '_': + altlang = utils.get_alt_language(node) + if altlang: + lang = altlang + auxlist = specs.get_aux_for_language(lang) + if not auxlist or not node.lemma in auxlist: + incidents.append(Error( + nodeid=node.ord, + level=5, + testclass=TestClass.MORPHO, + testid='aux-lemma', + message=f"'{node.lemma}' is not an auxiliary in language [{lang}]", + )) + return incidents + +def check_copula_lemmas(node, lang, specs): + """ + Check that the relation cop is used only with lemmas that are known to + act as copulas in the given language. + + Parameters + ---------- + node : udapi.core.node.Node object + The node to be validated. + lang : str + Code of the main language of the corpus. + specs : UDSpecs + The object containing specific information about the allowed values + + returns + ------- + incidents : list + A list of Incidents (empty if validation is successful). + """ + incidents = [] + if node.udeprel == 'cop' and node.lemma != '_': + altlang = utils.get_alt_language(node) + if altlang: + lang = altlang + coplist = specs.get_cop_for_language(lang) + if not coplist or not node.lemma in coplist: + incidents.append(Error( + nodeid=node.ord, + level=5, + testclass=TestClass.SYNTAX, + testid='cop-lemma', + message=f"'{node.lemma}' is not a copula in language [{lang}]", + )) + return incidents + +# ! proposal: remove entirely and put in tree block of the validator, or at +# ! least rename to check_universal_guidelines (this function simply groups a +# ! few checks together, and the tree section of the engine kinda does the same +# ! thing), not to mention that removing this function spares us passing line +# ! numbers around +def validate_annotation(tree, linenos): + """ + Checks universally valid consequences of the annotation guidelines. Looks + at regular nodes and basic tree, not at enhanced graph (which is checked + elsewhere). + + Parameters + ---------- + tree : udapi.core.root.Root object + linenos : dict + Key is node ID (string, not int or float!) Value is the 1-based index + of the line where the node occurs (int). + + returns + ------- + incidents : list + A list of Incidents (empty if validation is successful). + """ + incidents = [] + nodes = tree.descendants + for node in nodes: + lineno = linenos[str(node.ord)] + incidents.extend(check_expected_features(node, lineno)) + #incidents.extend(validate_upos_vs_deprel(node, lineno)) + #incidents.extend(validate_flat_foreign(node, lineno, linenos)) + #incidents.extend(validate_left_to_right_relations(node, lineno)) + #incidents.extend(validate_single_subject(node, lineno)) + #incidents.extend(validate_single_object(node, lineno)) + #incidents.extend(validate_orphan(node, lineno)) + #incidents.extend(validate_functional_leaves(node, lineno, linenos)) + #incidents.extend(validate_fixed_span(node, lineno)) + #incidents.extend(validate_goeswith_span(node, lineno)) + #incidents.extend(validate_goeswith_morphology_and_edeps(node, lineno)) + #incidents.extend(validate_projective_punctuation(node, lineno)) + incidents = [] + +def check_expected_features(node, seen_morpho_feature, delayed_feature_errors): + """ + Certain features are expected to occur with certain UPOS or certain values + of other features. This function issues warnings instead of errors, as + features are in general optional and language-specific. Even the warnings + are issued only if the treebank has features. Note that the expectations + tested here are considered (more or less) universal. Checking that a given + feature-value pair is compatible with a particular UPOS is done using + language-specific lists at level 4. + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + """ + incidents = [] + # TODO: + if node.upos in ['PRON', 'DET']: + incidents.extend(validate_required_feature( + node, 'PronType', None, + seen_morpho_feature, delayed_feature_errors, + IncidentType.ERROR, TestClass.MORPHO, 'pron-det-without-prontype' + )) + if node.feats['VerbForm'] == 'Fin' and node.feats['Mood'] == '': + incidents.append(Warning( + level=3, + # ! used to be Incident with testclass="Warning", but now Warning is an alternative to Error and TestClass.MORPHO makes sense here + testclass=TestClass.MORPHO, + testid='verbform-fin-without-mood', + message=f"Finite verb '{utils.formtl(node)}' lacks the 'Mood' feature" + )) + elif node.feats['Mood'] != '' and node.feats['VerbForm'] != 'Fin': + incidents.append(Warning( + level=3, + # ! used to be Incident with testclass="Warning", but now Warning is an alternative to Error and TestClass.MORPHO makes sense here + testclass=TestClass.MORPHO, + testid='mood-without-verbform-fin', + message=f"Non-empty 'Mood' feature at a word that is not finite verb ('{utils.formtl(node)}')" + )) + +def validate_required_feature(node, required_feature, required_value, seen_morpho_feature, delayed_feature_errors, incident_type, testclass, testid): + """ + In general, the annotation of morphological features is optional, although + highly encouraged. However, if the treebank does have features, then certain + features become required. This function will check the presence of a feature + and if it is missing, an error will be reported only if at least one feature + has been already encountered. Otherwise the error will be remembered and it + may be reported afterwards if any feature is encountered later. + + Parameters + ---------- + node : TODO: update + required_feature : str + The name of the required feature. + required_value : str + The required value of the feature. Multivalues are not supported (they + are just a string value containing one or more commas). If + required_value is None or an empty string, it means that we require any + non-empty value of required_feature. + TODO: update + """ + incidents = [] + feats = node.feats + if required_value: + if feats[required_feature] != required_value or feats[required_feature] == '': + if seen_morpho_feature: + incidents.append(Error if incident_type == IncidentType.ERROR else Warning( + level=3, + testclass=testclass, + testid=testid, + message=f"The word '{utils.formtl(node)}' is tagged '{node.upos}' but it lacks the 'PronType' feature" + )) + # TODO: outside of this function + #else: + # if not testid in delayed_feature_errors: + # state.delayed_feature_errors[incident.testid] = {'occurrences': []} + # state.delayed_feature_errors[incident.testid]['occurrences'].append({'incident': incident}) + return incidents + + diff --git a/validator/src/validator/validate_lib.py b/validator/src/validator/validate_lib.py new file mode 100755 index 000000000..ece7ee2ed --- /dev/null +++ b/validator/src/validator/validate_lib.py @@ -0,0 +1,3805 @@ +# ! REMOVE #/usr/bin/env python3 +# ! REMOVE Original code (2015) by Filip Ginter and Sampo Pyysalo. +# ! REMOVE DZ 2018-11-04: Porting the validator to Python 3. +# ! REMOVE DZ: Many subsequent changes. See the git history. +import sys +import io +import os.path +import argparse +import traceback +# ! REMOVE According to https://stackoverflow.com/questions/1832893/python-regex-matching-unicode-properties, +# ! REMOVE the regex module has the same API as re but it can check Unicode character properties using \p{} +# ! REMOVE as in Perl. +#import re +import regex as re +import unicodedata +import json +# Once we know that the low-level CoNLL-U format is OK, we will be able to use +# the Udapi library to access the data and perform the tests at higher levels. +import udapi.block.read.conllu + +import validator.compiled_regex as crex +import validator.utils as utils +import validator.output_utils as outils +import validator.specifications as data + + + +# The folder where this script resides. +# ! OLD THISDIR=os.path.dirname(os.path.realpath(os.path.abspath(__file__))) +THISDIR=os.path.join(os.path.dirname(os.path.realpath(os.path.abspath(__file__))), "../../..") + +# Constants for the column indices +COLCOUNT=10 +ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC=range(COLCOUNT) +COLNAMES='ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL,DEPS,MISC'.split(',') + + +# TODO: turn into data class +class State: + """ + The State class holds various global data about where we are in the file + and what we have seen so far. Typically there will be just one instance of + this class. + """ + def __init__(self, current_file_name): + # Name of the current input file. + self.current_file_name = current_file_name + # Current line in the input file, or, more precisely, the last line + # read so far. Once we start looking at tree integrity, we may find + # errors on previous lines as well. + self.current_line = 0; + # The line in the input file on which the current sentence starts, + # including sentence-level comments. + self.comment_start_line = 0 # TODO: rename to someting about sentences + # The line in the input file on which the current sentence starts + # (the first node/token line, skipping comments). + self.sentence_line = 0 + # The most recently read sentence id. + self.sentence_id = None + # Needed to check that no space after last word of sentence does not + # co-occur with new paragraph or document. # TODO: smorfiare + self.spaceafterno_in_effect = False + # Error counter by error type. Key: error type; value: error count. + # Incremented in Incident.report(). + self.error_counter = {} # TODO: replace with len(something) + # Set of detailed error explanations that have been printed so far. + # Each explanation will be printed only once. Typically, an explanation + # can be identified by test id + language code. Nevertheless, we put + # the whole explanation to the set. + self.explanation_printed = set() # TODO: cestinare + # Some feature-related errors can only be reported if the corpus + # contains feature annotation because features are optional in general. + # Once we see the first feature, we can flush all accumulated + # complaints about missing features. + # Key: testid; value: dict with parameters of the error and the list of + # its occurrences. + self.delayed_feature_errors = {} + # Remember all sentence ids seen in all input files (presumably one + # corpus). We need it to check that each id is unique. + self.known_sent_ids = set() + #---------------------------------------------------------------------- + # Various things that we may have seen earlier in the corpus. The value + # is None if we have not seen it, otherwise it is the line number of + # the first occurrence. + #---------------------------------------------------------------------- + self.seen_morpho_feature = None + self.seen_enhanced_graph = None + self.seen_tree_without_enhanced_graph = None + # Any difference between non-empty DEPS and HEAD:DEPREL. + # (Because we can see many enhanced graphs but no real enhancements.) + self.seen_enhancement = None + self.seen_empty_node = None + self.seen_enhanced_orphan = None + + # If a multi-word token has Typo=Yes, its component words must not have + # it. When we see Typo=Yes on a MWT line, we will remember the span of + # the MWT here and will not allow Typo=Yes within that span (which is + # checked in another function). + self.mwt_typo_span_end = None + + #---------------------------------------------------------------------- + # Additional observations related to Entity annotation in MISC + # (only needed when validating entities and coreference). + #---------------------------------------------------------------------- + # Remember the global.entity attribute string to be able to check that + # repeated declarations are identical. + # global.entity comment line is needed for Entity annotations in MISC. + self.seen_global_entity = None + self.global_entity_attribute_string = None + # The number of entity attributes will be derived from the attribute + # string and will be used to check that an entity does not have extra + # attributes. + self.entity_attribute_number = 0 + # Key: entity attribute name; value: the index of the attribute in the + # entity attribute list. + self.entity_attribute_index = {} + # Key: entity (cluster) id; value: tuple: (type of the entity, identity + # (Wikipedia etc.), line of the first mention)). + self.entity_types = {} + # Indices of known entity ids in this and other documents. + # (Otherwise, if we only needed to know that an entity is known, we + # could use self.entity_types above.) + self.entity_ids_this_document = {} + self.entity_ids_other_documents = {} + # List of currently open entity mentions. Items are dictionaries with + # entity mention information. + self.open_entity_mentions = [] + # For each entity that has currently open discontinuous mention, + # describe the last part of the mention. Key: entity id; value is dict, + # its keys: last_ipart, npart, line. + self.open_discontinuous_mentions = {} + # Key: srceid 0 and state.error_counter[self.testclass] > args.max_err: + if state.error_counter[self.testclass] == args.max_err + 1: + print(f'...suppressing further errors regarding {self.testclass}', file=sys.stderr) + return # suppressed + # If we are here, the error message should really be printed. + # Address of the incident. + address = f'Line {self.lineno} Sent {self.sentid}' + if self.nodeid: + address += f' Node {self.nodeid}' + # Insert file name if there are several input files. + if len(args.input) > 1: + address = f'File {self.filename} ' + address + # Classification of the incident. + levelclassid = f'L{self.level} {self.testclass} {self.testid}' + # Message (+ explanation, if this is the first error of its kind). + message = self.message + if self.explanation and self.explanation not in state.explanation_printed: + message += "\n\n" + self.explanation + "\n" + state.explanation_printed.add(self.explanation) + print(f'[{address}]: [{levelclassid}] {message}', file=sys.stderr) + + +#============================================================================== +# Level 1 tests. Only CoNLL-U backbone. Values can be empty or non-UD. +#============================================================================== + + + +class Validator: + # TODO: only pass args that are actually useful + def __init__(self, args, specs): + self.conllu_reader = udapi.block.read.conllu.Conllu() + self.args = args + self.specs = specs + + # TODO: rename to next_block + def OLD_next_sentence(self, state, inp): + """ + `inp` a file-like object yielding lines as unicode + + This function does elementary checking of the input and yields one + sentence at a time from the input stream. The function guarantees + elementary integrity of its yields. Some lines may be skipped (e.g., + extra empty lines or misplaced comments), and a whole sentence will be + skipped if one of its token lines has unexpected number of columns. + + However, some low-level errors currently do not lead to excluding the + sentence from being yielded and put to subsequent tests. Specifically, + character constraints on individual fields are tested here but errors + are not considered fatal. + + This function is a generator. The caller can call it in a 'for x in ...' + loop. In each iteration of the caller's loop, the generator will generate + the next sentence, that is, it will read the next sentence from the input + stream. (Technically, the function returns an object, and the object will + then read the sentences within the caller's loop.) + """ + all_lines = [] # List of lines in the sentence (comments and tokens), minus final empty line, minus newline characters (and minus spurious lines that are neither comment lines nor token lines) + comment_lines = [] # List of comment lines to go with the current sentence; initial part of all_lines + token_lines_fields = [] # List of token/word lines of the current sentence, converted from string to list of fields + corrupted = False # In case of wrong number of columns check the remaining lines of the sentence but do not yield the sentence for further processing. + state.comment_start_line = None + + for line_counter, line in enumerate(inp): + state.current_line = line_counter+1 + Incident.default_level = 1 + Incident.default_testclass = 'Format' + Incident.default_lineno = None # use the most recently read line + if not state.comment_start_line: + state.comment_start_line = state.current_line + line = line.rstrip("\n") + self.validate_unicode_normalization(state, line) + if utils.is_whitespace(line): + Incident( + state=state, + testid='pseudo-empty-line', + message='Spurious line that appears empty but is not; there are whitespace characters.' + ).report(state, self.args) + # We will pretend that the line terminates a sentence in order to + # avoid subsequent misleading error messages. + if token_lines_fields: + if not corrupted: + yield all_lines, comment_lines, token_lines_fields + all_lines = [] + comment_lines = [] + token_lines_fields = [] + corrupted = False + state.comment_start_line = None + elif not line: # empty line + if token_lines_fields: # sentence done + if not corrupted: + yield all_lines, comment_lines, token_lines_fields + all_lines = [] + comment_lines = [] + token_lines_fields = [] + corrupted = False + state.comment_start_line = None + else: + Incident( + state=state, + testid='extra-empty-line', + message='Spurious empty line. Only one empty line is expected after every sentence.' + ).report(state, self.args) + elif line[0] == '#': + # We will really validate sentence ids later. But now we want to remember + # everything that looks like a sentence id and use it in the error messages. + # Line numbers themselves may not be sufficient if we are reading multiple + # files from a pipe. + match = crex.sentid.fullmatch(line) + if match: + state.sentence_id = match.group(1) + if not token_lines_fields: # before sentence + all_lines.append(line) + comment_lines.append(line) + else: + Incident( + state=state, + testid='misplaced-comment', + message='Spurious comment line. Comments are only allowed before a sentence.' + ).report(state, self.args) + elif line[0].isdigit(): + if not token_lines_fields: # new sentence + state.sentence_line = state.current_line + cols = line.split("\t") + # If there is an unexpected number of columns, do not test their contents. + # Maybe the contents belongs to a different column. And we could see + # an exception if a column value is missing. + if len(cols) == COLCOUNT: + all_lines.append(line) + token_lines_fields.append(cols) + # Low-level tests, mostly universal constraints on whitespace in fields, also format of the ID field. + self.validate_whitespace(state, cols) + else: + Incident( + state=state, + testid='number-of-columns', + message=f'The line has {len(cols)} columns but {COLCOUNT} are expected. The line will be excluded from further tests.' + ).report(state, self.args) + corrupted = True + else: # A line which is neither a comment nor a token/word, nor empty. That's bad! + Incident( + state=state, + testid='invalid-line', + message=f"Spurious line: '{line}'. All non-empty lines should start with a digit or the # character. The line will be excluded from further tests." + ).report(state, self.args) + else: # end of file + if comment_lines and not token_lines_fields: + # Comments at the end of the file, no sentence follows them. + Incident( + state=state, + testid='misplaced-comment', + message='Spurious comment line. Comments are only allowed before a sentence.' + ).report(state, self.args) + elif comment_lines or token_lines_fields: # These should have been yielded on an empty line! + Incident( + state=state, + testid='missing-empty-line', + message='Missing empty line after the last sentence.' + ).report(state, self.args) + if not corrupted: + yield all_lines, comment_lines, token_lines_fields + + + +#------------------------------------------------------------------------------ +# Level 1 tests applicable to a single line independently of the others. +#------------------------------------------------------------------------------ + + + + def validate_unicode_normalization(self, state, text): + """ + Tests that letters composed of multiple Unicode characters (such as a base + letter plus combining diacritics) conform to NFC normalization (canonical + decomposition followed by canonical composition). + + Parameters + ---------- + text : str + The input line to be tested. If the line consists of TAB-separated + fields (token line), errors reports will specify the field where the + error occurred. Otherwise (comment line), the error report will not be + localized. + """ + normalized_text = unicodedata.normalize('NFC', text) + if text != normalized_text: + # Find the first unmatched character and include it in the report. + firsti = -1 + firstj = -1 + inpfirst = '' + inpsecond = '' + nfcfirst = '' + tcols = text.split("\t") + ncols = normalized_text.split("\t") + for i in range(len(tcols)): + for j in range(len(tcols[i])): + if tcols[i][j] != ncols[i][j]: + firsti = i + firstj = j + inpfirst = unicodedata.name(tcols[i][j]) + nfcfirst = unicodedata.name(ncols[i][j]) + if j+1 < len(tcols[i]): + inpsecond = unicodedata.name(tcols[i][j+1]) + break + if firsti >= 0: + break + if len(tcols) > 1: + testmessage = f"Unicode not normalized: {COLNAMES[firsti]}.character[{firstj}] is {inpfirst}, should be {nfcfirst}." + else: + testmessage = f"Unicode not normalized: character[{firstj}] is {inpfirst}, should be {nfcfirst}." + explanation_second = f" In this case, your next character is {inpsecond}." if inpsecond else '' + Incident( + state=state, + level=1, + testclass='Unicode', + testid='unicode-normalization', + message=testmessage, + explanation=f"This error usually does not mean that {inpfirst} is an invalid character. Usually it means that this is a base character followed by combining diacritics, and you should replace them by a single combined character.{explanation_second} You can fix normalization errors using the normalize_unicode.pl script from the tools repository." + ).report(state, self.args) + + + + def validate_whitespace(self, state, cols): + """ + Checks that columns are not empty and do not contain whitespace characters + except for patterns that could be allowed at level 4. Applies to all types + of TAB-containing lines: nodes / words, mwt ranges, empty nodes. + + Parameters + ---------- + cols : list + The values of the columns on the current node / token line. + """ + Incident.default_level = 1 + Incident.default_testclass = 'Format' + Incident.default_lineno = None # use the most recently read line + # Some whitespace may be permitted in FORM, LEMMA and MISC but not elsewhere. + for col_idx in range(COLCOUNT): + # Must never be empty + if not cols[col_idx]: + Incident( + state=state, + testid='empty-column', + message=f'Empty value in column {COLNAMES[col_idx]}.' + ).report(state, self.args) + else: + # Must never have leading/trailing whitespace + if cols[col_idx][0].isspace(): + Incident( + state=state, + testid='leading-whitespace', + message=f'Leading whitespace not allowed in column {COLNAMES[col_idx]}.' + ).report(state, self.args) + if cols[col_idx][-1].isspace(): + Incident( + state=state, + testid='trailing-whitespace', + message=f'Trailing whitespace not allowed in column {COLNAMES[col_idx]}.' + ).report(state, self.args) + # Must never contain two consecutive whitespace characters + if crex.ws2.search(cols[col_idx]): + Incident( + state=state, + testid='repeated-whitespace', + message=f'Two or more consecutive whitespace characters not allowed in column {COLNAMES[col_idx]}.' + ).report(state, self.args) + # Multi-word tokens may have whitespaces in MISC but not in FORM or LEMMA. + # If it contains a space, it does not make sense to treat it as a MWT. + if utils.is_multiword_token(cols): + for col_idx in (FORM, LEMMA): + if col_idx >= len(cols): + break # this has been already reported in next_sentence() + if crex.ws.search(cols[col_idx]): + Incident( + state=state, + testid='invalid-whitespace-mwt', + message=f"White space not allowed in multi-word token '{cols[col_idx]}'. If it contains a space, it is not one surface token." + ).report(state, self.args) + # These columns must not have whitespace. + for col_idx in (ID, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS): + if col_idx >= len(cols): + break # this has been already reported in next_sentence() + if crex.ws.search(cols[col_idx]): + Incident( + state=state, + testid='invalid-whitespace', + message=f"White space not allowed in column {COLNAMES[col_idx]}: '{cols[col_idx]}'." + ).report(state, self.args) + # We should also check the ID format (e.g., '1' is good, '01' is wrong). + # Although it is checking just a single column, we will do it in + # validate_id_sequence() because that function has the power to block + # further tests, which could choke up on this. + + + +#------------------------------------------------------------------------------ +# Level 1 tests applicable to the whole sentence. +#------------------------------------------------------------------------------ + + + + def OLD_validate_id_sequence(self, state, sentence): + """ + Validates that the ID sequence is correctly formed. + Besides reporting the errors, it also returns False to the caller so it can + avoid building a tree from corrupt IDs. + + sentence ... array of arrays, each inner array contains columns of one line + """ + ok = True + Incident.default_level = 1 + Incident.default_testclass = 'Format' + Incident.default_lineno = None # use the most recently read line + words=[] + tokens=[] + current_word_id, next_empty_id = 0, 1 + for cols in sentence: + # Check for the format of the ID value. (ID must not be empty.) + if not (utils.is_word(cols) or utils.is_empty_node(cols) or utils.is_multiword_token(cols)): + Incident( + state=state, + testid='invalid-word-id', + message=f"Unexpected ID format '{cols[ID]}'." + ).report(state, self.args) + ok = False + continue + if not utils.is_empty_node(cols): + next_empty_id = 1 # reset sequence + if utils.is_word(cols): + t_id = int(cols[ID]) + current_word_id = t_id + words.append(t_id) + # Not covered by the previous interval? + if not (tokens and tokens[-1][0] <= t_id and tokens[-1][1] >= t_id): + tokens.append((t_id, t_id)) # nope - let's make a default interval for it + elif utils.is_multiword_token(cols): + match = crex.mwtid.fullmatch(cols[ID]) # Check the interval against the regex + if not match: # This should not happen. The function utils.is_multiword_token() would then not return True. + Incident( + state=state, + testid='invalid-word-interval', + message=f"Spurious word interval definition: '{cols[ID]}'." + ).report(state, self.args) + ok = False + continue + beg, end = int(match.group(1)), int(match.group(2)) + if not ((not words and beg >= 1) or (words and beg >= words[-1] + 1)): + Incident( + state=state, + testid='misplaced-word-interval', + message='Multiword range not before its first word.' + ).report(state, self.args) + ok = False + continue + tokens.append((beg, end)) + elif utils.is_empty_node(cols): + word_id, empty_id = (int(i) for i in utils.parse_empty_node_id(cols)) + if word_id != current_word_id or empty_id != next_empty_id: + Incident( + state=state, + testid='misplaced-empty-node', + message=f'Empty node id {cols[ID]}, expected {current_word_id}.{next_empty_id}' + ).report(state, self.args) + ok = False + next_empty_id += 1 + # Interaction of multiword tokens and empty nodes if there is an empty + # node between the first word of a multiword token and the previous word: + # This sequence is correct: 4 4.1 5-6 5 6 + # This sequence is wrong: 4 5-6 4.1 5 6 + if word_id == current_word_id and tokens and word_id < tokens[-1][0]: + Incident( + state=state, + testid='misplaced-empty-node', + message=f"Empty node id {cols[ID]} must occur before multiword token {tokens[-1][0]}-{tokens[-1][1]}." + ).report(state, self.args) + ok = False + # Now let's do some basic sanity checks on the sequences. + # Expected sequence of word IDs is 1, 2, ... + expstrseq = ','.join(str(x) for x in range(1, len(words) + 1)) + wrdstrseq = ','.join(str(x) for x in words) + if wrdstrseq != expstrseq: + Incident( + state=state, + lineno=-1, + testid='word-id-sequence', + message=f"Words do not form a sequence. Got '{wrdstrseq}'. Expected '{expstrseq}'." + ).report(state, self.args) + ok = False + # Check elementary sanity of word intervals. + # Remember that these are not just multi-word tokens. Here we have intervals even for single-word tokens (b=e)! + for (b, e) in tokens: + if e < b: # end before beginning + Incident( + state=state, + testid='reversed-word-interval', + message=f'Spurious token interval {b}-{e}' + ).report(state, self.args) + ok = False + continue + if b < 1 or e > len(words): # out of range + Incident( + state=state, + testid='word-interval-out', + message=f'Spurious token interval {b}-{e} (out of range)' + ).report(state, self.args) + ok = False + continue + return ok + + + + def OLD_validate_token_ranges(self, state, sentence): + """ + Checks that the word ranges for multiword tokens are valid. + + sentence ... array of arrays, each inner array contains columns of one line + """ + Incident.default_level = 1 + Incident.default_testclass = 'Format' + Incident.default_lineno = None # use the most recently read line + covered = set() + for cols in sentence: + if not utils.is_multiword_token(cols): + continue + m = crex.mwtid.fullmatch(cols[ID]) + if not m: # This should not happen. The function utils.is_multiword_token() would then not return True. + Incident( + state=state, + testid='invalid-word-interval', + message=f"Spurious word interval definition: '{cols[ID]}'." + ).report(state, self.args) + continue + start, end = m.groups() + start, end = int(start), int(end) + # Do not test if start >= end: This was already tested above in validate_id_sequence(). + if covered & set(range(start, end+1)): + Incident( + state=state, + testid='overlapping-word-intervals', + message=f'Range overlaps with others: {cols[ID]}' + ).report(state, self.args) + covered |= set(range(start, end+1)) + + + + def OLD_validate_newlines(self, state, inp): + """ + Checks that the input file consistently uses linux-style newlines (LF only, + not CR LF like in Windows). To be run on the input file handle after the + whole input has been read. + """ + if inp.newlines and inp.newlines != '\n': + Incident( + state=state, + level=1, + testclass='Format', + lineno=state.current_line, + testid='non-unix-newline', + message='Only the unix-style LF line terminator is allowed.' + ).report(state, self.args) + + + +#============================================================================== +# Level 2 tests. Tree structure, universal tags and deprels. Note that any +# well-formed Feature=Value pair is allowed (because it could be language- +# specific) and any word form or lemma can contain spaces (because language- +# specific guidelines may permit it). +#============================================================================== + + + +#------------------------------------------------------------------------------ +# Level 2 tests of sentence metadata. +#------------------------------------------------------------------------------ + + + + def validate_sent_id(self, state, comments, lcode): + """ + Checks that sentence id exists, is well-formed and unique. + """ + Incident.default_level = 2 + Incident.default_testclass = 'Metadata' + Incident.default_lineno = -1 # use the first line after the comments + matched = [] + for c in comments: + match = crex.sentid.fullmatch(c) + if match: + matched.append(match) + else: + if c.startswith('# sent_id') or c.startswith('#sent_id'): + Incident( + state=state, + testid='invalid-sent-id', + message=f"Spurious sent_id line: '{c}' should look like '# sent_id = xxxxx' where xxxxx is not whitespace. Forward slash reserved for special purposes." + ).report(state, self.args) + if not matched: + Incident( + state=state, + testid='missing-sent-id', + message='Missing the sent_id attribute.' + ).report(state, self.args) + elif len(matched) > 1: + Incident( + state=state, + testid='multiple-sent-id', + message='Multiple sent_id attributes.' + ).report(state, self.args) + else: + # Uniqueness of sentence ids should be tested treebank-wide, not just file-wide. + # For that to happen, all three files should be tested at once. + sid = matched[0].group(1) + if sid in state.known_sent_ids: + Incident( + state=state, + testid='non-unique-sent-id', + message=f"Non-unique sent_id attribute '{sid}'." + ).report(state, self.args) + if sid.count('/') > 1 or (sid.count('/') == 1 and lcode != 'ud'): + Incident( + state=state, + testid='slash-in-sent-id', + message=f"The forward slash is reserved for special use in parallel treebanks: '{sid}'" + ).report(state, self.args) + state.known_sent_ids.add(sid) + + + + def validate_text_meta(self, state, comments, tree): + """ + Checks metadata other than sentence id, that is, document breaks, paragraph + breaks and sentence text (which is also compared to the sequence of the + forms of individual tokens, and the spaces vs. SpaceAfter=No in MISC). + """ + Incident.default_level = 2 + Incident.default_testclass = 'Metadata' + Incident.default_lineno = -1 # use the first line after the comments + newdoc_matched = [] + newpar_matched = [] + text_matched = [] + for c in comments: + newdoc_match = crex.newdoc.fullmatch(c) + if newdoc_match: + newdoc_matched.append(newdoc_match) + newpar_match = crex.newpar.fullmatch(c) + if newpar_match: + newpar_matched.append(newpar_match) + text_match = crex.text.fullmatch(c) + if text_match: + text_matched.append(text_match) + if len(newdoc_matched) > 1: + Incident( + state=state, + testid='multiple-newdoc', + message='Multiple newdoc attributes.' + ).report(state, self.args) + if len(newpar_matched) > 1: + Incident( + state=state, + testid='multiple-newpar', + message='Multiple newpar attributes.' + ).report(state, self.args) + if (newdoc_matched or newpar_matched) and state.spaceafterno_in_effect: + Incident( + state=state, + testid='spaceafter-newdocpar', + message='New document or paragraph starts when the last token of the previous sentence says SpaceAfter=No.' + ).report(state, self.args) + if not text_matched: + Incident( + state=state, + testid='missing-text', + message='Missing the text attribute.' + ).report(state, self.args) + elif len(text_matched) > 1: + Incident( + state=state, + testid='multiple-text', + message='Multiple text attributes.' + ).report(state, self.args) + else: + stext = text_matched[0].group(1) + if stext[-1].isspace(): + Incident( + state=state, + testid='text-trailing-whitespace', + message='The text attribute must not end with whitespace.' + ).report(state, self.args) + # Validate the text against the SpaceAfter attribute in MISC. + skip_words = set() + mismatch_reported = 0 # do not report multiple mismatches in the same sentence; they usually have the same cause + # We will sum state.sentence_line + iline, and state.sentence_line already points at + # the first token/node line after the sentence comments. Hence iline shall + # be 0 once we enter the cycle. + iline = -1 + for cols in tree: + iline += 1 + if 'NoSpaceAfter=Yes' in cols[MISC]: # I leave this without the split("|") to catch all + Incident( + state=state, + testid='nospaceafter-yes', + message="'NoSpaceAfter=Yes' should be replaced with 'SpaceAfter=No'." + ).report(state, self.args) + if len([x for x in cols[MISC].split('|') if re.match(r"^SpaceAfter=", x) and x != 'SpaceAfter=No']) > 0: + Incident( + state=state, + lineno=state.sentence_line+iline, + testid='spaceafter-value', + message="Unexpected value of the 'SpaceAfter' attribute in MISC. Did you mean 'SpacesAfter'?" + ).report(state, self.args) + if utils.is_empty_node(cols): + if 'SpaceAfter=No' in cols[MISC]: # I leave this without the split("|") to catch all + Incident( + state=state, + lineno=state.sentence_line+iline, + testid='spaceafter-empty-node', + message="'SpaceAfter=No' cannot occur with empty nodes." + ).report(state, self.args) + continue + elif utils.is_multiword_token(cols): + beg, end = cols[ID].split('-') + begi, endi = int(beg), int(end) + # If we see a multi-word token, add its words to an ignore-set – these will be skipped, and also checked for absence of SpaceAfter=No. + for i in range(begi, endi+1): + skip_words.add(str(i)) + elif cols[ID] in skip_words: + if 'SpaceAfter=No' in cols[MISC]: + Incident( + state=state, + lineno=state.sentence_line+iline, + testid='spaceafter-mwt-node', + message="'SpaceAfter=No' cannot occur with words that are part of a multi-word token." + ).report(state, self.args) + continue + else: + # Err, I guess we have nothing to do here. :) + pass + # So now we have either a multi-word token or a word which is also a token in its entirety. + if not stext.startswith(cols[FORM]): + if not mismatch_reported: + extra_message = '' + if len(stext) >= 1 and stext[0].isspace(): + extra_message = ' (perhaps extra SpaceAfter=No at previous token?)' + Incident( + state=state, + lineno=state.sentence_line+iline, + testid='text-form-mismatch', + message=f"Mismatch between the text attribute and the FORM field. Form[{cols[ID]}] is '{cols[FORM]}' but text is '{stext[:len(cols[FORM])+20]}...'"+extra_message + ).report(state, self.args) + mismatch_reported = 1 + else: + stext = stext[len(cols[FORM]):] # eat the form + # Remember if SpaceAfter=No applies to the last word of the sentence. + # This is not prohibited in general but it is prohibited at the end of a paragraph or document. + if 'SpaceAfter=No' in cols[MISC].split("|"): + state.spaceafterno_in_effect = True + else: + state.spaceafterno_in_effect = False + if (stext) and not stext[0].isspace(): + Incident( + state=state, + lineno=state.sentence_line+iline, + testid='missing-spaceafter', + message=f"'SpaceAfter=No' is missing in the MISC field of node {cols[ID]} because the text is '{utils.shorten(cols[FORM]+stext)}'." + ).report(state, self.args) + stext = stext.lstrip() + if stext: + Incident( + state=state, + testid='text-extra-chars', + message=f"Extra characters at the end of the text attribute, not accounted for in the FORM fields: '{stext}'" + ).report(state, self.args) + + + +#------------------------------------------------------------------------------ +# Level 2 tests applicable to a single line independently of the others. +#------------------------------------------------------------------------------ + + def OLD_validate_mwt_empty_vals(self, state, cols, line): + """ + Checks that a multi-word token has _ empty values in all fields except MISC. + This is required by UD guidelines although it is not a problem in general, + therefore a level 2 test. + + Parameters + ---------- + cols : list + The values of the columns on the current node / token line. + line : int + Number of the line where the node occurs in the file. + """ + assert utils.is_multiword_token(cols), 'internal error' + for col_idx in range(LEMMA, MISC): # all columns except the first two (ID, FORM) and the last one (MISC) + # Exception: The feature Typo=Yes may occur in FEATS of a multi-word token. + if col_idx == FEATS and cols[col_idx] == 'Typo=Yes': + # If a multi-word token has Typo=Yes, its component words must not have it. + # We must remember the span of the MWT and check it in validate_features_level4(). + m = crex.mwtid.fullmatch(cols[ID]) + state.mwt_typo_span_end = m.group(2) + elif cols[col_idx] != '_': + Incident( + state=state, + lineno=line, + level=2, + testclass='Format', + testid='mwt-nonempty-field', + message=f"A multi-word token line must have '_' in the column {COLNAMES[col_idx]}. Now: '{cols[col_idx]}'." + ).report(state, self.args) + + + + + def OLD_validate_empty_node_empty_vals(self, state, cols, line): + """ + Checks that an empty node has _ empty values in HEAD and DEPREL. This is + required by UD guidelines but not necessarily by CoNLL-U, therefore + a level 2 test. + + Parameters + ---------- + cols : list + The values of the columns on the current node / token line. + line : int + Number of the line where the node occurs in the file. + """ + assert utils.is_empty_node(cols), 'internal error' + for col_idx in (HEAD, DEPREL): + if cols[col_idx]!= '_': + Incident( + state=state, + lineno=line, + level=2, + testclass='Format', + testid='mwt-nonempty-field', + message=f"An empty node must have '_' in the column {COLNAMES[col_idx]}. Now: '{cols[col_idx]}'." + ).report(state, self.args) + + + + def OLD_validate_character_constraints(self, state, cols, line): + """ + Checks general constraints on valid characters, e.g. that UPOS + only contains [A-Z]. + + Parameters + ---------- + cols : list + The values of the columns on the current node / token line. + line : int + Number of the line where the node occurs in the file. + """ + Incident.default_level = 2 + Incident.default_lineno = line + if utils.is_multiword_token(cols): + return + # Do not test the regular expression crex.upos here. We will test UPOS + # directly against the list of known tags. That is a level 2 test, too. + if not (crex.deprel.fullmatch(cols[DEPREL]) or (utils.is_empty_node(cols) and cols[DEPREL] == '_')): + Incident( + state=state, + testclass='Syntax', + testid='invalid-deprel', + message=f"Invalid DEPREL value '{cols[DEPREL]}'. Only lowercase English letters or a colon are expected." + ).report(state, self.args) + try: + utils.deps_list(cols) + except ValueError: + Incident( + state=state, + testclass='Enhanced', + testid='invalid-deps', + message=f"Failed to parse DEPS: '{cols[DEPS]}'." + ).report(state, self.args) + return + if any(deprel for head, deprel in utils.deps_list(cols) + if not crex.edeprel.fullmatch(deprel)): + Incident( + state=state, + testclass='Enhanced', + testid='invalid-edeprel', + message=f"Invalid enhanced relation type: '{cols[DEPS]}'." + ).report(state, self.args) + + + + def OLD_validate_upos(self, state, cols, line): + """ + Checks that the UPOS field contains one of the 17 known tags. + + Parameters + ---------- + cols : list + The values of the columns on the current node / token line. + line : int + Number of the line where the node occurs in the file. + """ + if utils.is_empty_node(cols) and cols[UPOS] == '_': + return + # Just in case, we still match UPOS against the regular expression that + # checks general character constraints. However, the list of UPOS, loaded + # from a JSON file, should conform to the regular expression. + if not crex.upos.fullmatch(cols[UPOS]) or cols[UPOS] not in self.specs.upos: + Incident( + state=state, + lineno=line, + level=2, + testclass='Morpho', + testid='unknown-upos', + message=f"Unknown UPOS tag: '{cols[UPOS]}'." + ).report(state, self.args) + + + + def OLD_validate_features_level2(self, state, cols, line): + """ + Checks general constraints on feature-value format: Permitted characters in + feature name and value, features must be sorted alphabetically, features + cannot be repeated etc. + + Parameters + ---------- + cols : list + The values of the columns on the current node / token line. + line : int + Number of the line where the node occurs in the file. + + Returns + ------- + safe : bool + There were no errors or the errors are not so severe that we should + refrain from loading the sentence into Udapi. + """ + Incident.default_lineno = line + Incident.default_level = 2 + Incident.default_testclass = 'Morpho' + feats = cols[FEATS] + if feats == '_': + return True + self.features_present(state) + feat_list = feats.split('|') + if [f.lower() for f in feat_list] != sorted(f.lower() for f in feat_list): + Incident( + state=state, + testid='unsorted-features', + message=f"Morphological features must be sorted: '{feats}'." + ).report(state, self.args) + attr_set = set() # I'll gather the set of features here to check later that none is repeated. + # Subsequent higher-level tests could fail if a feature is not in the + # Feature=Value format. If that happens, we will return False and the caller + # can skip the more fragile tests. + safe = True + for f in feat_list: + match = crex.featval.fullmatch(f) + if match is None: + Incident( + state=state, + testid='invalid-feature', + message=f"Spurious morphological feature: '{f}'. Should be of the form Feature=Value and must start with [A-Z] and only contain [A-Za-z0-9]." + ).report(state, self.args) + attr_set.add(f) # to prevent misleading error "Repeated features are disallowed" + safe = False + else: + # Check that the values are sorted as well + attr = match.group(1) + attr_set.add(attr) + values = match.group(2).split(',') + if len(values) != len(set(values)): + Incident( + state=state, + testid='repeated-feature-value', + message=f"Repeated feature values are disallowed: '{feats}'" + ).report(state, self.args) + if [v.lower() for v in values] != sorted(v.lower() for v in values): + Incident( + state=state, + testid='unsorted-feature-values', + message=f"If a feature has multiple values, these must be sorted: '{f}'" + ).report(state, self.args) + for v in values: + if not crex.val.fullmatch(v): + Incident( + state=state, + testid='invalid-feature-value', + message=f"Spurious value '{v}' in '{f}'. Must start with [A-Z0-9] and only contain [A-Za-z0-9]." + ).report(state, self.args) + # Level 2 tests character properties and canonical order but not that the f-v pair is known. + if len(attr_set) != len(feat_list): + Incident( + state=state, + testid='repeated-feature', + message=f"Repeated features are disallowed: '{feats}'." + ).report(state, self.args) + return safe + + + + @staticmethod + def features_present(state): + """ + In general, the annotation of morphological features is optional, although + highly encouraged. However, if the treebank does have features, then certain + features become required. This function is called when the first morphological + feature is encountered. It remembers that from now on, missing features can + be reported as errors. In addition, if any such errors have already been + encountered, they will be reported now. + """ + if not state.seen_morpho_feature: + state.seen_morpho_feature = state.current_line + for testid in state.delayed_feature_errors: + for occurrence in state.delayed_feature_errors[testid]['occurrences']: + occurrence.report(state, self.args) + + + + def OLD_validate_deps(self, state, cols, line): + """ + Validates that DEPS is correctly formatted and that there are no + self-loops in DEPS (longer cycles are allowed in enhanced graphs but + self-loops are not). + + This function must be run on raw DEPS before it is fed into Udapi because + it checks the order of relations, which is not guaranteed to be preserved + in Udapi. On the other hand, we assume that it is run after + validate_id_references() and only if DEPS is parsable and the head indices + in it are OK. + + Parameters + ---------- + cols : list + The values of the columns on the current node / token line. + line : int + Number of the line where the node occurs in the file. + """ + Incident.default_lineno = line + Incident.default_level = 2 + Incident.default_testclass = 'Format' + if not (utils.is_word(cols) or utils.is_empty_node(cols)): + return + # Remember whether there is at least one difference between the basic + # tree and the enhanced graph in the entire dataset. + if cols[DEPS] != '_' and cols[DEPS] != cols[HEAD]+':'+cols[DEPREL]: + state.seen_enhancement = line + # We already know that the contents of DEPS is parsable (deps_list() was + # first called from validate_id_references() and the head indices are OK). + deps = utils.deps_list(cols) + ###!!! Float will not work if there are 10 empty nodes between the same two + ###!!! regular nodes. '1.10' is not equivalent to '1.1'. + heads = [float(h) for h, d in deps] + if heads != sorted(heads): + Incident( + state=state, + testid='unsorted-deps', + message=f"DEPS not sorted by head index: '{cols[DEPS]}'" + ).report(state, self.args) + else: + lasth = None + lastd = None + for h, d in deps: + if h == lasth: + if d < lastd: + Incident( + state=state, + testid='unsorted-deps-2', + message=f"DEPS pointing to head '{h}' not sorted by relation type: '{cols[DEPS]}'" + ).report(state, self.args) + elif d == lastd: + Incident( + state=state, + testid='repeated-deps', + message=f"DEPS contain multiple instances of the same relation '{h}:{d}'" + ).report(state, self.args) + lasth = h + lastd = d + try: + id_ = float(cols[ID]) + except ValueError: + # This error has been reported previously. + return + if id_ in heads: + Incident( + state=state, + testclass='Enhanced', + testid='deps-self-loop', + message=f"Self-loop in DEPS for '{cols[ID]}'" + ).report(state, self.args) + + + + def OLD_validate_misc(self, state, cols, line): + """ + In general, the MISC column can contain almost anything. However, if there + is a vertical bar character, it is interpreted as the separator of two + MISC attributes, which may or may not have the form of attribute=value pair. + In general it is not forbidden that the same attribute appears several times + with different values, but this should not happen for selected attributes + that are described in the UD documentation. + + This function must be run on raw MISC before it is fed into Udapi because + Udapi is not prepared for some of the less recommended usages of MISC. + + Parameters + ---------- + cols : list + The values of the columns on the current node / token line. + line : int + Number of the line where the node occurs in the file. + """ + Incident.default_lineno = line + Incident.default_level = 2 + Incident.default_testclass = 'Warning' + if cols[MISC] == '_': + return + misc = [ma.split('=', 1) for ma in cols[MISC].split('|')] + mamap = {} + for ma in misc: + if ma[0] == '': + if len(ma) == 1: + Incident( + state=state, + testid='empty-misc', + message="Empty attribute in MISC; possible misinterpreted vertical bar?" + ).report(state, self.args) + else: + Incident( + state=state, + testid='empty-misc-key', + message=f"Empty MISC attribute name in '{ma[0]}={ma[1]}'." + ).report(state, self.args) + # We do not warn about MISC items that do not contain '='. + # But the remaining error messages below assume that ma[1] exists. + if len(ma) == 1: + ma.append('') + if re.match(r"^\s", ma[0]): + Incident( + state=state, + testid='misc-extra-space', + message=f"MISC attribute name starts with space in '{ma[0]}={ma[1]}'." + ).report(state, self.args) + elif re.search(r"\s$", ma[0]): + Incident( + state=state, + testid='misc-extra-space', + message=f"MISC attribute name ends with space in '{ma[0]}={ma[1]}'." + ).report(state, self.args) + elif re.match(r"^\s", ma[1]): + Incident( + state=state, + testid='misc-extra-space', + message=f"MISC attribute value starts with space in '{ma[0]}={ma[1]}'." + ).report(state, self.args) + elif re.search(r"\s$", ma[1]): + Incident( + state=state, + testid='misc-extra-space', + message=f"MISC attribute value ends with space in '{ma[0]}={ma[1]}'." + ).report(state, self.args) + if re.match(r"^(SpaceAfter|Lang|Translit|LTranslit|Gloss|LId|LDeriv)$", ma[0]): + mamap.setdefault(ma[0], 0) + mamap[ma[0]] = mamap[ma[0]] + 1 + elif re.match(r"^\s*(spaceafter|lang|translit|ltranslit|gloss|lid|lderiv)\s*$", ma[0], re.IGNORECASE): + Incident( + state=state, + testid='misc-attr-typo', + message=f"Possible typo (case or spaces) in MISC attribute '{ma[0]}={ma[1]}'." + ).report(state, self.args) + for a in list(mamap): + if mamap[a] > 1: + Incident( + state=state, + testclass='Format', # this one is real error + testid='repeated-misc', + message=f"MISC attribute '{a}' not supposed to occur twice" + ).report(state, self.args) + + + +#------------------------------------------------------------------------------ +# Level 2 tests applicable to the whole sentence. +#------------------------------------------------------------------------------ + + + + def OLD_validate_id_references(self, state, sentence): + """ + Verifies that HEAD and DEPS reference existing IDs. If this function does + not return True, most of the other tests should be skipped for the current + sentence (in particular anything that considers the tree structure). + + Parameters + ---------- + sentence : list + Lines (arrays of columns): words, mwt tokens, empty nodes. + + Returns + ------- + ok : bool + """ + ok = True + Incident.default_level = 2 + Incident.default_testclass = 'Format' + word_tree = [cols for cols in sentence if utils.is_word(cols) or utils.is_empty_node(cols)] + ids = set([cols[ID] for cols in word_tree]) + for cols in word_tree: + # Test the basic HEAD only for non-empty nodes. + # We have checked elsewhere that it is empty for empty nodes. + if not utils.is_empty_node(cols): + match = crex.head.fullmatch(cols[HEAD]) + if match is None: + Incident( + state=state, + testid='invalid-head', + message=f"Invalid HEAD: '{cols[HEAD]}'." + ).report(state, self.args) + ok = False + if not (cols[HEAD] in ids or cols[HEAD] == '0'): + Incident( + state=state, + testclass='Syntax', + testid='unknown-head', + message=f"Undefined HEAD (no such ID): '{cols[HEAD]}'." + ).report(state, self.args) + ok = False + try: + deps = utils.deps_list(cols) + except ValueError: + # Similar errors have probably been reported earlier. + Incident( + state=state, + testid='invalid-deps', + message=f"Failed to parse DEPS: '{cols[DEPS]}'." + ).report(state, self.args) + ok = False + continue + for head, deprel in deps: + match = crex.ehead.fullmatch(head) + if match is None: + Incident( + state=state, + testid='invalid-ehead', + message=f"Invalid enhanced head reference: '{head}'." + ).report(state, self.args) + ok = False + if not (head in ids or head == '0'): + Incident( + state=state, + testclass='Enhanced', + testid='unknown-ehead', + message=f"Undefined enhanced head reference (no such ID): '{head}'." + ).report(state, self.args) + ok = False + return ok + + + def validate_tree(self, state, sentence): + """ + Takes the list of non-comment lines (line = list of columns) describing + a sentence. Returns an array with line number corresponding to each tree + node. In case of fatal problems (missing HEAD etc.) returns None + (and reports the error, unless it is something that should have been + reported earlier). + + We will assume that this function is called only if both ID and HEAD values + have been found valid for all tree nodes, including the sequence of IDs + and the references from HEAD to existing IDs. + + This function originally served to build a data structure that would + describe the tree and make it accessible during subsequent tests. Now we + use the Udapi data structures instead but we still have to call this + function first because it will survive and report ill-formed input. In + such a case, the Udapi data structure will not be built and Udapi-based + tests will be skipped. + + Parameters + ---------- + sentence : list + Lines (arrays of columns): words, mwt tokens, empty nodes. + + Returns + ------- + ok : bool + """ + Incident.default_level = 2 + Incident.default_testclass = 'Syntax' + node_line = state.sentence_line - 1 + children = {} # int(node id) -> set of children + n_words = 0 + for cols in sentence: + node_line += 1 + if not utils.is_word(cols): + continue + n_words += 1 + # ID and HEAD values have been validated before and this function would + # not be called if they were not OK. So we can now safely convert them + # to integers. + id_ = int(cols[ID]) + head = int(cols[HEAD]) + if head == id_: + Incident( + state=state, + lineno=node_line, + testid='head-self-loop', + message=f'HEAD == ID for {cols[ID]}' + ).report(state, self.args) + return False + # Incrementally build the set of children of every node. + children.setdefault(head, set()).add(id_) + word_ids = list(range(1, n_words+1)) + # Check that there is just one node with the root relation. + children_0 = sorted(children.get(0, [])) + if len(children_0) > 1 and self.args.single_root: + Incident( + state=state, + lineno=-1, + testid='multiple-roots', + message=f"Multiple root words: {children_0}" + ).report(state, self.args) + return False + # Return None if there are any cycles. Otherwise we could not later ask + # Udapi to built a data structure representing the tree. + # Presence of cycles is equivalent to presence of unreachable nodes. + projection = set() + node_id = 0 + nodes = list((node_id,)) + while nodes: + node_id = nodes.pop() + children_id = sorted(children.get(node_id, [])) + for child in children_id: + if child in projection: + continue # skip cycles + projection.add(child) + nodes.append(child) + unreachable = set(word_ids) - projection + if unreachable: + str_unreachable = ','.join(str(w) for w in sorted(unreachable)) + Incident( + state=state, + lineno=-1, + testid='non-tree', + message=f'Non-tree structure. Words {str_unreachable} are not reachable from the root 0.' + ).report(state, self.args) + return False + return True + + + def validate_root(self, state, node, line): + """ + Checks that DEPREL is "root" iff HEAD is 0. + + Parameters + ---------- + node : udapi.core.node.Node object + The node whose incoming relation will be validated. This function + operates on both regular and empty nodes. Make sure to call it for + empty nodes, too! + line : int + Number of the line where the node occurs in the file. + """ + Incident.default_lineno = line + Incident.default_level = 2 + Incident.default_testclass = 'Syntax' + if not node.is_empty(): + if node.parent.ord == 0 and node.udeprel != 'root': + Incident( + state=state, + testid='0-is-not-root', + message="DEPREL must be 'root' if HEAD is 0." + ).report(state, self.args) + if node.parent.ord != 0 and node.udeprel == 'root': + Incident( + state=state, + testid='root-is-not-0', + message="DEPREL cannot be 'root' if HEAD is not 0." + ).report(state, self.args) + # In the enhanced graph, test both regular and empty roots. + for edep in node.deps: + if edep['parent'].ord == 0 and utils.lspec2ud(edep['deprel']) != 'root': + Incident( + state=state, + testclass='Enhanced', + testid='enhanced-0-is-not-root', + message="Enhanced relation type must be 'root' if head is 0." + ).report(state, self.args) + if edep['parent'].ord != 0 and utils.lspec2ud(edep['deprel']) == 'root': + Incident( + state=state, + testclass='Enhanced', + testid='enhanced-root-is-not-0', + message="Enhanced relation type cannot be 'root' if head is not 0." + ).report(state, self.args) + + + def OLD_validate_deps_all_or_none(self, state, sentence): + """ + Takes the list of non-comment lines (line = list of columns) describing + a sentence. Checks that enhanced dependencies are present if they were + present at another sentence, and absent if they were absent at another + sentence. + """ + egraph_exists = False # enhanced deps are optional + for cols in sentence: + if utils.is_multiword_token(cols): + continue + if utils.is_empty_node(cols) or cols[DEPS] != '_': + egraph_exists = True + # We are currently testing the existence of enhanced graphs separately for each sentence. + # However, we should not allow that one sentence has a connected egraph and another + # has no enhanced dependencies. Such inconsistency could come as a nasty surprise + # to the users. + Incident.default_lineno = state.sentence_line + Incident.default_level = 2 + Incident.default_testclass = 'Enhanced' + if egraph_exists: + if not state.seen_enhanced_graph: + state.seen_enhanced_graph = state.sentence_line + if state.seen_tree_without_enhanced_graph: + Incident( + state=state, + testid='edeps-only-sometimes', + message=f"Enhanced graph must be empty because we saw empty DEPS on line {state.seen_tree_without_enhanced_graph}" + ).report(state, self.args) + else: + if not state.seen_tree_without_enhanced_graph: + state.seen_tree_without_enhanced_graph = state.sentence_line + if state.seen_enhanced_graph: + Incident( + state=state, + testid='edeps-only-sometimes', + message=f"Enhanced graph cannot be empty because we saw non-empty DEPS on line {state.seen_enhanced_graph}" + ).report(state, self.args) + + + def validate_egraph_connected(self, state, nodes, linenos): + """ + Takes the list of nodes (including empty nodes). If there are enhanced + dependencies in DEPS, builds the enhanced graph and checks that it is + rooted and connected. + + Parameters + ---------- + nodes : list of udapi.core.node.Node objects + List of nodes in the sentence, including empty nodes, sorted by word + order. + linenos : dict + Indexed by node ID (string), contains the line number on which the node + occurs. + """ + egraph_exists = False # enhanced deps are optional + egraph = {'0': {'children': set()}} + nodeids = set() + for node in nodes: + parents = [x['parent'] for x in node.deps] + if node.is_empty() or len(parents) > 0: + egraph_exists = True + nodeids.add(str(node.ord)) + # The graph may already contain a record for the current node if one of + # the previous nodes is its child. If it doesn't, we will create it now. + egraph.setdefault(str(node.ord), {}) + egraph[str(node.ord)].setdefault('children', set()) + # Incrementally build the set of children of every node. + for p in parents: + egraph.setdefault(str(p.ord), {}) + egraph[str(p.ord)].setdefault('children', set()).add(str(node.ord)) + # If there is no trace of enhanced annotation, there are no requirements + # on the enhanced graph. + if not egraph_exists: + return + # Check that the graph is rooted and connected. The UD guidelines do not + # license unconnected graphs. Projection of the technical root (ord '0') + # must contain all nodes. + projection = set() + node_id = '0' + projnodes = list((node_id,)) + while projnodes: + node_id = projnodes.pop() + for child in egraph[node_id]['children']: + if child in projection: + continue # skip cycles + projection.add(child) + projnodes.append(child) + unreachable = nodeids - projection + if unreachable: + sur = sorted(unreachable) + Incident( + state=state, + lineno=linenos[sur[0]], + level=2, + testclass='Enhanced', + testid='unconnected-egraph', + message=f"Enhanced graph is not connected. Nodes {sur} are not reachable from any root" + ).report(state, self.args) + return None + + + +#============================================================================== +# Level 3 tests. Annotation content vs. the guidelines (only universal tests). +#============================================================================== + + + def validate_required_feature(self, state, feats, required_feature, required_value, incident): + """ + In general, the annotation of morphological features is optional, although + highly encouraged. However, if the treebank does have features, then certain + features become required. This function will check the presence of a feature + and if it is missing, an error will be reported only if at least one feature + has been already encountered. Otherwise the error will be remembered and it + may be reported afterwards if any feature is encountered later. + + Parameters + ---------- + feats : udapi.core.dualdict.DualDict object + The feature-value set to be tested whether they contain the required one. + required_feature : str + The name of the required feature. + required_value : str + The required value of the feature. Multivalues are not supported (they + are just a string value containing one or more commas). If + required_value is None or an empty string, it means that we require any + non-empty value of required_feature. + incident : Incident object + The message that should be printed if the error is confirmed. + """ + ok = True + if required_value: + if feats[required_feature] != required_value: + ok = False + else: + if feats[required_feature] == '': + ok = False + if not ok: + if state.seen_morpho_feature: + incident.report(state, self.args) + else: + if not incident.testid in state.delayed_feature_errors: + state.delayed_feature_errors[incident.testid] = {'occurrences': []} + state.delayed_feature_errors[incident.testid]['occurrences'].append({'incident': incident}) + + + def validate_expected_features(self, state, node, lineno): + """ + Certain features are expected to occur with certain UPOS or certain values + of other features. This function issues warnings instead of errors, as + features are in general optional and language-specific. Even the warnings + are issued only if the treebank has features. Note that the expectations + tested here are considered (more or less) universal. Checking that a given + feature-value pair is compatible with a particular UPOS is done using + language-specific lists at level 4. + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + lineno : int + The 1-based index of the line where the node occurs. + """ + Incident.default_lineno = lineno + Incident.default_level = 3 + Incident.default_testclass = 'Warning' + if node.upos in ['PRON', 'DET']: + self.validate_required_feature(state, node.feats, 'PronType', None, Incident( + state=state, + testid='pron-det-without-prontype', + message=f"The word '{utils.formtl(node)}' is tagged '{node.upos}' but it lacks the 'PronType' feature" + )) + if node.feats['VerbForm'] == 'Fin' and node.feats['Mood'] == '': + Incident( + state=state, + testid='verbform-fin-without-mood', + message=f"Finite verb '{utils.formtl(node)}' lacks the 'Mood' feature" + ).report(state, self.args) + elif node.feats['Mood'] != '' and node.feats['VerbForm'] != 'Fin': + Incident( + state=state, + testid='mood-without-verbform-fin', + message=f"Non-empty 'Mood' feature at a word that is not finite verb ('{utils.formtl(node)}')" + ).report(state, self.args) + + + + def validate_upos_vs_deprel(self, state, node, lineno): + """ + For certain relations checks that the dependent word belongs to an expected + part-of-speech category. Occasionally we may have to check the children of + the node, too. + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + lineno : int + The 1-based index of the line where the node occurs. + """ + Incident.default_lineno = lineno + Incident.default_level = 3 + Incident.default_testclass = 'Syntax' + # Occasionally a word may be marked by the feature ExtPos as acting as + # a part of speech different from its usual one (which is given in UPOS). + # Typical examples are words that head fixed multiword expressions (the + # whole expression acts like a word of that alien part of speech), but + # ExtPos may be used also on single words whose external POS is altered. + upos = node.upos + # Nodes with a fixed child may need ExtPos to signal the part of speech of + # the whole fixed expression. + if node.feats['ExtPos']: + upos = node.feats['ExtPos'] + # This is a level 3 test, we will check only the universal part of the relation. + deprel = node.udeprel + childrels = set([x.udeprel for x in node.children]) + # It is recommended that the head of a fixed expression always has ExtPos, + # even if it does not need it to pass the tests in this function. + if 'fixed' in childrels and not node.feats['ExtPos']: + fixed_forms = [node.form] + [x.form for x in node.children if x.udeprel == 'fixed'] + str_fixed_forms = ' '.join(fixed_forms) + Incident( + state=state, + testclass='Warning', + testid='fixed-without-extpos', + message=f"Fixed expression '{str_fixed_forms}' does not have the 'ExtPos' feature" + ).report(state, self.args) + # Certain relations are reserved for nominals and cannot be used for verbs. + # Nevertheless, they can appear with adjectives or adpositions if they are promoted due to ellipsis. + # Unfortunately, we cannot enforce this test because a word can be cited + # rather than used, and then it can take a nominal function even if it is + # a verb, as in this Upper Sorbian sentence where infinitives are appositions: + # [hsb] Z werba danci "rejować" móže substantiw nastać danco "reja", adjektiw danca "rejowanski" a adwerb dance "rejowansce", ale tež z substantiwa martelo "hamor" móže nastać werb marteli "klepać z hamorom", adjektiw martela "hamorowy" a adwerb martele "z hamorom". + # Determiner can alternate with a pronoun. + if deprel == 'det' and not re.match(r"^(DET|PRON)", upos): + Incident( + state=state, + testid='rel-upos-det', + message=f"'det' should be 'DET' or 'PRON' but it is '{upos}' ('{utils.formtl(node)}')" + ).report(state, self.args) + # Nummod is for "number phrases" only. This could be interpreted as NUM only, + # but some languages treat some cardinal numbers as NOUNs, and in + # https://github.com/UniversalDependencies/docs/issues/596, + # we concluded that the validator will tolerate them. + if deprel == 'nummod' and not re.match(r"^(NUM|NOUN|SYM)$", upos): + Incident( + state=state, + testid='rel-upos-nummod', + message=f"'nummod' should be 'NUM' but it is '{upos}' ('{utils.formtl(node)}')" + ).report(state, self.args) + # Advmod is for adverbs, perhaps particles but not for prepositional phrases or clauses. + # Nevertheless, we should allow adjectives because they can be used as adverbs in some languages. + # https://github.com/UniversalDependencies/docs/issues/617#issuecomment-488261396 + # Bohdan reports that some DET can modify adjectives in a way similar to ADV. + # I am not sure whether advmod is the best relation for them but the alternative + # det is not much better, so maybe we should not enforce it. Adding DET to the tolerated UPOS tags. + if deprel == 'advmod' and not re.match(r"^(ADV|ADJ|CCONJ|DET|PART|SYM)", upos) and not 'goeswith' in childrels: + Incident( + state=state, + testid='rel-upos-advmod', + message=f"'advmod' should be 'ADV' but it is '{upos}' ('{utils.formtl(node)}')" + ).report(state, self.args) + # Known expletives are pronouns. Determiners and particles are probably acceptable, too. + if deprel == 'expl' and not re.match(r"^(PRON|DET|PART)$", upos): + Incident( + state=state, + testid='rel-upos-expl', + message=f"'expl' should normally be 'PRON' but it is '{upos}' ('{utils.formtl(node)}')" + ).report(state, self.args) + # Auxiliary verb/particle must be AUX. + if deprel == 'aux' and not re.match(r"^(AUX)", upos): + Incident( + state=state, + testid='rel-upos-aux', + message=f"'aux' should be 'AUX' but it is '{upos}' ('{utils.formtl(node)}')" + ).report(state, self.args) + # Copula is an auxiliary verb/particle (AUX) or a pronoun (PRON|DET). + if deprel == 'cop' and not re.match(r"^(AUX|PRON|DET|SYM)", upos): + Incident( + state=state, + testid='rel-upos-cop', + message=f"'cop' should be 'AUX' or 'PRON'/'DET' but it is '{upos}' ('{utils.formtl(node)}')" + ).report(state, self.args) + # Case is normally an adposition, maybe particle. + # However, there are also secondary adpositions and they may have the original POS tag: + # NOUN: [cs] pomocí, prostřednictvím + # VERB: [en] including + # Interjection can also act as case marker for vocative, as in Sanskrit: भोः भगवन् / bhoḥ bhagavan / oh sir. + if deprel == 'case' and re.match(r"^(PROPN|ADJ|PRON|DET|NUM|AUX)", upos): + Incident( + state=state, + testid='rel-upos-case', + message=f"'case' should not be '{upos}' ('{utils.formtl(node)}')" + ).report(state, self.args) + # Mark is normally a conjunction or adposition, maybe particle but definitely not a pronoun. + ###!!! February 2022: Temporarily allow mark+VERB ("regarding"). In the future, it should be banned again + ###!!! by default (and case+VERB too), but there should be a language-specific list of exceptions. + ###!!! In 2024 I wanted to re-enable the test because people could use the + ###!!! newly approved ExtPos feature to signal that "regarding" is acting + ###!!! as a function word, but Amir was opposed to the idea that ExtPos would + ###!!! now be required also for single-word expressions. + if deprel == 'mark' and re.match(r"^(NOUN|PROPN|ADJ|PRON|DET|NUM|AUX|INTJ)", upos): + Incident( + state=state, + testid='rel-upos-mark', + message=f"'mark' should not be '{upos}' ('{utils.formtl(node)}')" + ).report(state, self.args) + # Cc is a conjunction, possibly an adverb or particle. + if deprel == 'cc' and re.match(r"^(NOUN|PROPN|ADJ|PRON|DET|NUM|VERB|AUX|INTJ)", upos): + Incident( + state=state, + testid='rel-upos-cc', + message=f"'cc' should not be '{upos}' ('{utils.formtl(node)}')" + ).report(state, self.args) + if deprel == 'punct' and upos != 'PUNCT': + Incident( + state=state, + testid='rel-upos-punct', + message=f"'punct' must be 'PUNCT' but it is '{upos}' ('{utils.formtl(node)}')" + ).report(state, self.args) + if upos == 'PUNCT' and not re.match(r"^(punct|root)", deprel): + Incident( + state=state, + testid='upos-rel-punct', + message=f"'PUNCT' must be 'punct' but it is '{node.deprel}' ('{utils.formtl(node)}')" + ).report(state, self.args) + if upos == 'PROPN' and (deprel == 'fixed' or 'fixed' in childrels): + Incident( + state=state, + testid='rel-upos-fixed', + message=f"'fixed' should not be used for proper nouns ('{utils.formtl(node)}')." + ).report(state, self.args) + + + + def validate_flat_foreign(self, state, node, lineno, linenos): + """ + flat:foreign is an optional subtype of flat. It is used to connect two words + in a code-switched segment of foreign words if the annotators did not want + to provide the analysis according to the source language. If flat:foreign + is used, both the parent and the child should have the Foreign=Yes feature + and their UPOS tag should be X. + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + lineno : int + The 1-based index of the line where the node occurs. + linenos : dict + Key is node ID (string, not int or float!) Value is the 1-based index + of the line where the node occurs (int). + """ + Incident.default_level = 3 + Incident.default_testclass = 'Warning' # or Morpho + if node.deprel != 'flat:foreign': + return + parent = node.parent + if node.upos != 'X' or str(node.feats) != 'Foreign=Yes': + Incident( + state=state, + lineno=lineno, + nodeid=node.ord, + testid='flat-foreign-upos-feats', + message="The child of a flat:foreign relation should have UPOS X and Foreign=Yes (but no other features)." + ).report(state, self.args) + if parent.upos != 'X' or str(parent.feats) != 'Foreign=Yes': + Incident( + state=state, + lineno=linenos[str(parent.ord)], + nodeid=parent.ord, + testid='flat-foreign-upos-feats', + message="The parent of a flat:foreign relation should have UPOS X and Foreign=Yes (but no other features)." + ).report(state, self.args) + + + + def validate_left_to_right_relations(self, state, node, lineno): + """ + Certain UD relations must always go left-to-right (in the logical order, + meaning that parent precedes child, disregarding that some languages have + right-to-left writing systems). + Here we currently check the rule for the basic dependencies. + The same should also be tested for the enhanced dependencies! + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + lineno : int + The 1-based index of the line where the node occurs. + """ + # According to the v2 guidelines, apposition should also be left-headed, although the definition of apposition may need to be improved. + if re.match(r"^(conj|fixed|flat|goeswith|appos)", node.deprel): + ichild = node.ord + iparent = node.parent.ord + if ichild < iparent: + # We must recognize the relation type in the test id so we can manage exceptions for legacy treebanks. + # For conj, flat, and fixed the requirement was introduced already before UD 2.2. + # For appos and goeswith the requirement was introduced before UD 2.4. + # The designation "right-to-left" is confusing in languages with right-to-left writing systems. + # We keep it in the testid but we make the testmessage more neutral. + Incident( + state=state, + lineno=lineno, + nodeid=node.ord, + level=3, + testclass='Syntax', + testid=f"right-to-left-{node.udeprel}", + message=f"Parent of relation '{node.deprel}' must precede the child in the word order." + ).report(state, self.args) + + + + def validate_single_subject(self, state, node, lineno): + """ + No predicate should have more than one subject. + An xcomp dependent normally has no subject, but in some languages the + requirement may be weaker: it could have an overt subject if it is + correferential with a particular argument of the matrix verb. Hence we do + not check zero subjects of xcomp dependents at present. + Furthermore, in some situations we must allow multiple subjects. If a clause + acts as a nonverbal predicate of another clause, then we must attach two + subjects to the predicate of the inner clause: one is the predicate of the + inner clause, the other is the predicate of the outer clause. This could in + theory be recursive but in practice it isn't. As of UD 2.10, an amendment + of the guidelines says that the inner predicate of the predicate clause + should govern both subjects even if there is a copula (previously such + cases were an exception from the UD approach that copulas should not be + heads); however, the outer subjects should be attached as [nc]subj:outer. + See https://universaldependencies.org/changes.html#multiple-subjects. + See also issue 34 (https://github.com/UniversalDependencies/tools/issues/34). + Strictly speaking, :outer is optional because it is a subtype, and some + treebanks may want to avoid it. For example, in Coptic Scriptorium, there + is only one occurrence in dev, one in test, and none in train, so it would + be impossible to train a parser that gets it right. For that reason, it is + possible to replace the :outer subtype with Subject=Outer in MISC. The MISC + attribute is just a directive for the validator and no parser is expected + to predict it. + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + lineno : int + The 1-based index of the line where the node occurs. + """ + + def is_inner_subject(node): + """ + Takes a node (udapi.core.node.Node). Tells whether the node's deprel is + nsubj or csubj without the :outer subtype. Alternatively, instead of the + :outer subtype, the node could have Subject=Outer in MISC. + """ + if not re.search(r'subj', node.udeprel): + return False + if re.match(r'^[nc]subj:outer$', node.deprel): + return False + if node.misc['Subject'] == 'Outer': + return False + return True + + subjects = [x for x in node.children if is_inner_subject(x)] + subject_ids = [x.ord for x in subjects] + subject_forms = [utils.formtl(x) for x in subjects] + if len(subjects) > 1: + Incident( + state=state, + lineno=lineno, + nodeid=node.ord, + level=3, + testclass='Syntax', + testid='too-many-subjects', + message=f"Multiple subjects {str(subject_ids)} ({str(subject_forms)[1:-1]}) not subtyped as ':outer'.", + explanation="Outer subjects are allowed if a clause acts as the predicate of another clause." + ).report(state, self.args) + + + + def validate_single_object(self, state, node, lineno): + """ + No predicate should have more than one direct object (number of indirect + objects is unlimited). Theoretically, ccomp should be understood as a + clausal equivalent of a direct object, but we do not have an indirect + equivalent, so it seems better to tolerate additional ccomp at present. + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + lineno : int + The 1-based index of the line where the node occurs. + """ + objects = [x for x in node.children if x.udeprel == 'obj'] + object_ids = [x.ord for x in objects] + object_forms = [utils.formtl(x) for x in objects] + if len(objects) > 1: + Incident( + state=state, + lineno=lineno, + nodeid=node.ord, + level=3, + testclass='Syntax', + testid='too-many-objects', + message=f"Multiple direct objects {str(object_ids)} ({str(object_forms)[1:-1]}) under one predicate." + ).report(state, self.args) + + + + def validate_orphan(self, state, node, lineno): + """ + The orphan relation is used to attach an unpromoted orphan to the promoted + orphan in gapping constructions. A common error is that the promoted orphan + gets the orphan relation too. The parent of orphan is typically attached + via a conj relation, although some other relations are plausible too. + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + lineno : int + The 1-based index of the line where the node occurs. + """ + # This is a level 3 test, we will check only the universal part of the relation. + if node.udeprel == 'orphan': + # We include advcl because gapping (or something very similar) can also + # occur in subordinate clauses: "He buys companies like my mother [does] vegetables." + # In theory, a similar pattern could also occur with reparandum. + # A similar pattern also occurs with acl, e.g. in Latvian: + # viņš ēd tos ābolus, ko pirms tam [ēda] tārpi ('he eats the same apples, which were [eaten] by worms before that') + # Other clausal heads (ccomp, csubj) may be eligible as well, e.g. in Latvian + # (see also issue 635 2019-09-19): + # atjēdzos, ka bez angļu valodas nekur [netikšu] '[I] realised, that [I will get] nowhere without English' + # 2023-04-14: Reclassifying the test as warning only. Due to promotion, + # the parent of orphan may receive many other relations. See issue 635 + # for details and a Latin example. + if not re.match(r"^(conj|parataxis|root|csubj|ccomp|advcl|acl|reparandum)$", node.parent.udeprel): + Incident( + state=state, + lineno=lineno, + nodeid=node.ord, + level=3, + testclass='Warning', + testid='orphan-parent', + message=f"The parent of 'orphan' should normally be 'conj' but it is '{node.parent.udeprel}'." + ).report(state, self.args) + + + + def validate_functional_leaves(self, state, node, lineno, linenos): + """ + Most of the time, function-word nodes should be leaves. This function + checks for known exceptions and warns in the other cases. + (https://universaldependencies.org/u/overview/syntax.html#function-word-modifiers) + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + lineno : int + The 1-based index of the line where the node occurs. + linenos : dict + Key is node ID (string, not int or float!) Value is the 1-based index + of the line where the node occurs (int). + """ + # This is a level 3 test, we will check only the universal part of the relation. + deprel = node.udeprel + if re.match(r"^(case|mark|cc|aux|cop|det|clf|fixed|goeswith|punct)$", deprel): + idparent = node.ord + pdeprel = deprel + pfeats = node.feats + for child in node.children: + idchild = child.ord + Incident.default_lineno = linenos[str(idchild)] + Incident.default_level = 3 + Incident.default_testclass = 'Syntax' + cdeprel = child.udeprel + # The guidelines explicitly say that negation can modify any function word + # (see https://universaldependencies.org/u/overview/syntax.html#function-word-modifiers). + # We cannot recognize negation simply by deprel; we have to look at the + # part-of-speech tag and the Polarity feature as well. + cupos = child.upos + cfeats = child.feats + if pdeprel != 'punct' and cdeprel == 'advmod' and re.match(r"^(PART|ADV)$", cupos) and cfeats['Polarity'] == 'Neg': + continue + # Punctuation should not depend on function words if it can be projectively + # attached to a content word. But sometimes it cannot. Czech example: + # "Budou - li však zbývat , ukončíme" (lit. "will - if however remain , we-stop") + # "však" depends on "ukončíme" while "budou" and "li" depend nonprojectively + # on "zbývat" (which depends on "ukončíme"). "Budou" is aux and "li" is mark. + # Yet the hyphen must depend on one of them because any other attachment would + # be non-projective. Here we assume that if the parent of a punctuation node + # is attached nonprojectively, punctuation can be attached to it to avoid its + # own nonprojectivity. + if node.is_nonprojective() and cdeprel == 'punct': + continue + # Auxiliaries, conjunctions and case markers will tollerate a few special + # types of modifiers. + # Punctuation should normally not depend on a functional node. However, + # it is possible that a functional node such as auxiliary verb is in + # quotation marks or brackets ("must") and then these symbols should depend + # on the functional node. We temporarily allow punctuation here, until we + # can detect precisely the bracket situation and disallow the rest. + # According to the guidelines + # (https://universaldependencies.org/u/overview/syntax.html#function-word-modifiers), + # mark can have a limited set of adverbial/oblique dependents, while the same + # is not allowed for nodes attached as case. Nevertheless, there are valid + # objections against this (see https://github.com/UniversalDependencies/docs/issues/618) + # and we may want to revisit the guideline in UD v3. For the time being, + # we make the validator more benevolent to 'case' too. (If we now force people + # to attach adverbials higher, information will be lost and later reversal + # of the step will not be possible.) + # Coordinating conjunctions usually depend on a non-first conjunct, i.e., + # on a node whose deprel is 'conj'. However, there are paired conjunctions + # such as "both-and", "either-or". Here the first part is attached to the + # first conjunct. Since some function nodes (mark, case, aux, cop) can be + # coordinated, we must allow 'cc' children under these nodes, too. However, + # we do not want to allow 'cc' under another 'cc'. (Still, 'cc' can have + # a 'conj' dependent. In "and/or", "or" will depend on "and" as 'conj'.) + if re.match(r"^(mark|case)$", pdeprel) and not re.match(r"^(advmod|obl|goeswith|fixed|reparandum|conj|cc|punct)$", cdeprel): + Incident( + state=state, + nodeid=node.ord, + testid='leaf-mark-case', + message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" + ).report(state, self.args) + if re.match(r"^(aux|cop)$", pdeprel) and not re.match(r"^(goeswith|fixed|reparandum|conj|cc|punct)$", cdeprel): + Incident( + state=state, + nodeid=node.ord, + testid='leaf-aux-cop', + message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" + ).report(state, self.args) + # Classifiers must be allowed under demonstrative determiners according to the clf guidelines. + # People have identified various constructions where the restriction + # on children of det dependents may have to be relaxed even if not + # mentioned directly in the universal guidelines. + # https://universaldependencies.org/workgroups/newdoc/children_of_determiners.html + # Latvian: There are compound determiners, composed of a PART and a head PRON. + # They are not fixed, so they need a separate exception for the compound deprel. + # (Laura, https://github.com/UniversalDependencies/docs/issues/1059#issuecomment-2413484624) + # Hebrew: Demonstrative pronouns have their own determiners, as in “the men the these” = “these men”. + # It is also parallel to how adjectival modification works in Modern Hebrew. + # Maybe determiners under demonstratives could be allowed in some languages but not the others? + # (Daniel, https://github.com/UniversalDependencies/docs/issues/1059#issuecomment-2400694043) + # Classical Armenian: Case marker may be repeated both at a noun and at its demonstrative. + # We probably should allow demonstratives to have their own case child, but ideally we should + # not allow it for all determiners in all languages because it opens the door for errors + # (currently there are such errors in Chinese data). ###!!! For now I am allowing it everywhere. + # (Petr, https://github.com/UniversalDependencies/docs/issues/1059#issuecomment-2441260051) + # Spoken data: + # There is a lot of fillers ("euh"), tagged INTJ and attached as discourse + # "to the most relevant nearby unit" (that is the guideline). The most + # relevant nearby unit may be a determiner. Similarly, parentheticals + # should be attached as parataxis to the most relevant unit, and again + # the unit is not necessarily a clause. For example, Latvian: + # "tādā godīgā iestādē ieperinājušies daži (tikai daži!) zagļi" + # “a few (only a few!) thieves have nested in such an honest institution” + # (Laura, https://github.com/UniversalDependencies/docs/issues/1059#issuecomment-2438448236) + # Several treebanks have problems with possessive determiners, which + # are referential and can thus take dependents such as appos, acl:relcl, even nmod. + # Joakim thinks that such possessives should be nmod rather than det, + # but that's not how many of us understand the UD guidelines. For now, + # the test should be thus relaxed if the determiner has Poss=Yes. + # Flavio also argued that certain multiword det expressions should be + # connected by flat:redup (rather than fixed), which is why flat should + # be another exception. + if re.match(r"^(det)$", pdeprel) and not re.match(r"^(det|case|advmod|obl|clf|goeswith|fixed|flat|compound|reparandum|discourse|parataxis|conj|cc|punct)$", cdeprel) and not (pfeats['Poss'] == 'Yes' and re.match(r"^(appos|acl|nmod)$", cdeprel)): + Incident( + state=state, + nodeid=node.ord, + testid='leaf-det', + message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" + ).report(state, self.args) + if re.match(r"^(clf)$", pdeprel) and not re.match(r"^(advmod|obl|goeswith|fixed|reparandum|conj|cc|punct)$", cdeprel): + Incident( + state=state, + nodeid=node.ord, + testid='leaf-clf', + message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" + ).report(state, self.args) + if re.match(r"^(cc)$", pdeprel) and not re.match(r"^(goeswith|fixed|reparandum|conj|punct)$", cdeprel): + Incident( + state=state, + nodeid=node.ord, + testid='leaf-cc', + message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" + ).report(state, self.args) + # Fixed expressions should not be nested, i.e., no chains of fixed relations. + # As they are supposed to represent functional elements, they should not have + # other dependents either, with the possible exception of conj. + # We also allow a punct child, at least temporarily, because of fixed + # expressions that have a hyphen in the middle (e.g. Russian "вперед-назад"). + # It would be better to keep these expressions as one token. But sometimes + # the tokenizer is out of control of the UD data providers and it is not + # practical to retokenize. + elif pdeprel == 'fixed' and not re.match(r"^(goeswith|reparandum|conj|punct)$", cdeprel): + Incident( + state=state, + nodeid=node.ord, + testid='leaf-fixed', + message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" + ).report(state, self.args) + # Goeswith cannot have any children, not even another goeswith. + elif pdeprel == 'goeswith': + Incident( + state=state, + nodeid=node.ord, + testid='leaf-goeswith', + message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" + ).report(state, self.args) + # Punctuation can exceptionally have other punct children if an exclamation + # mark is in brackets or quotes. It cannot have other children. + elif pdeprel == 'punct' and cdeprel != 'punct': + Incident( + state=state, + nodeid=node.ord, + testid='leaf-punct', + message=f"'{pdeprel}' not expected to have children ({idparent}:{node.form}:{pdeprel} --> {idchild}:{child.form}:{cdeprel})" + ).report(state, self.args) + + + + def validate_fixed_span(self, state, node, lineno): + """ + Like with goeswith, the fixed relation should not in general skip words that + are not part of the fixed expression. Unlike goeswith however, there can be + an intervening punctuation symbol. Moreover, the rule that fixed expressions + cannot be discontiguous has been challenged with examples from Swedish and + Coptic, see https://github.com/UniversalDependencies/docs/issues/623. + Hence, the test was turned off 2019-04-13. I am re-activating it 2023-09-03 + as just a warning. + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + lineno : int + The 1-based index of the line where the node occurs. + """ + fxchildren = [c for c in node.children if c.udeprel == 'fixed'] + if fxchildren: + fxlist = sorted([node] + fxchildren) + fxrange = [n for n in node.root.descendants if n.ord >= node.ord and n.ord <= fxchildren[-1].ord] + # All nodes between me and my last fixed child should be either fixed or punct. + fxgap = [n for n in fxrange if n.udeprel != 'punct' and n not in fxlist] + if fxgap: + fxordlist = [n.ord for n in fxlist] + fxexpr = ' '.join([(n.form if n in fxlist else '*') for n in fxrange]) + Incident( + state=state, + lineno=lineno, + nodeid=node.ord, + level=3, + testclass='Warning', + testid='fixed-gap', + message=f"Gaps in fixed expression {str(fxordlist)} '{fxexpr}'" + ).report(state, self.args) + + + def validate_goeswith_span(self, state, node, lineno): + """ + The relation 'goeswith' is used to connect word parts that are separated + by whitespace and should be one word instead. We assume that the relation + goes left-to-right, which is checked elsewhere. Here we check that the + nodes really were separated by whitespace. If there is another node in the + middle, it must be also attached via 'goeswith'. The parameter id refers to + the node whose goeswith children we test. + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + lineno : int + The 1-based index of the line where the node occurs. + """ + Incident.default_lineno = lineno + Incident.default_level = 3 + Incident.default_testclass = 'Syntax' + gwchildren = [c for c in node.children if c.udeprel == 'goeswith'] + if gwchildren: + gwlist = sorted([node] + gwchildren) + gwrange = [n for n in node.root.descendants if n.ord >= node.ord and n.ord <= gwchildren[-1].ord] + # All nodes between me and my last goeswith child should be goeswith too. + if gwlist != gwrange: + gwordlist = [n.ord for n in gwlist] + gwordrange = [n.ord for n in gwrange] + Incident( + state=state, + nodeid=node.ord, + testid='goeswith-gap', + message=f"Gaps in goeswith group {str(gwordlist)} != {str(gwordrange)}." + ).report(state, self.args) + # Non-last node in a goeswith range must have a space after itself. + nospaceafter = [x for x in gwlist[:-1] if x.misc['SpaceAfter'] == 'No'] + if nospaceafter: + Incident( + state=state, + nodeid=node.ord, + testid='goeswith-nospace', + message="'goeswith' cannot connect nodes that are not separated by whitespace." + ).report(state, self.args) + # This is not about the span of the interrupted word, but since we already + # know that we are at the head of a goeswith word, let's do it here, too. + # Every goeswith parent should also have Typo=Yes. However, this is not + # required if the treebank does not have features at all. + incident = Incident( + state=state, + nodeid=node.ord, + testclass='Morpho', + testid='goeswith-missing-typo', + message="Since the treebank has morphological features, 'Typo=Yes' must be used with 'goeswith' heads." + ) + self.validate_required_feature(state, node.feats, 'Typo', 'Yes', incident) + + + + def validate_goeswith_morphology_and_edeps(self, state, node, lineno): + """ + If a node has the 'goeswith' incoming relation, it is a non-first part of + a mistakenly interrupted word. The lemma, upos tag and morphological features + of the word should be annotated at the first part, not here. + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + lineno : int + The 1-based index of the line where the node occurs. + """ + Incident.default_lineno = lineno + Incident.default_level = 3 + Incident.default_testclass = 'Morpho' + if node.udeprel == 'goeswith': + if node.lemma != '_': + Incident( + state=state, + nodeid=node.ord, + testid='goeswith-lemma', + message="The lemma of a 'goeswith'-connected word must be annotated only at the first part." + ).report(state, self.args) + if node.upos != 'X': + Incident( + state=state, + nodeid=node.ord, + testid='goeswith-upos', + message="The UPOS tag of a 'goeswith'-connected word must be annotated only at the first part; the other parts must be tagged 'X'." + ).report(state, self.args) + if str(node.feats) != '_': + Incident( + state=state, + nodeid=node.ord, + testid='goeswith-feats', + message="The morphological features of a 'goeswith'-connected word must be annotated only at the first part." + ).report(state, self.args) + if str(node.raw_deps) != '_' and str(node.raw_deps) != str(node.parent.ord)+':'+node.deprel: + Incident( + state=state, + nodeid=node.ord, + testclass='Enhanced', + testid='goeswith-edeps', + message="A 'goeswith' dependent cannot have any additional dependencies in the enhanced graph." + ).report(state, self.args) + + + def get_caused_nonprojectivities(self, node): + """ + Checks whether a node is in a gap of a nonprojective edge. Report true only + if the node's parent is not in the same gap. (We use this function to check + that a punctuation node does not cause nonprojectivity. But if it has been + dragged to the gap with a larger subtree, then we do not blame it.) This + extra condition makes this function different from node.is_nonprojective_gap(); + another difference is that instead of just detecting the nonprojectivity, + we return the nonprojective nodes so we can report them. + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + + Returns + ------- + cross : list of udapi.core.node.Node objects + The nodes whose attachment is nonprojective because of the current node. + """ + nodes = node.root.descendants + iid = node.ord + # We need to find all nodes that are not ancestors of this node and lie + # on other side of this node than their parent. First get the set of + # ancestors. + ancestors = [] + current_node = node + while not current_node.is_root(): + current_node = current_node.parent + ancestors.append(current_node) + maxid = nodes[-1].ord + # Get the lists of nodes to either side of id. + # Do not look beyond the parent (if it is in the same gap, it is the parent's responsibility). + pid = node.parent.ord + if pid < iid: + leftidrange = range(pid + 1, iid) # ranges are open from the right (i.e. iid-1 is the last number) + rightidrange = range(iid + 1, maxid + 1) + else: + leftidrange = range(1, iid) + rightidrange = range(iid + 1, pid) + left = [n for n in nodes if n.ord in leftidrange] + right = [n for n in nodes if n.ord in rightidrange] + # Exclude nodes whose parents are ancestors of id. + leftna = [x for x in left if x.parent not in ancestors] + rightna = [x for x in right if x.parent not in ancestors] + leftcross = [x for x in leftna if x.parent.ord > iid] + rightcross = [x for x in rightna if x.parent.ord < iid] + # Once again, exclude nonprojectivities that are caused by ancestors of id. + if pid < iid: + rightcross = [x for x in rightcross if x.parent.ord > pid] + else: + leftcross = [x for x in leftcross if x.parent.ord < pid] + # Do not return just a boolean value. Return the nonprojective nodes so we can report them. + return sorted(leftcross + rightcross) + + + + @staticmethod + def get_gap(node): + """ + Returns the list of nodes between node and its parent that are not dominated + by the parent. If the list is not empty, the node is attached nonprojectively. + + Note that the Udapi Node class does not have a method like this. It has + is_nonprojective(), which returns the boolean decision without showing the + nodes in the gap. There is also the function is_nonprojective_gap() but it, + too, does not deliver what we need. + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + + Returns + ------- + gap : list of udapi.core.node.Node objects + The nodes in the gap of the current node's relation to its parent, + sorted by their ords (IDs). + """ + iid = node.ord + pid = node.parent.ord + if iid < pid: + rangebetween = range(iid + 1, pid) + else: + rangebetween = range(pid + 1, iid) + gap = [] + if rangebetween: + gap = [n for n in node.root.descendants if n.ord in rangebetween and not n in node.parent.descendants] + return gap + + + + def validate_projective_punctuation(self, state, node, lineno): + """ + Punctuation is not supposed to cause nonprojectivity or to be attached + nonprojectively. + + Parameters + ---------- + node : udapi.core.node.Node object + The tree node to be tested. + lineno : int + The 1-based index of the line where the node occurs. + """ + Incident.default_lineno = lineno + Incident.default_level = 3 + Incident.default_testclass = 'Syntax' + if node.udeprel == 'punct': + nonprojnodes = self.get_caused_nonprojectivities(node) + if nonprojnodes: + Incident( + state=state, + nodeid=node.ord, + testid='punct-causes-nonproj', + message=f"Punctuation must not cause non-projectivity of nodes {nonprojnodes}" + ).report(state, self.args) + gap = self.get_gap(node) + if gap: + Incident( + state=state, + nodeid=node.ord, + testid='punct-is-nonproj', + message=f"Punctuation must not be attached non-projectively over nodes {sorted(gap)}" + ).report(state, self.args) + + + # TODO: rename to something more meaningful + def validate_annotation(self, state, tree, linenos): + """ + Checks universally valid consequences of the annotation guidelines. Looks + at regular nodes and basic tree, not at enhanced graph (which is checked + elsewhere). + + Parameters + ---------- + tree : udapi.core.root.Root object + linenos : dict + Key is node ID (string, not int or float!) Value is the 1-based index + of the line where the node occurs (int). + """ + nodes = tree.descendants + for node in nodes: + lineno = linenos[str(node.ord)] + self.validate_expected_features(state, node, lineno) + self.validate_upos_vs_deprel(state, node, lineno) + self.validate_flat_foreign(state, node, lineno, linenos) + self.validate_left_to_right_relations(state, node, lineno) + self.validate_single_subject(state, node, lineno) + self.validate_single_object(state, node, lineno) + self.validate_orphan(state, node, lineno) + self.validate_functional_leaves(state, node, lineno, linenos) + self.validate_fixed_span(state, node, lineno) + self.validate_goeswith_span(state, node, lineno) + self.validate_goeswith_morphology_and_edeps(state, node, lineno) + self.validate_projective_punctuation(state, node, lineno) + + + + def validate_enhanced_orphan(self, state, node, line): + """ + Checks universally valid consequences of the annotation guidelines in the + enhanced representation. Currently tests only phenomena specific to the + enhanced dependencies; however, we should also test things that are + required in the basic dependencies (such as left-to-right coordination), + unless it is obvious that in enhanced dependencies such things are legal. + + Parameters + ---------- + node : udapi.core.node.Node object + The node whose incoming relations will be validated. This function + operates on both regular and empty nodes. Make sure to call it for + empty nodes, too! + line : int + Number of the line where the node occurs in the file. + """ + Incident.default_lineno = line + Incident.default_level = 3 + Incident.default_testclass = 'Enhanced' + # Enhanced dependencies should not contain the orphan relation. + # However, all types of enhancements are optional and orphans are excluded + # only if this treebank addresses gapping. We do not know it until we see + # the first empty node. + if str(node.deps) == '_': + return + if node.is_empty(): + if not state.seen_empty_node: + state.seen_empty_node = line + # Empty node itself is not an error. Report it only for the first time + # and only if an orphan occurred before it. + if state.seen_enhanced_orphan: + Incident( + state=state, + nodeid=node.ord, + testid='empty-node-after-eorphan', + message=f"Empty node means that we address gapping and there should be no orphans in the enhanced graph; but we saw one on line {state.seen_enhanced_orphan}" + ).report(state, self.args) + udeprels = set([utils.lspec2ud(edep['deprel']) for edep in node.deps]) + if 'orphan' in udeprels: + if not state.seen_enhanced_orphan: + state.seen_enhanced_orphan = line + # If we have seen an empty node, then the orphan is an error. + if state.seen_empty_node: + Incident( + state=state, + nodeid=node.ord, + testid='eorphan-after-empty-node', + message=f"'orphan' not allowed in enhanced graph because we saw an empty node on line {state.seen_empty_node}" + ).report(state, self.args) + + + +#============================================================================== +# Level 4 tests. Language-specific formal tests. Now we can check in which +# words spaces are permitted, and which Feature=Value pairs are defined. +#============================================================================== + + + + def validate_words_with_spaces(self, state, node, line, lang): + """ + Checks a single line for disallowed whitespace. + Here we assume that all language-independent whitespace-related tests have + already been done on level 1, so we only check for words with spaces that + are explicitly allowed in a given language. + + Parameters + ---------- + node : udapi.core.node.Node object + The node to be validated. + line : int + Number of the line where the node occurs in the file. + lang : str + Code of the main language of the corpus. + """ + Incident.default_lineno = line + Incident.default_level = 4 + Incident.default_testclass = 'Format' + # List of permited words with spaces is language-specific. + # The current token may be in a different language due to code switching. + tospacedata = self.specs.get_tospace_for_language(lang) + altlang = utils.get_alt_language(node) + if altlang: + lang = altlang + tospacedata = self.specs.get_tospace_for_language(altlang) + for column in ('FORM', 'LEMMA'): + word = node.form if column == 'FORM' else node.lemma + # Is there whitespace in the word? + if crex.ws.search(word): + # Whitespace found. Does the word pass the regular expression that defines permitted words with spaces in this language? + if tospacedata: + # For the purpose of this test, NO-BREAK SPACE is equal to SPACE. + string_to_test = re.sub(r'\xA0', ' ', word) + if not tospacedata[1].fullmatch(string_to_test): + Incident( + state=state, + nodeid=node.ord, + testid='invalid-word-with-space', + message=f"'{word}' in column {column} is not on the list of exceptions allowed to contain whitespace.", + explanation=outils.explain_tospace(lang) + ).report(state, self.args) + else: + Incident( + state=state, + nodeid=node.ord, + testid='invalid-word-with-space', + message=f"'{word}' in column {column} is not on the list of exceptions allowed to contain whitespace.", + explanation=outils.explain_tospace(lang) + ).report(state, self.args) + + + + def validate_features_level4(self, state, node, line, lang): + """ + Checks that a feature-value pair is listed as approved. Feature lists are + language-specific. To disallow non-universal features, test on level 4 with + language 'ud'. + + Parameters + ---------- + node : udapi.core.node.Node object + The node to be validated. + line : int + Number of the line where the node occurs in the file. + lang : str + Code of the main language of the corpus. + """ + Incident.default_lineno = line + Incident.default_level = 4 + Incident.default_testclass = 'Morpho' + if str(node.feats) == '_': + return True + # List of permited features is language-specific. + # The current token may be in a different language due to code switching. + default_lang = lang + default_featset = featset = self.specs.get_feats_for_language(lang) + altlang = utils.get_alt_language(node) + if altlang: + lang = altlang + featset = self.specs.get_feats_for_language(altlang) + for f in node.feats: + values = node.feats[f].split(',') + for v in values: + # Level 2 tested character properties and canonical order but not that the f-v pair is known. + # Level 4 also checks whether the feature value is on the list. + # If only universal feature-value pairs are allowed, test on level 4 with lang='ud'. + # The feature Typo=Yes is the only feature allowed on a multi-word token line. + # If it occurs there, it cannot be duplicated on the lines of the component words. + if f == 'Typo' and state.mwt_typo_span_end and node.ord <= state.mwt_typo_span_end: + Incident( + state=state, + nodeid=node.ord, + testid='mwt-typo-repeated-at-word', + message="Feature Typo cannot occur at a word if it already occurred at the corresponding multi-word token." + ).report(state, self.args) + # In case of code switching, the current token may not be in the default language + # and then its features are checked against a different feature set. An exception + # is the feature Foreign, which always relates to the default language of the + # corpus (but Foreign=Yes should probably be allowed for all UPOS categories in + # all languages). + effective_featset = featset + effective_lang = lang + if f == 'Foreign': + # Revert to the default. + effective_featset = default_featset + effective_lang = default_lang + if effective_featset is not None: + if f not in effective_featset: + Incident( + state=state, + nodeid=node.ord, + testid='feature-unknown', + message=f"Feature {f} is not documented for language [{effective_lang}] ('{utils.formtl(node)}').", + explanation=outils.explain_feats(effective_lang) + ).report(state, self.args) + else: + lfrecord = effective_featset[f] + if lfrecord['permitted'] == 0: + Incident( + state=state, + nodeid=node.ord, + testid='feature-not-permitted', + message=f"Feature {f} is not permitted in language [{effective_lang}] ('{utils.formtl(node)}').", + explanation=outils.explain_feats(effective_lang) + ).report(state, self.args) + else: + values = lfrecord['uvalues'] + lfrecord['lvalues'] + lfrecord['unused_uvalues'] + lfrecord['unused_lvalues'] + if not v in values: + Incident( + state=state, + nodeid=node.ord, + testid='feature-value-unknown', + message=f"Value {v} is not documented for feature {f} in language [{effective_lang}] ('{utils.formtl(node)}').", + explanation=outils.explain_feats(effective_lang) + ).report(state, self.args) + elif not node.upos in lfrecord['byupos']: + Incident( + state=state, + nodeid=node.ord, + testid='feature-upos-not-permitted', + message=f"Feature {f} is not permitted with UPOS {node.upos} in language [{effective_lang}] ('{utils.formtl(node)}').", + explanation=outils.explain_feats(effective_lang) + ).report(state, self.args) + elif not v in lfrecord['byupos'][node.upos] or lfrecord['byupos'][node.upos][v]==0: + Incident( + state=state, + nodeid=node.ord, + testid='feature-value-upos-not-permitted', + message=f"Value {v} of feature {f} is not permitted with UPOS {node.upos} in language [{effective_lang}] ('{utils.formtl(node)}').", + explanation=outils.explain_feats(effective_lang) + ).report(state, self.args) + if state.mwt_typo_span_end and int(state.mwt_typo_span_end) <= int(node.ord): + state.mwt_typo_span_end = None + + + + def OLD_validate_deprels(self, state, node, line): + """ + Checks that a dependency relation label is listed as approved in the given + language. As a language-specific test, this function generally belongs to + level 4, but it can be also used on levels 2 and 3, in which case it will + check only the main dependency type and ignore any subtypes. + + Parameters + ---------- + node : udapi.core.node.Node object + The node whose incoming relation will be validated. + line : int + Number of the line where the node occurs in the file. + """ + Incident.default_lineno = line + Incident.default_level = 4 + Incident.default_testclass = 'Syntax' + # List of permited relations is language-specific. + # The current token may be in a different language due to code switching. + # Unlike with features and auxiliaries, with deprels it is less clear + # whether we want to switch the set of labels when the token belongs to + # another language. Especially with subtypes that are not so much language + # specific. For example, we may have allowed 'flat:name' for our language, + # the maintainers of the other language have not allowed it, and then we + # could not use it when the foreign language is active. (This actually + # happened in French GSD.) We will thus allow the union of the main and the + # alternative deprelset when both the parent and the child belong to the + # same alternative language. Otherwise, only the main deprelset is allowed. + mainlang = self.args.lang + naltlang = utils.get_alt_language(node) + # The basic relation should be tested on regular nodes but not on empty nodes. + if not node.is_empty(): + paltlang = utils.get_alt_language(node.parent) + main_deprelset = self.specs.get_deprel_for_language(mainlang) + alt_deprelset = set() + if naltlang != None and naltlang != mainlang and naltlang == paltlang: + alt_deprelset = self.specs.get_deprel_for_language(naltlang) + # Test only the universal part if testing at universal level. + deprel = node.deprel + if self.args.level < 4: + deprel = node.udeprel + Incident.default_level = 2 + if deprel not in main_deprelset and deprel not in alt_deprelset: + Incident( + state=state, + nodeid=node.ord, + testid='unknown-deprel', + message=f"Unknown DEPREL label: '{deprel}'", + explanation=outils.explain_deprel(mainlang) + ).report(state, self.args) + # If there are enhanced dependencies, test their deprels, too. + # We already know that the contents of DEPS is parsable (deps_list() was + # first called from validate_id_references() and the head indices are OK). + # The order of enhanced dependencies was already checked in validate_deps(). + Incident.default_testclass = 'Enhanced' + if str(node.deps) != '_': + main_edeprelset = self.specs.get_edeprel_for_language(mainlang) + alt_edeprelset = self.specs.get_edeprel_for_language(naltlang) + for edep in node.deps: + parent = edep['parent'] + deprel = edep['deprel'] + paltlang = utils.get_alt_language(parent) + if self.args.level < 4: + deprel = utils.lspec2ud(deprel) + Incident.default_level = 2 + if not (deprel in main_edeprelset or naltlang != None and naltlang != mainlang and naltlang == paltlang and deprel in alt_edeprelset): + Incident( + state=state, + nodeid=node.ord, + testid='unknown-edeprel', + message=f"Unknown enhanced relation type '{deprel}' in '{parent.ord}:{deprel}'", + explanation=outils.explain_edeprel(mainlang) + ).report(state, self.args) + + + +#============================================================================== +# Level 5 tests. Annotation content vs. the guidelines, language-specific. +#============================================================================== + + + + def validate_auxiliary_verbs(self, state, node, line, lang): + """ + Verifies that the UPOS tag AUX is used only with lemmas that are known to + act as auxiliary verbs or particles in the given language. + + Parameters + ---------- + node : udapi.core.node.Node object + The node to be validated. + line : int + Number of the line where the node occurs in the file. + lang : str + Code of the main language of the corpus. + """ + if node.upos == 'AUX' and node.lemma != '_': + altlang = utils.get_alt_language(node) + if altlang: + lang = altlang + auxlist = self.specs.get_aux_for_language(lang) + if not auxlist or not node.lemma in auxlist: + Incident( + state=state, + lineno=line, + nodeid=node.ord, + level=5, + testclass='Morpho', + testid='aux-lemma', + message=f"'{node.lemma}' is not an auxiliary in language [{lang}]", + explanation=outils.explain_aux(lang) + ).report(state, self.args) + + + + def validate_copula_lemmas(self, state, node, line, lang): + """ + Verifies that the relation cop is used only with lemmas that are known to + act as copulas in the given language. + + Parameters + ---------- + node : udapi.core.node.Node object + The node to be validated. + line : int + Number of the line where the node occurs in the file. + lang : str + Code of the main language of the corpus. + """ + if node.udeprel == 'cop' and node.lemma != '_': + altlang = utils.get_alt_language(node) + if altlang: + lang = altlang + coplist = self.specs.get_cop_for_language(lang) + if not coplist or not node.lemma in coplist: + Incident( + state=state, + lineno=line, + nodeid=node.ord, + level=5, + testclass='Syntax', + testid='cop-lemma', + message=f"'{node.lemma}' is not a copula in language [{lang}]", + explanation=outils.explain_cop(lang) + ).report(state, self.args) + + + +#============================================================================== +# Level 6 tests for annotation of coreference and named entities. This is +# tested on demand only, as the requirements are not compulsory for UD +# releases. +#============================================================================== + + + + def validate_misc_entity(self, state, comments, sentence): + """ + Optionally checks the well-formedness of the MISC attributes that pertain + to coreference and named entities. + """ + Incident.default_level = 6 + Incident.default_testclass = 'Coref' + iline = 0 + sentid = '' + for c in comments: + Incident.default_lineno = state.comment_start_line+iline + global_entity_match = crex.global_entity.fullmatch(c) + newdoc_match = crex.newdoc.fullmatch(c) + sentid_match = crex.sentid.fullmatch(c) + if global_entity_match: + # As a global declaration, global.Entity is expected only once per file. + # However, we may be processing multiple files or people may have created + # the file by concatening smaller files, so we will allow repeated + # declarations iff they are identical to the first one. + if state.seen_global_entity: + if global_entity_match.group(1) != state.global_entity_attribute_string: + Incident( + state=state, + testid='global-entity-mismatch', + message=f"New declaration of global.Entity '{global_entity_match.group(1)}' does not match the first declaration '{state.global_entity_attribute_string}' on line {state.seen_global_entity}." + ).report(state, self.args) + else: + state.seen_global_entity = state.comment_start_line + iline + state.global_entity_attribute_string = global_entity_match.group(1) + if not re.match(r"^[a-z]+(-[a-z]+)*$", state.global_entity_attribute_string): + Incident( + state=state, + testid='spurious-global-entity', + message=f"Cannot parse global.Entity attribute declaration '{state.global_entity_attribute_string}'." + ).report(state, self.args) + else: + global_entity_attributes = state.global_entity_attribute_string.split('-') + if not 'eid' in global_entity_attributes: + Incident( + state=state, + testid='spurious-global-entity', + message=f"Global.Entity attribute declaration '{state.global_entity_attribute_string}' does not include 'eid'." + ).report(state, self.args) + elif global_entity_attributes[0] != 'eid': + Incident( + state=state, + testid='spurious-global-entity', + message=f"Attribute 'eid' must come first in global.Entity attribute declaration '{state.global_entity_attribute_string}'." + ).report(state, self.args) + if not 'etype' in global_entity_attributes: + Incident( + state=state, + testid='spurious-global-entity', + message=f"Global.Entity attribute declaration '{state.global_entity_attribute_string}' does not include 'etype'." + ).report(state, self.args) + elif global_entity_attributes[1] != 'etype': + Incident( + state=state, + testid='spurious-global-entity', + message=f"Attribute 'etype' must come second in global.Entity attribute declaration '{state.global_entity_attribute_string}'." + ).report(state, self.args) + if not 'head' in global_entity_attributes: + Incident( + state=state, + testid='spurious-global-entity', + message=f"Global.Entity attribute declaration '{state.global_entity_attribute_string}' does not include 'head'." + ).report(state, self.args) + elif global_entity_attributes[2] != 'head': + Incident( + state=state, + testid='spurious-global-entity', + message=f"Attribute 'head' must come third in global.Entity attribute declaration '{state.global_entity_attribute_string}'." + ).report(state, self.args) + if 'other' in global_entity_attributes and global_entity_attributes[3] != 'other': + Incident( + state=state, + testid='spurious-global-entity', + message=f"Attribute 'other', if present, must come fourth in global.Entity attribute declaration '{state.global_entity_attribute_string}'." + ).report(state, self.args) + # Fill the global dictionary that maps attribute names to list indices. + i = 0 + for a in global_entity_attributes: + if a in state.entity_attribute_index: + Incident( + state=state, + testid='spurious-global-entity', + message=f"Attribute '{a}' occurs more than once in global.Entity attribute declaration '{state.global_entity_attribute_string}'." + ).report(state, self.args) + else: + state.entity_attribute_index[a] = i + i += 1 + state.entity_attribute_number = len(global_entity_attributes) + elif newdoc_match: + for eid in state.entity_ids_this_document: + state.entity_ids_other_documents[eid] = state.entity_ids_this_document[eid] + state.entity_ids_this_document = {} + elif sentid_match: + sentid = sentid_match.group(1) + iline += 1 + iline = 0 + for cols in sentence: + Incident.default_lineno = state.sentence_line+iline + # Add the current word to all currently open mentions. We will use it in error messages. + # Do this for regular and empty nodes but not for multi-word-token lines. + if not utils.is_multiword_token(cols): + for m in state.open_entity_mentions: + m['span'].append(cols[ID]) + m['text'] += ' '+cols[FORM] + m['length'] += 1 + misc = cols[MISC].split('|') + entity = [x for x in misc if re.match(r"^Entity=", x)] + bridge = [x for x in misc if re.match(r"^Bridge=", x)] + splitante = [x for x in misc if re.match(r"^SplitAnte=", x)] + if utils.is_multiword_token(cols) and (len(entity)>0 or len(bridge)>0 or len(splitante)>0): + Incident( + state=state, + testid='entity-mwt', + message="Entity or coreference annotation must not occur at a multiword-token line." + ).report(state, self.args) + continue + if len(entity)>1: + Incident( + state=state, + testid='multiple-entity-statements', + message=f"There can be at most one 'Entity=' statement in MISC but we have {str(misc)}." + ).report(state, self.args) + continue + if len(bridge)>1: + Incident( + state=state, + testid='multiple-bridge-statements', + message=f"There can be at most one 'Bridge=' statement in MISC but we have {str(misc)}." + ).report(state, self.args) + continue + if len(splitante)>1: + Incident( + state=state, + testid='multiple-splitante-statements', + message=f"There can be at most one 'SplitAnte=' statement in MISC but we have {str(misc)}." + ).report(state, self.args) + continue + if len(bridge)>0 and len(entity)==0: + Incident( + state=state, + testid='bridge-without-entity', + message=f"The 'Bridge=' statement can only occur together with 'Entity=' in MISC but we have {str(misc)}." + ).report(state, self.args) + continue + if len(splitante)>0 and len(entity)==0: + Incident( + state=state, + testid='splitante-without-entity', + message=f"The 'SplitAnte=' statement can only occur together with 'Entity=' in MISC but we have {str(misc)}." + ).report(state, self.args) + continue + # There is at most one Entity (and only if it is there, there may be also one Bridge and/or one SplitAnte). + if len(entity)>0: + if not state.seen_global_entity: + Incident( + state=state, + testid='entity-without-global-entity', + message="No global.Entity comment was found before the first 'Entity' in MISC." + ).report(state, self.args) + continue + match = re.match(r"^Entity=((?:\([^( )]+(?:-[^( )]+)*\)?|[^( )]+\))+)$", entity[0]) + if not match: + Incident( + state=state, + testid='spurious-entity-statement', + message=f"Cannot parse the Entity statement '{entity[0]}'." + ).report(state, self.args) + else: + entity_string = match.group(1) + # We cannot check the rest if we cannot identify the 'eid' attribute. + if 'eid' not in state.entity_attribute_index: + continue + # Items of entities are pairs of [012] and a string. + # 0 ... opening bracket; 1 ... closing bracket; 2 ... both brackets + entities = [] + while entity_string: + match = re.match(r"^\(([^( )]+(-[^( )]+)*)\)", entity_string) + if match: + entities.append((2, match.group(1))) + entity_string = re.sub(r"^\([^( )]+(-[^( )]+)*\)", '', entity_string, count=1) + continue + match = re.match(r"^\(([^( )]+(-[^( )]+)*)", entity_string) + if match: + entities.append((0, match.group(1))) + entity_string = re.sub(r"^\([^( )]+(-[^( )]+)*", '', entity_string, count=1) + continue + match = re.match(r"^([^( )]+)\)", entity_string) + if match: + entities.append((1, match.group(1))) + entity_string = re.sub(r"^[^( )]+\)", '', entity_string, count=1) + continue + # If we pre-checked the string well, we should never arrive here! + Incident( + state=state, + testid='internal-error', + message='INTERNAL ERROR' + ).report(state, self.args) + # All 1 cases should precede all 0 cases. + # The 2 cases can be either before the first 1 case, or after the last 0 case. + seen0 = False + seen1 = False + seen2 = False + # To be able to check validity of Bridge and SplitAnte, we will hash eids of mentions that start here. + # To be able to check that no two mentions have the same span, we will hash start-end intervals for mentions that end here. + starting_mentions = {} + ending_mentions = {} + for b, e in entities: + # First get attributes, entity id, and if applicable, part of discontinuous mention. + attributes = e.split('-') + if b==0 or b==2: + # Fewer attributes are allowed because trailing empty values can be omitted. + # More attributes are not allowed. + if len(attributes) > state.entity_attribute_number: + Incident( + state=state, + testid='too-many-entity-attributes', + message=f"Entity '{e}' has {len(attributes)} attributes while only {state.entity_attribute_number} attributes are globally declared." + ).report(state, self.args) + # The raw eid (bracket eid) may include an identification of a part of a discontinuous mention, + # as in 'e155[1/2]'. This is fine for matching opening and closing brackets + # because the closing bracket must contain it too. However, to identify the + # cluster, we need to take the real id. + beid = attributes[state.entity_attribute_index['eid']] + else: + # No attributes other than eid are expected at the closing bracket. + if len(attributes) > 1: + Incident( + state=state, + testid='too-many-entity-attributes', + message=f"Entity '{e}' has {len(attributes)} attributes while only eid is expected at the closing bracket." + ).report(state, self.args) + beid = attributes[0] + eid = beid + ipart = 1 + npart = 1 + eidnpart = eid + match = re.match(r"^(.+)\[([1-9]\d*)/([1-9]\d*)\]$", beid) + if match: + eid = match.group(1) + ipart = int(match.group(2)) + npart = int(match.group(3)) + eidnpart = eid+'['+match.group(3)+']' + # We should omit the square brackets if they would be [1/1]. + if ipart == 1 and npart == 1: + Incident( + state=state, + testid='spurious-entity-id', + message=f"Discontinuous mention must have at least two parts but it has one in '{beid}'." + ).report(state, self.args) + if ipart > npart: + Incident( + state=state, + testid='spurious-entity-id', + message=f"Entity id '{beid}' of discontinuous mention says the current part is higher than total number of parts." + ).report(state, self.args) + else: + if re.match(r"[\[\]]", beid): + Incident( + state=state, + testid='spurious-entity-id', + message=f"Entity id '{beid}' contains square brackets but does not have the form used in discontinuous mentions." + ).report(state, self.args) + + #-------------------------------------------------------------------------------------------------------------------------------- + # The code that we will have to execute at single-node continuous parts and at the opening brackets of multi-node continuous parts. + # We assume that we have already parsed beid and established whether this is a part of a discontinuous mention. + def opening_bracket(): + attrstring_to_match = '' + # If this is a part of a discontinuous mention, remember the attribute string. + # At the beginning of each part, we will check that its attribute string is identical to the first part. + if npart > 1: + # We want to check that values of all attributes are same in all parts (except the eid which differs in the brackets). + attributes_without_eid = [attributes[i] for i in range(len(attributes)) if i != state.entity_attribute_index['eid']] + # For better readability of the error messages, reintroduce eid anyway, but without the brackets. + attrstring_to_match = eid+'-'+('-'.join(attributes_without_eid)) + if ipart == 1: + # If this is the first part, create a new record for the mention in the global dictionary. + # We actually keep a stack of open mentions with the same eidnpart because they may be nested. + # The length and the span of the mention will be updated when we encounter the closing bracket of the current part. + discontinuous_mention = {'last_ipart': 1, 'npart': npart, + 'first_part_line': state.sentence_line+iline, + 'last_part_line': state.sentence_line+iline, + 'attributes': attrstring_to_match, + 'length': 0, 'span': []} + if eidnpart in state.open_discontinuous_mentions: + state.open_discontinuous_mentions[eidnpart].append(discontinuous_mention) + else: + state.open_discontinuous_mentions[eidnpart] = [discontinuous_mention] + else: + if eidnpart in state.open_discontinuous_mentions: + discontinuous_mention = state.open_discontinuous_mentions[eidnpart][-1] + if ipart != discontinuous_mention['last_ipart']+1: + Incident( + state=state, + testid='misplaced-mention-part', + message=f"Unexpected part of discontinuous mention '{beid}': last part was '{discontinuous_mention['last_ipart']}/{discontinuous_mention['npart']}' on line {discontinuous_mention['last_part_line']}." + ).report(state, self.args) + # We will update last_ipart at closing bracket, i.e., after the current part has been entirely processed. + # Otherwise nested discontinuous mentions might wrongly assess where they belong. + elif attrstring_to_match != discontinuous_mention['attributes']: + Incident( + state=state, + testid='mention-attribute-mismatch', + message=f"Attribute mismatch of discontinuous mention: current part has '{attrstring_to_match}', first part '{discontinuous_mention['attributes']}' was at line {discontinuous_mention['first_part_line']}." + ).report(state, self.args) + else: + Incident( + state=state, + testid='misplaced-mention-part', + message=f"Unexpected part of discontinuous mention '{beid}': this is part {ipart} but we do not have information about the previous parts." + ).report(state, self.args) + discontinuous_mention = {'last_ipart': ipart, 'npart': npart, + 'first_part_line': state.sentence_line+iline, + 'last_part_line': state.sentence_line+iline, + 'attributes': attrstring_to_match, + 'length': 0, 'span': []} + state.open_discontinuous_mentions[eidnpart] = [discontinuous_mention] + # Check all attributes of the entity, except those that must be examined at the closing bracket. + if eid in state.entity_ids_other_documents: + Incident( + state=state, + testid='entity-across-newdoc', + message=f"Same entity id should not occur in multiple documents; '{eid}' first seen on line {state.entity_ids_other_documents[eid]}, before the last newdoc." + ).report(state, self.args) + elif not eid in state.entity_ids_this_document: + state.entity_ids_this_document[eid] = state.sentence_line+iline + etype = '' + identity = '' + if 'etype' in state.entity_attribute_index and len(attributes) >= state.entity_attribute_index['etype']+1: + etype = attributes[state.entity_attribute_index['etype']] + # For etype values tentatively approved for CorefUD 1.0, see + # https://github.com/ufal/corefUD/issues/13#issuecomment-1008447464 + if not re.match(r"^(person|place|organization|animal|plant|object|substance|time|number|abstract|event|other)?$", etype): + Incident( + state=state, + testid='spurious-entity-type', + message=f"Spurious entity type '{etype}'." + ).report(state, self.args) + if 'identity' in state.entity_attribute_index and len(attributes) >= state.entity_attribute_index['identity']+1: + identity = attributes[state.entity_attribute_index['identity']] + # Check the form of the head index now. + # The value will be checked at the end of the mention, + # when we know the mention length. + head = 0 + if 'head' in state.entity_attribute_index and len(attributes) >= state.entity_attribute_index['head']+1: + if not re.match(r"^[1-9][0-9]*$", attributes[state.entity_attribute_index['head']]): + Incident( + state=state, + testid='spurious-mention-head', + message=f"Entity head index '{attributes[state.entity_attribute_index['head']]}' must be a non-zero-starting integer." + ).report(state, self.args) + else: + head = int(attributes[state.entity_attribute_index['head']]) + # If this is the first mention of the entity, remember the values + # of the attributes that should be identical at all mentions. + if not eid in state.entity_types: + state.entity_types[eid] = (etype, identity, state.sentence_line+iline) + else: + # All mentions of one entity (cluster) must have the same entity type. + if etype != state.entity_types[eid][0]: + Incident( + state=state, + testid='entity-type-mismatch', + message=f"Entity '{eid}' cannot have type '{etype}' that does not match '{state.entity_types[eid][0]}' from the first mention on line {state.entity_types[eid][2]}." + ).report(state, self.args) + # All mentions of one entity (cluster) must have the same identity (Wikipedia link or similar). + if identity != state.entity_types[eid][1]: + Incident( + state=state, + testid='entity-identity-mismatch', + message=f"Entity '{eid}' cannot have identity '{identity}' that does not match '{state.entity_types[eid][1]}' from the first mention on line {state.entity_types[eid][2]}." + ).report(state, self.args) + # Remember the line where (the current part of) the entity mention starts. + mention = {'beid': beid, 'line': state.sentence_line+iline, + 'span': [cols[ID]], 'text': cols[FORM], + 'length': 1, 'head': head, 'attrstring': attrstring_to_match} + state.open_entity_mentions.append(mention) + # The set of mentions starting at the current line will be needed later when checking Bridge and SplitAnte statements. + if ipart == 1: + starting_mentions[eid] = True + + #-------------------------------------------------------------------------------------------------------------------------------- + # The code that we will have to execute at single-node continuous parts and at the closing brackets of multi-node continuous parts. + def closing_bracket(): + # Find the corresponding opening bracket and extract the information we need to know. + mention_length = 0 + mention_span = [] + head = 0 + opening_line = 0 + if len(state.open_entity_mentions)==0: + Incident( + state=state, + testid='ill-nested-entities', + message=f"Cannot close entity '{beid}' because there are no open entities." + ).report(state, self.args) + return + else: + # If the closing bracket does not occur where expected, it is currently only a warning. + # We have crossing mention spans in CorefUD 1.0 and it has not been decided yet whether all of them should be illegal. + ###!!! Note that this will not catch ill-nested mentions whose only intersection is one node. The bracketing will + ###!!! not be a problem in such cases because one mention will be closed first, then the other will be opened. + if beid != state.open_entity_mentions[-1]['beid']: + Incident( + state=state, + testclass='Warning', + testid='ill-nested-entities-warning', + message=f"Entity mentions are not well nested: closing '{beid}' while the innermost open entity is '{state.open_entity_mentions[-1]['beid']}' from line {state.open_entity_mentions[-1]['line']}: {str(state.open_entity_mentions)}." + ).report(state, self.args) + # Try to find and close the entity whether or not it was well-nested. + for i in reversed(range(len(state.open_entity_mentions))): + if state.open_entity_mentions[i]['beid'] == beid: + mention_length = state.open_entity_mentions[i]['length'] + mention_span = state.open_entity_mentions[i]['span'] + head = state.open_entity_mentions[i]['head'] + opening_line = state.open_entity_mentions[i]['line'] + state.open_entity_mentions.pop(i) + break + else: + # If we did not find the entity to close, then the warning above was not enough and we have to make it a validation error. + Incident( + state=state, + testid='ill-nested-entities', + message=f"Cannot close entity '{beid}' because it was not found among open entities: {str(state.open_entity_mentions)}" + ).report(state, self.args) + return + # If this is a part of a discontinuous mention, update the information about the whole mention. + # We do this after reading the new part (and not when we see its opening bracket) so that nested + # discontinuous mentions of the same entity are possible. + if npart > 1: + # Update the attributes that have to be updated after each part. + if eidnpart in state.open_discontinuous_mentions: + discontinuous_mention = state.open_discontinuous_mentions[eidnpart][-1] + discontinuous_mention['last_ipart'] = ipart + discontinuous_mention['last_part_line'] = opening_line + discontinuous_mention['length'] += mention_length + discontinuous_mention['span'] += mention_span + else: + # This should have been taken care of at the opening bracket. + Incident( + state=state, + testclass='Internal', + testid='internal-error', + message="INTERNAL ERROR: at the closing bracket of a part of a discontinuous mention, still no record in state.open_discontinuous_mentions." + ).report(state, self.args) + discontinuous_mention = {'last_ipart': ipart, 'npart': npart, + 'first_part_line': opening_line, + 'last_part_line': opening_line, + 'attributes': '', 'length': mention_length, + 'span': mention_span} + state.open_discontinuous_mentions[eidnpart] = [discontinuous_mention] + # Update mention_length and mention_span to reflect the whole span up to this point rather than just the last part. + mention_length = state.open_discontinuous_mentions[eidnpart][-1]['length'] + mention_span = state.open_discontinuous_mentions[eidnpart][-1]['span'] + # We need to know the length (number of nodes) of the mention to check whether the head attribute is within limits. + # We need to know the span (list of nodes) of the mention to check that no two mentions have the same span. + # We only check these requirements after the last part of the discontinuous span (or after the single part of a continuous one). + if ipart == npart: + if mention_length < head: + Incident( + state=state, + testid='mention-head-out-of-range', + message=f"Entity mention head was specified as {head} on line {opening_line} but the mention has only {mention_length} nodes." + ).report(state, self.args) + # Check that no two mentions have identical spans (only if this is the last part of a mention). + ending_mention_key = str(opening_line)+str(mention_span) + if ending_mention_key in ending_mentions: + Incident( + state=state, + testid='same-span-entity-mentions', + message=f"Entity mentions '{ending_mentions[ending_mention_key]}' and '{beid}' from line {opening_line} have the same span {str(mention_span)}." + ).report(state, self.args) + else: + ending_mentions[ending_mention_key] = beid + # Remember the span of the current mention so that we can later check whether it crosses the span of another mention. + # Use the current sentence id to partially qualify the node ids. It will not work well for mentions that span multiple + # sentences but we do not expect cross-sentence mentions to be frequent. + myset = set(mention_span) + # Check whether any other mention of the same entity has span that crosses the current one. + if eid in state.entity_mention_spans: + if sentid in state.entity_mention_spans[eid]: + for m in state.entity_mention_spans[eid][sentid]: + ms = state.entity_mention_spans[eid][sentid][m] + if ms.intersection(myset) and not ms.issubset(myset) and not myset.issubset(ms): + Incident( + state=state, + testid='crossing-mentions-same-entity', + message=f"Mentions of entity '{eid}' have crossing spans: {m} vs. {str(mention_span)}." + ).report(state, self.args) + else: + state.entity_mention_spans[eid][sentid] = {} + else: + state.entity_mention_spans[eid] = {} + state.entity_mention_spans[eid][sentid] = {} + state.entity_mention_spans[eid][sentid][str(mention_span)] = myset + # At the end of the last part of a discontinuous mention, remove the information about the mention. + if npart > 1 and ipart == npart: + if eidnpart in state.open_discontinuous_mentions: + if len(state.open_discontinuous_mentions[eidnpart]) > 1: + state.open_discontinuous_mentions[eidnpart].pop() + else: + state.open_discontinuous_mentions.pop(eidnpart) + #-------------------------------------------------------------------------------------------------------------------------------- + + # Now we know the beid, eid, as well as all other attributes. + # We can check the well-nestedness of brackets. + if b==0: + if seen2 and not seen1: + Incident( + state=state, + testid='spurious-entity-statement', + message=f"If there are no closing entity brackets, single-node entity must follow all opening entity brackets in '{entity[0]}'." + ).report(state, self.args) + if seen0 and seen2: + Incident( + state=state, + testid='spurious-entity-statement', + message=f"Single-node entity must either precede all closing entity brackets or follow all opening entity brackets in '{entity[0]}'." + ).report(state, self.args) + seen0 = True + seen2 = False + opening_bracket() + elif b==2: + if seen1 and not seen0: + Incident( + state=state, + testid='spurious-entity-statement', + message=f"If there are no opening entity brackets, single-node entity must precede all closing entity brackets in '{entity[0]}'." + ).report(state, self.args) + seen2 = True + opening_bracket() + closing_bracket() + else: # b==1 + if seen0: + Incident( + state=state, + testid='spurious-entity-statement', + message=f"All closing entity brackets must precede all opening entity brackets in '{entity[0]}'." + ).report(state, self.args) + seen1 = True + closing_bracket() + # Now we are done with checking the 'Entity=' statement. + # If there are also 'Bridge=' or 'SplitAnte=' statements, check them too. + if len(bridge) > 0: + match = re.match(r"^Bridge=([^(< :>)]+<[^(< :>)]+(:[a-z]+)?(,[^(< :>)]+<[^(< :>)]+(:[a-z]+)?)*)$", bridge[0]) + if not match: + Incident( + state=state, + testid='spurious-bridge-statement', + message=f"Cannot parse the Bridge statement '{bridge[0]}'." + ).report(state, self.args) + else: + bridges = match.group(1).split(',') + # Hash src)]+)<([^(< :>)]+)(?::([a-z]+))?^$", b) + if match: + srceid = match.group(1) + tgteid = match.group(2) + relation = match.group(3) # optional + bridgekey = srceid+'<'+tgteid + if srceid == tgteid: + Incident( + state=state, + testid='spurious-bridge-relation', + message=f"Bridge must not point from an entity to itself: '{b}'." + ).report(state, self.args) + if not tgteid in starting_mentions: + Incident( + state=state, + testid='misplaced-bridge-statement', + message=f"Bridge relation '{b}' must be annotated at the beginning of a mention of entity '{tgteid}'." + ).report(state, self.args) + if bridgekey in srctgt: + Incident( + state=state, + testid='repeated-bridge-relation', + message=f"Bridge relation '{bridgekey}' must not be repeated in '{b}'." + ).report(state, self.args) + else: + srctgt[bridgekey] = True + # Check in the global dictionary whether this relation has been specified at another mention. + if bridgekey in state.entity_bridge_relations: + if relation != state.entity_bridge_relations[bridgekey]['relation']: + Incident( + state=state, + testid='bridge-relation-mismatch', + message=f"Bridge relation '{b}' type does not match '{state.entity_bridge_relations[bridgekey]['relation']}' specified earlier on line {state.entity_bridge_relations[bridgekey]['line']}." + ).report(state, self.args) + else: + state.entity_bridge_relations[bridgekey] = {'relation': relation, 'line': state.sentence_line+iline} + if len(splitante) > 0: + match = re.match(r"^SplitAnte=([^(< :>)]+<[^(< :>)]+(,[^(< :>)]+<[^(< :>)]+)*)$", splitante[0]) + if not match: + Incident( + state=state, + testid='spurious-splitante-statement', + message=f"Cannot parse the SplitAnte statement '{splitante[0]}'." + ).report(state, self.args) + else: + antecedents = match.group(1).split(',') + # Hash src)]+)<([^(< :>)]+)$", a) + if match: + srceid = match.group(1) + tgteid = match.group(2) + if srceid == tgteid: + Incident( + state=state, + testid='spurious-splitante-relation', + message=f"SplitAnte must not point from an entity to itself: '{srceid}<{tgteid}'." + ).report(state, self.args) + elif not tgteid in starting_mentions: + Incident( + state=state, + testid='misplaced-splitante-statement', + message=f"SplitAnte relation '{a}' must be annotated at the beginning of a mention of entity '{tgteid}'." + ).report(state, self.args) + if srceid+'<'+tgteid in srctgt: + str_antecedents = ','.join(antecedents) + Incident( + state=state, + testid='repeated-splitante-relation', + message=f"SplitAnte relation '{srceid}<{tgteid}' must not be repeated in '{str_antecedents}'." + ).report(state, self.args) + else: + srctgt[srceid+'<'+tgteid] = True + if tgteid in tgtante: + tgtante[tgteid].append(srceid) + else: + tgtante[tgteid] = [srceid] + for tgteid in tgtante: + if len(tgtante[tgteid]) == 1: + str_antecedents = ','.join(antecedents) + Incident( + state=state, + testid='only-one-split-antecedent', + message=f"SplitAnte statement '{str_antecedents}' must specify at least two antecedents for entity '{tgteid}'." + ).report(state, self.args) + # Check in the global dictionary whether this relation has been specified at another mention. + tgtante[tgteid].sort() + if tgteid in state.entity_split_antecedents: + if tgtante[tgteid] != state.entity_split_antecedents[tgteid]['antecedents']: + Incident( + state=state, + testid='split-antecedent-mismatch', + message=f"Split antecedent of entity '{tgteid}' does not match '{state.entity_split_antecedents[tgteid]['antecedents']}' specified earlier on line {state.entity_split_antecedents[tgteid]['line']}." + ).report(state, self.args) + else: + state.entity_split_antecedents[tgteid] = {'antecedents': str(tgtante[tgteid]), 'line': state.sentence_line+iline} + iline += 1 + if len(state.open_entity_mentions)>0: + Incident( + state=state, + testid='cross-sentence-mention', + message=f"Entity mentions must not cross sentence boundaries; still open at sentence end: {str(state.open_entity_mentions)}." + ).report(state, self.args) + # Close the mentions forcibly. Otherwise one omitted closing bracket would cause the error messages to to explode because the words would be collected from the remainder of the file. + state.open_entity_mentions = [] + if len(state.open_discontinuous_mentions)>0: + Incident( + state=state, + testid='cross-sentence-mention', + message=f"Entity mentions must not cross sentence boundaries; still open at sentence end: {str(state.open_discontinuous_mentions)}." + ).report(state, self.args) + # Close the mentions forcibly. Otherwise one omission would cause the error messages to to explode because the words would be collected from the remainder of the file. + state.open_discontinuous_mentions = {} + # Since we only test mentions within one sentence at present, we do not have to carry all mention spans until the end of the corpus. + for eid in state.entity_mention_spans: + if sentid in state.entity_mention_spans[eid]: + state.entity_mention_spans[eid].pop(sentid) + + + +#============================================================================== +# Main part. +#============================================================================== + # TODO: put in utils or similar + def build_tree_udapi(self, lines): + root = self.conllu_reader.read_tree_from_lines(lines) + return root + + def validate_file(self, state, inp): + """ + The main entry point for all validation tests applied to one input file. + It reads sentences from the input stream one by one, each sentence is + immediately tested. + + Parameters + ---------- + inp : open file handle + The CoNLL-U-formatted input stream. + """ + for all_lines, comments, sentence in self.OLD_next_sentence(state, inp): + linenos = utils.get_line_numbers_for_ids(state, sentence) + # The individual lines were validated already in next_sentence(). + # What follows is tests that need to see the whole tree. + # Note that low-level errors such as wrong number of columns would be + # reported in next_sentence() but then the lines would be thrown away + # and no tree lines would be yielded—meaning that we will not encounter + # such a mess here. + idseqok = self.OLD_validate_id_sequence(state, sentence) # level 1 + self.OLD_validate_token_ranges(state, sentence) # level 1 + # TODO: config file so that levels are not checked here + if self.args.level > 1: + idrefok = idseqok and self.OLD_validate_id_references(state, sentence) # level 2 + if not idrefok: + continue + treeok = self.validate_tree(state, sentence) # level 2 test: tree is single-rooted, connected, cycle-free + if not treeok: + continue + # Tests of individual nodes that operate on pre-Udapi data structures. + # Some of them (bad feature format) may lead to skipping Udapi completely. + colssafe = True + line = state.sentence_line - 1 + for cols in sentence: # TODO: cols->line + line += 1 + # Multiword tokens and empty nodes can or must have certain fields empty. + if utils.is_multiword_token(cols): + self.OLD_validate_mwt_empty_vals(state, cols, line) + if utils.is_empty_node(cols): # TODO: elif? + self.OLD_validate_empty_node_empty_vals(state, cols, line) # level 2 + if utils.is_word(cols) or utils.is_empty_node(cols): # TODO: elif? + self.OLD_validate_character_constraints(state, cols, line) # level 2 + self.OLD_validate_upos(state, cols, line) # level 2 + colssafe = colssafe and self.OLD_validate_features_level2(state, cols, line) # level 2 (level 4 tests will be called later) + self.OLD_validate_deps(state, cols, line) # level 2; must operate on pre-Udapi DEPS (to see order of relations) + self.OLD_validate_misc(state, cols, line) # level 2; must operate on pre-Udapi MISC + if not colssafe: + continue + # If we successfully passed all the tests above, it is probably + # safe to give the lines to Udapi and ask it to build the tree data + # structure for us. + tree = self.build_tree_udapi(all_lines) + self.validate_sent_id(state, comments, self.args.lang) # level 2 + self.validate_text_meta(state, comments, sentence) # level 2 + # Test that enhanced graphs exist either for all sentences or for + # none. As a side effect, get line numbers for all nodes including + # empty ones (here linenos is a dict indexed by cols[ID], i.e., a string). + # These line numbers are returned in any case, even if there are no + # enhanced dependencies, hence we can rely on them even with basic + # trees. + self.OLD_validate_deps_all_or_none(state, sentence) + # Tests of individual nodes with Udapi. + nodes = tree.descendants_and_empty + for node in nodes: + line = linenos[str(node.ord)] + self.OLD_validate_deprels(state, node, line) # level 2 and 4 + self.validate_root(state, node, line) # level 2: deprel root <=> head 0 + if self.args.level > 2: + self.validate_enhanced_orphan(state, node, line) # level 3 + if self.args.level > 3: + # To disallow words with spaces everywhere, use --lang ud. + self.validate_words_with_spaces(state, node, line, self.args.lang) # level 4 + self.validate_features_level4(state, node, line, self.args.lang) # level 4 + if self.args.level > 4: + self.validate_auxiliary_verbs(state, node, line, self.args.lang) # level 5 + self.validate_copula_lemmas(state, node, line, self.args.lang) # level 5 + # Tests on whole trees and enhanced graphs. + if self.args.level > 2: + self.validate_annotation(state, tree, linenos) # level 3 + self.validate_egraph_connected(state, nodes, linenos) + if self.args.check_coref: + self.validate_misc_entity(state, comments, sentence) # optional for CorefUD treebanks + self.OLD_validate_newlines(state, inp) # level 1 + + + + def validate_end(self, state): + """ + Final tests after processing the entire treebank (possibly multiple files). + """ + # After reading the entire treebank (perhaps multiple files), check whether + # the DEPS annotation was not a mere copy of the basic trees. + if self.args.level>2 and state.seen_enhanced_graph and not state.seen_enhancement: + Incident( + state=state, + level=3, + testclass='Enhanced', + testid='edeps-identical-to-basic-trees', + message="Enhanced graphs are copies of basic trees in the entire dataset. This can happen for some simple sentences where there is nothing to enhance, but not for all sentences. If none of the enhancements from the guidelines (https://universaldependencies.org/u/overview/enhanced-syntax.html) are annotated, the DEPS should be left unspecified" + ).report(state, self.args) + + + def validate_files(self, filenames): + state = State() + try: + for fname in filenames: + state.current_file_name = fname + if fname == '-': + # Set PYTHONIOENCODING=utf-8 before starting Python. + # See https://docs.python.org/3/using/cmdline.html#envvar-PYTHONIOENCODING + # Otherwise ANSI will be read in Windows and + # locale-dependent encoding will be used elsewhere. + self.validate_file(state, sys.stdin) + else: + with io.open(fname, 'r', encoding='utf-8') as inp: + self.validate_file(state, inp) + self.validate_end(state) + except: + Incident( + state=state, + level=0, + testclass='Internal', + testid='exception', + message="Exception caught!" + ).report(state, self.args) + # If the output is used in an HTML page, it must be properly escaped + # because the traceback can contain e.g. "". However, escaping + # is beyond the goal of validation, which can be also run in a console. + traceback.print_exc() + return state + + +#============================================================================== +# Argument processing. +#============================================================================== + + +def build_argparse(): + opt_parser = argparse.ArgumentParser(description="CoNLL-U validation script. Python 3 is needed to run it!") + + io_group = opt_parser.add_argument_group("Input / output options") + io_group.add_argument('--quiet', + dest="quiet", action="store_true", default=False, + help="""Do not print any error messages. + Exit with 0 on pass, non-zero on fail.""") + io_group.add_argument('--max-err', + action="store", type=int, default=20, + help="""How many errors to output before exiting? 0 for all. + Default: %(default)d.""") + io_group.add_argument('input', + nargs='*', + help="""Input file name(s), or "-" or nothing for standard input.""") + + list_group = opt_parser.add_argument_group("Tag sets", "Options relevant to checking tag sets.") + list_group.add_argument("--lang", + action="store", required=True, default=None, + help="""Which langauge are we checking? + If you specify this (as a two-letter code), the tags will be checked + using the language-specific files in the + data/directory of the validator.""") + list_group.add_argument("--level", + action="store", type=int, default=5, dest="level", + help="""Level 1: Test only CoNLL-U backbone. + Level 2: UD format. + Level 3: UD contents. + Level 4: Language-specific labels. + Level 5: Language-specific contents.""") + + tree_group = opt_parser.add_argument_group("Tree constraints", + "Options for checking the validity of the tree.") + tree_group.add_argument("--multiple-roots", + action="store_false", default=True, dest="single_root", + help="""Allow trees with several root words + (single root required by default).""") + + coref_group = opt_parser.add_argument_group("Coreference / entity constraints", + "Options for checking coreference and entity annotation.") + coref_group.add_argument('--coref', + action='store_true', default=False, dest='check_coref', + help='Test coreference and entity-related annotation in MISC.') + return opt_parser + +def parse_args(): + opt_parser = build_argparse() + args = opt_parser.parse_args() #Parsed command-line arguments + + # Level of validation + if args.level < 1: + print(f'Option --level must not be less than 1; changing from {args.level} to 1', + file=sys.stderr) + args.level = 1 + # No language-specific tests for levels 1-3 + # Anyways, any Feature=Value pair should be allowed at level 3 (because it may be language-specific), + # and any word form or lemma can contain spaces (because language-specific guidelines may allow it). + # We can also test language 'ud' on level 4; then it will require that no language-specific features are present. + if args.level < 4: + args.lang = 'ud' + if args.input == []: + args.input.append('-') + return args + +def main(): + args = parse_args() + validator = Validator(args) + state = validator.validate_files(args.input) + + # Summarize the warnings and errors. + passed = True + nerror = 0 + if state.error_counter: + for k, v in sorted(state.error_counter.items()): + if k == 'Warning': + errors = 'Warnings' + else: + errors = k+' errors' + nerror += v + passed = False + if not args.quiet: + print(f'{errors}: {v}', file=sys.stderr) + # Print the final verdict and exit. + if passed: + if not args.quiet: + print('*** PASSED ***', file=sys.stderr) + return 0 + else: + if not args.quiet: + print(f'*** FAILED *** with {nerror} errors', file=sys.stderr) + return 1 + +if __name__=="__main__": + errcode = main() + sys.exit(errcode) + diff --git a/test-cases/README.md b/validator/tests/test-cases/README.md similarity index 100% rename from test-cases/README.md rename to validator/tests/test-cases/README.md diff --git a/validator/tests/test-cases/invalid-functions/columns-format-minimal.conllu b/validator/tests/test-cases/invalid-functions/columns-format-minimal.conllu new file mode 100644 index 000000000..62c9d7d7b --- /dev/null +++ b/validator/tests/test-cases/invalid-functions/columns-format-minimal.conllu @@ -0,0 +1,5 @@ +# sent_id = tanl1 +# text = LONDRA . +1 LONDRA Lon dra NOUN SP _ 0 root _ _ +2 . . PUNCT FS _ 1 punct _ _ + diff --git a/validator/tests/test-cases/invalid-functions/columns-format.conllu b/validator/tests/test-cases/invalid-functions/columns-format.conllu new file mode 100644 index 000000000..e36f733c3 --- /dev/null +++ b/validator/tests/test-cases/invalid-functions/columns-format.conllu @@ -0,0 +1,24 @@ +# errors = number-of-columns, empty-column, leading-whitespace, trailing-whitespace, repeated-whitespace, invalid-whitespace-mwt, invalid-whitespace +# sent_id = tanl1 +# text = LONDRA . +1 LONDRA Londra NOUN SP _ 0 root _ _ X +2 . . PUNCT FS _ 1 punct _ _ + +# sent_id = tanl2 +# text = Gas dalla statua . +# this is comment +1 Gas gas NOUN S Gender=Masc|Number=Sing 0 root _ _ +2-3 da lla _ _ _ _ _ _ _ _ +2 da da ADP _ 4 case _ _ +3 la la DET RD Gender=Fem|Number=Sing|PronType=Art 4 det _ _ +4 statua statua NOUN S Gender=Fem|Number=Sing 1 nmod _ _ + 5 . . PUNCT FS _ 1 punct _ _ + +# sent_id = tanl3 +# text = Evacuata la Tate Gallery . +1 Evacuata evacuare VERB V Gender=Fem|Number=Sing 0 root _ _ +2 la il DE T RD Gender=Fem|Number=Sing|PronType=Art 3 det _ _ +3 Tate Tate PROPN SP _ 1 nsubj _ _ +4 Gallery Gallery PROPN SP _ 3 flat _ _ +5 . . PUNCT FS _ 1 punct _ _ + diff --git a/validator/tests/test-cases/invalid-functions/extra-empty-line.conllu b/validator/tests/test-cases/invalid-functions/extra-empty-line.conllu new file mode 100644 index 000000000..63e83c4b5 --- /dev/null +++ b/validator/tests/test-cases/invalid-functions/extra-empty-line.conllu @@ -0,0 +1,13 @@ +# valid one-word sentence. +# sent_id = extra-empty-line1 +# text = valid +1 valid valid NOUN SP _ 0 root _ _ + + + +# format error: sentences must be separated by exactly one empty line +# valid one-word sentence. +# sent_id = extra-empty-line2 +# text = valid +1 valid valid NOUN SP _ 0 root _ _ + diff --git a/validator/tests/test-cases/invalid-functions/invalid-line.conllu b/validator/tests/test-cases/invalid-functions/invalid-line.conllu new file mode 100644 index 000000000..eb9e43765 --- /dev/null +++ b/validator/tests/test-cases/invalid-functions/invalid-line.conllu @@ -0,0 +1,24 @@ +# errors = invalid-line +# sent_id = tanl1 +# text = LONDRA +1 LONDRA Londra NOUN SP _ 0 root _ _ +- . . PUNCT FS _ 1 punct _ _ + +# sent_id = tanl2 +# text = Gas dalla statua +# this is comment +1 Gas gas NOUN S Gender=Masc|Number=Sing 0 root _ _ +2-3 dalla _ _ _ _ _ _ _ _ +2 da da ADP EA _ 4 case _ _ +3 la la DET RD Gender=Fem|Number=Sing|PronType=Art 4 det _ _ +4 statua statua NOUN S Gender=Fem|Number=Sing 1 nmod _ _ + 5 . . PUNCT FS _ 1 punct _ _ + +# sent_id = tanl3 +# text = Evacuata la Tate Gallery . +1 Evacuata evacuare VERB V Gender=Fem|Number=Sing 0 root _ _ +2 la il DET RD Gender=Fem|Number=Sing|PronType=Art 3 det _ _ +3 Tate Tate PROPN SP _ 1 nsubj _ _ +4 Gallery Gallery PROPN SP _ 3 flat _ _ +5 . . PUNCT FS _ 1 punct _ _ + diff --git a/test-cases/invalid-level1-2/id-with-extra-0.conllu b/validator/tests/test-cases/invalid-functions/invalid-word-id.conllu similarity index 100% rename from test-cases/invalid-level1-2/id-with-extra-0.conllu rename to validator/tests/test-cases/invalid-functions/invalid-word-id.conllu diff --git a/test-cases/invalid-level1-2/invalid-range-format.conllu b/validator/tests/test-cases/invalid-functions/invalid-word-interval.conllu similarity index 100% rename from test-cases/invalid-level1-2/invalid-range-format.conllu rename to validator/tests/test-cases/invalid-functions/invalid-word-interval.conllu diff --git a/validator/tests/test-cases/invalid-functions/misplaced-comment.conllu b/validator/tests/test-cases/invalid-functions/misplaced-comment.conllu new file mode 100644 index 000000000..10bc347d0 --- /dev/null +++ b/validator/tests/test-cases/invalid-functions/misplaced-comment.conllu @@ -0,0 +1,24 @@ +# sent_id = tanl1 +# text = LONDRA . +1 LONDRA Londra NOUN SP _ 0 root _ _ +# mispaced comment +2 . . PUNCT FS _ 1 punct _ _ + +# sent_id = tanl2 +# text = Gas dalla statua . +# this is comment +1 Gas gas NOUN S Gender=Masc|Number=Sing 0 root _ _ +2-3 dalla _ _ _ _ _ _ _ _ +2 da da ADP _ _ 4 case _ _ +3 la la DET RD Gender=Fem|Number=Sing|PronType=Art 4 det _ _ +4 statua statua NOUN S Gender=Fem|Number=Sing 1 nmod _ _ +5 . . PUNCT FS _ 1 punct _ _ + +# sent_id = tanl3 +# text = Evacuata la Tate Gallery . +1 Evacuata evacuare VERB V Gender=Fem|Number=Sing 0 root _ _ +2 la il DET RD Gender=Fem|Number=Sing|PronType=Art 3 det _ _ +3 Tate Tate PROPN SP _ 1 nsubj _ _ +4 Gallery Gallery PROPN SP _ 3 flat _ _ +5 . . PUNCT FS _ 1 punct _ _ + diff --git a/validator/tests/test-cases/invalid-functions/misplaced-empty-node-2.conllu b/validator/tests/test-cases/invalid-functions/misplaced-empty-node-2.conllu new file mode 100644 index 000000000..0aa39dec9 --- /dev/null +++ b/validator/tests/test-cases/invalid-functions/misplaced-empty-node-2.conllu @@ -0,0 +1,13 @@ +# not valid: multiword tokens must appear before the first word in their +# range +# sent_id = misordered-empty +# text = I have haven't a clue PUNCT +1 I I PRON PRN Num=Sing|Per=1 2 nsubj _ _ +2-3 haven't _ _ _ _ _ _ _ _ +1.1 _ _ _ _ _ _ _ _ _ +2 have have VERB VB Tens=Pres 0 root _ _ +3 not not ADV RB _ 2 aux _ _ +4 a a DET DT _ 5 det _ _ +5 clue clue NOUN NN Num=Sing 2 obj _ _ +6 PUNCT . PUNCT PUNCT _ 2 punct _ _ + diff --git a/validator/tests/test-cases/invalid-functions/misplaced-empty-node.conllu b/validator/tests/test-cases/invalid-functions/misplaced-empty-node.conllu new file mode 100644 index 000000000..d95218cbe --- /dev/null +++ b/validator/tests/test-cases/invalid-functions/misplaced-empty-node.conllu @@ -0,0 +1,12 @@ +# not valid: multiword tokens must appear before the first word in their +# range +# sent_id = misordered-empty +# text = I have haven't a clue PUNCT +1 I I PRON PRN Num=Sing|Per=1 2 nsubj _ _ +2 have have VERB VB Tens=Pres 0 root _ _ +1.1 _ _ _ _ _ _ _ _ _ +3 not not ADV RB _ 2 aux _ _ +4 a a DET DT _ 5 det _ _ +5 clue clue NOUN NN Num=Sing 2 obj _ _ +6 PUNCT . PUNCT PUNCT _ 2 punct _ _ + diff --git a/test-cases/invalid-level1-2/misordered-multiword.conllu b/validator/tests/test-cases/invalid-functions/misplaced-word-interval.conllu similarity index 100% rename from test-cases/invalid-level1-2/misordered-multiword.conllu rename to validator/tests/test-cases/invalid-functions/misplaced-word-interval.conllu diff --git a/test-cases/invalid-level1-2/multiple-sent_id.conllu b/validator/tests/test-cases/invalid-functions/multiple-sent-id.conllu similarity index 100% rename from test-cases/invalid-level1-2/multiple-sent_id.conllu rename to validator/tests/test-cases/invalid-functions/multiple-sent-id.conllu diff --git a/validator/tests/test-cases/invalid-functions/mwt-nonempty-2.conllu b/validator/tests/test-cases/invalid-functions/mwt-nonempty-2.conllu new file mode 100644 index 000000000..695099aad --- /dev/null +++ b/validator/tests/test-cases/invalid-functions/mwt-nonempty-2.conllu @@ -0,0 +1,7 @@ +# sent_id = empty-node +# text = This holes +# not valid: UPOSTAG must have format '[A-Z]+' or '_' in empty nodes +1 This this PRON _ _ 0 root 1.1:nsubj _ +1.1 _ _ VERB _ _ 2 dep 0:root _ +2 holes hole NOUN _ _ 1 orphan 1.1:obj _ + diff --git a/validator/tests/test-cases/invalid-functions/mwt-nonempty-field.conllu b/validator/tests/test-cases/invalid-functions/mwt-nonempty-field.conllu new file mode 100644 index 000000000..ff1a67ee3 --- /dev/null +++ b/validator/tests/test-cases/invalid-functions/mwt-nonempty-field.conllu @@ -0,0 +1,12 @@ +# not valid: multiword tokens must have underscore ("_") for all fields +# except FORM, ID and MISC. +# sent_id = multiword-with-pos +# text = I haven't a clue PUNCT +1 I I PRON PRN Num=Sing|Per=1 2 nsubj _ _ +2-3 haven't _ VERB _ Typo=Yes 1 _ _ _ +2 have have VERB VB Tens=Pres 0 root _ _ +3 not not ADV RB _ 2 aux _ _ +4 a a DET DT _ 5 det _ _ +5 clue clue NOUN NN Num=Sing 2 obj _ _ +6 PUNCT . PUNCT PUNCT _ 2 punct _ _ + diff --git a/test-cases/invalid-level1-2/dos-newlines.conllu b/validator/tests/test-cases/invalid-functions/non-unix-newline.conllu similarity index 100% rename from test-cases/invalid-level1-2/dos-newlines.conllu rename to validator/tests/test-cases/invalid-functions/non-unix-newline.conllu diff --git a/validator/tests/test-cases/invalid-functions/overlapping-word-interval.conllu b/validator/tests/test-cases/invalid-functions/overlapping-word-interval.conllu new file mode 100644 index 000000000..ac3f07bc6 --- /dev/null +++ b/validator/tests/test-cases/invalid-functions/overlapping-word-interval.conllu @@ -0,0 +1,12 @@ +# not valid: first and last in multiword ranges must be integers +# sent_id = invalid-range-format +# text = I haven't have not a clue PUNCT +1 I I PRON PRN Num=Sing|Per=1 2 nsubj _ _ +2-3 haven't _ _ _ _ _ _ _ _ +2 have have VERB VB Tens=Pres 0 root _ _ +3-4 nota _ _ _ _ _ _ _ _ +3 not not ADV RB _ 2 aux _ _ +4 a a DET DT _ 5 det _ _ +5 clue clue NOUN NN Num=Sing 2 obj _ _ +6 PUNCT . PUNCT PUNCT _ 2 punct _ _ + diff --git a/validator/tests/test-cases/invalid-functions/pseudo-empty-line.conllu b/validator/tests/test-cases/invalid-functions/pseudo-empty-line.conllu new file mode 100644 index 000000000..1226c478c --- /dev/null +++ b/validator/tests/test-cases/invalid-functions/pseudo-empty-line.conllu @@ -0,0 +1,11 @@ +# valid one-word sentence. +# sent_id = extra-empty-line1 +# text = valid +1 valid valid NOUN SP _ 0 root _ _ + +# format error: sentences must be separated by exactly one empty line +# valid one-word sentence. +# sent_id = extra-empty-line2 +# text = valid +1 valid valid NOUN SP _ 0 root _ _ + diff --git a/test-cases/invalid-level1-2/invalid-range.conllu b/validator/tests/test-cases/invalid-functions/reversed-word-interval.conllu similarity index 100% rename from test-cases/invalid-level1-2/invalid-range.conllu rename to validator/tests/test-cases/invalid-functions/reversed-word-interval.conllu diff --git a/test-cases/invalid-level1-2/unnormalized-unicode.conllu b/validator/tests/test-cases/invalid-functions/unicode-normalization.conllu similarity index 100% rename from test-cases/invalid-level1-2/unnormalized-unicode.conllu rename to validator/tests/test-cases/invalid-functions/unicode-normalization.conllu diff --git a/validator/tests/test-cases/invalid-functions/word-id-sequence-2.conllu b/validator/tests/test-cases/invalid-functions/word-id-sequence-2.conllu new file mode 100644 index 000000000..d310b6b54 --- /dev/null +++ b/validator/tests/test-cases/invalid-functions/word-id-sequence-2.conllu @@ -0,0 +1,6 @@ +# sent_id = nonsequential-id +# text = valid PUNCT +# not valid: IDs must be sequential integers (1, 2, ...) +2 valid valid NOUN SP _ 0 root _ _ +1 PUNCT . PUNCT FS _ 1 punct _ _ + diff --git a/test-cases/invalid-level1-2/nonsequential-id.conllu b/validator/tests/test-cases/invalid-functions/word-id-sequence.conllu similarity index 100% rename from test-cases/invalid-level1-2/nonsequential-id.conllu rename to validator/tests/test-cases/invalid-functions/word-id-sequence.conllu diff --git a/test-cases/invalid-level1-2/ambiguous-feature.conllu b/validator/tests/test-cases/invalid-level1-2/ambiguous-feature.conllu similarity index 100% rename from test-cases/invalid-level1-2/ambiguous-feature.conllu rename to validator/tests/test-cases/invalid-level1-2/ambiguous-feature.conllu diff --git a/test-cases/invalid-level1-2/cyclic-deps.conllu b/validator/tests/test-cases/invalid-level1-2/cyclic-deps.conllu similarity index 100% rename from test-cases/invalid-level1-2/cyclic-deps.conllu rename to validator/tests/test-cases/invalid-level1-2/cyclic-deps.conllu diff --git a/test-cases/invalid-level1-2/deprel-not-empty-in-empty.conllu b/validator/tests/test-cases/invalid-level1-2/deprel-not-empty-in-empty.conllu similarity index 100% rename from test-cases/invalid-level1-2/deprel-not-empty-in-empty.conllu rename to validator/tests/test-cases/invalid-level1-2/deprel-not-empty-in-empty.conllu diff --git a/validator/tests/test-cases/invalid-level1-2/dos-newlines.conllu b/validator/tests/test-cases/invalid-level1-2/dos-newlines.conllu new file mode 100644 index 000000000..b4d3a8d2a --- /dev/null +++ b/validator/tests/test-cases/invalid-level1-2/dos-newlines.conllu @@ -0,0 +1,23 @@ +# sent_id = dos-newlines1 +# text = LONDRA PUNCT +1 LONDRA Londra NOUN SP _ 0 root _ _ +2 PUNCT . PUNCT FS _ 1 punct _ _ + +# This is a comment +# sent_id = dos-newlines2 +# text = Gas dalla statua PUNCT +1 Gas gas NOUN S Gen=M|Num=N 0 root _ _ +2-3 dalla _ _ _ _ _ _ _ _ +2 da da ADP EA _ 1 nmod _ _ +3 la la DET RD Gen=F|Num=S 4 det _ _ +4 statua statua NOUN S Gen=F|Num=S 2 nmod _ _ +5 PUNCT . PUNCT FS _ 1 punct _ _ + +# sent_id = dos-newlines3 +# text = Evacuata la Tate Gallery PUNCT +1 Evacuata evacuare VERB V Gen=F|Mod=P|Num=S 3 advcl _ _ +2 la il DET RD Gen=F|Num=S 3 det _ _ +3 Tate Tate NOUN SP _ 0 root _ _ +4 Gallery Gallery NOUN SP _ 3 fixed _ _ +5 PUNCT . PUNCT FS _ 3 punct _ _ + diff --git a/test-cases/invalid-level1-2/duplicate-feature.conllu b/validator/tests/test-cases/invalid-level1-2/duplicate-feature.conllu similarity index 100% rename from test-cases/invalid-level1-2/duplicate-feature.conllu rename to validator/tests/test-cases/invalid-level1-2/duplicate-feature.conllu diff --git a/test-cases/invalid-level1-2/duplicate-id.conllu b/validator/tests/test-cases/invalid-level1-2/duplicate-id.conllu similarity index 100% rename from test-cases/invalid-level1-2/duplicate-id.conllu rename to validator/tests/test-cases/invalid-level1-2/duplicate-id.conllu diff --git a/test-cases/invalid-level1-2/duplicate-layered-feature.conllu b/validator/tests/test-cases/invalid-level1-2/duplicate-layered-feature.conllu similarity index 100% rename from test-cases/invalid-level1-2/duplicate-layered-feature.conllu rename to validator/tests/test-cases/invalid-level1-2/duplicate-layered-feature.conllu diff --git a/test-cases/invalid-level1-2/duplicate-value.conllu b/validator/tests/test-cases/invalid-level1-2/duplicate-value.conllu similarity index 100% rename from test-cases/invalid-level1-2/duplicate-value.conllu rename to validator/tests/test-cases/invalid-level1-2/duplicate-value.conllu diff --git a/test-cases/invalid-level1-2/empty-field.conllu b/validator/tests/test-cases/invalid-level1-2/empty-field.conllu similarity index 100% rename from test-cases/invalid-level1-2/empty-field.conllu rename to validator/tests/test-cases/invalid-level1-2/empty-field.conllu diff --git a/test-cases/invalid-level1-2/empty-head.conllu b/validator/tests/test-cases/invalid-level1-2/empty-head.conllu similarity index 100% rename from test-cases/invalid-level1-2/empty-head.conllu rename to validator/tests/test-cases/invalid-level1-2/empty-head.conllu diff --git a/test-cases/invalid-level1-2/empty-sentence.conllu b/validator/tests/test-cases/invalid-level1-2/empty-sentence.conllu similarity index 100% rename from test-cases/invalid-level1-2/empty-sentence.conllu rename to validator/tests/test-cases/invalid-level1-2/empty-sentence.conllu diff --git a/test-cases/invalid-level1-2/extra-empty-line.conllu b/validator/tests/test-cases/invalid-level1-2/extra-empty-line.conllu similarity index 100% rename from test-cases/invalid-level1-2/extra-empty-line.conllu rename to validator/tests/test-cases/invalid-level1-2/extra-empty-line.conllu diff --git a/test-cases/invalid-level1-2/extra-field.conllu b/validator/tests/test-cases/invalid-level1-2/extra-field.conllu similarity index 100% rename from test-cases/invalid-level1-2/extra-field.conllu rename to validator/tests/test-cases/invalid-level1-2/extra-field.conllu diff --git a/test-cases/invalid-level1-2/head-0-deprel-not-root.conllu b/validator/tests/test-cases/invalid-level1-2/head-0-deprel-not-root.conllu similarity index 100% rename from test-cases/invalid-level1-2/head-0-deprel-not-root.conllu rename to validator/tests/test-cases/invalid-level1-2/head-0-deprel-not-root.conllu diff --git a/test-cases/invalid-level1-2/head-not-0-deprel-root.conllu b/validator/tests/test-cases/invalid-level1-2/head-not-0-deprel-root.conllu similarity index 100% rename from test-cases/invalid-level1-2/head-not-0-deprel-root.conllu rename to validator/tests/test-cases/invalid-level1-2/head-not-0-deprel-root.conllu diff --git a/test-cases/invalid-level1-2/head-not-empty-in-empty.conllu b/validator/tests/test-cases/invalid-level1-2/head-not-empty-in-empty.conllu similarity index 100% rename from test-cases/invalid-level1-2/head-not-empty-in-empty.conllu rename to validator/tests/test-cases/invalid-level1-2/head-not-empty-in-empty.conllu diff --git a/test-cases/invalid-level1-2/id-starting-from-2.conllu b/validator/tests/test-cases/invalid-level1-2/id-starting-from-2.conllu similarity index 100% rename from test-cases/invalid-level1-2/id-starting-from-2.conllu rename to validator/tests/test-cases/invalid-level1-2/id-starting-from-2.conllu diff --git a/validator/tests/test-cases/invalid-level1-2/id-with-extra-0.conllu b/validator/tests/test-cases/invalid-level1-2/id-with-extra-0.conllu new file mode 100644 index 000000000..7f0e08cb6 --- /dev/null +++ b/validator/tests/test-cases/invalid-level1-2/id-with-extra-0.conllu @@ -0,0 +1,5 @@ +# not valid: word IDs must be integers 1, 2, 3, ... +# text = nonvalid +# sent_id = id-with-extra-0 +01 nonvalid nonvalid NOUN _ _ 0 root _ _ + diff --git a/test-cases/invalid-level1-2/invalid-deps-id.conllu b/validator/tests/test-cases/invalid-level1-2/invalid-deps-id.conllu similarity index 100% rename from test-cases/invalid-level1-2/invalid-deps-id.conllu rename to validator/tests/test-cases/invalid-level1-2/invalid-deps-id.conllu diff --git a/test-cases/invalid-level1-2/invalid-deps-order.conllu b/validator/tests/test-cases/invalid-level1-2/invalid-deps-order.conllu similarity index 100% rename from test-cases/invalid-level1-2/invalid-deps-order.conllu rename to validator/tests/test-cases/invalid-level1-2/invalid-deps-order.conllu diff --git a/test-cases/invalid-level1-2/invalid-deps-syntax.conllu b/validator/tests/test-cases/invalid-level1-2/invalid-deps-syntax.conllu similarity index 100% rename from test-cases/invalid-level1-2/invalid-deps-syntax.conllu rename to validator/tests/test-cases/invalid-level1-2/invalid-deps-syntax.conllu diff --git a/test-cases/invalid-level1-2/invalid-head.conllu b/validator/tests/test-cases/invalid-level1-2/invalid-head.conllu similarity index 100% rename from test-cases/invalid-level1-2/invalid-head.conllu rename to validator/tests/test-cases/invalid-level1-2/invalid-head.conllu diff --git a/validator/tests/test-cases/invalid-level1-2/invalid-range-format.conllu b/validator/tests/test-cases/invalid-level1-2/invalid-range-format.conllu new file mode 100644 index 000000000..75e44baa9 --- /dev/null +++ b/validator/tests/test-cases/invalid-level1-2/invalid-range-format.conllu @@ -0,0 +1,11 @@ +# not valid: first and last in multiword ranges must be integers +# sent_id = invalid-range-format +# text = I haven't have not a clue PUNCT +1 I I PRON PRN Num=Sing|Per=1 2 nsubj _ _ +2-X haven't _ _ _ _ _ _ _ _ +2 have have VERB VB Tens=Pres 0 root _ _ +3 not not ADV RB _ 2 aux _ _ +4 a a DET DT _ 5 det _ _ +5 clue clue NOUN NN Num=Sing 2 obj _ _ +6 PUNCT . PUNCT PUNCT _ 2 punct _ _ + diff --git a/validator/tests/test-cases/invalid-level1-2/invalid-range.conllu b/validator/tests/test-cases/invalid-level1-2/invalid-range.conllu new file mode 100644 index 000000000..5adb1f1cb --- /dev/null +++ b/validator/tests/test-cases/invalid-level1-2/invalid-range.conllu @@ -0,0 +1,11 @@ +# not valid: (first-last) multiword ranges must have first <= last +# sent_id = invalid-range +# text = I haven't have not a clue PUNCT +1 I I PRON PRN Num=Sing|Per=1 2 nsubj _ _ +2-1 haven't _ _ _ _ _ _ _ _ +2 have have VERB VB Tens=Pres 0 root _ _ +3 not not ADV RB _ 2 aux _ _ +4 a a DET DT _ 5 det _ _ +5 clue clue NOUN NN Num=Sing 2 obj _ _ +6 PUNCT . PUNCT PUNCT _ 2 punct _ _ + diff --git a/test-cases/invalid-level1-2/lowercase-feature-in-empty.conllu b/validator/tests/test-cases/invalid-level1-2/lowercase-feature-in-empty.conllu similarity index 100% rename from test-cases/invalid-level1-2/lowercase-feature-in-empty.conllu rename to validator/tests/test-cases/invalid-level1-2/lowercase-feature-in-empty.conllu diff --git a/test-cases/invalid-level1-2/lowercase-feature-value-in-empty.conllu b/validator/tests/test-cases/invalid-level1-2/lowercase-feature-value-in-empty.conllu similarity index 100% rename from test-cases/invalid-level1-2/lowercase-feature-value-in-empty.conllu rename to validator/tests/test-cases/invalid-level1-2/lowercase-feature-value-in-empty.conllu diff --git a/test-cases/invalid-level1-2/lowercase-feature.conllu b/validator/tests/test-cases/invalid-level1-2/lowercase-feature.conllu similarity index 100% rename from test-cases/invalid-level1-2/lowercase-feature.conllu rename to validator/tests/test-cases/invalid-level1-2/lowercase-feature.conllu diff --git a/test-cases/invalid-level1-2/lowercase-postag-in-empty.conllu b/validator/tests/test-cases/invalid-level1-2/lowercase-postag-in-empty.conllu similarity index 100% rename from test-cases/invalid-level1-2/lowercase-postag-in-empty.conllu rename to validator/tests/test-cases/invalid-level1-2/lowercase-postag-in-empty.conllu diff --git a/test-cases/invalid-level1-2/lowercase-postag.conllu b/validator/tests/test-cases/invalid-level1-2/lowercase-postag.conllu similarity index 100% rename from test-cases/invalid-level1-2/lowercase-postag.conllu rename to validator/tests/test-cases/invalid-level1-2/lowercase-postag.conllu diff --git a/test-cases/invalid-level1-2/lowercase-value.conllu b/validator/tests/test-cases/invalid-level1-2/lowercase-value.conllu similarity index 100% rename from test-cases/invalid-level1-2/lowercase-value.conllu rename to validator/tests/test-cases/invalid-level1-2/lowercase-value.conllu diff --git a/test-cases/invalid-level1-2/malformed_deps.conllu b/validator/tests/test-cases/invalid-level1-2/malformed_deps.conllu similarity index 100% rename from test-cases/invalid-level1-2/malformed_deps.conllu rename to validator/tests/test-cases/invalid-level1-2/malformed_deps.conllu diff --git a/test-cases/invalid-level1-2/misindexed-empty-node.conllu b/validator/tests/test-cases/invalid-level1-2/misindexed-empty-node.conllu similarity index 100% rename from test-cases/invalid-level1-2/misindexed-empty-node.conllu rename to validator/tests/test-cases/invalid-level1-2/misindexed-empty-node.conllu diff --git a/test-cases/invalid-level1-2/misordered-feature.conllu b/validator/tests/test-cases/invalid-level1-2/misordered-feature.conllu similarity index 100% rename from test-cases/invalid-level1-2/misordered-feature.conllu rename to validator/tests/test-cases/invalid-level1-2/misordered-feature.conllu diff --git a/test-cases/invalid-level1-2/misordered-layered-feature.conllu b/validator/tests/test-cases/invalid-level1-2/misordered-layered-feature.conllu similarity index 100% rename from test-cases/invalid-level1-2/misordered-layered-feature.conllu rename to validator/tests/test-cases/invalid-level1-2/misordered-layered-feature.conllu diff --git a/validator/tests/test-cases/invalid-level1-2/misordered-multiword.conllu b/validator/tests/test-cases/invalid-level1-2/misordered-multiword.conllu new file mode 100644 index 000000000..3b40f244a --- /dev/null +++ b/validator/tests/test-cases/invalid-level1-2/misordered-multiword.conllu @@ -0,0 +1,12 @@ +# not valid: multiword tokens must appear before the first word in their +# range +# sent_id = misordered-multiword +# text = I have haven't a clue PUNCT +1 I I PRON PRN Num=Sing|Per=1 2 nsubj _ _ +2 have have VERB VB Tens=Pres 0 root _ _ +2-3 haven't _ _ _ _ _ _ _ _ +3 not not ADV RB _ 2 aux _ _ +4 a a DET DT _ 5 det _ _ +5 clue clue NOUN NN Num=Sing 2 obj _ _ +6 PUNCT . PUNCT PUNCT _ 2 punct _ _ + diff --git a/test-cases/invalid-level1-2/misplaced-comment-end.conllu b/validator/tests/test-cases/invalid-level1-2/misplaced-comment-end.conllu similarity index 100% rename from test-cases/invalid-level1-2/misplaced-comment-end.conllu rename to validator/tests/test-cases/invalid-level1-2/misplaced-comment-end.conllu diff --git a/test-cases/invalid-level1-2/misplaced-comment-mid.conllu b/validator/tests/test-cases/invalid-level1-2/misplaced-comment-mid.conllu similarity index 100% rename from test-cases/invalid-level1-2/misplaced-comment-mid.conllu rename to validator/tests/test-cases/invalid-level1-2/misplaced-comment-mid.conllu diff --git a/validator/tests/test-cases/invalid-level1-2/misplaced-range.conllu b/validator/tests/test-cases/invalid-level1-2/misplaced-range.conllu new file mode 100644 index 000000000..e8babeb46 --- /dev/null +++ b/validator/tests/test-cases/invalid-level1-2/misplaced-range.conllu @@ -0,0 +1,10 @@ +# not valid: range is not before its first word +# sent_id = invalid-range-format +# text = I haven't have not a clue PUNCT +1 I I PRON PRN Num=Sing|Per=1 2 nsubj _ _ +2 have have VERB VB Tens=Pres 0 root _ _ +3 not not ADV RB _ 2 aux _ _ +2-3 haven't _ _ _ _ _ _ _ _ +4 a a DET DT _ 5 det _ _ +5 clue clue NOUN NN Num=Sing 2 obj _ _ +6 PUNCT . PUNCT PUNCT _ 2 punct _ _ diff --git a/test-cases/invalid-level1-2/missing-final-line.conllu b/validator/tests/test-cases/invalid-level1-2/missing-final-line.conllu similarity index 100% rename from test-cases/invalid-level1-2/missing-final-line.conllu rename to validator/tests/test-cases/invalid-level1-2/missing-final-line.conllu diff --git a/test-cases/invalid-level1-2/missing-space-after.conllu b/validator/tests/test-cases/invalid-level1-2/missing-space-after.conllu similarity index 100% rename from test-cases/invalid-level1-2/missing-space-after.conllu rename to validator/tests/test-cases/invalid-level1-2/missing-space-after.conllu diff --git a/validator/tests/test-cases/invalid-level1-2/multiple-sent_id.conllu b/validator/tests/test-cases/invalid-level1-2/multiple-sent_id.conllu new file mode 100644 index 000000000..558c154ce --- /dev/null +++ b/validator/tests/test-cases/invalid-level1-2/multiple-sent_id.conllu @@ -0,0 +1,24 @@ +# sent_id = tanl1 +# text = LONDRA . +1 LONDRA Londra NOUN SP _ 0 root _ _ +2 . . PUNCT FS _ 1 case _ _ + +# sent_id = tanl2 +# text = Gas dalla statua . +1 Gas gas NOUN S Gender=Masc|Number=Sing 0 root _ _ +2-3 dalla _ _ _ _ _ _ _ _ +2 da da ADP EA _ 1 nmod _ _ +3 la la DET RD Gender=Fem|Number=Sing 4 det _ _ +4 statua statua NOUN S Gender=Fem|Number=Sing 2 nmod _ _ +5 . . PUNCT FS _ 1 punct _ _ + +# sent_id = tanl1 +# text = Evacuata la Tate Gallery . +# sent_id = tanl2 +# This sentence contains two sent_ids which are also repeated. +1 Evacuata evacuare VERB V Gender=Fem|Number=Sing 3 nmod _ _ +2 la il DET RD Gender=Fem|Number=Sing 3 det _ _ +3 Tate Tate NOUN SP _ 0 root _ _ +4 Gallery Gallery NOUN SP _ 3 fixed _ _ +5 . . PUNCT FS _ 3 punct _ _ + diff --git a/validator/tests/test-cases/invalid-level1-2/multiple_roots.conllu b/validator/tests/test-cases/invalid-level1-2/multiple_roots.conllu new file mode 100644 index 000000000..25271b116 --- /dev/null +++ b/validator/tests/test-cases/invalid-level1-2/multiple_roots.conllu @@ -0,0 +1,5 @@ +# not valid: multiple roots +# sent_id = multiple-0 +1 word word NOUN _ _ 0 nsubj _ _ +2 word word NOUN _ _ 0 root _ _ + diff --git a/test-cases/invalid-level1-2/multiword-with-pos.conllu b/validator/tests/test-cases/invalid-level1-2/multiword-with-pos.conllu similarity index 100% rename from test-cases/invalid-level1-2/multiword-with-pos.conllu rename to validator/tests/test-cases/invalid-level1-2/multiword-with-pos.conllu diff --git a/validator/tests/test-cases/invalid-level1-2/nan-id.conllu b/validator/tests/test-cases/invalid-level1-2/nan-id.conllu new file mode 100644 index 000000000..74365beb5 --- /dev/null +++ b/validator/tests/test-cases/invalid-level1-2/nan-id.conllu @@ -0,0 +1,10 @@ +# sent_id = id-starting-from-2a +# text = valid +# valid one-word sentence. +1 valid valid NOUN SP _ 0 root _ _ + +# sent_id = id-starting-from-2b +# text = valid +# not valid: ID must start at 1 for each new sentence +a valid valid NOUN SP _ 0 root _ _ + diff --git a/test-cases/invalid-level1-2/no-sent_id.conllu b/validator/tests/test-cases/invalid-level1-2/no-sent_id.conllu similarity index 100% rename from test-cases/invalid-level1-2/no-sent_id.conllu rename to validator/tests/test-cases/invalid-level1-2/no-sent_id.conllu diff --git a/test-cases/invalid-level1-2/nonsequential-empty-node-id.conllu b/validator/tests/test-cases/invalid-level1-2/nonsequential-empty-node-id.conllu similarity index 100% rename from test-cases/invalid-level1-2/nonsequential-empty-node-id.conllu rename to validator/tests/test-cases/invalid-level1-2/nonsequential-empty-node-id.conllu diff --git a/validator/tests/test-cases/invalid-level1-2/nonsequential-id.conllu b/validator/tests/test-cases/invalid-level1-2/nonsequential-id.conllu new file mode 100644 index 000000000..6933b33bd --- /dev/null +++ b/validator/tests/test-cases/invalid-level1-2/nonsequential-id.conllu @@ -0,0 +1,6 @@ +# sent_id = nonsequential-id +# text = valid PUNCT +# not valid: IDs must be sequential integers (1, 2, ...) +1 valid valid NOUN SP _ 0 root _ _ +3 PUNCT . PUNCT FS _ 1 punct _ _ + diff --git a/validator/tests/test-cases/invalid-level1-2/out-of-bounds-range.conllu b/validator/tests/test-cases/invalid-level1-2/out-of-bounds-range.conllu new file mode 100644 index 000000000..e7b6714fa --- /dev/null +++ b/validator/tests/test-cases/invalid-level1-2/out-of-bounds-range.conllu @@ -0,0 +1,10 @@ +# not valid: range is out of bounds +# sent_id = invalid-range-format +# text = I haven't have not a clue PUNCT +1 I I PRON PRN Num=Sing|Per=1 2 nsubj _ _ +2 have have VERB VB Tens=Pres 0 root _ _ +3 not not ADV RB _ 2 aux _ _ +2-7 haven't _ _ _ _ _ _ _ _ +4 a a DET DT _ 5 det _ _ +5 clue clue NOUN NN Num=Sing 2 obj _ _ +6 PUNCT . PUNCT PUNCT _ 2 punct _ _ diff --git a/test-cases/invalid-level1-2/overlapping-multiword.conllu b/validator/tests/test-cases/invalid-level1-2/overlapping-multiword.conllu similarity index 100% rename from test-cases/invalid-level1-2/overlapping-multiword.conllu rename to validator/tests/test-cases/invalid-level1-2/overlapping-multiword.conllu diff --git a/validator/tests/test-cases/invalid-level1-2/overlapping-range.conllu b/validator/tests/test-cases/invalid-level1-2/overlapping-range.conllu new file mode 100644 index 000000000..ac3f07bc6 --- /dev/null +++ b/validator/tests/test-cases/invalid-level1-2/overlapping-range.conllu @@ -0,0 +1,12 @@ +# not valid: first and last in multiword ranges must be integers +# sent_id = invalid-range-format +# text = I haven't have not a clue PUNCT +1 I I PRON PRN Num=Sing|Per=1 2 nsubj _ _ +2-3 haven't _ _ _ _ _ _ _ _ +2 have have VERB VB Tens=Pres 0 root _ _ +3-4 nota _ _ _ _ _ _ _ _ +3 not not ADV RB _ 2 aux _ _ +4 a a DET DT _ 5 det _ _ +5 clue clue NOUN NN Num=Sing 2 obj _ _ +6 PUNCT . PUNCT PUNCT _ 2 punct _ _ + diff --git a/test-cases/invalid-level1-2/seemingly-empty-line.conllu b/validator/tests/test-cases/invalid-level1-2/seemingly-empty-line.conllu similarity index 100% rename from test-cases/invalid-level1-2/seemingly-empty-line.conllu rename to validator/tests/test-cases/invalid-level1-2/seemingly-empty-line.conllu diff --git a/test-cases/invalid-level1-2/self-cycle-deps.conllu b/validator/tests/test-cases/invalid-level1-2/self-cycle-deps.conllu similarity index 100% rename from test-cases/invalid-level1-2/self-cycle-deps.conllu rename to validator/tests/test-cases/invalid-level1-2/self-cycle-deps.conllu diff --git a/test-cases/invalid-level1-2/self-cycle-head.conllu b/validator/tests/test-cases/invalid-level1-2/self-cycle-head.conllu similarity index 100% rename from test-cases/invalid-level1-2/self-cycle-head.conllu rename to validator/tests/test-cases/invalid-level1-2/self-cycle-head.conllu diff --git a/test-cases/invalid-level1-2/space-in-field.conllu b/validator/tests/test-cases/invalid-level1-2/space-in-field.conllu similarity index 100% rename from test-cases/invalid-level1-2/space-in-field.conllu rename to validator/tests/test-cases/invalid-level1-2/space-in-field.conllu diff --git a/test-cases/invalid-level1-2/tanl-broken.conllu b/validator/tests/test-cases/invalid-level1-2/tanl-broken.conllu similarity index 100% rename from test-cases/invalid-level1-2/tanl-broken.conllu rename to validator/tests/test-cases/invalid-level1-2/tanl-broken.conllu diff --git a/test-cases/invalid-level1-2/token_with_cols_filled.conllu b/validator/tests/test-cases/invalid-level1-2/token_with_cols_filled.conllu similarity index 100% rename from test-cases/invalid-level1-2/token_with_cols_filled.conllu rename to validator/tests/test-cases/invalid-level1-2/token_with_cols_filled.conllu diff --git a/test-cases/invalid-level1-2/trailing-tab.conllu b/validator/tests/test-cases/invalid-level1-2/trailing-tab.conllu similarity index 100% rename from test-cases/invalid-level1-2/trailing-tab.conllu rename to validator/tests/test-cases/invalid-level1-2/trailing-tab.conllu diff --git a/validator/tests/test-cases/invalid-level1-2/unnormalized-unicode.conllu b/validator/tests/test-cases/invalid-level1-2/unnormalized-unicode.conllu new file mode 100644 index 000000000..bca4ef37e --- /dev/null +++ b/validator/tests/test-cases/invalid-level1-2/unnormalized-unicode.conllu @@ -0,0 +1,9 @@ +# Unicode has multiple ways how to encode certain glyphs. UD data must always use the canonical way. +# For example, 'č' is one character (U+010D = LATIN SMALL LETTER C WITH CARON) and it is OK because it is the canonical encoding of this character. +# The non-canonical way is 'č' (two characters: U+0063 = LATIN SMALL LETTER C, U+030C = COMBINING CARON). +# On the other hand, 'с̌' (two characters: U+0441 = CYRILLIC SMALL LETTER ES, U+030C = COMBINING CARON) looks the same but it cannot be normalized to a single character because it is supposed to be Cyrillic. +# In the example below, we use the Cyrillic+Combining sequence in FORM and Latin+Combining in LEMMA. Only the latter should be reported as an error. +# sent_id = 1 +# text = proс̌ +1 proс̌ proč ADV _ _ 0 root _ _ + diff --git a/test-cases/invalid-level1-2/uppercase-deprel.conllu b/validator/tests/test-cases/invalid-level1-2/uppercase-deprel.conllu similarity index 100% rename from test-cases/invalid-level1-2/uppercase-deprel.conllu rename to validator/tests/test-cases/invalid-level1-2/uppercase-deprel.conllu diff --git a/test-cases/invalid-level1-2/uppercase-deps-deprel.conllu b/validator/tests/test-cases/invalid-level1-2/uppercase-deps-deprel.conllu similarity index 100% rename from test-cases/invalid-level1-2/uppercase-deps-deprel.conllu rename to validator/tests/test-cases/invalid-level1-2/uppercase-deps-deprel.conllu diff --git a/test-cases/invalid-level3/non-proj.conllu b/validator/tests/test-cases/invalid-level3/non-proj.conllu similarity index 100% rename from test-cases/invalid-level3/non-proj.conllu rename to validator/tests/test-cases/invalid-level3/non-proj.conllu diff --git a/test-cases/invalid-level3/rel-upos.conllu b/validator/tests/test-cases/invalid-level3/rel-upos.conllu similarity index 100% rename from test-cases/invalid-level3/rel-upos.conllu rename to validator/tests/test-cases/invalid-level3/rel-upos.conllu diff --git a/test-cases/invalid-level3/right-to-left.conllu b/validator/tests/test-cases/invalid-level3/right-to-left.conllu similarity index 100% rename from test-cases/invalid-level3/right-to-left.conllu rename to validator/tests/test-cases/invalid-level3/right-to-left.conllu diff --git a/test-cases/invalid-level3/too-many-subjects.conllu b/validator/tests/test-cases/invalid-level3/too-many-subjects.conllu similarity index 100% rename from test-cases/invalid-level3/too-many-subjects.conllu rename to validator/tests/test-cases/invalid-level3/too-many-subjects.conllu diff --git a/test-cases/invalid-level4-5/cs_pud-ud-test.conllu b/validator/tests/test-cases/invalid-level4-5/cs_pud-ud-test.conllu similarity index 100% rename from test-cases/invalid-level4-5/cs_pud-ud-test.conllu rename to validator/tests/test-cases/invalid-level4-5/cs_pud-ud-test.conllu diff --git a/test-cases/test.bat b/validator/tests/test-cases/test.bat similarity index 100% rename from test-cases/test.bat rename to validator/tests/test-cases/test.bat diff --git a/validator/tests/test-cases/valid/empty-file.conllu b/validator/tests/test-cases/valid/empty-file.conllu new file mode 100644 index 000000000..e69de29bb diff --git a/test-cases/valid/empty-nodes.conllu b/validator/tests/test-cases/valid/empty-nodes.conllu similarity index 100% rename from test-cases/valid/empty-nodes.conllu rename to validator/tests/test-cases/valid/empty-nodes.conllu diff --git a/test-cases/valid/layered-features.conllu b/validator/tests/test-cases/valid/layered-features.conllu similarity index 100% rename from test-cases/valid/layered-features.conllu rename to validator/tests/test-cases/valid/layered-features.conllu diff --git a/test-cases/valid/maximal-empty-node.conllu b/validator/tests/test-cases/valid/maximal-empty-node.conllu similarity index 100% rename from test-cases/valid/maximal-empty-node.conllu rename to validator/tests/test-cases/valid/maximal-empty-node.conllu diff --git a/test-cases/valid/minimal-empty-node.conllu b/validator/tests/test-cases/valid/minimal-empty-node.conllu similarity index 100% rename from test-cases/valid/minimal-empty-node.conllu rename to validator/tests/test-cases/valid/minimal-empty-node.conllu diff --git a/test-cases/valid/multiple-features.conllu b/validator/tests/test-cases/valid/multiple-features.conllu similarity index 100% rename from test-cases/valid/multiple-features.conllu rename to validator/tests/test-cases/valid/multiple-features.conllu diff --git a/test-cases/valid/tanl.conllu b/validator/tests/test-cases/valid/tanl.conllu similarity index 100% rename from test-cases/valid/tanl.conllu rename to validator/tests/test-cases/valid/tanl.conllu diff --git a/test-cases/valid/whitespace.conllu b/validator/tests/test-cases/valid/whitespace.conllu similarity index 100% rename from test-cases/valid/whitespace.conllu rename to validator/tests/test-cases/valid/whitespace.conllu diff --git a/validator/tests/test_cases.py b/validator/tests/test_cases.py new file mode 100644 index 000000000..2e79645d8 --- /dev/null +++ b/validator/tests/test_cases.py @@ -0,0 +1,45 @@ +import os +import argparse +from validator import cli +from validator.utils import THIS_DIR + +TEST_CASES_DIR = os.path.join(os.path.dirname(os.path.realpath(os.path.abspath(__file__))), "test-cases") + +CONFIG = { + "quiet": False, + "max_err": 20, + "single_root": True, + "check_tree_text": True, + "check_space_after": True, + "check_coref": False, + "data_folder": os.path.normpath(os.path.join(THIS_DIR,"../../../data")), + "format": "LOG", + "dest": "-", + "explanations": False, + "lines_content": False, + "config_file": os.path.realpath(os.path.join(THIS_DIR, "../../docs/example_working.yaml")) +} + +def general_test_cases(folder_name, expected_value, level=1): + CONFIG["level"] = level + cases_dir = os.path.join(TEST_CASES_DIR, folder_name) + for case in os.listdir(cases_dir): + case_path = os.path.join(cases_dir,case) + if level > 3: + CONFIG["lang"] = case.split("_")[0] + else: + CONFIG["lang"] = "ud" + CONFIG["input"] = [case_path] + assert cli._validate(argparse.Namespace(**CONFIG)) == expected_value + +def test_valid_cases(): + general_test_cases("valid", 0, level=2) + +def test_invalid_lv12_cases(): + general_test_cases("invalid-level1-2", 1, level=2) + +#def test_invalid_lv3_cases(): +# general_test_cases("invalid-level3", 1, level=3) +# +#def test_invalid_lv45_cases(): +# general_test_cases("invalid-level4-5", 1, level=5) diff --git a/validator/tests/test_checks.py b/validator/tests/test_checks.py new file mode 100644 index 000000000..e69de29bb diff --git a/validator/tests/test_regex.py b/validator/tests/test_regex.py new file mode 100644 index 000000000..e9e2c73f7 --- /dev/null +++ b/validator/tests/test_regex.py @@ -0,0 +1,224 @@ +import regex as re +from validator import compiled_regex as crex + +def test_ws(): + spaces = [" ", + " ", + "\n", # ! is this wanted? + " "] + + for space in spaces: + match = crex.ws.fullmatch(space) + if match: + assert True + else: + assert False + +def test_twows(): + spaces_true = [" ", + "\n\n", # ! is this wanted? + " "] + spaces_false = [" ", + "\n", # ! is this wanted? + " "] + + for space in spaces_true: + match = crex.ws2.fullmatch(space) + if match: + assert True + else: + assert False + + for space in spaces_false: + match = crex.ws2.fullmatch(space) + if match: + assert False + else: + assert True + +def test_integer(): + match = crex.wordid.fullmatch('10') + if match: + assert True + else: + assert False + + match = crex.wordid.fullmatch('01') + if match: + assert False + else: + assert True + +def test_naturalnumber(): + match = crex.head.fullmatch('0') + if match: + assert True + else: + assert False + + match = crex.head.fullmatch('10') + if match: + assert True + else: + assert False + + match = crex.head.fullmatch('01') + if match: + assert False + else: + assert True + +def test_range(): + ranges_true = ["1-2", "9-10", "15-16"] + ranges_false = ["0-1", "1-0", "10-", "-10"] + + for range in ranges_true: + match = crex.mwtid.fullmatch(range) + if match: + assert True + else: + assert False + + for range in ranges_false: + match = crex.mwtid.fullmatch(range) + if match: + assert False + else: + assert True + +def test_decimal(): + decimal_true = ["0.1", "1.1", "3.10"] + decimal_false = ["1.0", "1.", ".1", "1.01"] + + for decimal in decimal_true: + match = crex.enodeid.fullmatch(decimal) + if match: + assert True + else: + assert False + + for decimal in decimal_false: + match = crex.enodeid.fullmatch(decimal) + if match: + assert False + else: + assert True + +def test_decimalwithzero(): + decimal_true = ["0", "0.1", "1.1", "3.10"] + decimal_false = ["1.0", "1.", ".1", "1.01"] + + for decimal in decimal_true: + match = crex.ehead.fullmatch(decimal) + if match: + assert True + else: + assert False + + for decimal in decimal_false: + match = crex.ehead.fullmatch(decimal) + if match: + assert False + else: + assert True + +def general_test_metadata(regex_name, meta_str, expected): + obj = getattr(crex, regex_name) + match = obj.fullmatch(meta_str) + + if (match and expected) or (match is None and not expected): + assert True + else: + assert False + +def test_newdoc(): + general_test_metadata("newdoc", "# newdoc", True) + general_test_metadata("newdoc", "# newdoc newdoc_name", True) # ! is this wanted? + general_test_metadata("newdoc", "# newdoc = newdoc_name", False) # ! is this wanted? + general_test_metadata("newdoc", "# newdoc newdoc_name ", False) + +def test_newpar(): + general_test_metadata("newpar", "# newpar", True) + general_test_metadata("newpar", "# newpar newpar_name", True) # ! is this wanted? + general_test_metadata("newpar", "# newpar = newpar_name", False) # ! is this wanted? + general_test_metadata("newpar", "# newpar newpar_name ", False) + +def test_sentid(): + general_test_metadata("sentid", "# sent_id", False) + general_test_metadata("sentid", "# sent_id id_sentence", False) + general_test_metadata("sentid", "# sent_id = new sent id", False) + general_test_metadata("sentid", "# sent_id = new_sent_id", True) + general_test_metadata("sentid", "# sent_id = 9", True) + +def test_text(): + general_test_metadata("text", "# text", False) + general_test_metadata("text", "# text Mary had a little lamb", False) + general_test_metadata("text", "# text = Mary had a little lamb", True) + general_test_metadata("text", "# text = Mary had a little lamb ", False) # ! isn't this too strict? Maybe we could to allow trailing whitespaces + +def test_uppercasestring(): + strings_true = ["ABC", + "A"] + strings_false = ["abc", + "Abc", + ""] + + for string in strings_true: + match = crex.upos.fullmatch(string) + if match: + assert True + else: + assert False + + for string in strings_false: + match = crex.upos.fullmatch(string) + if match: + assert False + else: + assert True + +def test_featval(): + strings_true = ["Feat=Val", + "Feat=1,2", + "Feat=Val1", + "Feat[x]=Value"] + strings_false = ["Feat=val", + "feat=Val", + "Feat", + ""] + + for string in strings_true: + match = crex.featval.fullmatch(string) + if match: + assert True + else: + assert False + + for string in strings_false: + match = crex.featval.fullmatch(string) + if match: + assert False + else: + assert True + +def test_deprel(): + strings_true = ["nmod", + "nmod:poss", + "nmod:p"] + strings_false = [":poss", + "nmod:1", + ""] + + for string in strings_true: + match = crex.deprel.fullmatch(string) + if match: + assert True + else: + assert False + + for string in strings_false: + match = crex.deprel.fullmatch(string) + if match: + assert False + else: + assert True \ No newline at end of file diff --git a/validator/tests/test_utils.py b/validator/tests/test_utils.py new file mode 100644 index 000000000..bb6e38183 --- /dev/null +++ b/validator/tests/test_utils.py @@ -0,0 +1,51 @@ +from validator.utils import * +from udapi.core.node import Node + +TEST_CASES_DIR = os.path.join(os.path.dirname(os.path.realpath(os.path.abspath(__file__))), "test-cases") + +def test_parse_empty_node_id(): + empty_node = ["1.2", "_", "_", "_", "_", "_", "_", "_", "_", "_"] + # TODO: update after removing assert in parse_empty_nodes_id + assert parse_empty_node_id(empty_node) == ("1", "2") + +def test_shorten(): + short_str = "This is a short string" + long_str = "This is a string w more than twenty-five characters" + assert shorten(short_str) == short_str + assert len(shorten(long_str)) == 25 + +def test_lspec2ud(): + assert lspec2ud("nmod") == "nmod" + assert lspec2ud("nmod:poss") == "nmod" + +def test_formtl(): + form = "ኧሁ" + tl = "'ăhu" + node_wo_tl = Node(0, form=form) + node_w_tl = Node(0, form=form, misc="Translit={}".format(tl)) + assert formtl(node_wo_tl) == form + assert formtl(node_w_tl) == "{} {}".format(form, tl) + +def test_lemmatl(): + lemma = "እኔ" + tl = "'əne" + node_wo_tl = Node(0, lemma=lemma) + node_w_tl = Node(0, lemma=lemma, misc="LTranslit={}".format(tl)) + assert lemmatl(node_wo_tl) == lemma + assert lemmatl(node_w_tl) == "{} {}".format(lemma, tl) + +def test_get_alt_language(): + lang = "en" + node_wo_lang = Node(0) + node_w_lang = Node(0, misc="Lang={}".format(lang)) + assert get_alt_language(node_wo_lang) == None + assert get_alt_language(node_w_lang) == lang + +def test_deps_list(): + line_wo_deps = ["_", "_", "_", "_", "_", "_", "_", "_", "_", "_"] + line_w_deps = ["_", "_", "_", "_", "_", "_", "_", "_", "0:root|2:conj", "_"] + assert deps_list(line_wo_deps) == [] + assert deps_list(line_w_deps) == [["0", "root"], ["2", "conj"]] + +def test_get_line_numbers_for_ids(): + pass # TODO: only if State class changes \ No newline at end of file diff --git a/validator/tests/test_validate.py b/validator/tests/test_validate.py new file mode 100644 index 000000000..522682f69 --- /dev/null +++ b/validator/tests/test_validate.py @@ -0,0 +1,53 @@ +import os + +from validator.validate import * +from validator.specifications import UDSpecs +from validator.utils import THIS_DIR + +Fspecs = UDSpecs(os.path.normpath(os.path.join(utils.THIS_DIR,"../../../data"))) + +def test_mwt_empty_vals(): + assert check_mwt_empty_vals(['2-3','_','_','_','_','_','_','_','_','_']) == [] + assert check_mwt_empty_vals(['2-3','_','_','_','_','_','_','_','_','Feat=Val']) == [] + assert check_mwt_empty_vals(['2-3','_','_','_','_','Typo=Yes','_','_','_','Feat=Val']) == [] + + assert len(check_mwt_empty_vals(['2','_','_','_','_','_','_','_','_','Feat=Val'])) > 0 + assert len(check_mwt_empty_vals(['2-3','_','_','_','_','Gender=Masc','_','_','_','Feat=Val'])) > 0 + assert len(check_mwt_empty_vals(['2-3','_','_','ADJ','_','_','_','_','_','Feat=Val'])) > 0 + +def test_empty_node_empty_vals(): + assert len(check_empty_node_empty_vals(['1','_','_','_','_','_','_','_','_','_'])) == 1 + assert len(check_empty_node_empty_vals(['2-3','_','_','_','_','_','_','_','_','Feat=Val'])) == 1 + + assert len(check_empty_node_empty_vals(['1.1','word','lemma','ADJ','X','Feat=Val','_','_','_','Feat=Val'])) == 0 + assert len(check_empty_node_empty_vals(['1.1','word','lemma','ADJ','X','Feat=Val','2','_','_','Feat=Val'])) == 1 + assert len(check_empty_node_empty_vals(['1.1','word','lemma','ADJ','X','Feat=Val','_','root','_','Feat=Val'])) == 1 + assert len(check_empty_node_empty_vals(['1.1','word','lemma','ADJ','X','Feat=Val','0','root','_','Feat=Val'])) == 2 + + +def test_character_constraints(): + assert len(check_character_constraints(['2-3','_','_','_','_','_','_','_','_','Feat=Val'])) == 0 + assert len(check_character_constraints(['1.1','_','_','_','_','_','_','_','_','Feat=Val'])) == 0 + + assert len(check_character_constraints(['1','_','_','_','_','_','_','1','_','Feat=Val'])) == 1 + assert len(check_character_constraints(['1','_','_','_','_','_','_','root','3','Feat=Val'])) == 1 + assert len(check_character_constraints(['1','_','_','_','_','_','_','1','3','Feat=Val'])) == 2 + assert len(check_character_constraints(['1','_','_','_','_','_','_','_','3','Feat=Val'])) == 2 + +def test_upos(): + assert len(check_upos(['2-3','_','_','_','_','_','_','_','_','Feat=Val'], Fspecs)) == 0 + assert len(check_upos(['1.1','_','_','_','_','_','_','_','_','Feat=Val'], Fspecs)) == 0 + assert len(check_upos(['1','_','_','adj','_','_','_','_','_','Feat=Val'], Fspecs)) == 1 + assert len(check_upos(['1','_','_','ADJ','_','_','_','_','_','Feat=Val'], Fspecs)) == 0 + assert len(check_upos(['1','_','_','PROP','_','_','_','_','_','Feat=Val'], Fspecs)) == 1 + +def test_features_level2(): + assert len(check_features_level2(['1','_','_','_','_','_','_','_','_','_'])) == 0 + assert len(check_features_level2(['1','_','_','_','_','A=1|B=2','_','_','_','_'])) == 0 + assert len(check_features_level2(['1','_','_','_','_','A=No,Yes|B=2','_','_','_','_'])) == 0 + assert len(check_features_level2(['1','_','_','_','_','B=1|A=2','_','_','_','_'])) == 1 + assert len(check_features_level2(['1','_','_','_','_','A=1|B=2|B=2','_','_','_','_'])) == 1 + # TODO: this should raise 2 errors, not 1 + assert len(check_features_level2(['1','_','_','_','_','A=1|B=2|B=3','_','_','_','_'])) == 1 + assert len(check_features_level2(['1','_','_','_','_','A=Yes,No','_','_','_','_'])) == 1 + assert len(check_features_level2(['1','_','_','_','_','A=Yes,Yes','_','_','_','_'])) == 1