Skip to content

Commit bd6b66c

Browse files
committed
Port sillsdev/machine#362; fix gloss inclusion
1 parent 7c3c23d commit bd6b66c

File tree

6 files changed

+57
-26
lines changed

6 files changed

+57
-26
lines changed

machine/corpora/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from .paratext_project_file_handler import ParatextProjectFileHandler
2727
from .paratext_project_settings import ParatextProjectSettings
2828
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
29-
from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase
29+
from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase
3030
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
3131
from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector
3232
from .paratext_text_corpus import ParatextTextCorpus
@@ -115,6 +115,7 @@
115115
"FileParatextProjectVersificationErrorDetector",
116116
"flatten",
117117
"is_scripture",
118+
"KeyTerm",
118119
"lowercase",
119120
"MemoryAlignmentCollection",
120121
"MemoryStreamContainer",

machine/corpora/paratext_backup_terms_corpus.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
from typing import List, Sequence, Tuple
22
from zipfile import ZipFile
33

4-
from .key_term_row import KeyTerm
5-
64
from ..utils.typeshed import StrPath
75
from .dictionary_text_corpus import DictionaryTextCorpus
6+
from .key_term_row import KeyTerm
87
from .memory_text import MemoryText
98
from .text_row import TextRow
109
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser

machine/corpora/paratext_project_terms_parser_base.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,8 @@
88
from xml.etree import ElementTree
99

1010
from ..scripture.constants import ORIGINAL_VERSIFICATION
11-
1211
from ..scripture.verse_ref import VerseRef
13-
1412
from .key_term_row import KeyTerm
15-
1613
from .paratext_project_file_handler import ParatextProjectFileHandler
1714
from .paratext_project_settings import ParatextProjectSettings
1815
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
@@ -111,14 +108,17 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
111108
terms_glosses[id_].extend(glosses)
112109
if terms_glosses or terms_renderings:
113110
terms: List[KeyTerm] = []
114-
for id, renderings_patterns in terms_renderings.items():
111+
for id in sorted(set(terms_renderings.keys()).union(terms_glosses.keys())):
112+
renderings_patterns = terms_renderings.get(id, [])
115113
category = term_id_to_category_dict.get(id, "?")
116114
domain = term_id_to_domain_dict.get(id, "?")
117115
glosses = terms_glosses.get(id, [])
118116
references = term_id_to_references_dict.get(id, [])
119117
renderings = [r.replace("*", "") for r in renderings_patterns]
120118
if len(renderings) == 0:
121-
continue
119+
if len(glosses) == 0:
120+
continue
121+
renderings = glosses
122122
term = KeyTerm(
123123
id=id,
124124
category=category,

machine/corpora/usfm_versification_error_detector.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from machine.scripture import canon
55

66
from ..scripture.verse_ref import ValidStatus, VerseRef, Versification
7+
from .paratext_project_settings import ParatextProjectSettings
78
from .usfm_parser_handler import UsfmParserHandler
89
from .usfm_parser_state import UsfmParserState
910

@@ -25,6 +26,7 @@ def __init__(
2526
expected_verse: int,
2627
actual_chapter: int,
2728
actual_verse: int,
29+
project_name: str,
2830
verse_ref: Optional[VerseRef] = None,
2931
):
3032
self._book_num = book_num
@@ -34,11 +36,16 @@ def __init__(
3436
self._actual_verse = actual_verse
3537
self._verse_ref = verse_ref
3638
self._type: UsfmVersificationErrorType
39+
self._project_name = project_name
3740

3841
@property
3942
def type(self) -> UsfmVersificationErrorType:
4043
return self._type
4144

45+
@property
46+
def project_name(self) -> str:
47+
return self._project_name
48+
4249
def check_error(self) -> bool:
4350
"""Returns true if there is an error"""
4451
if self._expected_chapter > self._actual_chapter and self._expected_verse != 0:
@@ -71,15 +78,15 @@ def map(valid_status: ValidStatus) -> UsfmVersificationErrorType:
7178

7279
@property
7380
def expected_verse_ref(self) -> str:
81+
if self._type == UsfmVersificationErrorType.EXTRA_VERSE:
82+
return ""
7483
if (
7584
default_verse_ref := VerseRef.try_from_string(
76-
f"{self._book_num} {self._expected_chapter}:{self._expected_verse}"
85+
f"{canon.book_number_to_id(self._book_num)} {self._expected_chapter}:{self._expected_verse}"
7786
)
7887
is None
7988
):
8089
return self.default_verse(self._expected_chapter, self._expected_verse)
81-
if self._type == UsfmVersificationErrorType.EXTRA_VERSE:
82-
return ""
8390
if self._type == UsfmVersificationErrorType.MISSING_VERSE_SEGMENT:
8491
if (
8592
verse_ref_with_segment := VerseRef.try_from_string(
@@ -96,7 +103,7 @@ def expected_verse_ref(self) -> str:
96103
return str(first_verse)
97104
elif (
98105
corrected_verse_range_ref := VerseRef.try_from_string(
99-
f"{self._book_num} {self._expected_chapter}:{first_verse}-{last_verse}"
106+
f"{canon.book_number_to_id(self._book_num)} {self._expected_chapter}:{first_verse}-{last_verse}"
100107
)
101108
is not None
102109
):
@@ -119,8 +126,9 @@ def default_verse(self, chapter: int, verse: int):
119126

120127

121128
class UsfmVersificationErrorDetector(UsfmParserHandler):
122-
def __init__(self, versification: Versification):
123-
self._versification = versification
129+
def __init__(self, settings: ParatextProjectSettings):
130+
self._project_name = settings.name
131+
self._versification = settings.versification
124132
self._current_book = 0
125133
self._current_chapter = 0
126134
self._current_verse = VerseRef()
@@ -140,6 +148,7 @@ def end_usfm(self, state: UsfmParserState) -> None:
140148
),
141149
self._current_chapter,
142150
list(self._current_verse.all_verses())[-1].verse_num,
151+
self._project_name,
143152
)
144153
if versification_error.check_error():
145154
self._errors.append(versification_error)
@@ -159,6 +168,7 @@ def chapter(
159168
self._versification.get_last_verse(self._current_book, self._current_chapter),
160169
self._current_chapter,
161170
list(self._current_verse.all_verses())[-1].verse_num,
171+
self._project_name,
162172
)
163173
if versification_error.check_error():
164174
self._errors.append(versification_error)
@@ -173,6 +183,8 @@ def verse(
173183
list(self._current_verse.all_verses())[-1].verse_num,
174184
self._current_chapter,
175185
list(self._current_verse.all_verses())[-1].verse_num,
186+
self._project_name,
187+
self._current_verse,
176188
)
177189
if versification_error.check_error():
178190
self._errors.append(versification_error)

tests/corpora/test_paratext_project_terms_parser.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
from typing import Dict, List, Optional, Sequence, Tuple
22

3-
from ...machine.corpora.key_term_row import KeyTerm
43
from testutils.memory_paratext_project_file_handler import DefaultParatextProjectSettings
54
from testutils.memory_paratext_project_terms_parser import MemoryParatextProjectTermsParser
65

7-
from machine.corpora import ParatextProjectSettings, ParatextProjectTermsParserBase
8-
from machine.corpora.paratext_project_terms_parser_base import _get_glosses, _get_renderings, _strip_parens
6+
from machine.corpora import KeyTerm, ParatextProjectSettings, ParatextProjectTermsParserBase
7+
from machine.corpora.paratext_project_terms_parser_base import _get_glosses, _get_renderings_with_pattern, _strip_parens
98

109

1110
def test_get_key_terms_from_terms_renderings() -> None:
@@ -48,7 +47,7 @@ def test_get_key_terms_from_terms_localizations_no_term_renderings() -> None:
4847
assert len(terms) == 5726
4948

5049
glosses = terms[0].renderings
51-
assert str.join(" ", glosses) == "Abagtha"
50+
assert str.join(" ", glosses) == "Aaron"
5251

5352

5453
def test_get_key_terms_from_terms_localizations_no_term_renderings_do_not_use_term_glosses() -> None:
@@ -97,8 +96,8 @@ def test_get_key_terms_from_terms_localizations_term_renderings_exists_prefer_lo
9796

9897
terms_index1_glosses = terms[1].renderings
9998
terms_index2_glosses = terms[2].renderings
100-
assert str.join(" ", terms_index1_glosses) == "Abagtha"
101-
assert str.join(" ", terms_index2_glosses) == "Abi"
99+
assert str.join(" ", terms_index1_glosses) == "Obadiah"
100+
assert str.join(" ", terms_index2_glosses) == "Abagtha"
102101

103102

104103
def test_strip_parens() -> None:
@@ -116,11 +115,11 @@ def test_get_glosses() -> None:
116115

117116

118117
def test_get_renderings() -> None:
119-
assert _get_renderings("") == []
120-
assert _get_renderings("*Abba*") == ["Abba"]
121-
assert _get_renderings("Abba|| ") == ["Abba"]
122-
assert _get_renderings("Abba||Abbah") == ["Abba", "Abbah"]
123-
assert _get_renderings("Abba (note)") == ["Abba"]
118+
assert _get_renderings_with_pattern("") == []
119+
assert _get_renderings_with_pattern("*Abba*") == ["*Abba*"]
120+
assert _get_renderings_with_pattern("Abba|| ") == ["Abba"]
121+
assert _get_renderings_with_pattern("Abba||Abbah") == ["Abba", "Abbah"]
122+
assert _get_renderings_with_pattern("Abba (note)") == ["Abba"]
124123

125124

126125
class _TestEnvironment:

tests/corpora/test_usfm_versification_error_detector.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def get_usfm_versification_errors_no_errors():
3636
assert len(env.get_usfm_versification_errors()) == 0
3737

3838

39-
def get_usfm_versification_errors_missing_verses():
39+
def get_usfm_versification_errors_missing_verse():
4040
env = _TestEnvironment(
4141
files={
4242
"653JNTest.SFM": r"""\id 3JN
@@ -61,6 +61,8 @@ def get_usfm_versification_errors_missing_verses():
6161
errors = env.get_usfm_versification_errors()
6262
assert len(errors) == 1
6363
assert errors[0].type == UsfmVersificationErrorType.MISSING_VERSE
64+
assert errors[0].expected_verse_ref == "3JN 1:15"
65+
assert errors[0].actual_verse_ref == "3JN 1:14"
6466

6567

6668
def get_usfm_versification_missing_chapter():
@@ -73,6 +75,8 @@ def get_usfm_versification_missing_chapter():
7375
errors = env.get_usfm_versification_errors()
7476
assert len(errors) == 1
7577
assert errors[0].type == UsfmVersificationErrorType.MISSING_CHAPTER
78+
assert errors[0].expected_verse_ref == "3JN 1:15"
79+
assert errors[0].actual_verse_ref == "3JN 0:0"
7680

7781

7882
def get_usfm_versification_errors_extra_verse():
@@ -102,6 +106,8 @@ def get_usfm_versification_errors_extra_verse():
102106
errors = env.get_usfm_versification_errors()
103107
assert len(errors) == 1
104108
assert errors[0].type == UsfmVersificationErrorType.EXTRA_VERSE
109+
assert errors[0].expected_verse_ref == ""
110+
assert errors[0].actual_verse_ref == "3JN 1:16"
105111

106112

107113
def get_usfm_versification_errors_invalid_verse():
@@ -129,6 +135,8 @@ def get_usfm_versification_errors_invalid_verse():
129135
errors = env.get_usfm_versification_errors()
130136
assert len(errors) == 1
131137
assert errors[0].type == UsfmVersificationErrorType.INVALID_VERSE_RANGE
138+
assert errors[0].expected_verse_ref == "3JN 1:12-13"
139+
assert errors[0].actual_verse_ref == "3JN 1:13-12"
132140

133141

134142
def get_usfm_versification_errors_extra_verse_segment():
@@ -158,6 +166,8 @@ def get_usfm_versification_errors_extra_verse_segment():
158166
errors = env.get_usfm_versification_errors()
159167
assert len(errors) == 1
160168
assert errors[0].type == UsfmVersificationErrorType.EXTRA_VERSE_SEGMENT
169+
assert errors[0].expected_verse_ref == "3JN 1:14"
170+
assert errors[0].actual_verse_ref == "3JN 1:14a"
161171

162172

163173
def get_usfm_versification_errors_missing_verse_segments():
@@ -186,6 +196,8 @@ def get_usfm_versification_errors_missing_verse_segments():
186196
errors = env.get_usfm_versification_errors()
187197
assert len(errors) == 1
188198
assert errors[0].type == UsfmVersificationErrorType.MISSING_VERSE_SEGMENT
199+
assert errors[0].expected_verse_ref == "3JN 1:13a"
200+
assert errors[0].actual_verse_ref == "3JN 1:13"
189201

190202

191203
def get_usfm_versification_errors_ignore_noncanonicals():
@@ -227,6 +239,8 @@ def get_usfm_versification_errors_excluded_in_custom_vrs():
227239
errors = env.get_usfm_versification_errors()
228240
assert len(errors) == 1
229241
assert errors[0].type == UsfmVersificationErrorType.EXTRA_VERSE
242+
assert errors[0].expected_verse_ref == ""
243+
assert errors[0].actual_verse_ref == "3JN 1:13"
230244

231245

232246
def get_usfm_versification_errors_multiple_books():
@@ -270,6 +284,8 @@ def get_usfm_versification_errors_multiple_books():
270284
errors = env.get_usfm_versification_errors()
271285
assert len(errors) == 1
272286
assert errors[0].type == UsfmVersificationErrorType.MISSING_VERSE
287+
assert errors[0].expected_verse_ref == "2JN 1:13"
288+
assert errors[0].actual_verse_ref == "2JN 1:12"
273289

274290

275291
def get_usfm_versification_errors_multiple_chapters():
@@ -298,6 +314,10 @@ def get_usfm_versification_errors_multiple_chapters():
298314
assert len(errors) == 2
299315
assert errors[0].type == UsfmVersificationErrorType.MISSING_VERSE
300316
assert errors[0].type == UsfmVersificationErrorType.EXTRA_VERSE
317+
assert errors[0].expected_verse_ref == "2JN 1:13"
318+
assert errors[0].actual_verse_ref == "2JN 1:12"
319+
assert errors[1].expected_verse_ref == ""
320+
assert errors[1].actual_verse_ref == "2JN 2:1"
301321

302322

303323
class _TestEnvironment:

0 commit comments

Comments
 (0)