Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14,205 changes: 0 additions & 14,205 deletions machine/corpora/BiblicalTermsPt.xml

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .dictionary_text_corpus import DictionaryTextCorpus
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
from .file_paratext_project_terms_parser import FileParatextProjectTermsParser
from .file_paratext_project_text_updater import FileParatextProjectTextUpdater
from .file_paratext_project_versification_error_detector import FileParatextProjectVersificationErrorDetector
from .flatten import flatten
Expand All @@ -25,7 +26,7 @@
from .paratext_project_file_handler import ParatextProjectFileHandler
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase
from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector
from .paratext_text_corpus import ParatextTextCorpus
Expand Down Expand Up @@ -109,10 +110,12 @@
"extract_scripture_corpus",
"FileParatextProjectFileHandler",
"FileParatextProjectSettingsParser",
"FileParatextProjectTermsParser",
"FileParatextProjectTextUpdater",
"FileParatextProjectVersificationErrorDetector",
"flatten",
"is_scripture",
"KeyTerm",
"lowercase",
"MemoryAlignmentCollection",
"MemoryStreamContainer",
Expand Down
11 changes: 11 additions & 0 deletions machine/corpora/file_paratext_project_terms_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from ..utils.typeshed import StrPath
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
from .paratext_project_terms_parser_base import ParatextProjectTermsParserBase


class FileParatextProjectTermsParser(ParatextProjectTermsParserBase):
def __init__(self, project_dir: StrPath) -> None:
super().__init__(
FileParatextProjectFileHandler(project_dir), FileParatextProjectSettingsParser(project_dir).parse()
)
14 changes: 14 additions & 0 deletions machine/corpora/key_term_row.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from dataclasses import dataclass
from typing import List

from ..scripture.verse_ref import VerseRef


@dataclass(frozen=True)
class KeyTerm:
id: str
category: str
domain: str
renderings: List[str]
references: List[VerseRef]
renderings_patterns: List[str]
7 changes: 4 additions & 3 deletions machine/corpora/paratext_backup_terms_corpus.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from typing import List, Sequence, Tuple
from typing import Sequence
from zipfile import ZipFile

from ..utils.typeshed import StrPath
from .dictionary_text_corpus import DictionaryTextCorpus
from .key_term_row import KeyTerm
from .memory_text import MemoryText
from .text_row import TextRow
from .zip_paratext_project_settings_parser import ZipParatextProjectSettingsParser
Expand All @@ -15,7 +16,7 @@ def __init__(self, filename: StrPath, term_categories: Sequence[str], use_term_g

with ZipFile(filename, "r") as archive:
settings = ZipParatextProjectSettingsParser(archive).parse()
glosses: List[Tuple[str, List[str]]] = ZipParatextProjectTermsParser(archive, settings).parse(
key_terms: Sequence[KeyTerm] = ZipParatextProjectTermsParser(archive, settings).parse(
term_categories, use_term_glosses
)
text_id = (
Expand All @@ -24,5 +25,5 @@ def __init__(self, filename: StrPath, term_categories: Sequence[str], use_term_g
f"{settings.biblical_terms_file_name}"
)

text = MemoryText(text_id, [TextRow(text_id, kvp[0], kvp[1]) for kvp in glosses])
text = MemoryText(text_id, [TextRow(text_id, key_term.id, key_term.renderings) for key_term in key_terms])
self._add_text(text)
94 changes: 74 additions & 20 deletions machine/corpora/paratext_project_terms_parser_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
from typing import Dict, List, Optional, Sequence, Tuple, Union
from xml.etree import ElementTree

from ..scripture.constants import ORIGINAL_VERSIFICATION
from ..scripture.verse_ref import VerseRef
from .key_term_row import KeyTerm
from .paratext_project_file_handler import ParatextProjectFileHandler
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
Expand Down Expand Up @@ -37,21 +40,27 @@ def __init__(
else:
self._settings = settings

def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -> List[Tuple[str, List[str]]]:
def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -> Sequence[KeyTerm]:
biblical_terms_doc = None
if self._settings.biblical_terms_list_type == "Project":
if self._paratext_project_file_handler.exists(self._settings.biblical_terms_file_name):
with self._paratext_project_file_handler.open(self._settings.biblical_terms_file_name) as stream:
biblical_terms_doc = ElementTree.parse(stream)
term_id_to_category_dict = _get_category_per_id(biblical_terms_doc)
term_id_to_category_dict = {}
term_id_to_domain_dict = {}
term_id_to_references_dict = {}
if self._settings.biblical_terms_list_type == "Project" and self._paratext_project_file_handler.exists(
self._settings.biblical_terms_file_name
):
with self._paratext_project_file_handler.open(self._settings.biblical_terms_file_name) as stream:
biblical_terms_doc = ElementTree.parse(stream)
term_id_to_category_dict, term_id_to_domain_dict, term_id_to_references_dict = _get_term_data(
biblical_terms_doc
)
elif self._settings.biblical_terms_list_type in _PREDEFINED_TERMS_LIST_TYPES:
with open_binary(
_SUPPORTED_LANGUAGE_TERMS_LOCALIZATION_XMLS_PACKAGE, self._settings.biblical_terms_file_name
) as stream:
biblical_terms_doc = ElementTree.parse(stream)
term_id_to_category_dict = _get_category_per_id(biblical_terms_doc)
else:
term_id_to_category_dict = {}
term_id_to_category_dict, term_id_to_domain_dict, term_id_to_references_dict = _get_term_data(
biblical_terms_doc
)

terms_glosses_doc: Optional[ElementTree.ElementTree[ElementTree.Element]] = None
resource_name = None
Expand All @@ -74,15 +83,18 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
if term_renderings_doc is not None:
for term in term_renderings_doc.findall(".//TermRendering"):
id = term.attrib["Id"]
if _is_in_category(id, term_categories, term_id_to_category_dict):
if (
_is_in_category(id, term_categories, term_id_to_category_dict)
and term.attrib.get("Guess", "false") == "false"
):
id_ = id.replace("\n", "&#xA")
renderings_element = term.find("Renderings")
rendering_text = (
renderings_element.text
if renderings_element is not None and renderings_element.text is not None
else ""
)
renderings = _get_renderings(rendering_text)
renderings = _get_renderings_with_pattern(rendering_text)
terms_renderings[id_].extend(renderings)

terms_glosses: Dict[str, List[str]] = defaultdict(list)
Expand All @@ -95,15 +107,38 @@ def parse(self, term_categories: Sequence[str], use_term_glosses: bool = True) -
glosses = _get_glosses(gloss)
terms_glosses[id_].extend(glosses)
if terms_glosses or terms_renderings:
combined = {**terms_renderings, **{k: v for k, v in terms_glosses.items() if k not in terms_renderings}}
return [(key, list(value)) for key, value in combined.items()]

terms: List[KeyTerm] = []
for id in sorted(set(terms_renderings.keys()).union(terms_glosses.keys())):
renderings_patterns = terms_renderings.get(id, [])
category = term_id_to_category_dict.get(id, "?")
domain = term_id_to_domain_dict.get(id, "?")
glosses = terms_glosses.get(id, [])
references = term_id_to_references_dict.get(id, [])
renderings = [r.replace("*", "") for r in renderings_patterns]
if len(renderings) == 0:
if len(glosses) == 0:
continue
renderings = glosses
term = KeyTerm(
id=id,
category=category,
domain=domain,
renderings=renderings,
references=references,
renderings_patterns=renderings_patterns,
)
terms.append(term)
return terms
return []


def _is_in_category(id: str, term_categories: Sequence[str], term_id_to_category_dict: Dict[str, str]) -> bool:
category = term_id_to_category_dict.get(id)
return not term_categories or (category is not None and category in term_categories)
return (
not term_categories
or (category is not None and category in term_categories)
or (len(term_id_to_category_dict) == 0)
)


def _clean_term(term: str):
Expand All @@ -127,9 +162,9 @@ def _get_glosses(gloss: str) -> List[str]:
return glosses


def _get_renderings(rendering: str) -> List[str]:
def _get_renderings_with_pattern(rendering: str):
renderings = re.split(r"\|\|", rendering.strip())
renderings = [_clean_term(rendering).strip().replace("*", "") for rendering in renderings]
renderings = [_clean_term(rendering).strip() for rendering in renderings]
return [rendering for rendering in renderings if rendering]


Expand All @@ -150,8 +185,12 @@ def _strip_parens(term_string: str, left: str = "(", right: str = ")") -> str:
return term_string


def _get_category_per_id(biblical_terms_doc: ElementTree.ElementTree[ElementTree.Element]) -> Dict[str, str]:
def _get_term_data(
biblical_terms_doc: ElementTree.ElementTree[ElementTree.Element],
) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, List[VerseRef]]]:
term_id_to_category_dict: Dict[str, str] = {}
term_id_to_domain_dict: Dict[str, str] = {}
term_id_to_references_dict: Dict[str, List[VerseRef]] = {}

for term in biblical_terms_doc.findall(".//Term"):
term_id = term.attrib["Id"]
Expand All @@ -160,5 +199,20 @@ def _get_category_per_id(biblical_terms_doc: ElementTree.ElementTree[ElementTree
term_id_to_category_dict[term_id] = (
category.text if category is not None and category.text is not None else ""
)

return term_id_to_category_dict
if term_id not in term_id_to_domain_dict:
domain = term.find("Domain")
term_id_to_domain_dict[term_id] = domain.text if domain is not None and domain.text is not None else ""
if term_id not in term_id_to_references_dict:
references_element = term.find("References")
references: List[VerseRef] = []
if references_element is not None:
for verse_element in references_element.findall("Verse"):
if verse_element.text is None:
continue
bbbcccvvv = int(verse_element.text[:9])
vref = VerseRef.from_bbbcccvvv(bbbcccvvv)
vref.change_versification(ORIGINAL_VERSIFICATION)
references.append(vref)
term_id_to_references_dict[term_id] = references

return term_id_to_category_dict, term_id_to_domain_dict, term_id_to_references_dict
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def get_usfm_versification_errors(
self,
handler: Optional[UsfmVersificationErrorDetector] = None,
) -> List[UsfmVersificationError]:
handler = handler or UsfmVersificationErrorDetector(self._settings.versification)
handler = handler or UsfmVersificationErrorDetector(self._settings)
for file_name in self._settings.get_all_scripture_book_file_names():
if not self._paratext_project_file_handler.exists(file_name):
continue
Expand Down
44 changes: 31 additions & 13 deletions machine/corpora/usfm_versification_error_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

from machine.scripture import canon

from ..scripture.verse_ref import ValidStatus, VerseRef, Versification
from ..scripture.verse_ref import ValidStatus, VerseRef
from .paratext_project_settings import ParatextProjectSettings
from .usfm_parser_handler import UsfmParserHandler
from .usfm_parser_state import UsfmParserState

Expand All @@ -25,6 +26,7 @@ def __init__(
expected_verse: int,
actual_chapter: int,
actual_verse: int,
project_name: str,
verse_ref: Optional[VerseRef] = None,
):
self._book_num = book_num
Expand All @@ -34,11 +36,16 @@ def __init__(
self._actual_verse = actual_verse
self._verse_ref = verse_ref
self._type: UsfmVersificationErrorType
self._project_name = project_name

@property
def type(self) -> UsfmVersificationErrorType:
return self._type

@property
def project_name(self) -> str:
return self._project_name

def check_error(self) -> bool:
"""Returns true if there is an error"""
if self._expected_chapter > self._actual_chapter and self._expected_verse != 0:
Expand Down Expand Up @@ -71,15 +78,15 @@ def map(valid_status: ValidStatus) -> UsfmVersificationErrorType:

@property
def expected_verse_ref(self) -> str:
if self._type == UsfmVersificationErrorType.EXTRA_VERSE:
return ""
if (
default_verse_ref := VerseRef.try_from_string(
f"{self._book_num} {self._expected_chapter}:{self._expected_verse}"
f"{canon.book_number_to_id(self._book_num)} {self._expected_chapter}:{self._expected_verse}"
)
is None
):
return ""
if self._type == UsfmVersificationErrorType.EXTRA_VERSE:
return ""
return self.default_verse(self._expected_chapter, self._expected_verse)
if self._type == UsfmVersificationErrorType.MISSING_VERSE_SEGMENT:
if (
verse_ref_with_segment := VerseRef.try_from_string(
Expand All @@ -96,7 +103,7 @@ def expected_verse_ref(self) -> str:
return str(first_verse)
elif (
corrected_verse_range_ref := VerseRef.try_from_string(
f"{self._book_num} {self._expected_chapter}:{first_verse}-{last_verse}"
f"{canon.book_number_to_id(self._book_num)} {self._expected_chapter}:{first_verse}-{last_verse}"
)
is not None
):
Expand All @@ -105,16 +112,23 @@ def expected_verse_ref(self) -> str:

@property
def actual_verse_ref(self) -> str:
return (
str(self._verse_ref)
if self._verse_ref is not None
else str(VerseRef(self._book_num, self._actual_chapter, self._actual_verse))
)
if self._verse_ref is not None:
return str(self._verse_ref)
if actual_verse_ref := VerseRef.try_from_string(
f"{self._book_num} {self._actual_chapter}:{self._actual_verse}"
):
return str(actual_verse_ref)
return self.default_verse(self._actual_chapter, self._actual_verse)

def default_verse(self, chapter: int, verse: int):
verse_string = "" if self._actual_verse == -1 else str(verse)
return f"{canon.book_number_to_id(self._book_num)} {chapter}:{verse_string}"


class UsfmVersificationErrorDetector(UsfmParserHandler):
def __init__(self, versification: Versification):
self._versification = versification
def __init__(self, settings: ParatextProjectSettings):
self._project_name = settings.name
self._versification = settings.versification
self._current_book = 0
self._current_chapter = 0
self._current_verse = VerseRef()
Expand All @@ -134,6 +148,7 @@ def end_usfm(self, state: UsfmParserState) -> None:
),
self._current_chapter,
list(self._current_verse.all_verses())[-1].verse_num,
self._project_name,
)
if versification_error.check_error():
self._errors.append(versification_error)
Expand All @@ -153,6 +168,7 @@ def chapter(
self._versification.get_last_verse(self._current_book, self._current_chapter),
self._current_chapter,
list(self._current_verse.all_verses())[-1].verse_num,
self._project_name,
)
if versification_error.check_error():
self._errors.append(versification_error)
Expand All @@ -167,6 +183,8 @@ def verse(
list(self._current_verse.all_verses())[-1].verse_num,
self._current_chapter,
list(self._current_verse.all_verses())[-1].verse_num,
self._project_name,
self._current_verse,
)
if versification_error.check_error():
self._errors.append(versification_error)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ reportMissingModuleSource = false

[tool.poetry]
name = "sil-machine"
version = "1.8.4"
version = "1.8.5"
description = "A natural language processing library that is focused on providing tools for resource-poor languages."
license = "MIT"
authors = ["SIL International"]
Expand Down
Loading