Skip to content

Commit 842c533

Browse files
Re-introduced legacy APIs; fixed bugs; improved capitalization in suggestions
1 parent 23b33a5 commit 842c533

File tree

6 files changed

+148
-29
lines changed

6 files changed

+148
-29
lines changed

src/reynir_correct/__init__.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,26 @@
3838
from .annotation import Annotation
3939

4040
# Grammar checking
41-
from .checker import AnnotatedSentence, GreynirCorrect
41+
from .checker import (
42+
AnnotatedSentence,
43+
GreynirCorrect,
44+
check,
45+
check_single,
46+
check_with_stats,
47+
check_tokens,
48+
)
4249

4350
# Token-level correction
4451
from .errtokenizer import Correct_TOK, CorrectionPipeline, CorrectToken, tokenize
4552
from .readability import FleschKincaidFeedback, FleschKincaidScorer, RareWordsFinder
4653
from .settings import Settings
47-
from .wrappers import CorrectedSentence, CorrectionResult, GreynirCorrectAPI, ParseResultStats, check_errors
54+
from .wrappers import (
55+
CorrectedSentence,
56+
CorrectionResult,
57+
GreynirCorrectAPI,
58+
ParseResultStats,
59+
check_errors,
60+
)
4861

4962
__author__ = "Miðeind ehf"
5063
__copyright__ = "© 2025 Miðeind ehf."
@@ -70,6 +83,10 @@
7083
"GreynirCorrectAPI",
7184
"CorrectionResult",
7285
"CorrectedSentence",
86+
"check",
87+
"check_single",
88+
"check_with_stats",
89+
"check_tokens",
7390
"check_errors",
7491
"AnnotatedSentence",
7592
"Annotation",

src/reynir_correct/annotation.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,14 @@ def __str__(self) -> str:
8585
orig_sugg = f" | '{self._original}' -> '{self._suggest}'"
8686
else:
8787
orig_sugg = ""
88-
return "{0:03}-{1:03}: {2:6} {3}{4} | {5}".format(
88+
sugg_list = f" | {self._suggestlist}" if self._suggestlist else ""
89+
return "{0:03}-{1:03}: {2:20} {3}{4}{5}".format(
8990
self._start,
9091
self._end,
9192
self._code,
9293
self._text,
9394
orig_sugg,
94-
self._suggestlist,
95+
sugg_list,
9596
)
9697

9798
@property

src/reynir_correct/checker.py

Lines changed: 88 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,20 +61,20 @@
6161
from types import ModuleType
6262

6363
from islenska.basics import Ksnid
64-
from reynir import TOK, Greynir, Sentence, TokenList, _Job, correct_spaces
64+
from reynir import TOK, Greynir, Paragraph, Sentence, TokenList, _Job, correct_spaces
6565
from reynir.binparser import BIN_Grammar, BIN_Parser, VariantHandler
6666
from reynir.bintokenizer import StringIterable
6767
from reynir.fastparser import ffi # type: ignore
6868
from reynir.fastparser import Fast_Parser
6969
from reynir.incparser import ICELANDIC_RATIO
7070
from reynir.reducer import Reducer
71-
from reynir.reynir import Job, ProgressFunc
71+
from reynir.reynir import Job, ProgressFunc, DEFAULT_MAX_SENT_TOKENS
7272
from tokenizer import Abbreviations, Tok
7373

7474
from .settings import Settings
7575
from .annotation import Annotation
7676
from .errfinder import ErrorDetectionToken, ErrorFinder
77-
from .errtokenizer import CorrectionPipeline, CorrectToken
77+
from .errtokenizer import CorrectionPipeline, CorrectToken, settings_or_default
7878
from .pattern import PatternMatcher
7979

8080
# Style mark from BÍN:
@@ -478,3 +478,88 @@ def parse_all_tokens(self, tokens: Iterable[Tok], *, progress_func: ProgressFunc
478478
ambiguity=job.ambiguity,
479479
parse_time=job.parse_time,
480480
)
481+
482+
483+
def check_single(
484+
sentence_text: str, rc: Optional[GreynirCorrect] = None, **options: Any
485+
) -> Optional[AnnotatedSentence]:
486+
"""Check and annotate a single sentence, given in plain text"""
487+
# Returns None if no sentence was parsed
488+
max_sent_tokens = options.pop("max_sent_tokens", DEFAULT_MAX_SENT_TOKENS)
489+
if rc is None:
490+
settings = settings_or_default()
491+
pipeline = CorrectionPipeline("", settings, **options)
492+
rc = GreynirCorrect(settings, pipeline, **options)
493+
return cast(AnnotatedSentence, rc.parse_single(sentence_text, max_sent_tokens=max_sent_tokens))
494+
495+
496+
def check_tokens(
497+
tokens: Iterable[CorrectToken], rc: Optional[GreynirCorrect] = None, **options: Any
498+
) -> Optional[Sentence]:
499+
"""Check and annotate a single sentence, given as a token list"""
500+
# Returns None if no sentence was parsed
501+
max_sent_tokens = options.pop("max_sent_tokens", DEFAULT_MAX_SENT_TOKENS)
502+
if rc is None:
503+
settings = settings_or_default()
504+
pipeline = CorrectionPipeline("", settings, **options)
505+
rc = GreynirCorrect(settings, pipeline, **options)
506+
return rc.parse_tokens(tokens, max_sent_tokens=max_sent_tokens)
507+
508+
509+
def check(
510+
text: str, rc: Optional[GreynirCorrect] = None, **options: Any
511+
) -> Iterable[Paragraph]:
512+
"""Return a generator of checked paragraphs of text,
513+
each being a generator of checked sentences with
514+
annotations"""
515+
split_paragraphs = options.pop("split_paragraphs", False)
516+
max_sent_tokens = options.pop("max_sent_tokens", DEFAULT_MAX_SENT_TOKENS)
517+
if rc is None:
518+
settings = settings_or_default()
519+
pipeline = CorrectionPipeline("", settings, **options)
520+
rc = GreynirCorrect(settings, pipeline, **options)
521+
# This is an asynchronous (on-demand) parse job
522+
job = rc.submit(
523+
text,
524+
parse=True,
525+
split_paragraphs=split_paragraphs,
526+
max_sent_tokens=max_sent_tokens,
527+
)
528+
yield from job.paragraphs()
529+
530+
531+
def check_with_stats(
532+
text: str,
533+
*,
534+
settings: Optional[Settings] = None,
535+
split_paragraphs: bool = False,
536+
progress_func: ProgressFunc = None,
537+
**options: Any,
538+
) -> CheckResult:
539+
"""Return a dict containing parsed paragraphs as well as statistics,
540+
using the given correction/parser class. This is a low-level
541+
function; normally check_with_stats() should be used."""
542+
settings = settings_or_default(settings)
543+
split_paragraphs = options.pop("split_paragraphs", False)
544+
max_sent_tokens = options.pop("max_sent_tokens", DEFAULT_MAX_SENT_TOKENS)
545+
pipeline = CorrectionPipeline("", settings, **options)
546+
rc = GreynirCorrect(settings, pipeline, **options)
547+
# This is an asynchronous (on-demand) parse job
548+
job = rc.submit(
549+
text,
550+
parse=True,
551+
split_paragraphs=split_paragraphs,
552+
progress_func=progress_func,
553+
max_sent_tokens=max_sent_tokens,
554+
)
555+
# Enumerating through the job's paragraphs and sentences causes them
556+
# to be parsed and their statistics collected
557+
sentences = [cast(AnnotatedSentence, sent) for pg in job.paragraphs() for sent in pg]
558+
return CheckResult(
559+
sentences=sentences,
560+
num_tokens=job.num_tokens,
561+
num_sentences=job.num_sentences,
562+
num_parsed=job.num_parsed,
563+
ambiguity=job.ambiguity,
564+
parse_time=job.parse_time,
565+
)

src/reynir_correct/errtokenizer.py

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,9 @@ class TemplateDict(TypedDict):
217217

218218
_ErrorClass = TypeVar("_ErrorClass", bound=ErrorType)
219219

220+
# Cached settings for simple (legacy) API
221+
_cached_settings: Optional[Settings] = None
222+
220223

221224
def load_config(tov_config_path: Optional[str] = None) -> Settings:
222225
"""Load the default configuration file and return a Settings object. Optionally load
@@ -239,10 +242,19 @@ def register_error_class(cls: _ErrorClass) -> _ErrorClass:
239242

240243
def emulate_case(s: str, *, template: str) -> str:
241244
"""Return the string s but emulating the case of the template
242-
(lower/upper/capitalized)"""
245+
(lower/upper/capitalized), also for multi-word templates ('Hesturinn Skjóni')"""
246+
s_list = s.split()
247+
if len(s_list) > 1:
248+
template_list = template.split()
249+
if len(s_list) == len(template_list):
250+
# Multi-word case emulation
251+
return " ".join(
252+
emulate_case(word, template=template_word) for word, template_word in zip(s_list, template_list)
253+
)
243254
if template.isupper():
244255
return s.upper()
245256
if template and template[0].isupper():
257+
# The first letter of the template word is uppercase
246258
return s.capitalize()
247259
return s
248260

@@ -2677,7 +2689,6 @@ def late_fix_merges(
26772689

26782690

26792691
def create_template_dict(
2680-
settings: Settings,
26812692
explanation: str,
26822693
explanation_w_sugg: str,
26832694
error_warning: Type[ToneOfVoiceWarning] | Type[TabooWarning],
@@ -2700,15 +2711,12 @@ def check_wording(
27002711
"""Annotate words to be flagged, with warnings. Here we check for both taboo words and
27012712
tone of voice issues as determined by an additional config, if given."""
27022713
taboo_data = create_template_dict(
2703-
settings,
27042714
"Óheppilegt eða óviðurkvæmilegt orð",
27052715
"Óheppilegt eða óviðurkvæmilegt orð, skárra væri t.d. ",
27062716
TabooWarning,
27072717
settings.taboo_words.DICT,
27082718
)
2709-
27102719
tone_of_voice_data = create_template_dict(
2711-
settings,
27122720
"Orðið er ekki í samræmi við raddblæ okkar",
27132721
"Orðið er ekki í samræmi við raddblæ okkar, í staðinn gætirðu notað",
27142722
ToneOfVoiceWarning,
@@ -3058,7 +3066,6 @@ def check_spelling(self, stream: TokenIterator) -> TokenIterator:
30583066
err_codes = {"T001/w", "T001", "V001/w", "V001"}
30593067
if not only_ci and all(code not in ignore_rules for code in err_codes):
30603068
ct_stream = check_wording(ct_stream, self.settings, self._db, self._suggest_not_correct)
3061-
30623069
# Check context-independent style errors, indicated in BÍN
30633070
ct_stream = check_style(ct_stream, self._db, ignore_rules)
30643071
return ct_stream
@@ -3079,23 +3086,27 @@ def final_correct(self, stream: TokenIterator) -> TokenIterator:
30793086
self._suppress_suggestions,
30803087
self.settings,
30813088
)
3082-
30833089
ct_stream = late_fix_merges(ct_stream, self._ignore_wordlist, self._ignore_rules)
30843090
return ct_stream
30853091

3086-
_cached_settings: Optional[Settings] = None
3092+
3093+
def settings_or_default(settings: Optional[Settings] = None) -> Settings:
3094+
"""Return the given settings or a cached default if not given"""
3095+
if settings is not None:
3096+
# If a settings object is provided, use it
3097+
return settings
3098+
global _cached_settings
3099+
if _cached_settings is None:
3100+
# Create a new default settings object and cache it
3101+
_cached_settings = load_config()
3102+
return _cached_settings
3103+
30873104

30883105
def tokenize(
30893106
text_or_gen: StringIterable, *, settings: Optional[Settings] = None, **options: Any
30903107
) -> Iterator[CorrectToken]:
30913108
"""Tokenize text using the correction pipeline,
30923109
overriding a part of the default tokenization pipeline"""
3093-
if settings is None:
3094-
global _cached_settings
3095-
settings = _cached_settings
3096-
if settings is None:
3097-
# Create a new settings object if none is provided
3098-
settings = load_config()
3099-
_cached_settings = settings
3110+
settings = settings_or_default(settings)
31003111
pipeline = CorrectionPipeline(text_or_gen, settings, **options)
31013112
return cast(Iterator[CorrectToken], pipeline.tokenize())

src/reynir_correct/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@
166166
def from_args(args: argparse.Namespace) -> Dict[str, Union[str, bool]]:
167167
"""Fill options with information from args"""
168168
format = args.format
169-
if args.json:
169+
if args.json or args.grammar: # The --grammar option implies --json
170170
format = "json"
171171
elif args.csv:
172172
format = "csv"

test.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,22 @@
1-
# type: ignore
1+
22

33
import sys
4+
from typing import cast
45
import reynir_correct as rc
6+
from reynir_correct.checker import AnnotatedSentence
57

68

9+
"""
710
from reynir_correct import check_single
811
sent = check_single("Páli, vini mínum, langaði að horfa á sjónnvarpið.")
9-
for annotation in sent.annotations:
10-
print("{0}".format(annotation))
12+
if sent:
13+
for annotation in sent.annotations:
14+
print("{0}".format(annotation))
1115
1216
sys.exit(0)
17+
"""
1318

14-
def display_annotations(sent):
19+
def display_annotations(sent: rc.AnnotatedSentence):
1520
print("\nSetning:")
1621
print(sent.text)
1722
print("\nNiðurstaða tókunar:")
@@ -37,7 +42,7 @@ def display_annotations(sent):
3742
print("\nUpphaflegur texti: '{0}'".format(txt))
3843
for pg in rc.check(txt, split_paragraphs=True):
3944
for sent in pg:
40-
display_annotations(sent)
45+
display_annotations(cast(AnnotatedSentence, sent))
4146
print("---")
4247

4348
sys.exit(0)
@@ -51,7 +56,7 @@ def display_annotations(sent):
5156
c = Corrector(db) # type: Corrector
5257

5358

54-
def test(c, word):
59+
def test(c: Corrector, word: str) -> None:
5560
t0 = time.time()
5661
result = list(c.subs(word))
5762
valid = [r for r in result if r in c]

0 commit comments

Comments
 (0)