Skip to content

Commit 2949c2b

Browse files
committed
Integrate unicode normalization
Closes #1067
1 parent 20ab969 commit 2949c2b

File tree

5 files changed

+24
-5
lines changed

5 files changed

+24
-5
lines changed

.env.docker

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ TEKST_DB__HOST=mongo
6464
# TEKST_DB__NAME=tekst
6565
# default: tekst
6666

67+
# TEKST_DB__UNICODE_NF=NFC
68+
# default: NFC
69+
6770

6871
# ================ SEARCH SERVER (Elasticsearch) CONFIG ================
6972

Tekst-API/.env.template

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@
6464
# TEKST_DB__NAME=tekst
6565
# default: tekst
6666

67+
# TEKST_DB__UNICODE_NF=NFC
68+
# default: NFC
69+
6770

6871
# ================ SEARCH SERVER (Elasticsearch) CONFIG ================
6972

Tekst-API/tekst/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from os.path import realpath
66
from pathlib import Path
77
from secrets import token_hex
8-
from typing import Annotated, Any
8+
from typing import Annotated, Any, Literal
99
from urllib.parse import quote
1010
from uuid import uuid4
1111

@@ -96,6 +96,7 @@ class MongoDBConfig(ConfigSubSection):
9696
max_length=64,
9797
) = None
9898
name: str = "tekst"
99+
unicode_nf: Literal["NFC", "NFKC", "NFD", "NFKD"] | None = "NFC"
99100

100101
@field_validator("name", mode="before")
101102
@classmethod

Tekst-API/tekst/models/common.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
get_args,
88
get_origin,
99
) # noqa: UP035
10+
from unicodedata import normalize
1011

1112
from beanie import Document, PydanticObjectId
1213
from beanie.odm.utils.encoder import Encoder
@@ -21,6 +22,7 @@
2122
from pydantic.aliases import PydanticUndefined
2223
from pydantic.fields import FieldInfo
2324

25+
from tekst.config import get_config
2426
from tekst.types import (
2527
ExcludeFromModelVariants,
2628
SchemaOptionalNonNullable,
@@ -85,14 +87,21 @@ class NoAliasEncoder(Encoder):
8587
"""
8688

8789
def _iter_model_items(self, obj: BaseModel) -> Iterable[tuple[str, Any]]:
88-
exclude, keep_nulls = self.exclude, self.keep_nulls
8990
for key, value in obj.__iter__():
90-
if key not in exclude and (value is not None or keep_nulls):
91+
if key not in self.exclude and (value is not None or self.keep_nulls):
9192
# this is where we use "key" directly, without considering aliases
9293
yield key, value
9394

9495

9596
_no_alias_encoder = NoAliasEncoder(to_db=True, keep_nulls=False).encode
97+
_unicode_nf = get_config().db.unicode_nf
98+
99+
100+
def _apply_unicode_nf(str_v):
101+
if _unicode_nf is not None:
102+
return normalize(_unicode_nf, str_v)
103+
else: # pragma: no cover
104+
return str_v
96105

97106

98107
class DocumentBase(Document):
@@ -104,8 +113,8 @@ class Settings:
104113
validate_on_save = True
105114
keep_nulls = False
106115
bson_encoders = {
107-
# see docstring of NoAliasEncoder for rationale!
108-
BaseModel: _no_alias_encoder,
116+
BaseModel: _no_alias_encoder, # see docstring of NoAliasEncoder
117+
str: _apply_unicode_nf,
109118
}
110119

111120
def __init__(self, *args, **kwargs):

docs/content/setup/configuration.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ MongoDB password (String – default: _none_)
6868
### `TEKST_DB__NAME`
6969
MongoDB database name (String – default: `tekst`)
7070

71+
### `TEKST_DB__UNICODE_NF`
72+
Apply Unicode string normalization when writing to the database (`NFC`, `NFKC`, `NFD`, `NFKD` or _none_ – default: `NFC`)
73+
7174

7275

7376
## Search Server (Elasticsearch)

0 commit comments

Comments
 (0)