Skip to content

Commit 0193401

Browse files
committed
perf(machinery): improve built-in machinery queries
This is AI-assisted approach: - do an exact lookup first and skip trigram lookup in case some matches are found (is there problem with skipping other matches in this case?) - create split indexes and split SQL query for translation memory, this seems to work better than forcing PostgreSQL to do the complex filtering on top of trigram
1 parent 0b4b0f2 commit 0193401

File tree

8 files changed

+246
-39
lines changed

8 files changed

+246
-39
lines changed

docs/changes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ Weblate 5.17
1414
* Improved logic for adding monolingual plurals in :doc:`/formats/gettext`.
1515
* Improved error messages in some of the :ref:`api` endpoints.
1616
* :envvar:`WEBLATE_COMMIT_PENDING_HOURS` is now available in Docker container.
17+
* Improved performance of Weblate translation memory and built-in machinery lookups.
1718

1819
.. rubric:: Bug fixes
1920

weblate/machinery/tests.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2934,6 +2934,20 @@ def test_exists(self) -> None:
29342934
results = machine.translate(unit, self.user)
29352935
self.assertNotEqual(results, [])
29362936

2937+
@patch("weblate.machinery.weblatetm.adjust_similarity_threshold")
2938+
def test_exact_match_skips_trigram_threshold(self, adjust_threshold) -> None:
2939+
unit = Unit.objects.filter(translation__language_code="cs")[0]
2940+
other = unit.translation.unit_set.exclude(pk=unit.pk)[0]
2941+
other.source = unit.source
2942+
other.target = "Preklad"
2943+
other.state = STATE_TRANSLATED
2944+
other.save()
2945+
2946+
machine = WeblateTranslation({})
2947+
machine.translate(unit, self.user)
2948+
2949+
adjust_threshold.assert_not_called()
2950+
29372951

29382952
class CyrTranslitTranslationTest(ViewTestCase, BaseMachineTranslationTest):
29392953
ENGLISH = "sr@latin"

weblate/machinery/weblatetm.py

Lines changed: 37 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from __future__ import annotations
66

7-
from typing import TYPE_CHECKING, Any
7+
from typing import TYPE_CHECKING
88

99
from django.conf import settings
1010
from django.db.models import Value
@@ -28,6 +28,7 @@ class WeblateTranslation(InternalMachineTranslation):
2828
cache_translations = True
2929
# Cache results for 1 hour to avoid frequent database hits
3030
cache_expiry = 3600
31+
candidate_limit = 100
3132

3233
def download_translations(
3334
self,
@@ -48,33 +49,16 @@ def download_translations(
4849
if "memory_db" in settings.DATABASES:
4950
base = base.using("memory_db")
5051

51-
# Build search query
52-
lookup: dict[str, Any] = {}
53-
if threshold < 100:
54-
# Full text search
55-
lookup["source__search"] = text
56-
else:
57-
# Utilize PostgreSQL index
58-
lookup["source__lower__md5"] = MD5(Lower(Value(text)))
59-
lookup["source"] = text
60-
61-
matching_units = (
52+
matching_units = self.get_matching_units(
6253
base.filter(
6354
translation__component__source_language=source_language,
6455
translation__language=target_language,
6556
state__gte=STATE_TRANSLATED,
66-
**lookup,
67-
)
68-
.exclude(
69-
# The read-only strings can be possibly blank
70-
target__lower__md5=MD5(Lower(Value("")))
71-
)
72-
.prefetch()
57+
),
58+
text,
59+
threshold,
7360
)
7461

75-
# We want only close matches here
76-
adjust_similarity_threshold(0.98)
77-
7862
for munit in matching_units:
7963
source = munit.source_string
8064
if "forbidden" in munit.all_flags:
@@ -91,3 +75,34 @@ def download_translations(
9175
"origin_url": munit.get_absolute_url(),
9276
"source": source,
9377
}
78+
79+
def get_matching_units(self, base, text: str, threshold: int):
80+
# Exact matches are common and are much cheaper than trigram search.
81+
exact_matches = list(
82+
self.prepare_queryset(
83+
base.filter(
84+
source__lower__md5=MD5(Lower(Value(text))),
85+
source=text,
86+
)
87+
)[: self.candidate_limit]
88+
)
89+
if exact_matches or threshold >= 100:
90+
return exact_matches
91+
92+
# We want only close matches here.
93+
adjust_similarity_threshold(0.98)
94+
95+
return list(
96+
self.prepare_queryset(base.filter(source__search=text))[
97+
: self.candidate_limit
98+
]
99+
)
100+
101+
def prepare_queryset(self, queryset):
102+
return queryset.exclude(target="").select_related(
103+
"source_unit",
104+
"translation__language",
105+
"translation__plural",
106+
"translation__component",
107+
"translation__component__project",
108+
)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Copyright © Michal Čihař <michal@weblate.org>
2+
#
3+
# SPDX-License-Identifier: GPL-3.0-or-later
4+
5+
from django.db import migrations, models
6+
from django.db.models import Q
7+
8+
9+
class Migration(migrations.Migration):
10+
dependencies = [
11+
("memory", "0004_memory_status_and_context"),
12+
]
13+
14+
operations = [
15+
migrations.AddIndex(
16+
model_name="memory",
17+
index=models.Index(
18+
"source_language",
19+
"target_language",
20+
condition=Q(from_file=True),
21+
name="memory_file_lang_idx",
22+
),
23+
),
24+
migrations.AddIndex(
25+
model_name="memory",
26+
index=models.Index(
27+
"project",
28+
"source_language",
29+
"target_language",
30+
condition=Q(project__isnull=False, shared=False),
31+
name="memory_project_lang_idx",
32+
),
33+
),
34+
migrations.AddIndex(
35+
model_name="memory",
36+
index=models.Index(
37+
"user",
38+
"source_language",
39+
"target_language",
40+
condition=Q(user__isnull=False, shared=False),
41+
name="memory_user_lang_idx",
42+
),
43+
),
44+
migrations.AddIndex(
45+
model_name="memory",
46+
index=models.Index(
47+
"source_language",
48+
"target_language",
49+
condition=Q(shared=True),
50+
name="memory_shared_lang_idx",
51+
),
52+
),
53+
]

weblate/memory/models.py

Lines changed: 98 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -167,39 +167,94 @@ def lookup(
167167
use_shared,
168168
threshold: int = 75,
169169
):
170+
exact_results = self.lookup_scopes(
171+
source_language=source_language,
172+
target_language=target_language,
173+
text=text,
174+
user=user,
175+
project=project,
176+
use_shared=use_shared,
177+
exact=True,
178+
)
179+
if exact_results:
180+
return exact_results
181+
170182
# Adjust similarity based on string length to get more relevant matches
171183
# for long strings
172184
similarity_threshold = self.threshold_to_similarity(text, threshold)
173185

174-
results = self.none()
186+
results: list[Memory] = []
175187

176188
while len(results) == 0 and similarity_threshold > MIN_SIMILARITY_THRESHOLD:
177189
# Change PostgreSQL similarity threshold configuration
178190
adjust_similarity_threshold(similarity_threshold)
179191

180192
# Actual database query
181-
results = (
182-
self.prefetch_project()
183-
.filter_type(
184-
# Type filtering
185-
user=user,
186-
project=project,
187-
use_shared=use_shared,
188-
from_file=True,
189-
)
190-
.filter(
191-
# Full-text search on source
192-
source__search=text,
193-
# Language filtering
194-
source_language=source_language,
195-
target_language=target_language,
196-
)[:50]
193+
results = self.lookup_scopes(
194+
source_language=source_language,
195+
target_language=target_language,
196+
text=text,
197+
user=user,
198+
project=project,
199+
use_shared=use_shared,
197200
)
198201
# Decrease threshold in case no matches were found
199202
similarity_threshold -= 0.05
200203

201204
return results
202205

206+
def lookup_scopes(
207+
self,
208+
*,
209+
source_language,
210+
target_language,
211+
text: str,
212+
user,
213+
project,
214+
use_shared: bool,
215+
exact: bool = False,
216+
limit: int = 50,
217+
) -> list[Memory]:
218+
base = self.prefetch_project()
219+
common_filters = {
220+
"source_language": source_language,
221+
"target_language": target_language,
222+
}
223+
if exact:
224+
common_filters["source"] = text
225+
else:
226+
common_filters["source__search"] = text
227+
228+
results: list[Memory] = []
229+
seen: set[int] = set()
230+
231+
for query in self.iter_lookup_scope_queries(
232+
user=user,
233+
project=project,
234+
use_shared=use_shared,
235+
):
236+
for memory in base.filter(query, **common_filters)[:limit]:
237+
if memory.pk in seen:
238+
continue
239+
seen.add(memory.pk)
240+
results.append(memory)
241+
if len(results) >= limit:
242+
return results
243+
244+
return results
245+
246+
def iter_lookup_scope_queries(
247+
self, *, user, project, use_shared: bool
248+
) -> tuple[Q, ...]:
249+
queries: list[Q] = [Q(from_file=True)]
250+
if project is not None:
251+
queries.append(Q(project=project, shared=False))
252+
if use_shared:
253+
queries.append(Q(shared=True))
254+
if user is not None:
255+
queries.append(Q(user=user, shared=False))
256+
return tuple(queries)
257+
203258
def prefetch_lang(self):
204259
return self.prefetch_related("source_language", "target_language")
205260

@@ -601,6 +656,32 @@ class Meta:
601656
condition=Q(from_file=True),
602657
name="memory_from_file",
603658
),
659+
models.Index(
660+
"source_language",
661+
"target_language",
662+
condition=Q(from_file=True),
663+
name="memory_file_lang_idx",
664+
),
665+
models.Index(
666+
"project",
667+
"source_language",
668+
"target_language",
669+
condition=Q(project__isnull=False, shared=False),
670+
name="memory_project_lang_idx",
671+
),
672+
models.Index(
673+
"user",
674+
"source_language",
675+
"target_language",
676+
condition=Q(user__isnull=False, shared=False),
677+
name="memory_user_lang_idx",
678+
),
679+
models.Index(
680+
"source_language",
681+
"target_language",
682+
condition=Q(shared=True),
683+
name="memory_shared_lang_idx",
684+
),
604685
]
605686

606687
def __str__(self) -> str:

weblate/memory/tests.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import tempfile
99
from io import StringIO
1010
from typing import Any
11+
from unittest.mock import patch
1112

1213
from django.core.management import call_command
1314
from django.core.management.base import CommandError
@@ -93,6 +94,16 @@ def test_machine_batch(self) -> None:
9394
del machinery["origin"]
9495
self.assertEqual(machinery, {"quality": [100], "translation": ["Ahoj"]})
9596

97+
@patch("weblate.memory.models.adjust_similarity_threshold")
98+
def test_exact_match_skips_similarity_adjustment(self, adjust_threshold) -> None:
99+
add_document()
100+
unit = self.get_unit()
101+
102+
machine_translation = WeblateMemory({})
103+
machine_translation.search(unit, "Hello", None)
104+
105+
adjust_threshold.assert_not_called()
106+
96107
def test_machine_plurals(self) -> None:
97108
unit = self.get_unit("Orangutan has %d banana.\n")
98109

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright © Michal Čihař <michal@weblate.org>
2+
#
3+
# SPDX-License-Identifier: GPL-3.0-or-later
4+
5+
from django.contrib.postgres.indexes import GinIndex
6+
from django.db import migrations
7+
from django.db.models import Q
8+
9+
10+
class Migration(migrations.Migration):
11+
dependencies = [
12+
("trans", "0066_alter_variant_key"),
13+
]
14+
15+
operations = [
16+
migrations.AddIndex(
17+
model_name="unit",
18+
index=GinIndex(
19+
fields=["source"],
20+
opclasses=["gin_trgm_ops"],
21+
condition=Q(state__gte=20) & ~Q(target=""),
22+
name="trans_unit_source_tm_idx",
23+
),
24+
),
25+
]

weblate/trans/models/unit.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
import sentry_sdk
1313
from django.conf import settings
14+
from django.contrib.postgres.indexes import GinIndex
1415
from django.core.cache import cache
1516
from django.db import Error as DjangoDatabaseError
1617
from django.db import models, transaction
@@ -499,6 +500,12 @@ class Meta:
499500
models.Index(
500501
MD5(Lower("context")), "translation", name="trans_unit_context_md5"
501502
),
503+
GinIndex(
504+
fields=["source"],
505+
opclasses=["gin_trgm_ops"],
506+
condition=Q(state__gte=STATE_TRANSLATED) & ~Q(target=""),
507+
name="trans_unit_source_tm_idx",
508+
),
502509
]
503510

504511
def __str__(self) -> str:

0 commit comments

Comments
 (0)