Skip to content

Commit 685a8df

Browse files
committed
perf(machinery): improve built-in machinery performance
- add focused index with the exact conditions we use - cap limit of fetched results to avoid streaming too much data from the database - do cheaper exact lookup with 100% similarity - order matches by similarity
1 parent a14600c commit 685a8df

File tree

6 files changed

+223
-41
lines changed

6 files changed

+223
-41
lines changed

docs/changes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Weblate 5.17
1717
* Improved error messages in some of the :ref:`api` endpoints.
1818
* Improved performance of project and category search result pages with very large match sets.
1919
* :envvar:`WEBLATE_COMMIT_PENDING_HOURS` is now available in Docker container.
20+
* Improved performance of built-in Weblate machinery lookups.
2021

2122
.. rubric:: Bug fixes
2223

weblate/machinery/tests.py

Lines changed: 122 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from botocore.stub import ANY, Stubber
2222
from django.core.management import call_command
2323
from django.core.management.base import CommandError
24-
from django.test import TestCase
24+
from django.test import SimpleTestCase, TestCase
2525
from django.urls import reverse
2626
from google.api_core import exceptions as google_api_exceptions
2727
from google.cloud.translate import (
@@ -2972,6 +2972,20 @@ def test_exists(self) -> None:
29722972
results = machine.translate(unit, self.user)
29732973
self.assertNotEqual(results, [])
29742974

2975+
@patch("weblate.machinery.weblatetm.adjust_similarity_threshold")
2976+
def test_matches_still_probe_fuzzy_lookup(self, adjust_threshold) -> None:
2977+
unit = Unit.objects.filter(translation__language_code="cs")[0]
2978+
other = unit.translation.unit_set.exclude(pk=unit.pk)[0]
2979+
other.source = unit.source
2980+
other.target = "Preklad"
2981+
other.state = STATE_TRANSLATED
2982+
other.save()
2983+
2984+
machine = WeblateTranslation({})
2985+
machine.translate(unit, self.user)
2986+
2987+
adjust_threshold.assert_called_once_with(0.98)
2988+
29752989

29762990
class CyrTranslitTranslationTest(ViewTestCase, BaseMachineTranslationTest):
29772991
ENGLISH = "sr@latin"
@@ -3318,6 +3332,113 @@ def test_configure_invalid(self) -> None:
33183332
)
33193333

33203334

3335+
class WeblateTranslationLookupTest(SimpleTestCase):
3336+
@patch("weblate.machinery.weblatetm.Unit.objects")
3337+
@patch("weblate.machinery.weblatetm.Translation.objects")
3338+
def test_get_base_queryset_uses_translation_subquery(
3339+
self, translation_objects, unit_objects
3340+
) -> None:
3341+
machine = WeblateTranslation({})
3342+
user = MagicMock()
3343+
translations_using = MagicMock()
3344+
translations = MagicMock()
3345+
filtered_translations = MagicMock()
3346+
translation_ids = MagicMock()
3347+
units_using = MagicMock()
3348+
queryset = MagicMock()
3349+
3350+
translation_objects.using.return_value = translations_using
3351+
translations_using.all.return_value = translations
3352+
translations.filter_access.return_value = filtered_translations
3353+
filtered_translations.filter.return_value = translation_ids
3354+
translation_ids.values.return_value = "translation-subquery"
3355+
unit_objects.using.return_value = units_using
3356+
units_using.filter.return_value = queryset
3357+
3358+
result = machine.get_base_queryset(user, "en", "cs")
3359+
3360+
self.assertEqual(result, queryset)
3361+
translation_objects.using.assert_called_once_with("default")
3362+
translations.filter_access.assert_called_once_with(user)
3363+
filtered_translations.filter.assert_called_once_with(
3364+
component__source_language="en",
3365+
language="cs",
3366+
)
3367+
translation_ids.values.assert_called_once_with("id")
3368+
unit_objects.using.assert_called_once_with("default")
3369+
units_using.filter.assert_called_once_with(
3370+
state__gte=STATE_TRANSLATED,
3371+
translation_id__in="translation-subquery",
3372+
)
3373+
3374+
@patch("weblate.machinery.weblatetm.adjust_similarity_threshold")
3375+
def test_get_matching_units_uses_fuzzy_lookup(self, adjust_threshold) -> None:
3376+
machine = WeblateTranslation({})
3377+
base = MagicMock()
3378+
queryset = MagicMock()
3379+
annotated_queryset = MagicMock()
3380+
ordered_queryset = MagicMock()
3381+
fuzzy_match = MagicMock(pk=1)
3382+
base.filter.return_value = queryset
3383+
queryset.annotate.return_value = annotated_queryset
3384+
annotated_queryset.order_by.return_value = ordered_queryset
3385+
3386+
with patch.object(
3387+
machine, "prepare_queryset", return_value=[fuzzy_match]
3388+
) as prepare_queryset:
3389+
results = machine.get_matching_units(base, "Hello", 75)
3390+
3391+
self.assertEqual(results, [fuzzy_match])
3392+
base.filter.assert_called_once_with(source__trgm_search="Hello")
3393+
queryset.annotate.assert_called_once()
3394+
annotated_queryset.order_by.assert_called_once_with("-match_similarity", "pk")
3395+
prepare_queryset.assert_called_once_with(ordered_queryset)
3396+
adjust_threshold.assert_called_once_with(0.98)
3397+
3398+
@patch("weblate.machinery.weblatetm.adjust_similarity_threshold")
3399+
def test_get_matching_units_orders_short_queries_before_slicing(
3400+
self, adjust_threshold
3401+
) -> None:
3402+
machine = WeblateTranslation({})
3403+
base = MagicMock()
3404+
short_queryset = MagicMock()
3405+
fuzzy_match = MagicMock(pk=1)
3406+
3407+
with (
3408+
patch.object(
3409+
machine, "get_short_query_matches", return_value=short_queryset
3410+
) as get_short_query_matches,
3411+
patch.object(machine, "prepare_queryset", return_value=[fuzzy_match]),
3412+
):
3413+
results = machine.get_matching_units(base, "id", 75)
3414+
3415+
self.assertEqual(results, [fuzzy_match])
3416+
get_short_query_matches.assert_called_once_with(base, "id")
3417+
adjust_threshold.assert_called_once_with(0.98)
3418+
3419+
@patch("weblate.machinery.weblatetm.adjust_similarity_threshold")
3420+
def test_get_matching_units_uses_exact_lookup_at_full_threshold(
3421+
self, adjust_threshold
3422+
) -> None:
3423+
machine = WeblateTranslation({})
3424+
base = MagicMock()
3425+
queryset = MagicMock()
3426+
ordered_queryset = MagicMock()
3427+
exact_match = MagicMock(pk=1)
3428+
base.filter.return_value = queryset
3429+
queryset.order_by.return_value = ordered_queryset
3430+
3431+
with patch.object(
3432+
machine, "prepare_queryset", return_value=[exact_match]
3433+
) as prepare_queryset:
3434+
results = machine.get_matching_units(base, "Hello", 100)
3435+
3436+
self.assertEqual(results, [exact_match])
3437+
queryset.order_by.assert_called_once_with("pk")
3438+
prepare_queryset.assert_called_once_with(ordered_queryset)
3439+
adjust_threshold.assert_not_called()
3440+
3441+
33213442
class CommandTest(FixtureTestCase):
33223443
"""Test for management commands."""
33233444

weblate/machinery/weblatetm.py

Lines changed: 66 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@
44

55
from __future__ import annotations
66

7-
from typing import TYPE_CHECKING, Any
7+
from typing import TYPE_CHECKING
88

99
from django.conf import settings
10-
from django.db.models import Value
11-
from django.db.models.functions import MD5, Lower
10+
from django.contrib.postgres.search import TrigramSimilarity
11+
from django.db.models import Case, IntegerField, Value, When
12+
from django.db.models.functions import MD5, Length, Lower
1213

13-
from weblate.trans.models import Unit
14-
from weblate.utils.db import adjust_similarity_threshold
14+
from weblate.trans.models import Translation, Unit
15+
from weblate.utils.db import adjust_similarity_threshold, use_trgm_fallback
1516
from weblate.utils.state import STATE_TRANSLATED
1617

1718
from .base import InternalMachineTranslation
@@ -28,6 +29,7 @@ class WeblateTranslation(InternalMachineTranslation):
2829
cache_translations = True
2930
# Cache results for 1 hour to avoid frequent database hits
3031
cache_expiry = 3600
32+
candidate_limit = 50
3133

3234
def download_translations(
3335
self,
@@ -39,42 +41,12 @@ def download_translations(
3941
threshold: int = 10,
4042
) -> DownloadTranslations:
4143
"""Download list of possible translations from a service."""
42-
# Filter based on user access
43-
base = Unit.objects.filter_access(user) if user else Unit.objects.all()
44-
45-
# Use memory_db for the query in case it exists. This is supposed
46-
# to be a read-only replica for offloading expensive translation
47-
# queries.
48-
if "memory_db" in settings.DATABASES:
49-
base = base.using("memory_db")
50-
51-
# Build search query
52-
lookup: dict[str, Any] = {}
53-
if threshold < 100:
54-
# Full text search
55-
lookup["source__trgm_search"] = text
56-
else:
57-
# Utilize PostgreSQL index
58-
lookup["source__lower__md5"] = MD5(Lower(Value(text)))
59-
lookup["source"] = text
60-
61-
matching_units = (
62-
base.filter(
63-
translation__component__source_language=source_language,
64-
translation__language=target_language,
65-
state__gte=STATE_TRANSLATED,
66-
**lookup,
67-
)
68-
.exclude(
69-
# The read-only strings can be possibly blank
70-
target__lower__md5=MD5(Lower(Value("")))
71-
)
72-
.prefetch()
44+
matching_units = self.get_matching_units(
45+
self.get_base_queryset(user, source_language, target_language),
46+
text,
47+
threshold,
7348
)
7449

75-
# We want only close matches here
76-
adjust_similarity_threshold(0.98)
77-
7850
for munit in matching_units:
7951
source = munit.source_string
8052
if "forbidden" in munit.all_flags:
@@ -91,3 +63,58 @@ def download_translations(
9163
"origin_url": munit.get_absolute_url(),
9264
"source": source,
9365
}
66+
67+
def get_base_queryset(self, user, source_language, target_language):
68+
alias = "memory_db" if "memory_db" in settings.DATABASES else "default"
69+
70+
translations = Translation.objects.using(alias).all()
71+
if user is not None:
72+
translations = translations.filter_access(user)
73+
74+
translation_ids = translations.filter(
75+
component__source_language=source_language,
76+
language=target_language,
77+
).values("id")
78+
79+
return Unit.objects.using(alias).filter(
80+
state__gte=STATE_TRANSLATED,
81+
translation_id__in=translation_ids,
82+
)
83+
84+
def get_matching_units(self, base, text: str, threshold: int):
85+
if threshold < 100:
86+
adjust_similarity_threshold(0.98)
87+
if use_trgm_fallback(text):
88+
queryset = self.get_short_query_matches(base, text)
89+
else:
90+
queryset = base.filter(source__trgm_search=text).annotate(
91+
match_similarity=TrigramSimilarity("source", text)
92+
)
93+
queryset = queryset.order_by("-match_similarity", "pk")
94+
else:
95+
queryset = base.filter(
96+
source__lower__md5=MD5(Lower(Value(text))),
97+
source=text,
98+
).order_by("pk")
99+
100+
return list(self.prepare_queryset(queryset)[: self.candidate_limit])
101+
102+
def prepare_queryset(self, queryset):
103+
return queryset.exclude(target="").prefetch()
104+
105+
def get_short_query_matches(self, base, text: str):
106+
max_source_length = max(len(text) + 4, len(text) * 2, 8)
107+
return (
108+
base.filter(source__icontains=text)
109+
.annotate(
110+
short_query_rank=Case(
111+
When(source__iexact=text, then=Value(0)),
112+
When(source__istartswith=text, then=Value(1)),
113+
default=Value(2),
114+
output_field=IntegerField(),
115+
),
116+
source_length=Length("source"),
117+
)
118+
.filter(source_length__lte=max_source_length)
119+
.order_by("short_query_rank", "source_length", "pk")
120+
)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Copyright © Michal Čihař <michal@weblate.org>
2+
#
3+
# SPDX-License-Identifier: GPL-3.0-or-later
4+
5+
from django.contrib.postgres import indexes as postgres_indexes
6+
from django.db import migrations, models
7+
from django.db.models import Q
8+
9+
10+
class Migration(migrations.Migration):
11+
dependencies = [
12+
("trans", "0067_componentlink_alter_component_links"),
13+
]
14+
15+
operations = [
16+
migrations.AddIndex(
17+
model_name="unit",
18+
index=postgres_indexes.GinIndex(
19+
postgres_indexes.OpClass(models.F("source"), name="gin_trgm_ops"),
20+
condition=Q(state__gte=20) & ~Q(target=""),
21+
name="trans_unit_source_tm_idx",
22+
),
23+
),
24+
]

weblate/trans/models/unit.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,11 @@ class Meta:
530530
models.F("translation"),
531531
name="unit_explanation_fulltext",
532532
),
533+
postgres_indexes.GinIndex(
534+
postgres_indexes.OpClass(models.F("source"), name="gin_trgm_ops"),
535+
condition=Q(state__gte=STATE_TRANSLATED) & ~Q(target=""),
536+
name="trans_unit_source_tm_idx",
537+
),
533538
]
534539

535540
def __str__(self) -> str:

weblate/utils/db.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ def count_alnum(string):
5555
return sum(map(str.isalnum, string))
5656

5757

58+
def use_trgm_fallback(string: str) -> bool:
59+
return count_alnum(string) <= 3
60+
61+
5862
class PostgreSQLFallbackLookupMixin(Lookup):
5963
"""
6064
Mixin to block PostgreSQL from using trigram index.
@@ -79,7 +83,7 @@ def process_lhs(self, compiler, connection, lhs=None):
7983

8084
class PostgreSQLFallbackLookup(PostgreSQLFallbackLookupMixin, PatternLookup):
8185
def __init__(self, lhs, rhs) -> None:
82-
self._needs_fallback = isinstance(rhs, str) and count_alnum(rhs) <= 3
86+
self._needs_fallback = isinstance(rhs, str) and use_trgm_fallback(rhs)
8387
super().__init__(lhs, rhs)
8488

8589

0 commit comments

Comments
 (0)