Skip to content

Commit 199e51b

Browse files
committed
perf(machinery): improve built-in machinery performance
- add focused index with the exact conditions we use - cap limit of fetched results to avoid streaming too much data from the database - do cheaper exact lookup with 100% similarity - order matches by similarity
1 parent a14600c commit 199e51b

File tree

6 files changed

+298
-41
lines changed

6 files changed

+298
-41
lines changed

docs/changes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Weblate 5.17
1717
* Improved error messages in some of the :ref:`api` endpoints.
1818
* Improved performance of project and category search result pages with very large match sets.
1919
* :envvar:`WEBLATE_COMMIT_PENDING_HOURS` is now available in Docker container.
20+
* Improved performance of :ref:`mt-weblate` lookups.
2021

2122
.. rubric:: Bug fixes
2223

weblate/machinery/tests.py

Lines changed: 193 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from botocore.stub import ANY, Stubber
2222
from django.core.management import call_command
2323
from django.core.management.base import CommandError
24-
from django.test import TestCase
24+
from django.test import SimpleTestCase, TestCase
2525
from django.urls import reverse
2626
from google.api_core import exceptions as google_api_exceptions
2727
from google.cloud.translate import (
@@ -2972,6 +2972,20 @@ def test_exists(self) -> None:
29722972
results = machine.translate(unit, self.user)
29732973
self.assertNotEqual(results, [])
29742974

2975+
@patch("weblate.machinery.weblatetm.adjust_similarity_threshold")
2976+
def test_matches_still_probe_fuzzy_lookup(self, adjust_threshold) -> None:
2977+
unit = Unit.objects.filter(translation__language_code="cs")[0]
2978+
other = unit.translation.unit_set.exclude(pk=unit.pk)[0]
2979+
other.source = unit.source
2980+
other.target = "Preklad"
2981+
other.state = STATE_TRANSLATED
2982+
other.save()
2983+
2984+
machine = WeblateTranslation({})
2985+
machine.translate(unit, self.user)
2986+
2987+
adjust_threshold.assert_called_once_with(0.98)
2988+
29752989

29762990
class CyrTranslitTranslationTest(ViewTestCase, BaseMachineTranslationTest):
29772991
ENGLISH = "sr@latin"
@@ -3318,6 +3332,184 @@ def test_configure_invalid(self) -> None:
33183332
)
33193333

33203334

3335+
class WeblateTranslationLookupTest(SimpleTestCase):
3336+
@patch("weblate.machinery.weblatetm.Unit.objects")
3337+
@patch("weblate.machinery.weblatetm.Translation.objects")
3338+
def test_get_base_queryset_uses_translation_subquery(
3339+
self, translation_objects, unit_objects
3340+
) -> None:
3341+
machine = WeblateTranslation({})
3342+
user = MagicMock()
3343+
translations_using = MagicMock()
3344+
translations = MagicMock()
3345+
filtered_translations = MagicMock()
3346+
translation_ids = MagicMock()
3347+
units_using = MagicMock()
3348+
queryset = MagicMock()
3349+
3350+
translation_objects.using.return_value = translations_using
3351+
translations_using.all.return_value = translations
3352+
translations.filter_access.return_value = filtered_translations
3353+
filtered_translations.filter.return_value = translation_ids
3354+
translation_ids.values.return_value = "translation-subquery"
3355+
unit_objects.using.return_value = units_using
3356+
units_using.filter.return_value = queryset
3357+
3358+
result = machine.get_base_queryset(user, "en", "cs")
3359+
3360+
self.assertEqual(result, queryset)
3361+
translation_objects.using.assert_called_once_with("default")
3362+
translations.filter_access.assert_called_once_with(user)
3363+
filtered_translations.filter.assert_called_once_with(
3364+
component__source_language="en",
3365+
language="cs",
3366+
)
3367+
translation_ids.values.assert_called_once_with("id")
3368+
unit_objects.using.assert_called_once_with("default")
3369+
units_using.filter.assert_called_once_with(
3370+
state__gte=STATE_TRANSLATED,
3371+
translation_id__in="translation-subquery",
3372+
)
3373+
3374+
@patch("weblate.machinery.weblatetm.adjust_similarity_threshold")
3375+
def test_get_matching_units_uses_fuzzy_lookup(self, adjust_threshold) -> None:
3376+
machine = WeblateTranslation({})
3377+
base = MagicMock()
3378+
queryset = MagicMock()
3379+
annotated_queryset = MagicMock()
3380+
ordered_queryset = MagicMock()
3381+
prepared_queryset = MagicMock()
3382+
fuzzy_match = MagicMock(pk=1)
3383+
base.filter.return_value = queryset
3384+
queryset.annotate.return_value = annotated_queryset
3385+
annotated_queryset.order_by.return_value = ordered_queryset
3386+
prepared_queryset.iterator.return_value = [fuzzy_match]
3387+
3388+
with patch.object(
3389+
machine, "prepare_queryset", return_value=prepared_queryset
3390+
) as prepare_queryset:
3391+
results = machine.get_matching_units(base, "Hello", 75)
3392+
3393+
self.assertEqual(results, [fuzzy_match])
3394+
base.filter.assert_called_once_with(source__trgm_search="Hello")
3395+
queryset.annotate.assert_called_once()
3396+
annotated_queryset.order_by.assert_called_once_with("-match_similarity", "pk")
3397+
prepare_queryset.assert_called_once_with(ordered_queryset)
3398+
prepared_queryset.iterator.assert_called_once_with(
3399+
chunk_size=machine.candidate_limit
3400+
)
3401+
adjust_threshold.assert_called_once_with(0.98)
3402+
3403+
@patch("weblate.machinery.weblatetm.adjust_similarity_threshold")
3404+
def test_get_matching_units_orders_short_queries_before_slicing(
3405+
self, adjust_threshold
3406+
) -> None:
3407+
machine = WeblateTranslation({})
3408+
base = MagicMock()
3409+
short_queryset = MagicMock()
3410+
prepared_queryset = MagicMock()
3411+
fuzzy_match = MagicMock(pk=1)
3412+
prepared_queryset.iterator.return_value = [fuzzy_match]
3413+
3414+
with (
3415+
patch.object(
3416+
machine, "get_short_query_matches", return_value=short_queryset
3417+
) as get_short_query_matches,
3418+
patch.object(
3419+
machine, "prepare_queryset", return_value=prepared_queryset
3420+
) as prepare_queryset,
3421+
):
3422+
results = machine.get_matching_units(base, "id", 75)
3423+
3424+
self.assertEqual(results, [fuzzy_match])
3425+
get_short_query_matches.assert_called_once_with(base, "id")
3426+
prepare_queryset.assert_called_once_with(short_queryset)
3427+
prepared_queryset.iterator.assert_called_once_with(
3428+
chunk_size=machine.candidate_limit
3429+
)
3430+
adjust_threshold.assert_called_once_with(0.98)
3431+
3432+
@patch("weblate.machinery.weblatetm.adjust_similarity_threshold")
3433+
def test_get_matching_units_uses_exact_lookup_at_full_threshold(
3434+
self, adjust_threshold
3435+
) -> None:
3436+
machine = WeblateTranslation({})
3437+
base = MagicMock()
3438+
queryset = MagicMock()
3439+
ordered_queryset = MagicMock()
3440+
prepared_queryset = MagicMock()
3441+
exact_match = MagicMock(pk=1)
3442+
base.filter.return_value = queryset
3443+
queryset.order_by.return_value = ordered_queryset
3444+
prepared_queryset.iterator.return_value = [exact_match]
3445+
3446+
with patch.object(
3447+
machine, "prepare_queryset", return_value=prepared_queryset
3448+
) as prepare_queryset:
3449+
results = machine.get_matching_units(base, "Hello", 100)
3450+
3451+
self.assertEqual(results, [exact_match])
3452+
queryset.order_by.assert_called_once_with("pk")
3453+
prepare_queryset.assert_called_once_with(ordered_queryset)
3454+
prepared_queryset.iterator.assert_called_once_with(
3455+
chunk_size=machine.candidate_limit
3456+
)
3457+
adjust_threshold.assert_not_called()
3458+
3459+
def test_download_translations_limits_after_filtering(self) -> None:
3460+
machine = WeblateTranslation({})
3461+
machine.candidate_limit = 2
3462+
machine.comparer = MagicMock()
3463+
machine.comparer.similarity.side_effect = [95, 90, 85]
3464+
3465+
filtered_match = MagicMock()
3466+
filtered_match.source_string = "ignored"
3467+
filtered_match.all_flags = {"forbidden"}
3468+
3469+
first_match = MagicMock()
3470+
first_match.source_string = "first"
3471+
first_match.all_flags = set()
3472+
first_match.get_target_plurals.return_value = ["First"]
3473+
first_match.translation.component = "Component"
3474+
first_match.get_absolute_url.return_value = "/first/"
3475+
3476+
second_match = MagicMock()
3477+
second_match.source_string = "second"
3478+
second_match.all_flags = set()
3479+
second_match.get_target_plurals.return_value = ["Second"]
3480+
second_match.translation.component = "Component"
3481+
second_match.get_absolute_url.return_value = "/second/"
3482+
3483+
third_match = MagicMock()
3484+
third_match.source_string = "third"
3485+
third_match.all_flags = set()
3486+
third_match.get_target_plurals.return_value = ["Third"]
3487+
third_match.translation.component = "Component"
3488+
third_match.get_absolute_url.return_value = "/third/"
3489+
3490+
with (
3491+
patch.object(machine, "get_base_queryset", return_value=MagicMock()),
3492+
patch.object(
3493+
machine,
3494+
"get_matching_units",
3495+
return_value=[filtered_match, first_match, second_match, third_match],
3496+
),
3497+
):
3498+
results = list(
3499+
machine.download_translations(
3500+
"en",
3501+
"cs",
3502+
"Hello",
3503+
unit=None,
3504+
user=None,
3505+
threshold=10,
3506+
)
3507+
)
3508+
3509+
self.assertEqual([item["text"] for item in results], ["First", "Second"])
3510+
self.assertEqual(machine.comparer.similarity.call_count, 2)
3511+
3512+
33213513
class CommandTest(FixtureTestCase):
33223514
"""Test for management commands."""
33233515

weblate/machinery/weblatetm.py

Lines changed: 70 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@
44

55
from __future__ import annotations
66

7-
from typing import TYPE_CHECKING, Any
7+
from typing import TYPE_CHECKING
88

99
from django.conf import settings
10-
from django.db.models import Value
11-
from django.db.models.functions import MD5, Lower
10+
from django.contrib.postgres.search import TrigramSimilarity
11+
from django.db.models import Case, IntegerField, Value, When
12+
from django.db.models.functions import MD5, Length, Lower
1213

13-
from weblate.trans.models import Unit
14-
from weblate.utils.db import adjust_similarity_threshold
14+
from weblate.trans.models import Translation, Unit
15+
from weblate.utils.db import adjust_similarity_threshold, use_trgm_fallback
1516
from weblate.utils.state import STATE_TRANSLATED
1617

1718
from .base import InternalMachineTranslation
@@ -28,6 +29,7 @@ class WeblateTranslation(InternalMachineTranslation):
2829
cache_translations = True
2930
# Cache results for 1 hour to avoid frequent database hits
3031
cache_expiry = 3600
32+
candidate_limit = 50
3133

3234
def download_translations(
3335
self,
@@ -39,42 +41,13 @@ def download_translations(
3941
threshold: int = 10,
4042
) -> DownloadTranslations:
4143
"""Download list of possible translations from a service."""
42-
# Filter based on user access
43-
base = Unit.objects.filter_access(user) if user else Unit.objects.all()
44-
45-
# Use memory_db for the query in case it exists. This is supposed
46-
# to be a read-only replica for offloading expensive translation
47-
# queries.
48-
if "memory_db" in settings.DATABASES:
49-
base = base.using("memory_db")
50-
51-
# Build search query
52-
lookup: dict[str, Any] = {}
53-
if threshold < 100:
54-
# Full text search
55-
lookup["source__trgm_search"] = text
56-
else:
57-
# Utilize PostgreSQL index
58-
lookup["source__lower__md5"] = MD5(Lower(Value(text)))
59-
lookup["source"] = text
60-
61-
matching_units = (
62-
base.filter(
63-
translation__component__source_language=source_language,
64-
translation__language=target_language,
65-
state__gte=STATE_TRANSLATED,
66-
**lookup,
67-
)
68-
.exclude(
69-
# The read-only strings can be possibly blank
70-
target__lower__md5=MD5(Lower(Value("")))
71-
)
72-
.prefetch()
44+
yielded = 0
45+
matching_units = self.get_matching_units(
46+
self.get_base_queryset(user, source_language, target_language),
47+
text,
48+
threshold,
7349
)
7450

75-
# We want only close matches here
76-
adjust_similarity_threshold(0.98)
77-
7851
for munit in matching_units:
7952
source = munit.source_string
8053
if "forbidden" in munit.all_flags:
@@ -91,3 +64,61 @@ def download_translations(
9164
"origin_url": munit.get_absolute_url(),
9265
"source": source,
9366
}
67+
yielded += 1
68+
if yielded >= self.candidate_limit:
69+
break
70+
71+
def get_base_queryset(self, user, source_language, target_language):
72+
alias = "memory_db" if "memory_db" in settings.DATABASES else "default"
73+
74+
translations = Translation.objects.using(alias).all()
75+
if user is not None:
76+
translations = translations.filter_access(user)
77+
78+
translation_ids = translations.filter(
79+
component__source_language=source_language,
80+
language=target_language,
81+
).values("id")
82+
83+
return Unit.objects.using(alias).filter(
84+
state__gte=STATE_TRANSLATED,
85+
translation_id__in=translation_ids,
86+
)
87+
88+
def get_matching_units(self, base, text: str, threshold: int):
89+
if threshold < 100:
90+
adjust_similarity_threshold(0.98)
91+
if use_trgm_fallback(text):
92+
queryset = self.get_short_query_matches(base, text)
93+
else:
94+
queryset = base.filter(source__trgm_search=text).annotate(
95+
match_similarity=TrigramSimilarity("source", text)
96+
)
97+
queryset = queryset.order_by("-match_similarity", "pk")
98+
else:
99+
queryset = base.filter(
100+
source__lower__md5=MD5(Lower(Value(text))),
101+
source=text,
102+
).order_by("pk")
103+
104+
return self.prepare_queryset(queryset).iterator(chunk_size=self.candidate_limit)
105+
106+
def prepare_queryset(self, queryset):
107+
return queryset.exclude(target="").prefetch()
108+
109+
def get_short_query_matches(self, base, text: str):
110+
max_source_length = max(len(text) + 4, len(text) * 2, 8)
111+
return (
112+
base.filter(source__trgm_search=text)
113+
.annotate(
114+
short_query_rank=Case(
115+
When(source__iexact=text, then=Value(0)),
116+
When(source__istartswith=text, then=Value(1)),
117+
default=Value(2),
118+
output_field=IntegerField(),
119+
),
120+
source_length=Length("source"),
121+
)
122+
.filter(source_length__lte=max_source_length)
123+
.order_by("short_query_rank", "source_length", "pk")
124+
)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Copyright © Michal Čihař <michal@weblate.org>
2+
#
3+
# SPDX-License-Identifier: GPL-3.0-or-later
4+
5+
from django.contrib.postgres import indexes as postgres_indexes
6+
from django.db import migrations, models
7+
from django.db.models import Q
8+
9+
10+
class Migration(migrations.Migration):
11+
dependencies = [
12+
("trans", "0067_componentlink_alter_component_links"),
13+
]
14+
15+
operations = [
16+
migrations.AddIndex(
17+
model_name="unit",
18+
index=postgres_indexes.GinIndex(
19+
postgres_indexes.OpClass(models.F("source"), name="gin_trgm_ops"),
20+
condition=Q(state__gte=20) & ~Q(target=""),
21+
name="trans_unit_source_tm_idx",
22+
),
23+
),
24+
]

weblate/trans/models/unit.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,11 @@ class Meta:
530530
models.F("translation"),
531531
name="unit_explanation_fulltext",
532532
),
533+
postgres_indexes.GinIndex(
534+
postgres_indexes.OpClass(models.F("source"), name="gin_trgm_ops"),
535+
condition=Q(state__gte=STATE_TRANSLATED) & ~Q(target=""),
536+
name="trans_unit_source_tm_idx",
537+
),
533538
]
534539

535540
def __str__(self) -> str:

0 commit comments

Comments
 (0)