Skip to content

Commit c44d936

Browse files
committed
perf(machinery): improve built-in machinery performance
- add focused index with the exact conditions we use - cap limit of fetched results to avoid streaming too much data from the database - do cheaper exact lookup with 100% similarity
1 parent a14600c commit c44d936

File tree

6 files changed

+198
-41
lines changed

6 files changed

+198
-41
lines changed

docs/changes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Weblate 5.17
1717
* Improved error messages in some of the :ref:`api` endpoints.
1818
* Improved performance of project and category search result pages with very large match sets.
1919
* :envvar:`WEBLATE_COMMIT_PENDING_HOURS` is now available in Docker container.
20+
* Improved performance of built-in Weblate machinery lookups.
2021

2122
.. rubric:: Bug fixes
2223

weblate/machinery/tests.py

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from botocore.stub import ANY, Stubber
2222
from django.core.management import call_command
2323
from django.core.management.base import CommandError
24-
from django.test import TestCase
24+
from django.test import SimpleTestCase, TestCase
2525
from django.urls import reverse
2626
from google.api_core import exceptions as google_api_exceptions
2727
from google.cloud.translate import (
@@ -2972,6 +2972,20 @@ def test_exists(self) -> None:
29722972
results = machine.translate(unit, self.user)
29732973
self.assertNotEqual(results, [])
29742974

2975+
@patch("weblate.machinery.weblatetm.adjust_similarity_threshold")
2976+
def test_matches_still_probe_fuzzy_lookup(self, adjust_threshold) -> None:
2977+
unit = Unit.objects.filter(translation__language_code="cs")[0]
2978+
other = unit.translation.unit_set.exclude(pk=unit.pk)[0]
2979+
other.source = unit.source
2980+
other.target = "Preklad"
2981+
other.state = STATE_TRANSLATED
2982+
other.save()
2983+
2984+
machine = WeblateTranslation({})
2985+
machine.translate(unit, self.user)
2986+
2987+
adjust_threshold.assert_called_once_with(0.98)
2988+
29752989

29762990
class CyrTranslitTranslationTest(ViewTestCase, BaseMachineTranslationTest):
29772991
ENGLISH = "sr@latin"
@@ -3318,6 +3332,92 @@ def test_configure_invalid(self) -> None:
33183332
)
33193333

33203334

3335+
class WeblateTranslationLookupTest(SimpleTestCase):
3336+
@patch("weblate.machinery.weblatetm.Unit.objects.filter")
3337+
@patch("weblate.machinery.weblatetm.Translation.objects")
3338+
def test_get_base_queryset_uses_translation_subquery(
3339+
self, translation_objects, unit_filter
3340+
) -> None:
3341+
machine = WeblateTranslation({})
3342+
user = MagicMock()
3343+
translations = MagicMock()
3344+
filtered_translations = MagicMock()
3345+
translation_ids = MagicMock()
3346+
base = MagicMock()
3347+
3348+
translation_objects.all.return_value = translations
3349+
translations.filter_access.return_value = filtered_translations
3350+
filtered_translations.filter.return_value = translation_ids
3351+
translation_ids.values.return_value = "translation-subquery"
3352+
unit_filter.return_value = base
3353+
3354+
queryset = machine.get_base_queryset(user, "en", "cs")
3355+
3356+
self.assertEqual(queryset, base)
3357+
translations.filter_access.assert_called_once_with(user)
3358+
filtered_translations.filter.assert_called_once_with(
3359+
component__source_language="en",
3360+
language="cs",
3361+
)
3362+
translation_ids.values.assert_called_once_with("id")
3363+
unit_filter.assert_called_once_with(
3364+
state__gte=STATE_TRANSLATED,
3365+
translation_id__in="translation-subquery",
3366+
)
3367+
3368+
@patch("weblate.machinery.weblatetm.adjust_similarity_threshold")
3369+
def test_get_matching_units_uses_fuzzy_lookup(self, adjust_threshold) -> None:
3370+
machine = WeblateTranslation({})
3371+
base = MagicMock()
3372+
queryset = MagicMock()
3373+
fuzzy_match = MagicMock(pk=1)
3374+
base.filter.return_value = queryset
3375+
3376+
with patch.object(machine, "prepare_queryset", return_value=[fuzzy_match]):
3377+
results = machine.get_matching_units(base, "Hello", 75)
3378+
3379+
self.assertEqual(results, [fuzzy_match])
3380+
base.filter.assert_called_once_with(source__trgm_search="Hello")
3381+
adjust_threshold.assert_called_once_with(0.98)
3382+
3383+
@patch("weblate.machinery.weblatetm.adjust_similarity_threshold")
3384+
def test_get_matching_units_orders_short_queries_before_slicing(
3385+
self, adjust_threshold
3386+
) -> None:
3387+
machine = WeblateTranslation({})
3388+
base = MagicMock()
3389+
short_queryset = MagicMock()
3390+
fuzzy_match = MagicMock(pk=1)
3391+
3392+
with (
3393+
patch.object(
3394+
machine, "get_short_query_matches", return_value=short_queryset
3395+
) as get_short_query_matches,
3396+
patch.object(machine, "prepare_queryset", return_value=[fuzzy_match]),
3397+
):
3398+
results = machine.get_matching_units(base, "id", 75)
3399+
3400+
self.assertEqual(results, [fuzzy_match])
3401+
get_short_query_matches.assert_called_once_with(base, "id")
3402+
adjust_threshold.assert_called_once_with(0.98)
3403+
3404+
@patch("weblate.machinery.weblatetm.adjust_similarity_threshold")
3405+
def test_get_matching_units_uses_exact_lookup_at_full_threshold(
3406+
self, adjust_threshold
3407+
) -> None:
3408+
machine = WeblateTranslation({})
3409+
base = MagicMock()
3410+
queryset = MagicMock()
3411+
exact_match = MagicMock(pk=1)
3412+
base.filter.return_value = queryset
3413+
3414+
with patch.object(machine, "prepare_queryset", return_value=[exact_match]):
3415+
results = machine.get_matching_units(base, "Hello", 100)
3416+
3417+
self.assertEqual(results, [exact_match])
3418+
adjust_threshold.assert_not_called()
3419+
3420+
33213421
class CommandTest(FixtureTestCase):
33223422
"""Test for management commands."""
33233423

weblate/machinery/weblatetm.py

Lines changed: 62 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44

55
from __future__ import annotations
66

7-
from typing import TYPE_CHECKING, Any
7+
from typing import TYPE_CHECKING
88

99
from django.conf import settings
10-
from django.db.models import Value
11-
from django.db.models.functions import MD5, Lower
10+
from django.db.models import Case, IntegerField, Value, When
11+
from django.db.models.functions import MD5, Length, Lower
1212

13-
from weblate.trans.models import Unit
14-
from weblate.utils.db import adjust_similarity_threshold
13+
from weblate.trans.models import Translation, Unit
14+
from weblate.utils.db import adjust_similarity_threshold, use_trgm_fallback
1515
from weblate.utils.state import STATE_TRANSLATED
1616

1717
from .base import InternalMachineTranslation
@@ -28,6 +28,7 @@ class WeblateTranslation(InternalMachineTranslation):
2828
cache_translations = True
2929
# Cache results for 1 hour to avoid frequent database hits
3030
cache_expiry = 3600
31+
candidate_limit = 100
3132

3233
def download_translations(
3334
self,
@@ -39,42 +40,12 @@ def download_translations(
3940
threshold: int = 10,
4041
) -> DownloadTranslations:
4142
"""Download list of possible translations from a service."""
42-
# Filter based on user access
43-
base = Unit.objects.filter_access(user) if user else Unit.objects.all()
44-
45-
# Use memory_db for the query in case it exists. This is supposed
46-
# to be a read-only replica for offloading expensive translation
47-
# queries.
48-
if "memory_db" in settings.DATABASES:
49-
base = base.using("memory_db")
50-
51-
# Build search query
52-
lookup: dict[str, Any] = {}
53-
if threshold < 100:
54-
# Full text search
55-
lookup["source__trgm_search"] = text
56-
else:
57-
# Utilize PostgreSQL index
58-
lookup["source__lower__md5"] = MD5(Lower(Value(text)))
59-
lookup["source"] = text
60-
61-
matching_units = (
62-
base.filter(
63-
translation__component__source_language=source_language,
64-
translation__language=target_language,
65-
state__gte=STATE_TRANSLATED,
66-
**lookup,
67-
)
68-
.exclude(
69-
# The read-only strings can be possibly blank
70-
target__lower__md5=MD5(Lower(Value("")))
71-
)
72-
.prefetch()
43+
matching_units = self.get_matching_units(
44+
self.get_base_queryset(user, source_language, target_language),
45+
text,
46+
threshold,
7347
)
7448

75-
# We want only close matches here
76-
adjust_similarity_threshold(0.98)
77-
7849
for munit in matching_units:
7950
source = munit.source_string
8051
if "forbidden" in munit.all_flags:
@@ -91,3 +62,55 @@ def download_translations(
9162
"origin_url": munit.get_absolute_url(),
9263
"source": source,
9364
}
65+
66+
def get_base_queryset(self, user, source_language, target_language):
67+
alias = "memory_db" if "memory_db" in settings.DATABASES else "default"
68+
69+
translations = Translation.objects.using(alias).all()
70+
if user is not None:
71+
translations = translations.filter_access(user)
72+
73+
translation_ids = translations.filter(
74+
component__source_language=source_language,
75+
language=target_language,
76+
).values("id")
77+
78+
return Unit.objects.using(alias).filter(
79+
state__gte=STATE_TRANSLATED,
80+
translation_id__in=translation_ids,
81+
)
82+
83+
def get_matching_units(self, base, text: str, threshold: int):
84+
if threshold < 100:
85+
adjust_similarity_threshold(0.98)
86+
if use_trgm_fallback(text):
87+
queryset = self.get_short_query_matches(base, text)
88+
else:
89+
queryset = base.filter(source__trgm_search=text)
90+
else:
91+
queryset = base.filter(
92+
source__lower__md5=MD5(Lower(Value(text))),
93+
source=text,
94+
)
95+
96+
return list(self.prepare_queryset(queryset)[: self.candidate_limit])
97+
98+
def prepare_queryset(self, queryset):
99+
return queryset.exclude(target="").prefetch()
100+
101+
def get_short_query_matches(self, base, text: str):
102+
max_source_length = max(len(text) + 4, len(text) * 2, 8)
103+
return (
104+
base.filter(source__icontains=text)
105+
.annotate(
106+
short_query_rank=Case(
107+
When(source__iexact=text, then=Value(0)),
108+
When(source__istartswith=text, then=Value(1)),
109+
default=Value(2),
110+
output_field=IntegerField(),
111+
),
112+
source_length=Length("source"),
113+
)
114+
.filter(source_length__lte=max_source_length)
115+
.order_by("short_query_rank", "source_length", "pk")
116+
)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Copyright © Michal Čihař <michal@weblate.org>
2+
#
3+
# SPDX-License-Identifier: GPL-3.0-or-later
4+
5+
from django.contrib.postgres import indexes as postgres_indexes
6+
from django.db import migrations, models
7+
from django.db.models import Q
8+
9+
10+
class Migration(migrations.Migration):
11+
dependencies = [
12+
("trans", "0067_componentlink_alter_component_links"),
13+
]
14+
15+
operations = [
16+
migrations.AddIndex(
17+
model_name="unit",
18+
index=postgres_indexes.GinIndex(
19+
postgres_indexes.OpClass(models.F("source"), name="gin_trgm_ops"),
20+
condition=Q(state__gte=20) & ~Q(target=""),
21+
name="trans_unit_source_tm_idx",
22+
),
23+
),
24+
]

weblate/trans/models/unit.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,11 @@ class Meta:
530530
models.F("translation"),
531531
name="unit_explanation_fulltext",
532532
),
533+
postgres_indexes.GinIndex(
534+
postgres_indexes.OpClass(models.F("source"), name="gin_trgm_ops"),
535+
condition=Q(state__gte=STATE_TRANSLATED) & ~Q(target=""),
536+
name="trans_unit_source_tm_idx",
537+
),
533538
]
534539

535540
def __str__(self) -> str:

weblate/utils/db.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ def count_alnum(string):
5555
return sum(map(str.isalnum, string))
5656

5757

58+
def use_trgm_fallback(string: str) -> bool:
59+
return count_alnum(string) <= 3
60+
61+
5862
class PostgreSQLFallbackLookupMixin(Lookup):
5963
"""
6064
Mixin to block PostgreSQL from using trigram index.
@@ -79,7 +83,7 @@ def process_lhs(self, compiler, connection, lhs=None):
7983

8084
class PostgreSQLFallbackLookup(PostgreSQLFallbackLookupMixin, PatternLookup):
8185
def __init__(self, lhs, rhs) -> None:
82-
self._needs_fallback = isinstance(rhs, str) and count_alnum(rhs) <= 3
86+
self._needs_fallback = isinstance(rhs, str) and use_trgm_fallback(rhs)
8387
super().__init__(lhs, rhs)
8488

8589

0 commit comments

Comments
 (0)