Skip to content

Commit 9a43da9

Browse files
sarahboycepauloxnet
andcommitted
Made typos in docs searches return some results.
Co-authored-by: Paolo Melchiorre <[email protected]>
1 parent 9767504 commit 9a43da9

File tree

3 files changed

+79
-19
lines changed

3 files changed

+79
-19
lines changed

docs/models.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -250,8 +250,7 @@ def search(self, query_text, release):
250250
query_text, config=models.F("config"), search_type="websearch"
251251
)
252252
search_rank = SearchRank(models.F("search"), search_query)
253-
similarity = TrigramSimilarity("title", query_text)
254-
return (
253+
base_qs = (
255254
self.prefetch_related(
256255
Prefetch(
257256
"release",
@@ -261,12 +260,8 @@ def search(self, query_text, release):
261260
"release__release", queryset=Release.objects.only("version")
262261
),
263262
)
264-
.filter(
265-
release_id=release.id,
266-
search=search_query,
267-
)
263+
.filter(release_id=release.id)
268264
.annotate(
269-
rank=search_rank + similarity,
270265
headline=SearchHeadline(
271266
"title",
272267
search_query,
@@ -283,12 +278,28 @@ def search(self, query_text, release):
283278
),
284279
breadcrumbs=models.F("metadata__breadcrumbs"),
285280
)
286-
.order_by("-rank")
287281
.only(
288282
"path",
289283
"release",
290284
)
291285
)
286+
vector_qs = (
287+
base_qs.alias(rank=search_rank)
288+
.filter(search=search_query)
289+
.order_by("-rank")
290+
)
291+
if not vector_qs:
292+
return (
293+
base_qs.alias(
294+
similarity=TrigramSimilarity(
295+
"title", utils.sanitize_for_trigram(query_text)
296+
)
297+
)
298+
.filter(similarity__gt=0.3)
299+
.order_by("-similarity")
300+
)
301+
else:
302+
return vector_qs
292303
else:
293304
return self.none()
294305

docs/tests.py

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from .models import DOCUMENT_SEARCH_VECTOR, Document, DocumentRelease
2121
from .sitemaps import DocsSitemap
2222
from .templatetags.docs import get_all_doc_versions
23-
from .utils import get_doc_path
23+
from .utils import get_doc_path, sanitize_for_trigram
2424

2525

2626
class ModelsTests(TestCase):
@@ -262,6 +262,30 @@ def test_get_doc_path(self):
262262
path, filename = __file__.rsplit(os.path.sep, 1)
263263
self.assertEqual(get_doc_path(Path(path), filename), None)
264264

265+
def test_sanitize_for_trigram(self):
266+
for query, sanitized_query in [
267+
("simple search", "simple search"),
268+
("Python Django -Flask", "Python Django"),
269+
('Python "Django Framework" -Flask', "Python Django Framework"),
270+
("Développement -'Framework Django' web", "Developpement web"),
271+
(
272+
"Γλώσσα προγραμματισμού Python -'Flask και Django'",
273+
"Γλωσσα προγραμματισμου Python",
274+
),
275+
(
276+
"Pemrograman Python -'Flask dan Django' backend",
277+
"Pemrograman Python backend",
278+
),
279+
(
280+
"Programmazione 'Python e Django' -Flask",
281+
"Programmazione Python e Django",
282+
),
283+
("Linguagem Python -'Django e Flask' web", "Linguagem Python web"),
284+
("Desarrollo Python -'Django y Flask' rápido", "Desarrollo Python rapido"),
285+
]:
286+
with self.subTest(query=query):
287+
self.assertEqual(sanitize_for_trigram(query), sanitized_query)
288+
265289

266290
class UpdateDocTests(TestCase):
267291
@classmethod
@@ -548,18 +572,16 @@ def setUp(self):
548572
def test_search(self):
549573
expected_list = [
550574
(
551-
0.96982837,
552575
"releases/1.2.1",
553-
"<mark>Django</mark> 1.2.1 release notes",
576+
"<mark>Django</mark> 1.2.1 release notes", # Ranked: 0.96982837.
554577
(
555578
"<mark>Django</mark> 1.2.1 release notes ¶ \n "
556579
"<mark>Django</mark> 1.2.1 was released almost immediately after 1.2.0 to correct two small"
557580
),
558581
),
559582
(
560-
0.9490876,
561583
"releases/1.9.4",
562-
"<mark>Django</mark> 1.9.4 release notes",
584+
"<mark>Django</mark> 1.9.4 release notes", # Ranked: 0.9490876.
563585
(
564586
"<mark>Django</mark> 1.9.4 release notes ¶ \n "
565587
"March 5, 2016 \n "
@@ -570,24 +592,24 @@ def test_search(self):
570592
self.assertQuerySetEqual(
571593
Document.objects.search("django", self.release),
572594
expected_list,
573-
transform=attrgetter("rank", "path", "headline", "highlight"),
595+
transform=attrgetter("path", "headline", "highlight"),
574596
)
575597

576598
def test_websearch(self):
577599
self.assertQuerySetEqual(
578600
Document.objects.search('django "release notes" -packaging', self.release),
579-
[("Django 1.9.4 release notes", 1.5675676)],
580-
transform=attrgetter("title", "rank"),
601+
["Django 1.9.4 release notes"],
602+
transform=attrgetter("title"),
581603
)
582604

583605
def test_multilingual_search(self):
584606
self.assertQuerySetEqual(
585607
Document.objects.search("publication", self.release_fr),
586608
[
587-
("Notes de publication de Django 1.2.1", 1.0693262),
588-
("Notes de publication de Django 1.9.4", 1.0458658),
609+
"Notes de publication de Django 1.2.1", # Ranked: 1.0693262.
610+
"Notes de publication de Django 1.9.4", # Ranked: 1.0458658.
589611
],
590-
transform=attrgetter("title", "rank"),
612+
transform=attrgetter("title"),
591613
)
592614

593615
def test_empty_search(self):
@@ -642,6 +664,15 @@ def test_search_highlight_stemmed(self):
642664
transform=attrgetter("headline", "highlight"),
643665
)
644666

667+
def test_search_title(self):
668+
misspelled_query = Document.objects.search("viewss", self.release)
669+
with self.assertNumQueries(2):
670+
self.assertQuerySetEqual(
671+
misspelled_query,
672+
["Generic views"],
673+
transform=attrgetter("headline"),
674+
)
675+
645676

646677
class TemplateTestCase(TestCase):
647678
def _assertOGTitleEqual(self, doc, expected):

docs/utils.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import re
2+
import unicodedata
3+
14
from django.conf import settings
25
from django.http import Http404
36

@@ -39,3 +42,18 @@ def get_doc_path_or_404(docroot, subpath):
3942
if doc is None:
4043
raise Http404(doc)
4144
return doc
45+
46+
47+
def sanitize_for_trigram(text):
48+
"""
49+
Sanitize search query for PostgreSQL Trigram search.
50+
51+
- Removes parts starting with '-'
52+
- Normalizes Unicode characters (NFKD)
53+
- Keeps only letters, numbers and spaces
54+
- Removes multiple spaces and trims
55+
"""
56+
text = re.sub(r'(\s|^)-[^\s"\']+|(\s|^)-["\'][^"\']+["\']', "", text)
57+
text = unicodedata.normalize("NFKD", text)
58+
text = re.sub(r"[^\w\s]", "", text, flags=re.UNICODE)
59+
return " ".join(text.split())

0 commit comments

Comments
 (0)