Skip to content

Commit dc4f109

Browse files
authored
Merge pull request #6 from govtechmy/feat/search-relevancy
feat: add AutocompleteView for search suggestions and improve author …
2 parents 4ada70b + 6083a6d commit dc4f109

File tree

2 files changed

+126
-0
lines changed

2 files changed

+126
-0
lines changed

src/api/urls.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@
99
SpeechView,
1010
AuthorHistoryView,
1111
AuthorView,
12+
AutocompleteView,
1213
health_check,
1314
)
1415

1516
urlpatterns = [
1617
path("catalogue/", CatalogueView.as_view(), name="catalogue"),
1718
path("search/", SearchResultsList.as_view(), name="search"),
1819
path("search-plot/", SearchPlotView.as_view(), name="search-plot"),
20+
path("autocomplete/", AutocompleteView.as_view(), name="autocomplete"),
1921
path("sitting/", HansardView.as_view(), name="sitting"),
2022
path("speech/", SpeechView.as_view(), name="speech"),
2123
path("attendance/", AttendanceView.as_view(), name="attendance"),

src/api/views.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,130 @@ def get(self, request) -> JsonResponse:
552552
return JsonResponse(res)
553553

554554

555+
class AutocompleteView(APIView):
556+
"""
557+
API endpoint for search autocomplete functionality.
558+
Returns keyword suggestions based on actual speech content.
559+
Only filters by house for optimal performance/relevance balance.
560+
"""
561+
562+
@method_decorator(cache_page(60 * 15)) # Cache for 15 minutes
563+
def get(self, request) -> JsonResponse:
564+
autocomplete_query = request.query_params.get("q", "").strip().lower()
565+
max_suggestions = int(request.query_params.get("limit", 8))
566+
567+
if len(autocomplete_query) < 2:
568+
return JsonResponse({"suggestions": []})
569+
570+
# Simple caching with just house + query
571+
from django.core.cache import cache
572+
573+
house = request.query_params.get("house", "dewan-rakyat")
574+
cache_key = f"autocomplete:{house}:{autocomplete_query}:{max_suggestions}"
575+
576+
cached_result = cache.get(cache_key)
577+
if cached_result:
578+
return JsonResponse(cached_result)
579+
580+
# Simple house filtering only
581+
house_int = ParliamentaryCycle.get_integer_value(house)
582+
# filters = Q(sitting__cycle__house=house_int)
583+
584+
suggestions = []
585+
586+
try:
587+
# Primary Strategy: Fast prefix matching (most important)
588+
prefix_suggestions = self._get_prefix_suggestions(
589+
autocomplete_query, house_int, max_suggestions
590+
)
591+
suggestions.extend(prefix_suggestions)
592+
593+
except Exception as e:
594+
logger.error(f"Autocomplete error: {str(e)}")
595+
return JsonResponse({"suggestions": []})
596+
597+
# Remove duplicates while preserving order and relevance
598+
unique_suggestions = []
599+
seen = set()
600+
for suggestion in suggestions:
601+
suggestion_clean = suggestion.strip().lower()
602+
if (
603+
suggestion_clean not in seen
604+
and suggestion_clean != autocomplete_query
605+
and len(suggestion_clean) > 1
606+
):
607+
unique_suggestions.append(suggestion.strip())
608+
seen.add(suggestion_clean)
609+
610+
result = {
611+
"suggestions": unique_suggestions[:max_suggestions],
612+
"query": autocomplete_query,
613+
}
614+
615+
# Cache result - longer cache for popular queries
616+
cache_timeout = 3600 if len(unique_suggestions) > 3 else 900
617+
cache.set(cache_key, result, cache_timeout)
618+
619+
return JsonResponse(result)
620+
621+
def _get_prefix_suggestions(self, query, house_int, limit):
622+
"""Fast prefix matching using the indexed speech_vector field with PostgreSQL FTS"""
623+
from django.db.models import Q, F
624+
from django.contrib.postgres.search import SearchQuery, SearchRank
625+
626+
search_query = SearchQuery(query)
627+
628+
speeches = (
629+
Speech.objects.filter(
630+
sitting__cycle__house=house_int,
631+
is_annotation=False,
632+
speech_vector=search_query,
633+
)
634+
# .annotate(rank=SearchRank(F("speech_vector"), search_query))
635+
# .order_by("-rank")
636+
.values_list("speech", flat=True)[:100]
637+
) # Get most relevant results first
638+
639+
# If no results with exact search, try prefix search using raw query
640+
if not speeches:
641+
# Use prefix search with :* operator for word prefix matching
642+
prefix_query = SearchQuery(
643+
f"{query}:*", search_type="raw", config="english"
644+
)
645+
speeches = (
646+
Speech.objects.filter(
647+
sitting__cycle__house=house_int,
648+
is_annotation=False,
649+
speech_vector=prefix_query,
650+
)
651+
# .annotate(rank=SearchRank(F("speech_vector"), prefix_query))
652+
# .order_by("-rank")
653+
.values_list("speech", flat=True)[:100]
654+
)
655+
656+
# Extract words from speeches in Python (fast for small datasets)
657+
words = set()
658+
query_lower = query.lower()
659+
660+
for speech in speeches:
661+
if not speech:
662+
continue
663+
# Split speech into words and find ones starting with query
664+
speech_words = speech.lower().split()
665+
for word in speech_words:
666+
# Clean word (remove punctuation)
667+
clean_word = "".join(c for c in word if c.isalpha())
668+
if (
669+
len(clean_word) > 2
670+
and clean_word.startswith(query_lower)
671+
and len(words) < limit * 3
672+
): # Collect more than needed
673+
words.add(clean_word)
674+
675+
# Return sorted list
676+
return sorted(list(words))[:limit]
677+
678+
555679
class CatalogueView(View):
556680
"""
557681
API endpoint that returns a list of sittings by house.

0 commit comments

Comments
 (0)