Merge pull request #6 from govtechmy/feat/search-relevancy

shenghann · web-flow · commit dc4f10984e81 · 2025-06-13T01:07:25.000+08:00
feat: add AutocompleteView for search suggestions and improve author …
diff --git a/src/api/urls.py b/src/api/urls.py
@@ -9,13 +9,15 @@
     SpeechView,
     AuthorHistoryView,
     AuthorView,
+    AutocompleteView,
     health_check,
 )
 
 urlpatterns = [
     path("catalogue/", CatalogueView.as_view(), name="catalogue"),
     path("search/", SearchResultsList.as_view(), name="search"),
     path("search-plot/", SearchPlotView.as_view(), name="search-plot"),
+    path("autocomplete/", AutocompleteView.as_view(), name="autocomplete"),
     path("sitting/", HansardView.as_view(), name="sitting"),
     path("speech/", SpeechView.as_view(), name="speech"),
     path("attendance/", AttendanceView.as_view(), name="attendance"),
diff --git a/src/api/views.py b/src/api/views.py
@@ -552,6 +552,130 @@ def get(self, request) -> JsonResponse:
         return JsonResponse(res)
 
 
+class AutocompleteView(APIView):
+    """
+    API endpoint for search autocomplete functionality.
+    Returns keyword suggestions based on actual speech content.
+    Only filters by house for optimal performance/relevance balance.
+    """
+
+    @method_decorator(cache_page(60 * 15))  # Cache for 15 minutes
+    def get(self, request) -> JsonResponse:
+        autocomplete_query = request.query_params.get("q", "").strip().lower()
+        max_suggestions = int(request.query_params.get("limit", 8))
+
+        if len(autocomplete_query) < 2:
+            return JsonResponse({"suggestions": []})
+
+        # Simple caching with just house + query
+        from django.core.cache import cache
+
+        house = request.query_params.get("house", "dewan-rakyat")
+        cache_key = f"autocomplete:{house}:{autocomplete_query}:{max_suggestions}"
+
+        cached_result = cache.get(cache_key)
+        if cached_result:
+            return JsonResponse(cached_result)
+
+        # Simple house filtering only
+        house_int = ParliamentaryCycle.get_integer_value(house)
+        # filters = Q(sitting__cycle__house=house_int)
+
+        suggestions = []
+
+        try:
+            # Primary Strategy: Fast prefix matching (most important)
+            prefix_suggestions = self._get_prefix_suggestions(
+                autocomplete_query, house_int, max_suggestions
+            )
+            suggestions.extend(prefix_suggestions)
+
+        except Exception as e:
+            logger.error(f"Autocomplete error: {str(e)}")
+            return JsonResponse({"suggestions": []})
+
+        # Remove duplicates while preserving order and relevance
+        unique_suggestions = []
+        seen = set()
+        for suggestion in suggestions:
+            suggestion_clean = suggestion.strip().lower()
+            if (
+                suggestion_clean not in seen
+                and suggestion_clean != autocomplete_query
+                and len(suggestion_clean) > 1
+            ):
+                unique_suggestions.append(suggestion.strip())
+                seen.add(suggestion_clean)
+
+        result = {
+            "suggestions": unique_suggestions[:max_suggestions],
+            "query": autocomplete_query,
+        }
+
+        # Cache result - longer cache for popular queries
+        cache_timeout = 3600 if len(unique_suggestions) > 3 else 900
+        cache.set(cache_key, result, cache_timeout)
+
+        return JsonResponse(result)
+
+    def _get_prefix_suggestions(self, query, house_int, limit):
+        """Fast prefix matching using the indexed speech_vector field with PostgreSQL FTS"""
+        from django.db.models import Q, F
+        from django.contrib.postgres.search import SearchQuery, SearchRank
+
+        search_query = SearchQuery(query)
+
+        speeches = (
+            Speech.objects.filter(
+                sitting__cycle__house=house_int,
+                is_annotation=False,
+                speech_vector=search_query,
+            )
+            # .annotate(rank=SearchRank(F("speech_vector"), search_query))
+            # .order_by("-rank")
+            .values_list("speech", flat=True)[:100]
+        )  # Get most relevant results first
+
+        # If no results with exact search, try prefix search using raw query
+        if not speeches:
+            # Use prefix search with :* operator for word prefix matching
+            prefix_query = SearchQuery(
+                f"{query}:*", search_type="raw", config="english"
+            )
+            speeches = (
+                Speech.objects.filter(
+                    sitting__cycle__house=house_int,
+                    is_annotation=False,
+                    speech_vector=prefix_query,
+                )
+                # .annotate(rank=SearchRank(F("speech_vector"), prefix_query))
+                # .order_by("-rank")
+                .values_list("speech", flat=True)[:100]
+            )
+
+        # Extract words from speeches in Python (fast for small datasets)
+        words = set()
+        query_lower = query.lower()
+
+        for speech in speeches:
+            if not speech:
+                continue
+            # Split speech into words and find ones starting with query
+            speech_words = speech.lower().split()
+            for word in speech_words:
+                # Clean word (remove punctuation)
+                clean_word = "".join(c for c in word if c.isalpha())
+                if (
+                    len(clean_word) > 2
+                    and clean_word.startswith(query_lower)
+                    and len(words) < limit * 3
+                ):  # Collect more than needed
+                    words.add(clean_word)
+
+        # Return sorted list
+        return sorted(list(words))[:limit]
+
+
 class CatalogueView(View):
     """
     API endpoint that returns a list of sittings by house.