Modified word_score to give absolute score and threshold scores; removed idf

ksgr5566 · ksgr5566 · commit 2804b9fdf9e6 · 2023-09-19T23:39:36.000+05:30
diff --git a/src/search/word_score/local/README.md b/src/search/word_score/local/README.md
@@ -4,10 +4,10 @@
 
 - Git clone the repo and cd to the project location.
 - cd to `local`, i.e., `cd ./src/search/word_score/local`.
-- Replace the file in `./content` with a csv file of your choice, but the data column should be named `tags` column.
+- Replace the link in the Dockerfile to a downloadable csv file of your choice, but the data column should be named `tags`.
 - Start your docker engine and `docker build -t word_score .`.
 - Do `docker run -p 8000:8000 word_score`.
-- `curl -X POST -H "Content-Type: application/json" -d '{"query": "seed procurement district", "n": "5", "search_category" : "seed"}' http://localhost:8000/`. <br> Replace `seed procurement district` with a query you want to search and `5` with the number of rows you want to retrieve.
+- `curl -X POST -H "Content-Type: application/json" -d '{"query": "seed procurement district", "n": "5", "search_category" : "seed", "threshold": "0.8", "k": "6"}' http://localhost:8000/`. <br> Replace `seed procurement district` with a query you want to search and `5` with the number of rows you want to retrieve. Change `threshold` value (0 to 1) to retrieve documents whose score cross the specific threshold. `k` is the number of top k words to consider for thresholding of the score.
 - The reponse for above would be: <br>
 `
 {
diff --git a/src/search/word_score/local/model.py b/src/search/word_score/local/model.py
@@ -1,15 +1,12 @@
 from request import ModelRequest
-import pandas as pd
-from math import log
 from thefuzz import fuzz
 import numpy as np
 from cache import AsyncTTL
 from tqdm import tqdm
-import os   
 
 
 class Model:
-    def __init__(self, seed_df,pesticide_df, fertilizer_df, global_df, request: ModelRequest, search_categoty= 'others' ):
+    def __init__(self, seed_df,pesticide_df, fertilizer_df, global_df, request: ModelRequest, search_category= 'others'):
         self.search_category =  request.search_category
         if self.search_category == 'seed':
             self.df = seed_df
@@ -19,54 +16,49 @@ def __init__(self, seed_df,pesticide_df, fertilizer_df, global_df, request: Mode
             self.df = pesticide_df
         else :
             self.df = global_df
-        self.idf_dict = self.__compute_idf(self.df)
     
-    @staticmethod
-    def __compute_idf(df):
-        N = len(df)
-        all_tags = df['tags'].str.split().explode()
-        df_count_series = all_tags.drop_duplicates().value_counts()
-        idf_dict = {tag: log(N / (df_count + 1)) for tag, df_count in df_count_series.items()}
-        return idf_dict
 
-    def __fuzzy_match(self, query_tokens, doc_tokens):
-        weighted_fuzzy_scores = []
+    def __fuzzy_match(self, query_tokens, doc_tokens, k):
+        fuzzy_scores = []
         query_set = set(query_tokens)
         doc_set = set(doc_tokens)
 
         for q_token in query_set:
             max_ratio = None
-            max_token = None
+            # max_token = None
             for token in doc_set:
                 ratio = fuzz.ratio(token, q_token)
                 if max_ratio == None or ratio > max_ratio:
                    max_ratio = ratio
-                   max_token = token
+                #    max_token = token
 
-            
-            idf_weight = self.idf_dict.get(max_token, 0.0)
-            weighted_fuzzy_scores.append((max_ratio / 100) * idf_weight)
+            fuzzy_scores.append((max_ratio / 100))
 
-        return np.mean(weighted_fuzzy_scores)
+        fuzzy_scores = sorted(fuzzy_scores, reverse=True)
+
+        return np.mean(fuzzy_scores), np.mean(fuzzy_scores[:k]) 
 
 
     @AsyncTTL(time_to_live=600000, maxsize=1024)
     async def inference(self, request: ModelRequest):
         scores = []
+        top_k_scores = []
         query = request.query
-        n = int(request.n)
+        threshold = float(request.threshold)
+        k = int(request.k) # k is the number of top k words to consider for the score
+        n = int(request.n) # n is the number of documents to return
         query_tokens = query.lower().split()
 
         for _, row in tqdm(self.df.iterrows()):
             doc_tokens = str(row['tags']).split()
-            fuzzy_score = self.__fuzzy_match(query_tokens, doc_tokens)
+            fuzzy_score, top_k_score = self.__fuzzy_match(query_tokens, doc_tokens, k)
             scores.append(fuzzy_score)
-
-        max_score = max(scores) if scores else 1
-        scores = [score / max_score for score in scores]
+            top_k_scores.append(top_k_score)
 
         new_df = self.df.copy(deep=True)
         new_df['scores'] = scores
+        new_df['top_k_scores'] = top_k_scores
+        new_df = new_df[new_df['top_k_scores'] > threshold]
         new_df_sorted = new_df.sort_values(by=['scores'], ascending=False).head(n)
         return {"docs": new_df_sorted['tags'].to_list()}
     
diff --git a/src/search/word_score/local/request.py b/src/search/word_score/local/request.py
@@ -2,10 +2,12 @@
 
 
 class ModelRequest():
-    def __init__(self, query, n, search_category):
+    def __init__(self, query, n, search_category, threshold, k=5):
         self.query = query
         self.n = n
         self.search_category =  search_category
+        self.threshold = threshold
+        self.k = k
 
     def to_json(self):
         return json.dumps(self, default=lambda o: o.__dict__,

Original file line number	Diff line number	Diff line change
`@@ -4,10 +4,10 @@`
`4`	`4`
`5`	`5`	`- Git clone the repo and cd to the project location.`
`6`	`6`	- cd to `local`, i.e., `cd ./src/search/word_score/local`.
`7`		-- Replace the file in `./content` with a csv file of your choice, but the data column should be named `tags` column.
	`7`	+- Replace the link in the Dockerfile to a downloadable csv file of your choice, but the data column should be named `tags`.
`8`	`8`	- Start your docker engine and `docker build -t word_score .`.
`9`	`9`	- Do `docker run -p 8000:8000 word_score`.
`10`		-- `curl -X POST -H "Content-Type: application/json" -d '{"query": "seed procurement district", "n": "5", "search_category" : "seed"}' http://localhost:8000/`. <br> Replace `seed procurement district` with a query you want to search and `5` with the number of rows you want to retrieve.
	`10`	+- `curl -X POST -H "Content-Type: application/json" -d '{"query": "seed procurement district", "n": "5", "search_category" : "seed", "threshold": "0.8", "k": "6"}' http://localhost:8000/`. <br> Replace `seed procurement district` with a query you want to search and `5` with the number of rows you want to retrieve. Change `threshold` value (0 to 1) to retrieve documents whose score cross the specific threshold. `k` is the number of top k words to consider for thresholding of the score.
`11`	`11`	`- The reponse for above would be: <br>`
`12`	`12`	`
`13`	`13`	`{`