Merge pull request Samagra-Development#255 from Samagra-Development/fuzzy_search

Gautam-Rajeev · web-flow · commit 32e8179428d6 · 2023-08-31T16:20:54.000+05:30
added right csv and filtered
diff --git a/src/search/word_score/local/Dockerfile b/src/search/word_score/local/Dockerfile
@@ -7,8 +7,15 @@ WORKDIR /app
 COPY requirements.txt requirements.txt
 RUN pip3 install -r requirements.txt
 
+# Download the CSV from Google Drive and store it in the "content" directory
+RUN apt-get update && apt-get install -y curl && \
+    mkdir content && \
+    curl -L 'https://drive.google.com/uc?export=download&id=1Ka6cyCCHbRy6h8Ej075_Nk9mMICp_xS6' -o content/data.csv && \
+    apt-get remove -y curl && apt-get autoremove -y && rm -rf /var/lib/apt/lists/*
+
 # Copy the rest of the application code to the working directory
 COPY . /app/
 EXPOSE 8000
+
 # Set the entrypoint for the container
 CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
diff --git a/src/search/word_score/local/api.py b/src/search/word_score/local/api.py
@@ -2,25 +2,38 @@
 from request import ModelRequest
 from quart import Quart, request
 import aiohttp
+import os
+import pandas as pd
 
-#from fastapi import FastAPI, Body
 app = Quart(__name__)
-#app.client = aiohttp.ClientSession()
-#app = FastAPI()
+
+# Global variable for the dataframe
+global_df = None
 
 @app.before_serving
 async def startup():
     app.client = aiohttp.ClientSession()
+    
+    # Load the dataframe during startup
+    global global_df
+    global seed_df 
+    global pesticide_df
+    global fertilizer_df
+    files = os.listdir("./content")
+    global_df = pd.read_csv(os.path.join("./content", files[0]))
+    global_df['tags'] = global_df['tags'].str.lower()
+    seed_df = global_df.loc[global_df.category == 'seed',: ]
+    pesticide_df = global_df.loc[global_df.category == 'pesticide',: ]
+    fertilizer_df = global_df.loc[global_df.category == 'fertilizer',: ]
 
 @app.route('/', methods=['POST'])
 async def translate():
     data = await request.get_json()
     req = ModelRequest(**data)
-    model = Model(app)
+    # Pass the dataframe as an argument to the Model class
+    model = Model(seed_df,pesticide_df, fertilizer_df, global_df , req)
     return await model.inference(req)
 
 @app.route('/', methods=['GET'])
 async def hi():
     return "hi"
-
-
diff --git a/src/search/word_score/local/model.py b/src/search/word_score/local/model.py
@@ -5,27 +5,30 @@
 import numpy as np
 from cache import AsyncTTL
 from tqdm import tqdm
-import os
+import os   
 
 
 class Model:
-    def __new__(cls, context):
-        cls.context = context
-        if not hasattr(cls, 'instance'):
-            files = os.listdir("./content")
-            cls.df = pd.read_csv(os.path.join("./content", files[0]))
-            cls.idf_dict = cls._Model__compute_idf(cls.df)
-            cls.instance = super(Model, cls).__new__(cls)
-        return cls.instance
+    def __init__(self, seed_df,pesticide_df, fertilizer_df, global_df, request: ModelRequest, search_categoty= 'others' ):
+        self.search_category =  request.search_category
+        if self.search_category == 'seed':
+            self.df = seed_df
+        elif self.search_category == 'fertilizer':
+            self.df = fertilizer_df
+        elif self.search_category == 'pesticide':
+            self.df = pesticide_df
+        else :
+            self.df = global_df
+        self.idf_dict = self.__compute_idf(self.df)
     
     @staticmethod
     def __compute_idf(df):
         N = len(df)
-        all_tags = df['tags'].str.lower().str.split().explode()
+        all_tags = df['tags'].str.split().explode()
         df_count_series = all_tags.drop_duplicates().value_counts()
         idf_dict = {tag: log(N / (df_count + 1)) for tag, df_count in df_count_series.items()}
         return idf_dict
-    
+
     def __fuzzy_match(self, query_tokens, doc_tokens):
         weighted_fuzzy_scores = []
         query_set = set(query_tokens)
@@ -40,7 +43,8 @@ def __fuzzy_match(self, query_tokens, doc_tokens):
                    max_ratio = ratio
                    max_token = token
 
-            idf_weight = self.idf_dict.get(max_token)
+            
+            idf_weight = self.idf_dict.get(max_token, 0.0)
             weighted_fuzzy_scores.append((max_ratio / 100) * idf_weight)
 
         return np.mean(weighted_fuzzy_scores)
@@ -50,11 +54,11 @@ def __fuzzy_match(self, query_tokens, doc_tokens):
     async def inference(self, request: ModelRequest):
         scores = []
         query = request.query
-        n = request.n
+        n = int(request.n)
         query_tokens = query.lower().split()
 
         for _, row in tqdm(self.df.iterrows()):
-            doc_tokens = row['tags'].lower().split()
+            doc_tokens = str(row['tags']).split()
             fuzzy_score = self.__fuzzy_match(query_tokens, doc_tokens)
             scores.append(fuzzy_score)
 
diff --git a/src/search/word_score/local/request.py b/src/search/word_score/local/request.py
@@ -2,9 +2,10 @@
 
 
 class ModelRequest():
-    def __init__(self, query, n):
+    def __init__(self, query, n, search_category):
         self.query = query
         self.n = n
+        self.search_category =  search_category
 
     def to_json(self):
         return json.dumps(self, default=lambda o: o.__dict__,