Merge pull request Samagra-Development#254 from ksgr5566/search

Gautam-Rajeev · web-flow · commit 5a011be19739 · 2023-08-31T10:26:20.000+05:30
Added word_score
diff --git a/config.json b/config.json
@@ -78,6 +78,14 @@
       "environment": {},
       "nginx": []
     },
+    {
+      "serviceName": "word_score",
+      "modelBasePath": "src/search/word_score/local/.",
+      "apiBasePath": "/search/word_score/local",
+      "containerPort": 8000,
+      "environment": {},
+      "nginx": []
+    },
     {
       "serviceName": "text_translation_bhashini",
       "modelBasePath": "src/text_translation/bhashini/remote/.",
diff --git a/repository_data.json b/repository_data.json
@@ -120,6 +120,16 @@
                     "request_class": "ModelRequest"
                 }
             }
+        },
+        "search": {
+            "word_score": {
+                "local": {
+                    "__is_async": true,
+                     "__is_base": true,
+                    "model_class": "Model",
+                    "request_class": "ModelRequest"
+                }
+            }
         }
     }
 }
diff --git a/src/search/README.md b/src/search/README.md
@@ -0,0 +1,3 @@
+# Purpose
+
+Common folder for scoring methods required for augmenting search and retrieval of documents.
diff --git a/src/search/__init__.py b/src/search/__init__.py
@@ -0,0 +1 @@
+from word_score import *
diff --git a/src/search/word_score/README.md b/src/search/word_score/README.md
@@ -0,0 +1,5 @@
+# Word Score
+
+This folder consists of an API that scores documents based on an approach that combines IDF and Fuzzy word matching.
+
+For a given query, it calculates fuzzy matching scores for words in query (max score for a word from entire row), weights them with IDF, takes average of the scores of all words in the query to give a score for the entire query, sorts them, and returns the top n matches.
diff --git a/src/search/word_score/__init__.py b/src/search/word_score/__init__.py
@@ -0,0 +1 @@
+from .local import *
diff --git a/src/search/word_score/local/Dockerfile b/src/search/word_score/local/Dockerfile
@@ -0,0 +1,14 @@
+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+
+WORKDIR /app
+
+#install requirements
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+
+# Copy the rest of the application code to the working directory
+COPY . /app/
+EXPOSE 8000
+# Set the entrypoint for the container
+CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
diff --git a/src/search/word_score/local/README.md b/src/search/word_score/local/README.md
@@ -0,0 +1,18 @@
+# Word Score
+
+## Test Deployment
+
+- Git clone the repo and cd to the project location.
+- cd to `local`, i.e., `cd ./src/search/word_score/local`.
+- Replace the file in `./content` with a csv file of your choice, but the data column should be named `tags` column.
+- Start your docker engine and `docker build -t word_score .`.
+- Do `docker run -p 8000:8000 word_score`.
+- `curl -X POST -H "Content-Type: application/json" -d '{"query": QUERY, "n": N}' http://0.0.0.0:8000`. <br> Replace `QUERY` with a query and `N` with the number of rows you want to retrieve.
+- The reponse for above would be: <br>
+`
+{
+    "docs": ["row1", "row2", ... , "rowN"]
+}
+`
+The list of strings contains the top N rows.
+
diff --git a/src/search/word_score/local/__init__.py b/src/search/word_score/local/__init__.py
@@ -0,0 +1,2 @@
+from .request import *
+from .model import *
diff --git a/src/search/word_score/local/api.py b/src/search/word_score/local/api.py
@@ -0,0 +1,26 @@
+from model import Model
+from request import ModelRequest
+from quart import Quart, request
+import aiohttp
+
+#from fastapi import FastAPI, Body
+app = Quart(__name__)
+#app.client = aiohttp.ClientSession()
+#app = FastAPI()
+
+@app.before_serving
+async def startup():
+    app.client = aiohttp.ClientSession()
+
+@app.route('/', methods=['POST'])
+async def translate():
+    data = await request.get_json()
+    req = ModelRequest(**data)
+    model = Model(app)
+    return await model.inference(req)
+
+@app.route('/', methods=['GET'])
+async def hi():
+    return "hi"
+
+
diff --git a/src/search/word_score/local/model.py b/src/search/word_score/local/model.py
@@ -0,0 +1,68 @@
+from request import ModelRequest
+import pandas as pd
+from math import log
+from thefuzz import fuzz
+import numpy as np
+from cache import AsyncTTL
+from tqdm import tqdm
+import os
+
+
+class Model:
+    def __new__(cls, context):
+        cls.context = context
+        if not hasattr(cls, 'instance'):
+            files = os.listdir("./content")
+            cls.df = pd.read_csv(os.path.join("./content", files[0]))
+            cls.idf_dict = cls._Model__compute_idf(cls.df)
+            cls.instance = super(Model, cls).__new__(cls)
+        return cls.instance
+    
+    @staticmethod
+    def __compute_idf(df):
+        N = len(df)
+        all_tags = df['tags'].str.lower().str.split().explode()
+        df_count_series = all_tags.drop_duplicates().value_counts()
+        idf_dict = {tag: log(N / (df_count + 1)) for tag, df_count in df_count_series.items()}
+        return idf_dict
+    
+    def __fuzzy_match(self, query_tokens, doc_tokens):
+        weighted_fuzzy_scores = []
+        query_set = set(query_tokens)
+        doc_set = set(doc_tokens)
+
+        for q_token in query_set:
+            max_ratio = None
+            max_token = None
+            for token in doc_set:
+                ratio = fuzz.ratio(token, q_token)
+                if max_ratio == None or ratio > max_ratio:
+                   max_ratio = ratio
+                   max_token = token
+
+            idf_weight = self.idf_dict.get(max_token)
+            weighted_fuzzy_scores.append((max_ratio / 100) * idf_weight)
+
+        return np.mean(weighted_fuzzy_scores)
+
+
+    @AsyncTTL(time_to_live=600000, maxsize=1024)
+    async def inference(self, request: ModelRequest):
+        scores = []
+        query = request.query
+        n = request.n
+        query_tokens = query.lower().split()
+
+        for _, row in tqdm(self.df.iterrows()):
+            doc_tokens = row['tags'].lower().split()
+            fuzzy_score = self.__fuzzy_match(query_tokens, doc_tokens)
+            scores.append(fuzzy_score)
+
+        max_score = max(scores) if scores else 1
+        scores = [score / max_score for score in scores]
+
+        new_df = self.df.copy(deep=True)
+        new_df['scores'] = scores
+        new_df_sorted = new_df.sort_values(by=['scores'], ascending=False).head(n)
+        return {"docs": new_df_sorted['tags'].to_list()}
+    
diff --git a/src/search/word_score/local/request.py b/src/search/word_score/local/request.py
@@ -0,0 +1,11 @@
+import json
+
+
+class ModelRequest():
+    def __init__(self, query, n):
+        self.query = query
+        self.n = n
+
+    def to_json(self):
+        return json.dumps(self, default=lambda o: o.__dict__,
+                          sort_keys=True, indent=4)
diff --git a/src/search/word_score/local/requirements.txt b/src/search/word_score/local/requirements.txt
@@ -0,0 +1,6 @@
+thefuzz
+quart
+aiohttp
+async-cache==1.1.1
+pandas
+tqdm

Original file line number	Diff line number	Diff line change
`@@ -120,6 +120,16 @@`
`120`	`120`	`"request_class": "ModelRequest"`
`121`	`121`	`}`
`122`	`122`	`}`
	`123`	`+ },`
	`124`	`+ "search": {`
	`125`	`+ "word_score": {`
	`126`	`+ "local": {`
	`127`	`+ "__is_async": true,`
	`128`	`+ "__is_base": true,`
	`129`	`+ "model_class": "Model",`
	`130`	`+ "request_class": "ModelRequest"`
	`131`	`+ }`
	`132`	`+ }`
`123`	`133`	`}`
`124`	`134`	`}`
`125`	`135`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Purpose`
	`2`	`+`
	`3`	`+Common folder for scoring methods required for augmenting search and retrieval of documents.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .request import *`
	`2`	`+from .model import *`