Skip to content

Commit 32e8179

Browse files
Merge pull request Samagra-Development#255 from Samagra-Development/fuzzy_search
added right csv and filtered
2 parents 5a011be + 26157fc commit 32e8179

File tree

4 files changed

+46
-21
lines changed

4 files changed

+46
-21
lines changed

src/search/word_score/local/Dockerfile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,15 @@ WORKDIR /app
77
COPY requirements.txt requirements.txt
88
RUN pip3 install -r requirements.txt
99

10+
# Download the CSV from Google Drive and store it in the "content" directory
11+
RUN apt-get update && apt-get install -y curl && \
12+
mkdir content && \
13+
curl -L 'https://drive.google.com/uc?export=download&id=1Ka6cyCCHbRy6h8Ej075_Nk9mMICp_xS6' -o content/data.csv && \
14+
apt-get remove -y curl && apt-get autoremove -y && rm -rf /var/lib/apt/lists/*
15+
1016
# Copy the rest of the application code to the working directory
1117
COPY . /app/
1218
EXPOSE 8000
19+
1320
# Set the entrypoint for the container
1421
CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]

src/search/word_score/local/api.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,38 @@
22
from request import ModelRequest
33
from quart import Quart, request
44
import aiohttp
5+
import os
6+
import pandas as pd
57

6-
#from fastapi import FastAPI, Body
78
app = Quart(__name__)
8-
#app.client = aiohttp.ClientSession()
9-
#app = FastAPI()
9+
10+
# Global variable for the dataframe
11+
global_df = None
1012

1113
@app.before_serving
1214
async def startup():
1315
app.client = aiohttp.ClientSession()
16+
17+
# Load the dataframe during startup
18+
global global_df
19+
global seed_df
20+
global pesticide_df
21+
global fertilizer_df
22+
files = os.listdir("./content")
23+
global_df = pd.read_csv(os.path.join("./content", files[0]))
24+
global_df['tags'] = global_df['tags'].str.lower()
25+
seed_df = global_df.loc[global_df.category == 'seed',: ]
26+
pesticide_df = global_df.loc[global_df.category == 'pesticide',: ]
27+
fertilizer_df = global_df.loc[global_df.category == 'fertilizer',: ]
1428

1529
@app.route('/', methods=['POST'])
1630
async def translate():
1731
data = await request.get_json()
1832
req = ModelRequest(**data)
19-
model = Model(app)
33+
# Pass the dataframe as an argument to the Model class
34+
model = Model(seed_df,pesticide_df, fertilizer_df, global_df , req)
2035
return await model.inference(req)
2136

2237
@app.route('/', methods=['GET'])
2338
async def hi():
2439
return "hi"
25-
26-

src/search/word_score/local/model.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,27 +5,30 @@
55
import numpy as np
66
from cache import AsyncTTL
77
from tqdm import tqdm
8-
import os
8+
import os
99

1010

1111
class Model:
12-
def __new__(cls, context):
13-
cls.context = context
14-
if not hasattr(cls, 'instance'):
15-
files = os.listdir("./content")
16-
cls.df = pd.read_csv(os.path.join("./content", files[0]))
17-
cls.idf_dict = cls._Model__compute_idf(cls.df)
18-
cls.instance = super(Model, cls).__new__(cls)
19-
return cls.instance
12+
def __init__(self, seed_df,pesticide_df, fertilizer_df, global_df, request: ModelRequest, search_categoty= 'others' ):
13+
self.search_category = request.search_category
14+
if self.search_category == 'seed':
15+
self.df = seed_df
16+
elif self.search_category == 'fertilizer':
17+
self.df = fertilizer_df
18+
elif self.search_category == 'pesticide':
19+
self.df = pesticide_df
20+
else :
21+
self.df = global_df
22+
self.idf_dict = self.__compute_idf(self.df)
2023

2124
@staticmethod
2225
def __compute_idf(df):
2326
N = len(df)
24-
all_tags = df['tags'].str.lower().str.split().explode()
27+
all_tags = df['tags'].str.split().explode()
2528
df_count_series = all_tags.drop_duplicates().value_counts()
2629
idf_dict = {tag: log(N / (df_count + 1)) for tag, df_count in df_count_series.items()}
2730
return idf_dict
28-
31+
2932
def __fuzzy_match(self, query_tokens, doc_tokens):
3033
weighted_fuzzy_scores = []
3134
query_set = set(query_tokens)
@@ -40,7 +43,8 @@ def __fuzzy_match(self, query_tokens, doc_tokens):
4043
max_ratio = ratio
4144
max_token = token
4245

43-
idf_weight = self.idf_dict.get(max_token)
46+
47+
idf_weight = self.idf_dict.get(max_token, 0.0)
4448
weighted_fuzzy_scores.append((max_ratio / 100) * idf_weight)
4549

4650
return np.mean(weighted_fuzzy_scores)
@@ -50,11 +54,11 @@ def __fuzzy_match(self, query_tokens, doc_tokens):
5054
async def inference(self, request: ModelRequest):
5155
scores = []
5256
query = request.query
53-
n = request.n
57+
n = int(request.n)
5458
query_tokens = query.lower().split()
5559

5660
for _, row in tqdm(self.df.iterrows()):
57-
doc_tokens = row['tags'].lower().split()
61+
doc_tokens = str(row['tags']).split()
5862
fuzzy_score = self.__fuzzy_match(query_tokens, doc_tokens)
5963
scores.append(fuzzy_score)
6064

src/search/word_score/local/request.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22

33

44
class ModelRequest():
5-
def __init__(self, query, n):
5+
def __init__(self, query, n, search_category):
66
self.query = query
77
self.n = n
8+
self.search_category = search_category
89

910
def to_json(self):
1011
return json.dumps(self, default=lambda o: o.__dict__,

0 commit comments

Comments
 (0)