Skip to content

Commit 2804b9f

Browse files
committed
Modified word_score to give absolute score and threshold scores; removed idf
1 parent 3f054a4 commit 2804b9f

File tree

3 files changed

+22
-28
lines changed

3 files changed

+22
-28
lines changed

src/search/word_score/local/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44

55
- Git clone the repo and cd to the project location.
66
- cd to `local`, i.e., `cd ./src/search/word_score/local`.
7-
- Replace the file in `./content` with a csv file of your choice, but the data column should be named `tags` column.
7+
- Replace the link in the Dockerfile to a downloadable csv file of your choice, but the data column should be named `tags`.
88
- Start your docker engine and `docker build -t word_score .`.
99
- Do `docker run -p 8000:8000 word_score`.
10-
- `curl -X POST -H "Content-Type: application/json" -d '{"query": "seed procurement district", "n": "5", "search_category" : "seed"}' http://localhost:8000/`. <br> Replace `seed procurement district` with a query you want to search and `5` with the number of rows you want to retrieve.
10+
- `curl -X POST -H "Content-Type: application/json" -d '{"query": "seed procurement district", "n": "5", "search_category" : "seed", "threshold": "0.8", "k": "6"}' http://localhost:8000/`. <br> Replace `seed procurement district` with a query you want to search and `5` with the number of rows you want to retrieve. Change `threshold` value (0 to 1) to retrieve documents whose score cross the specific threshold. `k` is the number of top k words to consider for thresholding of the score.
1111
- The reponse for above would be: <br>
1212
`
1313
{
Lines changed: 17 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,12 @@
11
from request import ModelRequest
2-
import pandas as pd
3-
from math import log
42
from thefuzz import fuzz
53
import numpy as np
64
from cache import AsyncTTL
75
from tqdm import tqdm
8-
import os
96

107

118
class Model:
12-
def __init__(self, seed_df,pesticide_df, fertilizer_df, global_df, request: ModelRequest, search_categoty= 'others' ):
9+
def __init__(self, seed_df,pesticide_df, fertilizer_df, global_df, request: ModelRequest, search_category= 'others'):
1310
self.search_category = request.search_category
1411
if self.search_category == 'seed':
1512
self.df = seed_df
@@ -19,54 +16,49 @@ def __init__(self, seed_df,pesticide_df, fertilizer_df, global_df, request: Mode
1916
self.df = pesticide_df
2017
else :
2118
self.df = global_df
22-
self.idf_dict = self.__compute_idf(self.df)
2319

24-
@staticmethod
25-
def __compute_idf(df):
26-
N = len(df)
27-
all_tags = df['tags'].str.split().explode()
28-
df_count_series = all_tags.drop_duplicates().value_counts()
29-
idf_dict = {tag: log(N / (df_count + 1)) for tag, df_count in df_count_series.items()}
30-
return idf_dict
3120

32-
def __fuzzy_match(self, query_tokens, doc_tokens):
33-
weighted_fuzzy_scores = []
21+
def __fuzzy_match(self, query_tokens, doc_tokens, k):
22+
fuzzy_scores = []
3423
query_set = set(query_tokens)
3524
doc_set = set(doc_tokens)
3625

3726
for q_token in query_set:
3827
max_ratio = None
39-
max_token = None
28+
# max_token = None
4029
for token in doc_set:
4130
ratio = fuzz.ratio(token, q_token)
4231
if max_ratio == None or ratio > max_ratio:
4332
max_ratio = ratio
44-
max_token = token
33+
# max_token = token
4534

46-
47-
idf_weight = self.idf_dict.get(max_token, 0.0)
48-
weighted_fuzzy_scores.append((max_ratio / 100) * idf_weight)
35+
fuzzy_scores.append((max_ratio / 100))
4936

50-
return np.mean(weighted_fuzzy_scores)
37+
fuzzy_scores = sorted(fuzzy_scores, reverse=True)
38+
39+
return np.mean(fuzzy_scores), np.mean(fuzzy_scores[:k])
5140

5241

5342
@AsyncTTL(time_to_live=600000, maxsize=1024)
5443
async def inference(self, request: ModelRequest):
5544
scores = []
45+
top_k_scores = []
5646
query = request.query
57-
n = int(request.n)
47+
threshold = float(request.threshold)
48+
k = int(request.k) # k is the number of top k words to consider for the score
49+
n = int(request.n) # n is the number of documents to return
5850
query_tokens = query.lower().split()
5951

6052
for _, row in tqdm(self.df.iterrows()):
6153
doc_tokens = str(row['tags']).split()
62-
fuzzy_score = self.__fuzzy_match(query_tokens, doc_tokens)
54+
fuzzy_score, top_k_score = self.__fuzzy_match(query_tokens, doc_tokens, k)
6355
scores.append(fuzzy_score)
64-
65-
max_score = max(scores) if scores else 1
66-
scores = [score / max_score for score in scores]
56+
top_k_scores.append(top_k_score)
6757

6858
new_df = self.df.copy(deep=True)
6959
new_df['scores'] = scores
60+
new_df['top_k_scores'] = top_k_scores
61+
new_df = new_df[new_df['top_k_scores'] > threshold]
7062
new_df_sorted = new_df.sort_values(by=['scores'], ascending=False).head(n)
7163
return {"docs": new_df_sorted['tags'].to_list()}
7264

src/search/word_score/local/request.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22

33

44
class ModelRequest():
5-
def __init__(self, query, n, search_category):
5+
def __init__(self, query, n, search_category, threshold, k=5):
66
self.query = query
77
self.n = n
88
self.search_category = search_category
9+
self.threshold = threshold
10+
self.k = k
911

1012
def to_json(self):
1113
return json.dumps(self, default=lambda o: o.__dict__,

0 commit comments

Comments
 (0)