Skip to content

Commit 10d1768

Browse files
committed
idaka
1 parent 05c1053 commit 10d1768

File tree

114 files changed

+44490
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

114 files changed

+44490
-1
lines changed

Back-end/.gitignore

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Logs
2+
logs
3+
*.log
4+
npm-debug.log*
5+
yarn-debug.log*
6+
yarn-error.log*
7+
8+
# Runtime data
9+
pids
10+
*.pid
11+
*.seed
12+
*.pid.lock
13+
14+
# Directory for instrumented libs generated by jscoverage/JSCover
15+
lib-cov
16+
17+
# Coverage directory used by tools like istanbul
18+
coverage
19+
20+
# nyc test coverage
21+
.nyc_output
22+
23+
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
24+
.grunt
25+
26+
# Bower dependency directory (https://bower.io/)
27+
bower_components
28+
29+
# node-waf configuration
30+
.lock-wscript
31+
32+
# Compiled binary addons (https://nodejs.org/api/addons.html)
33+
build/Release
34+
35+
# Dependency directories
36+
node_modules/
37+
jspm_packages/
38+
39+
# Typescript v1 declaration files
40+
typings/
41+
42+
# Optional npm cache directory
43+
.npm
44+
45+
# Optional eslint cache
46+
.eslintcache
47+
48+
# Optional REPL history
49+
.node_repl_history
50+
51+
# Output of 'npm pack'
52+
*.tgz
53+
54+
# Yarn Integrity file
55+
.yarn-integrity
56+
57+
# dotenv environment variables file
58+
.env
59+
60+
# next.js build output
61+
.next

Back-end/BM25.py

Lines changed: 336 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Mon Oct 10 08:41:59 2022
4+
5+
@author: Laura
6+
"""
7+
"BM25 --https://www.quora.com/How-does-BM25-work"
8+
9+
from math import log
10+
import collections
11+
""" import matplotlib.pyplot as plt """
12+
import re
13+
import numpy as np
14+
import string
15+
import nltk
16+
from nltk.corpus import stopwords
17+
from nltk.stem import WordNetLemmatizer
18+
""" import docx """
19+
import json
20+
21+
""" import openpyxl """
22+
""" from Query_expansion import expansion """
23+
sw=stopwords.words("english")
24+
def BM25(q,docs,vectorizer, original,b,k1):
25+
26+
k2 = 0
27+
k3=1000
28+
29+
i=0
30+
j=0
31+
result = dict()
32+
avgdl=average_dl(docs)
33+
for doc in docs:
34+
result[i]=BM25_doc(q,doc,avgdl,k1,k2,k3,b,vectorizer)
35+
i+=1
36+
result= sorted(result.items(), key=lambda x: x[1], reverse=True)
37+
ans={"practices":[],"scores":[]}
38+
for r in result:
39+
if r[1]>0:
40+
ans["practices"].append(r[0])
41+
ans["scores"].append(r[1])
42+
# print(original[r[0]])
43+
# # print(docs[r[0]])
44+
# print("Similarity:")
45+
# print(r[1])
46+
return json.dumps(ans)
47+
48+
49+
def BM25_doc(q,doc, avgdl,k1,k2,k3,b,vectorizer):
50+
ans=0
51+
query=q.split(' ')
52+
dl=len(doc.split(' '))
53+
K=calculateK(dl, avgdl,b,k1)
54+
for term in query:
55+
ans+=(idf(term,vectorizer)*(((k1+1)*tf(term,doc))/(K+tf(term,doc)))*((k3+1)*tf(term,q))/(k3+tf(term,q))+k2*len(query)*((avgdl-dl)/(avgdl+dl)))
56+
return ans
57+
def calculateK(dl,avgdl,b,k1):
58+
return k1*((1-b)+(b*dl/avgdl))
59+
60+
def idf(term,vectorizer):
61+
if vectorizer.vocabulary_.get(term)!= None:
62+
return vectorizer.idf_[vectorizer.vocabulary_.get(term)]
63+
else:
64+
return 0
65+
66+
def tf(term, sentence):
67+
return sentence.count(term)
68+
def average_dl(docs):
69+
tot=0
70+
for doc in docs:
71+
tot+=len(doc.split(' '))
72+
return tot/len(docs)
73+
def precision(relevant,total):
74+
return relevant/total
75+
76+
def tune_hyperparameters_BM25(docs,vectorizer, practices,original_paraphrased):
77+
78+
true_docs={}
79+
80+
excel_document = openpyxl.load_workbook('Ranked queries.xlsx')
81+
sheet = excel_document['Hoja1']
82+
all_columns = sheet.columns
83+
for col in all_columns:
84+
if col[1].value is not None:
85+
true_docs[col[1].value]=[]
86+
for row in col[2:]:
87+
if row.value is not None:
88+
true_docs[col[1].value].append(row.value)
89+
90+
def precision(relevant,total):
91+
return relevant/total
92+
93+
def recall(relevant, tot_relevant):
94+
return relevant/tot_relevant
95+
96+
def is_relevant(doc,query):
97+
if doc in true_docs[query]:
98+
return True
99+
else:
100+
return False
101+
def removing_original_duplicates(ranking):
102+
top=[]
103+
for doc in ranking.keys():
104+
original=get_original(practices[doc])
105+
if original not in top:
106+
top.append(original)
107+
# print(original)
108+
# print(ranking[doc])
109+
return top
110+
111+
112+
def get_original(doc):
113+
for k in original_paraphrased:
114+
if doc in original_paraphrased[k]:
115+
return k
116+
"Return the list of tokens of the document"
117+
def tokenizer(doc):
118+
tokenized_doc=nltk.word_tokenize(doc)
119+
return tokenized_doc
120+
121+
"Return the stemmed version of the tokens"
122+
def stemmer(tokens):
123+
porter_stemmer=nltk.stem.PorterStemmer()
124+
stemmed=[]
125+
for word in tokens:
126+
stemmed.append(porter_stemmer.stem(word))
127+
return stemmed
128+
129+
"Return the lemmatized version of the tokens"
130+
def lemmatizer(tokens):
131+
token_lemmatizer = WordNetLemmatizer()
132+
lemmatized=[]
133+
for word in tokens:
134+
lemmatized.append(token_lemmatizer.lemmatize(word))
135+
return lemmatized
136+
137+
"Return tokens of the doc that are not stop words"
138+
def remove_stopwords(tokens):
139+
cleaned_practice = []
140+
for token in tokens:
141+
if token not in sw:
142+
cleaned_practice.append(token)
143+
return cleaned_practice
144+
def cleaning(docs, uni=True, lc=True,pun=True,num=True,sp=True,sw=True,st=True,lm=False):
145+
cleaned_practices = []
146+
all_tokens=[]
147+
for doc in docs:
148+
#Remove Unicode
149+
if uni:
150+
document_test = re.sub(r'[^\x00-\x7F]+', ' ', doc)
151+
# Lowercase the document
152+
if lc:
153+
document_test = document_test.lower()
154+
# Remove punctuations
155+
if pun:
156+
document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
157+
# Remove the numbers
158+
if num:
159+
document_test = re.sub(r'[0-9]', '', document_test)
160+
# Remove the doubled space
161+
if sp:
162+
document_test = re.sub(r'\s{2,}', ' ', document_test)
163+
tokens=tokenizer(document_test)
164+
if sw:
165+
tokens=remove_stopwords(tokens)
166+
if st:
167+
tokens=stemmer(tokens)
168+
if lm:
169+
tokens=lemmatizer(tokens)
170+
joined_doc= ' '.join(tokens)
171+
cleaned_practices.append(joined_doc)
172+
all_tokens.append(tokens)
173+
return cleaned_practices, all_tokens
174+
# return cleaned_practices
175+
176+
queries=list(true_docs.keys())
177+
bs=np.arange(0.0, 1.0, 0.25)
178+
k1s=np.arange(0.0, 3.0, 0.2)
179+
b_result=[]
180+
k1_result=[]
181+
preci=[]
182+
R=[]
183+
#Acá debería recibir n= a precisiont at n TODO
184+
at_k=range(1,11)
185+
av_precision=[]
186+
av_recall=[]
187+
tot_precision=[0] * 10
188+
tot_recall=[0] * 10
189+
tot_q=[0,0,0,0,0,0,0,0,0,0]
190+
cants=[]
191+
for k1 in k1s:
192+
for b in bs:
193+
for q in queries:
194+
query=q+" "+expansion(q)
195+
query = cleaning([query])[0][0]
196+
print(q)
197+
bm25=BM25(query,docs,vectorizer, practices,b,k1)#vsm,lda,bm25
198+
bm25=removing_original_duplicates(bm25)
199+
tot_relevant=len(true_docs[q])
200+
if len(bm25):
201+
for n in at_k:
202+
if tot_relevant>=n:
203+
tot_q[n-1]+=1
204+
print(tot_q)
205+
rlv_retrieved=0
206+
bm25a=list(bm25)[:n]
207+
print('ans---------------a',bm25a)
208+
for r in bm25a:
209+
print('ans---------------',r)
210+
if is_relevant(r, q):
211+
rlv_retrieved+=1
212+
print(true_docs[q])
213+
precision_n=precision(rlv_retrieved,len(bm25a))
214+
recall_n=recall(rlv_retrieved,tot_relevant)
215+
print("Precision"+str(precision_n)+" at "+str(n))
216+
print("Recall"+str(recall_n)+" at "+str(n))
217+
tot_precision[n-1]+=precision_n
218+
tot_recall[n-1]+=recall_n
219+
cants.append("cantidad de queries validas para n="+str(tot_q))
220+
print("tot_q",tot_q)
221+
for j in at_k:
222+
av_precision.append(tot_precision[j-1]/tot_q[j-1])
223+
av_recall.append(tot_recall[j-1]/tot_q[j-1])
224+
b_result.append(b)
225+
k1_result.append(k1)
226+
preci.append(sum(av_precision)/len(av_precision))
227+
R.append(sum(av_recall)/len(av_recall))
228+
229+
print("CANTS-----------------",cants)
230+
print("b,k1"+str(b_result)+str(k1_result))
231+
newDocument = docx.Document()
232+
fig, ax = plt.subplots()
233+
print("BM25")
234+
newDocument.add_paragraph('------Presicion BM25-------')
235+
newDocument.add_paragraph(str(preci))
236+
newDocument.add_paragraph('------Recall BM25-------')
237+
newDocument.add_paragraph(str(R))
238+
newDocument.add_paragraph('------k1s BM25-------')
239+
newDocument.add_paragraph(str(k1_result))
240+
newDocument.add_paragraph('------bs BM25-------')
241+
newDocument.add_paragraph(str(b_result))
242+
print("precision: "+str(preci))
243+
print("recall: "+str(R))
244+
ax.plot(av_recall,av_precision, color='tab:orange',marker = 'o')
245+
# p=0
246+
# for j in range(0,len(av_recall)):
247+
# x=av_recall[j]
248+
# y=av_precision[j]
249+
# ax.annotate(str(at_k[p]),xy=(x,y),xytext =(x, y))
250+
# p+=1
251+
ax.set_title("Precision vs Recall")
252+
ax.set_xlabel("Recall")
253+
ax.set_ylabel("Precision")
254+
plt.show()
255+
256+
newDocument.save('PR1a10BM25.docx')
257+
# # for i in range(1,13):
258+
# # doc = docx.Document("PR1a10.docx")
259+
# # paragraphs=doc.paragraphs
260+
# # resultspyr=[]#VSM,LDA,BM25
261+
# # for paragraph in paragraphs:
262+
# # if not paragraph.text.startswith("-"):
263+
# # print(paragraph.text)
264+
# # temp=paragraph.text.replace("[","").replace("]","").replace(" ","").split(",")
265+
# # temp=[float(i) for i in temp]
266+
# # resultspyr.append(temp)
267+
# # fig, ax = plt.subplots()
268+
# # ax.plot(range(1,11),resultspyr[0], color='tab:purple',marker = 'o')
269+
# # ax.plot(range(1,11),resultspyr[1], color='tab:orange',marker = 'o')
270+
271+
# # ax.set_title("Precision and Recall vs n (VSM)")
272+
# # ax.set_ylabel("Precision and Recall")
273+
# # ax.set_xlabel("n")
274+
# # ax.set_ylim(0,1)
275+
# # plt.legend(["Precision", "Recall"], loc ="lower right")
276+
# # plt.show()
277+
# # fig1, ax1 = plt.subplots()
278+
# # ax1.plot(range(1,11),resultspyr[2], color='tab:purple',marker = 'o')
279+
# # ax1.plot(range(1,11),resultspyr[3], color='tab:orange',marker = 'o')
280+
281+
# # ax1.set_title("Precision and Recall vs n (LDA)")
282+
# # ax1.set_ylabel("Precision and Recall")
283+
# # ax1.set_xlabel("n")
284+
# # ax1.set_ylim(0,1)
285+
# # plt.legend(["Precision", "Recall"], loc ="upper right")
286+
# # plt.show()
287+
# # fig2, ax2 = plt.subplots()
288+
# # ax2.plot(range(1,11),resultspyr[4], color='tab:purple',marker = 'o')
289+
# # ax2.plot(range(1,11),resultspyr[5],color='tab:orange',marker = 'o')
290+
291+
# # ax2.set_title("Precision and Recall vs n (BM25)")
292+
# # ax2.set_ylabel("Precision and Recall")
293+
# # ax2.set_xlabel("n")
294+
# # ax2.set_ylim(0,1)
295+
# # plt.legend(["Precision", "Recall"], loc ="lower right")
296+
# # plt.show()
297+
298+
# # fig, ax = plt.subplots()
299+
# # ax.plot(resultspyr[1],resultspyr[0], color='tab:purple',marker = 'o')
300+
# # ax.plot(resultspyr[3],resultspyr[2], color='tab:orange',marker = 'o')
301+
# # ax.plot(resultspyr[5],resultspyr[4], color='tab:blue',marker = 'o')
302+
# # ax.set_title("Precision vs Recall")
303+
# # ax.set_ylabel("Precision")
304+
# # ax.set_xlabel("Recall")
305+
# # ax.set_ylim(0,1)
306+
# # ax.set_xlim(0,1)
307+
# # plt.legend(["VSM", "LDA", "BM25"], loc ="upper right")
308+
# # plt.show()
309+
310+
# # fig, ax = plt.subplots()
311+
# # ax.plot(range(1,11),resultspyr[0], color='tab:purple',marker = 'o')
312+
# # ax.plot(range(1,11),resultspyr[2], color='tab:orange',marker = 'o')
313+
# # ax.plot(range(1,11),resultspyr[4], color='tab:blue',marker = 'o')
314+
# # ax.set_title("Precision")
315+
# # ax.set_ylabel("Precision")
316+
# # ax.set_xlabel("n")
317+
# # ax.set_ylim(0,1)
318+
# # plt.legend(["VSM", "LDA", "BM25"], loc ="upper right")
319+
# # plt.show()
320+
321+
# # fig, ax = plt.subplots()
322+
# # ax.plot(range(1,11),resultspyr[1], color='tab:purple',marker = 'o')
323+
# # ax.plot(range(1,11),resultspyr[3], color='tab:orange',marker = 'o')
324+
# # ax.plot(range(1,11),resultspyr[5], color='tab:blue',marker = 'o')
325+
# # ax.set_title("Recall")
326+
# # ax.set_ylabel("Precision")
327+
# # ax.set_xlabel("Recall")
328+
# # ax.set_ylim(0,1)
329+
# # plt.legend(["VSM", "LDA", "BM25"], loc ="upper right")
330+
# # plt.show()
331+
# # # precision_recall()
332+
333+
334+
335+
336+

0 commit comments

Comments
 (0)