-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataframe Fuzzy String Matching.py
More file actions
43 lines (35 loc) · 1.53 KB
/
Dataframe Fuzzy String Matching.py
File metadata and controls
43 lines (35 loc) · 1.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#This code implements fuzzy text string matching from two different dataframes.
#The assumption though is that two dframes must be of the same number of row. Otherwise, this will
#return a misleading fuzzy string match at the diagonal matrix.
import pandas as pd
from fuzzywuzzy import fuzz, process
import numpy as np
df1 = pd.read_excel(open('/Users/michael/Desktop/GE_pipeline/data.xlsx'), sheetname = 0)
df2 = pd.read_excel(open('/Users/michael/Desktop/GE_pipeline/data.xlsx'), sheetname = 1)
#print df1.head()
#print df2.head()
df3 = pd.DataFrame(index=df1.index, columns=df2.index)
#print df3
def lv1_match(master, source, threshold=0):
for i in df3.index:
for j in df3.columns:
vi = master.get_value(i, 'Orig')
vj = source.get_value(j, 'Changed')
df3.set_value(i, j, fuzz.token_sort_ratio(vi, vj))
return df3
#print(df3)
#threshold = df3.max(1) > 90
#a = pd.Series(np.diag(df3), index=[df3.index, df3.columns])
#print a
a = lv1_match(df1,df2)
b = pd.Series(np.diag(df3), index=[df3.index, df3.columns])
print b
#This part is not tested
#idxmax = df3.idxmax(1)
#df['PROD_ID'] = np.where(threshold, df2.loc[idxmax, 'PROD_ID'].values, np.nan)
#df['PROD_DESCRIPTION'] = np.where(threshold, df2.loc[idxmax, 'PROD_DESCRIPTION'].values, np.nan)
#df
#REFERENCES
#https://stackoverflow.com/questions/43938672/efficient-string-matching-in-apache-spark
#https://github.com/MrPowers/spark-stringmetric.git
#https://stackoverflow.com/questions/41455093/searching-one-python-dataframe-dictionary-for-fuzzy-matches-in-another-datafra