Geomatics/Dataframe Fuzzy String Matching.py at master · mgmanalili/Geomatics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#This code implements fuzzy text string matching from two different dataframes.
#The assumption though is that two dframes must be of the same number of row. Otherwise, this will
#return a misleading fuzzy string match at the diagonal matrix.

import pandas as pd
from fuzzywuzzy import fuzz, process
import numpy as np

df1 = pd.read_excel(open('/Users/michael/Desktop/GE_pipeline/data.xlsx'), sheetname = 0)
df2 = pd.read_excel(open('/Users/michael/Desktop/GE_pipeline/data.xlsx'), sheetname = 1)

#print df1.head()
#print df2.head()

df3 = pd.DataFrame(index=df1.index, columns=df2.index)
#print df3

def lv1_match(master, source, threshold=0):
	for i in df3.index:
	    for j in df3.columns:
	        vi = master.get_value(i, 'Orig')
	        vj = source.get_value(j, 'Changed')
	        df3.set_value(i, j, fuzz.token_sort_ratio(vi, vj))
	return df3
#print(df3)
#threshold = df3.max(1) > 90

#a = pd.Series(np.diag(df3), index=[df3.index, df3.columns])
#print a
a = lv1_match(df1,df2)
b = pd.Series(np.diag(df3), index=[df3.index, df3.columns])
print b

#This part is not tested
#idxmax = df3.idxmax(1)
#df['PROD_ID'] = np.where(threshold, df2.loc[idxmax, 'PROD_ID'].values, np.nan)
#df['PROD_DESCRIPTION'] = np.where(threshold, df2.loc[idxmax, 'PROD_DESCRIPTION'].values, np.nan)
#df

#REFERENCES
#https://stackoverflow.com/questions/43938672/efficient-string-matching-in-apache-spark
#https://github.com/MrPowers/spark-stringmetric.git
#https://stackoverflow.com/questions/41455093/searching-one-python-dataframe-dictionary-for-fuzzy-matches-in-another-datafra