File tree Expand file tree Collapse file tree 1 file changed +31
-0
lines changed Expand file tree Collapse file tree 1 file changed +31
-0
lines changed Original file line number Diff line number Diff line change 2
2
import pandas as pd
3
3
import json
4
4
from datetime import date
5
+ from difflib import SequenceMatcher
5
6
import squarify
6
7
import matplotlib .pyplot as plt
7
8
@@ -164,6 +165,36 @@ def __get__percentage_of_metadata_version_1(df):
164
165
"""
165
166
return len (df [df ["metadata_version" ] == 1 ]) / len (df )
166
167
168
+ def __get_similar_columns (df , column ):
169
+ """
170
+ Return a list of similar column values. For example, the "affiliation" column might include
171
+
172
+ ['University of California, Los Angeles',
173
+ 'University of California, Los Angeles (UCLA)',
174
+ 0.9135802469135802]]
175
+
176
+ """
177
+
178
+ df = df .dropna (subset = [column ]) # drop null values
179
+ unique_values = df [column ].unique ()
180
+
181
+ completed_pairs = []
182
+ similar_pairs = []
183
+ for value in unique_values :
184
+ for compare_value in [
185
+ v for v in unique_values if v != value and (value , v ) not in completed_pairs
186
+ ]:
187
+ similarity = SequenceMatcher (
188
+ None , value .lower (), compare_value .lower ()
189
+ ).ratio ()
190
+ if similarity > 0.85 :
191
+ similar_pairs .append ([value , compare_value , similarity ])
192
+
193
+ completed_pairs .append ((compare_value , value ))
194
+
195
+ return similar_pairs
196
+
197
+
167
198
168
199
def report ():
169
200
# Get today's date
You can’t perform that action at this time.
0 commit comments