Skip to content

Commit b5013f9

Browse files
authored
Add __get_similar_columns
1 parent 28be05f commit b5013f9

File tree

1 file changed

+31
-0
lines changed

1 file changed

+31
-0
lines changed

braininventory/get.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import pandas as pd
33
import json
44
from datetime import date
5+
from difflib import SequenceMatcher
56
import squarify
67
import matplotlib.pyplot as plt
78

@@ -164,6 +165,36 @@ def __get__percentage_of_metadata_version_1(df):
164165
"""
165166
return len(df[df["metadata_version"] == 1]) / len(df)
166167

168+
def __get_similar_columns(df, column):
169+
"""
170+
Return a list of similar column values. For example, the "affiliation" column might include
171+
172+
['University of California, Los Angeles',
173+
'University of California, Los Angeles (UCLA)',
174+
0.9135802469135802]]
175+
176+
"""
177+
178+
df = df.dropna(subset=[column]) # drop null values
179+
unique_values = df[column].unique()
180+
181+
completed_pairs = []
182+
similar_pairs = []
183+
for value in unique_values:
184+
for compare_value in [
185+
v for v in unique_values if v != value and (value, v) not in completed_pairs
186+
]:
187+
similarity = SequenceMatcher(
188+
None, value.lower(), compare_value.lower()
189+
).ratio()
190+
if similarity > 0.85:
191+
similar_pairs.append([value, compare_value, similarity])
192+
193+
completed_pairs.append((compare_value, value))
194+
195+
return similar_pairs
196+
197+
167198

168199
def report():
169200
# Get today's date

0 commit comments

Comments
 (0)