Skip to content

Commit a510fa0

Browse files
authored
Merge pull request #73 from brain-image-library/72-similar-columns-metric
Add __get_similar_columns
2 parents 2dc0bac + c588c13 commit a510fa0

File tree

1 file changed

+42
-9
lines changed

1 file changed

+42
-9
lines changed

braininventory/get.py

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
import random
44
from datetime import date
5+
from difflib import SequenceMatcher
56
from pathlib import Path
67

78
import folium
@@ -1277,6 +1278,36 @@ def __get__percentage_of_metadata_version_1(df):
12771278
return len(df[df["metadata_version"] == 1]) / len(df)
12781279

12791280

1281+
def __get_similar_columns(df, column):
1282+
"""
1283+
Return a list of similar column values. For example, the "affiliation" column might include
1284+
1285+
['University of California, Los Angeles',
1286+
'University of California, Los Angeles (UCLA)',
1287+
0.9135802469135802]]
1288+
1289+
"""
1290+
1291+
df = df.dropna(subset=[column]) # drop null values
1292+
unique_values = df[column].unique()
1293+
1294+
completed_pairs = []
1295+
similar_pairs = []
1296+
for value in unique_values:
1297+
for compare_value in [
1298+
v for v in unique_values if v != value and (value, v) not in completed_pairs
1299+
]:
1300+
similarity = SequenceMatcher(
1301+
None, value.lower(), compare_value.lower()
1302+
).ratio()
1303+
if similarity > 0.85:
1304+
similar_pairs.append([value, compare_value, similarity])
1305+
1306+
completed_pairs.append((compare_value, value))
1307+
1308+
return similar_pairs
1309+
1310+
12801311
def __get__percentage_of_metadata_version_2(df):
12811312
"""
12821313
Calculates the percentage of rows in the DataFrame that have 'metadata_version' equal to 2.
@@ -1353,27 +1384,29 @@ def report():
13531384

13541385
return report
13551386

1387+
13561388
def create_tree_map(frequency_dict, width, height):
13571389
"""
1358-
Get a treemap of projects
1390+
Get a treemap of projects
13591391
13601392
Input parameter: dictionary
13611393
Output: treemap image
13621394
"""
13631395
labels = list(frequency_dict.keys())
13641396
values = list(frequency_dict.values())
13651397

1366-
fig = go.Figure(go.Treemap(
1367-
labels=labels,
1368-
parents=[''] * len(labels),
1369-
values=values,
1370-
textinfo='label+value'
1371-
))
1398+
fig = go.Figure(
1399+
go.Treemap(
1400+
labels=labels,
1401+
parents=[""] * len(labels),
1402+
values=values,
1403+
textinfo="label+value",
1404+
)
1405+
)
13721406

1373-
fig.update_layout(title='Projects', width=width, height=height)
1407+
fig.update_layout(title="Projects", width=width, height=height)
13741408

13751409
today = date.today()
13761410
output_path = f'treemap-{today.strftime("%Y%m%d")}.png'
13771411
fig.write_image(output_path)
13781412
fig.show()
1379-

0 commit comments

Comments
 (0)