|
2 | 2 | import json
|
3 | 3 | import random
|
4 | 4 | from datetime import date
|
| 5 | +from difflib import SequenceMatcher |
5 | 6 | from pathlib import Path
|
6 | 7 |
|
7 | 8 | import folium
|
@@ -1277,6 +1278,36 @@ def __get__percentage_of_metadata_version_1(df):
|
1277 | 1278 | return len(df[df["metadata_version"] == 1]) / len(df)
|
1278 | 1279 |
|
1279 | 1280 |
|
| 1281 | +def __get_similar_columns(df, column): |
| 1282 | + """ |
| 1283 | + Return a list of similar column values. For example, the "affiliation" column might include |
| 1284 | +
|
| 1285 | + ['University of California, Los Angeles', |
| 1286 | + 'University of California, Los Angeles (UCLA)', |
| 1287 | + 0.9135802469135802]] |
| 1288 | +
|
| 1289 | + """ |
| 1290 | + |
| 1291 | + df = df.dropna(subset=[column]) # drop null values |
| 1292 | + unique_values = df[column].unique() |
| 1293 | + |
| 1294 | + completed_pairs = [] |
| 1295 | + similar_pairs = [] |
| 1296 | + for value in unique_values: |
| 1297 | + for compare_value in [ |
| 1298 | + v for v in unique_values if v != value and (value, v) not in completed_pairs |
| 1299 | + ]: |
| 1300 | + similarity = SequenceMatcher( |
| 1301 | + None, value.lower(), compare_value.lower() |
| 1302 | + ).ratio() |
| 1303 | + if similarity > 0.85: |
| 1304 | + similar_pairs.append([value, compare_value, similarity]) |
| 1305 | + |
| 1306 | + completed_pairs.append((compare_value, value)) |
| 1307 | + |
| 1308 | + return similar_pairs |
| 1309 | + |
| 1310 | + |
1280 | 1311 | def __get__percentage_of_metadata_version_2(df):
|
1281 | 1312 | """
|
1282 | 1313 | Calculates the percentage of rows in the DataFrame that have 'metadata_version' equal to 2.
|
@@ -1353,27 +1384,29 @@ def report():
|
1353 | 1384 |
|
1354 | 1385 | return report
|
1355 | 1386 |
|
| 1387 | + |
1356 | 1388 | def create_tree_map(frequency_dict, width, height):
|
1357 | 1389 | """
|
1358 |
| - Get a treemap of projects |
| 1390 | + Get a treemap of projects |
1359 | 1391 |
|
1360 | 1392 | Input parameter: dictionary
|
1361 | 1393 | Output: treemap image
|
1362 | 1394 | """
|
1363 | 1395 | labels = list(frequency_dict.keys())
|
1364 | 1396 | values = list(frequency_dict.values())
|
1365 | 1397 |
|
1366 |
| - fig = go.Figure(go.Treemap( |
1367 |
| - labels=labels, |
1368 |
| - parents=[''] * len(labels), |
1369 |
| - values=values, |
1370 |
| - textinfo='label+value' |
1371 |
| - )) |
| 1398 | + fig = go.Figure( |
| 1399 | + go.Treemap( |
| 1400 | + labels=labels, |
| 1401 | + parents=[""] * len(labels), |
| 1402 | + values=values, |
| 1403 | + textinfo="label+value", |
| 1404 | + ) |
| 1405 | + ) |
1372 | 1406 |
|
1373 |
| - fig.update_layout(title='Projects', width=width, height=height) |
| 1407 | + fig.update_layout(title="Projects", width=width, height=height) |
1374 | 1408 |
|
1375 | 1409 | today = date.today()
|
1376 | 1410 | output_path = f'treemap-{today.strftime("%Y%m%d")}.png'
|
1377 | 1411 | fig.write_image(output_path)
|
1378 | 1412 | fig.show()
|
1379 |
| - |
|
0 commit comments