Skip to content

Commit 372345d

Browse files
committed
Restructuring of file input/output
I've added an aggregation function, and changed the way the files are saved coming out of xDeepDive. We now save files by "resource" so it's easier to manually check the results and apply regex or other search filters. I've added the journal title to the CSV files, in case we want to begin to search that way. There is now an aggregation function that allows us to aggregate resources by DOI to help build the bipartite network.
1 parent 6cccb88 commit 372345d

File tree

4 files changed

+40
-5
lines changed

4 files changed

+40
-5
lines changed

src/interop_agg.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import json
2+
import os
3+
import csv
4+
import re
5+
from collections import defaultdict
6+
7+
def interop_agg(path: str) -> list:
8+
"""_Aggregate Resource List by DOI_
9+
10+
Args:
11+
path (str): _A relative or complete path as a string._
12+
13+
Returns:
14+
list: _A list of dict objects structured with kets `doi` and `resources`:
15+
{"doi":"XXX", "resources":["XXX","YYY"]}_
16+
"""
17+
files = os.listdir(path)
18+
resources = []
19+
for i in files:
20+
filename = f"{os.getcwd()}/{path}/{i}"
21+
with open(filename, "r", encoding = "utf-8") as file:
22+
reader = csv.reader(file)
23+
for k in reader:
24+
j = [j for j in k]
25+
resources.append((j[0].strip(), re.sub(r'\.csv', '', i)))
26+
clean_res = list(set(resources))
27+
res = defaultdict(list)
28+
for key, val in sorted(clean_res, key=lambda tup: tup[0]):
29+
res[key].append(val)
30+
cleaned = [{"doi": i, "resources": list(set(res[i]))} for i in res if len(list(set(res[i]))) > 1]
31+
with open('data/doi_joined.json', 'w', encoding='utf-8') as file:
32+
json.dump(cleaned, file)
33+
return cleaned

src/interop_dd.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
import csv
22
import interoperability_deepdive as iod
3+
from interop_agg import interop_agg
34

45
# This will generate a large-ish number of papers and grants.
56

6-
with open('./data/term_records.csv', 'r') as terms:
7-
repositories = terms.read().splitlines()
8-
97
dbs = []
108

119
with open('./data/merged_records.csv', 'r', encoding='UTF-8') as terms:
@@ -19,8 +17,10 @@
1917
try:
2018
full_list = full_list + iod.gdd_snippets(i[0])
2119
if len(full_list) > 0:
22-
with open(f'./data/{i[1]}.csv', 'a', encoding='utf-8', newline="") as file:
20+
with open(f'./data/resources/{i[1]}.csv', 'a', encoding='utf-8', newline="") as file:
2321
dictwriter = csv.DictWriter(file, full_list[0].keys())
2422
dictwriter.writerows(full_list)
2523
except Exception as e:
2624
print(e)
25+
26+
aggregate = interop_agg('data/resources')
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
from .gddURLcall import gddURLcall as gddURLcall
22
from .gdd_snippets import gdd_snippets as gdd_snippets
3-
from .process_hits import process_hits as process_hits
3+
from .process_hits import process_hits as process_hits

src/interoperability_deepdive/process_hits.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ def process_hits(input: dict) -> dict:
77
doi = i.get('doi', '') or ''
88
snippet = j or ''
99
title = i.get('title', '') or ''
10+
journal = i.get('pubname', '') or ''
1011
response.append({'doi': doi,
1112
'snippet': re.sub('\n', '', snippet),
13+
'pubname': re.sub('\n', '', journal),
1214
'title': re.sub('\n', '', title)})
1315
return response

0 commit comments

Comments
 (0)