Restructuring of file input/output

SimonGoring · SimonGoring · commit 372345da1f52 · 2024-12-04T11:36:00.000-08:00
I've added an aggregation function, and changed the way the files are saved coming out of xDeepDive.

We now save files by "resource" so it's easier to manually check the results and apply regex or other search filters.

I've added the journal title to the CSV files, in case we want to begin to search that way.

There is now an aggregation function that allows us to aggregate resources by DOI to help build the bipartite network.
diff --git a/src/interop_agg.py b/src/interop_agg.py
@@ -0,0 +1,33 @@
+import json
+import os
+import csv
+import re
+from collections import defaultdict
+
+def interop_agg(path: str) -> list:
+    """_Aggregate Resource List by DOI_
+
+    Args:
+        path (str): _A relative or complete path as a string._
+
+    Returns:
+        list: _A list of dict objects structured with kets `doi` and `resources`:
+            {"doi":"XXX", "resources":["XXX","YYY"]}_
+    """
+    files = os.listdir(path)
+    resources = []
+    for i in files:
+        filename = f"{os.getcwd()}/{path}/{i}"
+        with open(filename, "r", encoding = "utf-8") as file:
+            reader = csv.reader(file)
+            for k in reader:
+                j = [j for j in k]
+                resources.append((j[0].strip(), re.sub(r'\.csv', '', i)))
+    clean_res = list(set(resources))
+    res = defaultdict(list)
+    for key, val in sorted(clean_res, key=lambda tup: tup[0]):
+        res[key].append(val)
+    cleaned = [{"doi": i, "resources": list(set(res[i]))} for i in res if len(list(set(res[i]))) > 1]
+    with open('data/doi_joined.json', 'w', encoding='utf-8') as file:
+        json.dump(cleaned, file)
+    return cleaned
diff --git a/src/interop_dd.py b/src/interop_dd.py
@@ -1,11 +1,9 @@
 import csv
 import interoperability_deepdive as iod
+from interop_agg import interop_agg
 
 # This will generate a large-ish number of papers and grants.
 
-with open('./data/term_records.csv', 'r') as terms:
-    repositories = terms.read().splitlines()
-
 dbs = []
 
 with open('./data/merged_records.csv', 'r', encoding='UTF-8') as terms:
@@ -19,8 +17,10 @@
     try:
         full_list = full_list + iod.gdd_snippets(i[0])
         if len(full_list) > 0:
-            with open(f'./data/{i[1]}.csv', 'a', encoding='utf-8', newline="") as file:
+            with open(f'./data/resources/{i[1]}.csv', 'a', encoding='utf-8', newline="") as file:
                 dictwriter = csv.DictWriter(file, full_list[0].keys())
                 dictwriter.writerows(full_list)
     except Exception as e:
         print(e)
+
+aggregate = interop_agg('data/resources')
diff --git a/src/interoperability_deepdive/__init__.py b/src/interoperability_deepdive/__init__.py
@@ -1,3 +1,3 @@
 from .gddURLcall import gddURLcall as gddURLcall
 from .gdd_snippets import gdd_snippets as gdd_snippets
-from .process_hits import process_hits as process_hits
+from .process_hits import process_hits as process_hits
diff --git a/src/interoperability_deepdive/process_hits.py b/src/interoperability_deepdive/process_hits.py
@@ -7,7 +7,9 @@ def process_hits(input: dict) -> dict:
             doi = i.get('doi', '') or ''
             snippet = j or ''
             title = i.get('title', '') or ''
+            journal = i.get('pubname', '') or ''
             response.append({'doi': doi,
                              'snippet': re.sub('\n', '', snippet),
+                             'pubname': re.sub('\n', '', journal),
                              'title': re.sub('\n', '', title)})
     return response