Skip to content

Commit d0b1719

Browse files
authored
Create kg2-to-db.py
1 parent 7b5514a commit d0b1719

File tree

1 file changed

+94
-0
lines changed

1 file changed

+94
-0
lines changed

kg2-to-db.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import json
2+
import ast
3+
import psycopg2
4+
from psycopg2.extras import execute_batch
5+
6+
# Database connection details
7+
conn = psycopg2.connect(
8+
dbname="biomedical",
9+
user="postgres",
10+
password=",
11+
host="",
12+
port="5432"
13+
)
14+
cursor = conn.cursor()
15+
16+
# File paths
17+
edges_file_path = 'kg2c-2.8.4-edges.jsonl'
18+
nodes_file_path = 'kg2c-2.8.4-nodes.jsonl'
19+
20+
# Load all nodes into a dictionary for quick access by ID
21+
nodes = {}
22+
equivalent_curies_map = {}
23+
with open(nodes_file_path, 'r') as nodes_file:
24+
for line in nodes_file:
25+
node_data = json.loads(line)
26+
name = node_data.get('name') or (node_data.get('all_names')[0] if 'all_names' in node_data and node_data['all_names'] else "Unknown")
27+
nodes[node_data['id']] = name
28+
for curie in node_data.get('equivalent_curies', []):
29+
equivalent_curies_map[curie] = name
30+
31+
# Prepare batch insertion
32+
batch_size = 10000 # Adjust this size according to your available memory
33+
insert_data = []
34+
35+
with open(edges_file_path, 'r') as edges_file:
36+
for line in edges_file:
37+
edge = json.loads(line)
38+
39+
if edge.get('primary_knowledge_source') == 'infores:semmeddb':
40+
publications_info_raw = edge.get('publications_info', '{}')
41+
try:
42+
publications_info = ast.literal_eval(publications_info_raw)
43+
except ValueError as e:
44+
print(f"Error parsing publications_info: {publications_info_raw} with error: {e}")
45+
publications_info = {}
46+
47+
# Extract all sentences from publications_info
48+
sentences = []
49+
for info in publications_info.values():
50+
if 'sentence' in info and info['sentence']:
51+
sentences.append(info['sentence'])
52+
53+
# If no sentences were found, add an empty one to ensure the triple is still recorded
54+
if not sentences:
55+
sentences = ['']
56+
57+
subject_name = nodes.get(edge['subject'], equivalent_curies_map.get(edge['subject'], edge['subject']))
58+
object_name = nodes.get(edge['object'], equivalent_curies_map.get(edge['object'], edge['object']))
59+
predicate_name = nodes.get(edge['predicate'], equivalent_curies_map.get(edge['predicate'], edge['predicate']))
60+
fact = f"{subject_name} {predicate_name} {object_name}"
61+
62+
# Add each sentence as a separate row with the same triple
63+
for sentence in sentences:
64+
insert_data.append((edge['id'], fact, sentence))
65+
66+
if len(insert_data) >= batch_size:
67+
execute_batch(
68+
cursor,
69+
"""
70+
INSERT INTO public."tblbiomedicalfactcheck_new" ("nodeDataID", "triple", "sentence")
71+
VALUES (%s, %s, %s)
72+
""",
73+
insert_data
74+
)
75+
conn.commit()
76+
insert_data.clear()
77+
78+
# Insert remaining data
79+
if insert_data:
80+
execute_batch(
81+
cursor,
82+
"""
83+
INSERT INTO public."tblbiomedicalfactcheck_new" ("nodeDataID", "triple", "sentence")
84+
VALUES (%s, %s, %s)
85+
""",
86+
insert_data
87+
)
88+
conn.commit()
89+
90+
# Clean up
91+
cursor.close()
92+
conn.close()
93+
94+
print("Data has been inserted into the database successfully.")

0 commit comments

Comments
 (0)