Skip to content

Commit 5c8408b

Browse files
author
Justin Reese
committed
Add notebook for pruning KG-Alzheimers data for RAG applications
1 parent dbdd7e9 commit 5c8408b

File tree

1 file changed

+285
-0
lines changed

1 file changed

+285
-0
lines changed

notebooks/kg_pruning_for_rag.ipynb

Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# KG-Alzheimers Processing Notebook\n",
8+
"\n",
9+
"This notebook:\n",
10+
"1. Downloads the KG-Alzheimers dataset from kghub.io\n",
11+
"2. Extracts the tar.gz file\n",
12+
"3. Prunes the nodes and edges TSV files to include only specific columns\n",
13+
"4. Repackages the pruned data into a new tar.gz file"
14+
]
15+
},
16+
{
17+
"cell_type": "code",
18+
"execution_count": null,
19+
"metadata": {},
20+
"source": [
21+
"# Import necessary libraries\n",
22+
"import os\n",
23+
"import requests\n",
24+
"import pandas as pd\n",
25+
"import tarfile\n",
26+
"import shutil\n",
27+
"from tqdm.notebook import tqdm"
28+
]
29+
},
30+
{
31+
"cell_type": "code",
32+
"execution_count": null,
33+
"metadata": {},
34+
"source": [
35+
"# Set up constants\n",
36+
"URL = \"https://kghub.io/kg-alzheimers/20250317/kg-alzheimers.tar.gz\"\n",
37+
"DOWNLOAD_PATH = \"kg-alzheimers.tar.gz\"\n",
38+
"EXTRACT_DIR = \"kg-alzheimers-extracted\"\n",
39+
"PRUNED_DIR = \"kg-alzheimers-pruned\"\n",
40+
"OUTPUT_FILE = \"kg-alzheimers-pruned.tar.gz\""
41+
]
42+
},
43+
{
44+
"cell_type": "code",
45+
"execution_count": null,
46+
"metadata": {},
47+
"source": [
48+
"# Download the dataset\n",
49+
"def download_file(url, save_path):\n",
50+
" print(f\"Downloading {url} to {save_path}...\")\n",
51+
" if os.path.exists(save_path):\n",
52+
" print(f\"File already exists at {save_path}. Skipping download.\")\n",
53+
" return\n",
54+
" \n",
55+
" response = requests.get(url, stream=True)\n",
56+
" response.raise_for_status() # Raise an exception for HTTP errors\n",
57+
" \n",
58+
" # Get file size for progress bar\n",
59+
" total_size = int(response.headers.get('content-length', 0))\n",
60+
" block_size = 1024 # 1 Kibibyte\n",
61+
" \n",
62+
" with open(save_path, 'wb') as file, tqdm(\n",
63+
" desc=save_path,\n",
64+
" total=total_size,\n",
65+
" unit='iB',\n",
66+
" unit_scale=True,\n",
67+
" unit_divisor=1024,\n",
68+
" ) as bar:\n",
69+
" for data in response.iter_content(block_size):\n",
70+
" file.write(data)\n",
71+
" bar.update(len(data))\n",
72+
" \n",
73+
" print(\"Download complete!\")"
74+
]
75+
},
76+
{
77+
"cell_type": "code",
78+
"execution_count": null,
79+
"metadata": {},
80+
"source": [
81+
"# Download the dataset\n",
82+
"download_file(URL, DOWNLOAD_PATH)"
83+
]
84+
},
85+
{
86+
"cell_type": "code",
87+
"execution_count": null,
88+
"metadata": {},
89+
"source": [
90+
"# Extract the tar.gz file\n",
91+
"def extract_tarfile(tarfile_path, extract_dir):\n",
92+
" print(f\"Extracting {tarfile_path} to {extract_dir}...\")\n",
93+
" if os.path.exists(extract_dir):\n",
94+
" print(f\"Directory {extract_dir} already exists. Removing it.\")\n",
95+
" shutil.rmtree(extract_dir)\n",
96+
" \n",
97+
" os.makedirs(extract_dir, exist_ok=True)\n",
98+
" \n",
99+
" with tarfile.open(tarfile_path, 'r:gz') as tar:\n",
100+
" tar.extractall(path=extract_dir)\n",
101+
" \n",
102+
" print(\"Extraction complete!\")"
103+
]
104+
},
105+
{
106+
"cell_type": "code",
107+
"execution_count": null,
108+
"metadata": {},
109+
"source": [
110+
"# Extract the dataset\n",
111+
"extract_tarfile(DOWNLOAD_PATH, EXTRACT_DIR)"
112+
]
113+
},
114+
{
115+
"cell_type": "code",
116+
"execution_count": null,
117+
"metadata": {},
118+
"source": [
119+
"# List files in the extracted directory\n",
120+
"os.listdir(EXTRACT_DIR)"
121+
]
122+
},
123+
{
124+
"cell_type": "code",
125+
"execution_count": null,
126+
"metadata": {},
127+
"source": [
128+
"# Define which columns to keep in the nodes and edges files\n",
129+
"# Update these according to your requirements\n",
130+
"NODE_COLUMNS_TO_KEEP = ['id', 'name', 'category', 'xref', 'description', 'synonym', 'full_name', 'in_taxon_label']\n",
131+
"EDGE_COLUMNS_TO_KEEP = ['subject', 'predicate', 'object', 'category']"
132+
]
133+
},
134+
{
135+
"cell_type": "code",
136+
"execution_count": null,
137+
"metadata": {},
138+
"source": [
139+
"# Function to prune TSV files\n",
140+
"def prune_tsv_file(input_file, output_file, columns_to_keep):\n",
141+
" print(f\"Pruning {input_file} to {output_file}...\")\n",
142+
" \n",
143+
" # Read the TSV file\n",
144+
" df = pd.read_csv(input_file, sep='\\t')\n",
145+
" \n",
146+
" # Keep only the specified columns that exist in the dataset\n",
147+
" columns_to_keep = [col for col in columns_to_keep if col in df.columns]\n",
148+
" df_pruned = df[columns_to_keep]\n",
149+
" \n",
150+
" # Create the output directory if it doesn't exist\n",
151+
" os.makedirs(os.path.dirname(output_file), exist_ok=True)\n",
152+
" \n",
153+
" # Save the pruned dataframe to a TSV file\n",
154+
" df_pruned.to_csv(output_file, sep='\\t', index=False)\n",
155+
" \n",
156+
" print(f\"Pruned file saved to {output_file}\")"
157+
]
158+
},
159+
{
160+
"cell_type": "code",
161+
"execution_count": null,
162+
"metadata": {},
163+
"source": [
164+
"# Find and prune node and edge TSV files\n",
165+
"nodes_file = None\n",
166+
"edges_file = None\n",
167+
"\n",
168+
"for file in os.listdir(EXTRACT_DIR):\n",
169+
" if 'nodes' in file.lower() and file.endswith('.tsv'):\n",
170+
" nodes_file = os.path.join(EXTRACT_DIR, file)\n",
171+
" elif 'edges' in file.lower() and file.endswith('.tsv'):\n",
172+
" edges_file = os.path.join(EXTRACT_DIR, file)\n",
173+
"\n",
174+
"print(f\"Found nodes file: {nodes_file}\")\n",
175+
"print(f\"Found edges file: {edges_file}\")"
176+
]
177+
},
178+
{
179+
"cell_type": "code",
180+
"execution_count": null,
181+
"metadata": {},
182+
"source": [
183+
"# Create the pruned directory\n",
184+
"if os.path.exists(PRUNED_DIR):\n",
185+
" print(f\"Directory {PRUNED_DIR} already exists. Removing it.\")\n",
186+
" shutil.rmtree(PRUNED_DIR)\n",
187+
"\n",
188+
"os.makedirs(PRUNED_DIR, exist_ok=True)\n",
189+
"\n",
190+
"# Prune nodes and edges files if found\n",
191+
"if nodes_file:\n",
192+
" pruned_nodes_file = os.path.join(PRUNED_DIR, os.path.basename(nodes_file))\n",
193+
" prune_tsv_file(nodes_file, pruned_nodes_file, NODE_COLUMNS_TO_KEEP)\n",
194+
"\n",
195+
"if edges_file:\n",
196+
" pruned_edges_file = os.path.join(PRUNED_DIR, os.path.basename(edges_file))\n",
197+
" prune_tsv_file(edges_file, pruned_edges_file, EDGE_COLUMNS_TO_KEEP)"
198+
]
199+
},
200+
{
201+
"cell_type": "code",
202+
"execution_count": null,
203+
"metadata": {},
204+
"source": [
205+
"# Create a new tar.gz file with the pruned data\n",
206+
"def create_tarfile(source_dir, output_file):\n",
207+
" print(f\"Creating {output_file} from {source_dir}...\")\n",
208+
" with tarfile.open(output_file, \"w:gz\") as tar:\n",
209+
" for file in os.listdir(source_dir):\n",
210+
" file_path = os.path.join(source_dir, file)\n",
211+
" tar.add(file_path, arcname=file)\n",
212+
" print(f\"Created {output_file} successfully!\")"
213+
]
214+
},
215+
{
216+
"cell_type": "code",
217+
"execution_count": null,
218+
"metadata": {},
219+
"source": [
220+
"# Create the final tar.gz file\n",
221+
"create_tarfile(PRUNED_DIR, OUTPUT_FILE)"
222+
]
223+
},
224+
{
225+
"cell_type": "code",
226+
"execution_count": null,
227+
"metadata": {},
228+
"source": [
229+
"# Cleanup temporary files and directories\n",
230+
"def cleanup():\n",
231+
" print(\"Cleaning up temporary files and directories...\")\n",
232+
" if os.path.exists(EXTRACT_DIR):\n",
233+
" shutil.rmtree(EXTRACT_DIR)\n",
234+
" if os.path.exists(PRUNED_DIR):\n",
235+
" shutil.rmtree(PRUNED_DIR)\n",
236+
" print(\"Cleanup complete!\")"
237+
]
238+
},
239+
{
240+
"cell_type": "code",
241+
"execution_count": null,
242+
"metadata": {},
243+
"source": [
244+
"# Uncomment to clean up\n",
245+
"# cleanup()"
246+
]
247+
},
248+
{
249+
"cell_type": "markdown",
250+
"metadata": {},
251+
"source": [
252+
"## Summary\n",
253+
"\n",
254+
"This notebook has:\n",
255+
"1. Downloaded the KG-Alzheimers dataset\n",
256+
"2. Extracted the tar.gz file\n",
257+
"3. Pruned the nodes and edges TSV files to include only specified columns\n",
258+
"4. Created a new tar.gz file with the pruned data\n",
259+
"\n",
260+
"The pruned dataset is available at: `kg-alzheimers-pruned.tar.gz`"
261+
]
262+
}
263+
],
264+
"metadata": {
265+
"kernelspec": {
266+
"display_name": "Python 3",
267+
"language": "python",
268+
"name": "python3"
269+
},
270+
"language_info": {
271+
"codemirror_mode": {
272+
"name": "ipython",
273+
"version": 3
274+
},
275+
"file_extension": ".py",
276+
"mimetype": "text/x-python",
277+
"name": "python",
278+
"nbconvert_exporter": "python",
279+
"pygments_lexer": "ipython3",
280+
"version": "3.8.10"
281+
}
282+
},
283+
"nbformat": 4,
284+
"nbformat_minor": 4
285+
}

0 commit comments

Comments
 (0)