Add notebook for pruning KG-Alzheimers data for RAG applications

Justin Reese · Justin Reese · commit 5c8408b1cf5e · 2025-03-26T15:50:16.000-04:00
diff --git a/notebooks/kg_pruning_for_rag.ipynb b/notebooks/kg_pruning_for_rag.ipynb
@@ -0,0 +1,285 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# KG-Alzheimers Processing Notebook\n",
+    "\n",
+    "This notebook:\n",
+    "1. Downloads the KG-Alzheimers dataset from kghub.io\n",
+    "2. Extracts the tar.gz file\n",
+    "3. Prunes the nodes and edges TSV files to include only specific columns\n",
+    "4. Repackages the pruned data into a new tar.gz file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Import necessary libraries\n",
+    "import os\n",
+    "import requests\n",
+    "import pandas as pd\n",
+    "import tarfile\n",
+    "import shutil\n",
+    "from tqdm.notebook import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Set up constants\n",
+    "URL = \"https://kghub.io/kg-alzheimers/20250317/kg-alzheimers.tar.gz\"\n",
+    "DOWNLOAD_PATH = \"kg-alzheimers.tar.gz\"\n",
+    "EXTRACT_DIR = \"kg-alzheimers-extracted\"\n",
+    "PRUNED_DIR = \"kg-alzheimers-pruned\"\n",
+    "OUTPUT_FILE = \"kg-alzheimers-pruned.tar.gz\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Download the dataset\n",
+    "def download_file(url, save_path):\n",
+    "    print(f\"Downloading {url} to {save_path}...\")\n",
+    "    if os.path.exists(save_path):\n",
+    "        print(f\"File already exists at {save_path}. Skipping download.\")\n",
+    "        return\n",
+    "    \n",
+    "    response = requests.get(url, stream=True)\n",
+    "    response.raise_for_status()  # Raise an exception for HTTP errors\n",
+    "    \n",
+    "    # Get file size for progress bar\n",
+    "    total_size = int(response.headers.get('content-length', 0))\n",
+    "    block_size = 1024  # 1 Kibibyte\n",
+    "    \n",
+    "    with open(save_path, 'wb') as file, tqdm(\n",
+    "        desc=save_path,\n",
+    "        total=total_size,\n",
+    "        unit='iB',\n",
+    "        unit_scale=True,\n",
+    "        unit_divisor=1024,\n",
+    "    ) as bar:\n",
+    "        for data in response.iter_content(block_size):\n",
+    "            file.write(data)\n",
+    "            bar.update(len(data))\n",
+    "    \n",
+    "    print(\"Download complete!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Download the dataset\n",
+    "download_file(URL, DOWNLOAD_PATH)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Extract the tar.gz file\n",
+    "def extract_tarfile(tarfile_path, extract_dir):\n",
+    "    print(f\"Extracting {tarfile_path} to {extract_dir}...\")\n",
+    "    if os.path.exists(extract_dir):\n",
+    "        print(f\"Directory {extract_dir} already exists. Removing it.\")\n",
+    "        shutil.rmtree(extract_dir)\n",
+    "    \n",
+    "    os.makedirs(extract_dir, exist_ok=True)\n",
+    "    \n",
+    "    with tarfile.open(tarfile_path, 'r:gz') as tar:\n",
+    "        tar.extractall(path=extract_dir)\n",
+    "    \n",
+    "    print(\"Extraction complete!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Extract the dataset\n",
+    "extract_tarfile(DOWNLOAD_PATH, EXTRACT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# List files in the extracted directory\n",
+    "os.listdir(EXTRACT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Define which columns to keep in the nodes and edges files\n",
+    "# Update these according to your requirements\n",
+    "NODE_COLUMNS_TO_KEEP = ['id', 'name', 'category', 'xref', 'description', 'synonym', 'full_name', 'in_taxon_label']\n",
+    "EDGE_COLUMNS_TO_KEEP = ['subject', 'predicate', 'object', 'category']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Function to prune TSV files\n",
+    "def prune_tsv_file(input_file, output_file, columns_to_keep):\n",
+    "    print(f\"Pruning {input_file} to {output_file}...\")\n",
+    "    \n",
+    "    # Read the TSV file\n",
+    "    df = pd.read_csv(input_file, sep='\\t')\n",
+    "    \n",
+    "    # Keep only the specified columns that exist in the dataset\n",
+    "    columns_to_keep = [col for col in columns_to_keep if col in df.columns]\n",
+    "    df_pruned = df[columns_to_keep]\n",
+    "    \n",
+    "    # Create the output directory if it doesn't exist\n",
+    "    os.makedirs(os.path.dirname(output_file), exist_ok=True)\n",
+    "    \n",
+    "    # Save the pruned dataframe to a TSV file\n",
+    "    df_pruned.to_csv(output_file, sep='\\t', index=False)\n",
+    "    \n",
+    "    print(f\"Pruned file saved to {output_file}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Find and prune node and edge TSV files\n",
+    "nodes_file = None\n",
+    "edges_file = None\n",
+    "\n",
+    "for file in os.listdir(EXTRACT_DIR):\n",
+    "    if 'nodes' in file.lower() and file.endswith('.tsv'):\n",
+    "        nodes_file = os.path.join(EXTRACT_DIR, file)\n",
+    "    elif 'edges' in file.lower() and file.endswith('.tsv'):\n",
+    "        edges_file = os.path.join(EXTRACT_DIR, file)\n",
+    "\n",
+    "print(f\"Found nodes file: {nodes_file}\")\n",
+    "print(f\"Found edges file: {edges_file}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Create the pruned directory\n",
+    "if os.path.exists(PRUNED_DIR):\n",
+    "    print(f\"Directory {PRUNED_DIR} already exists. Removing it.\")\n",
+    "    shutil.rmtree(PRUNED_DIR)\n",
+    "\n",
+    "os.makedirs(PRUNED_DIR, exist_ok=True)\n",
+    "\n",
+    "# Prune nodes and edges files if found\n",
+    "if nodes_file:\n",
+    "    pruned_nodes_file = os.path.join(PRUNED_DIR, os.path.basename(nodes_file))\n",
+    "    prune_tsv_file(nodes_file, pruned_nodes_file, NODE_COLUMNS_TO_KEEP)\n",
+    "\n",
+    "if edges_file:\n",
+    "    pruned_edges_file = os.path.join(PRUNED_DIR, os.path.basename(edges_file))\n",
+    "    prune_tsv_file(edges_file, pruned_edges_file, EDGE_COLUMNS_TO_KEEP)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Create a new tar.gz file with the pruned data\n",
+    "def create_tarfile(source_dir, output_file):\n",
+    "    print(f\"Creating {output_file} from {source_dir}...\")\n",
+    "    with tarfile.open(output_file, \"w:gz\") as tar:\n",
+    "        for file in os.listdir(source_dir):\n",
+    "            file_path = os.path.join(source_dir, file)\n",
+    "            tar.add(file_path, arcname=file)\n",
+    "    print(f\"Created {output_file} successfully!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Create the final tar.gz file\n",
+    "create_tarfile(PRUNED_DIR, OUTPUT_FILE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Cleanup temporary files and directories\n",
+    "def cleanup():\n",
+    "    print(\"Cleaning up temporary files and directories...\")\n",
+    "    if os.path.exists(EXTRACT_DIR):\n",
+    "        shutil.rmtree(EXTRACT_DIR)\n",
+    "    if os.path.exists(PRUNED_DIR):\n",
+    "        shutil.rmtree(PRUNED_DIR)\n",
+    "    print(\"Cleanup complete!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# Uncomment to clean up\n",
+    "# cleanup()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "This notebook has:\n",
+    "1. Downloaded the KG-Alzheimers dataset\n",
+    "2. Extracted the tar.gz file\n",
+    "3. Pruned the nodes and edges TSV files to include only specified columns\n",
+    "4. Created a new tar.gz file with the pruned data\n",
+    "\n",
+    "The pruned dataset is available at: `kg-alzheimers-pruned.tar.gz`"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}