Create v1-to-v2 migration notebook

natoverse · natoverse · commit c030acfd1045 · 2025-02-04T10:46:47.000-08:00
diff --git a/docs/examples_notebooks/index_migration_to_v1.ipynb b/docs/examples_notebooks/index_migration_to_v1.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 62,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -14,9 +14,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Index Migration\n",
+    "## Index Migration (pre-v1 to v1)\n",
     "\n",
-    "This notebook is used to maintain data model parity with older indexes for the latest versions of GraphRAG. If you have a pre-1.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment.\n",
+    "This notebook is used to maintain data model parity with older indexes for version 1.0 of GraphRAG. If you have a pre-1.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment.\n",
     "\n",
     "NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. This also ensures that you have default vector store config, which is now required or indexing will fail.\n",
     "\n",
@@ -25,40 +25,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 63,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# This is the directory that has your settings.yml\n",
+    "# This is the directory that has your settings.yaml\n",
     "# NOTE: much older indexes may have been output with a timestamped directory\n",
-    "# if this is the case, you will need to make sure the storage.base_dir in settings.yml points to it correctly\n",
-    "PROJECT_DIRECTORY = \"<your project directory>\""
+    "# if this is the case, you will need to make sure the storage.base_dir in settings.yaml points to it correctly\n",
+    "PROJECT_DIRECTORY = \"<your project directory\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 64,
    "metadata": {},
    "outputs": [],
    "source": [
     "from pathlib import Path\n",
     "\n",
     "from graphrag.config.load_config import load_config\n",
-    "from graphrag.config.resolve_path import resolve_paths\n",
     "from graphrag.storage.factory import StorageFactory\n",
     "\n",
-    "# This first block does some config loading, path resolution, and translation that is normally done by the CLI/API when running a full workflow\n",
     "config = load_config(Path(PROJECT_DIRECTORY))\n",
-    "resolve_paths(config)\n",
-    "storage_config = config.storage.model_dump()  # type: ignore\n",
+    "storage_config = config.output.model_dump()\n",
     "storage = StorageFactory().create_storage(\n",
-    "    storage_type=storage_config[\"type\"], kwargs=storage_config\n",
+    "    storage_type=storage_config[\"type\"],\n",
+    "    kwargs=storage_config,\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 65,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -69,7 +67,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 66,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -98,7 +96,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 67,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -134,7 +132,7 @@
     "if \"name\" in final_entities.columns:\n",
     "    final_entities.rename(columns={\"name\": \"title\"}, inplace=True)\n",
     "remove_columns(\n",
-    "    final_entities, [\"mname_embedding\", \"graph_embedding\", \"description_embedding\"]\n",
+    "    final_entities, [\"name_embedding\", \"graph_embedding\", \"description_embedding\"]\n",
     ")\n",
     "\n",
     "# Final nodes uses community for joins, which is now an int everywhere\n",
@@ -168,6 +166,15 @@
     "    final_communities[\"id\"] = [str(uuid4()) for _ in range(len(final_communities))]\n",
     "if \"parent\" not in final_communities.columns:\n",
     "    final_communities = final_communities.merge(parent_df, on=\"community\", how=\"left\")\n",
+    "if \"entity_ids\" not in final_communities.columns:\n",
+    "    node_mapping = (\n",
+    "        final_nodes.loc[:, [\"community\", \"id\"]]\n",
+    "        .groupby(\"community\")\n",
+    "        .agg(entity_ids=(\"id\", list))\n",
+    "    )\n",
+    "    final_communities = final_communities.merge(\n",
+    "        node_mapping, on=\"community\", how=\"left\"\n",
+    "    )\n",
     "remove_columns(final_communities, [\"raw_community\"])\n",
     "\n",
     "# We need int for community and the human_readable_id copy for consistency\n",
@@ -197,7 +204,7 @@
    "source": [
     "from graphrag.cache.factory import CacheFactory\n",
     "from graphrag.callbacks.noop_workflow_callbacks import NoopWorkflowCallbacks\n",
-    "from graphrag.index.config.embeddings import get_embedded_fields, get_embedding_settings\n",
+    "from graphrag.config.embeddings import get_embedded_fields, get_embedding_settings\n",
     "from graphrag.index.flows.generate_text_embeddings import generate_text_embeddings\n",
     "\n",
     "# We only need to re-run the embeddings workflow, to ensure that embeddings for all required search fields are in place\n",
diff --git a/docs/examples_notebooks/index_migration_to_v2.ipynb b/docs/examples_notebooks/index_migration_to_v2.ipynb
@@ -0,0 +1,140 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright (c) 2024 Microsoft Corporation.\n",
+    "# Licensed under the MIT License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Index Migration (v1 to v2)\n",
+    "\n",
+    "This notebook is used to maintain data model parity with older indexes for version 2.0 of GraphRAG. If you have a pre-2.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment. If you have a pre-1.0 index, please run the v1 migration notebook first!\n",
+    "\n",
+    "NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration. This also ensures that you have default vector store config, which is now required or indexing will fail.\n",
+    "\n",
+    "WARNING: This will overwrite your parquet files, you may want to make a backup!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This is the directory that has your settings.yaml\n",
+    "PROJECT_DIRECTORY = \"<your project directory\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "from graphrag.config.load_config import load_config\n",
+    "from graphrag.storage.factory import StorageFactory\n",
+    "\n",
+    "config = load_config(Path(PROJECT_DIRECTORY))\n",
+    "storage_config = config.output.model_dump()\n",
+    "storage = StorageFactory().create_storage(\n",
+    "    storage_type=storage_config[\"type\"],\n",
+    "    kwargs=storage_config,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def remove_columns(df, columns):\n",
+    "    \"\"\"Remove columns from a DataFrame, suppressing errors.\"\"\"\n",
+    "    df.drop(labels=columns, axis=1, errors=\"ignore\", inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from graphrag.utils.storage import (\n",
+    "    delete_table_from_storage,\n",
+    "    load_table_from_storage,\n",
+    "    write_table_to_storage,\n",
+    ")\n",
+    "\n",
+    "final_documents = await load_table_from_storage(\"create_final_documents\", storage)\n",
+    "final_text_units = await load_table_from_storage(\"create_final_text_units\", storage)\n",
+    "final_entities = await load_table_from_storage(\"create_final_entities\", storage)\n",
+    "final_nodes = await load_table_from_storage(\"create_final_nodes\", storage)\n",
+    "final_relationships = await load_table_from_storage(\n",
+    "    \"create_final_relationships\", storage\n",
+    ")\n",
+    "final_communities = await load_table_from_storage(\"create_final_communities\", storage)\n",
+    "final_community_reports = await load_table_from_storage(\n",
+    "    \"create_final_community_reports\", storage\n",
+    ")\n",
+    "\n",
+    "# we've renamed document attributes as metadata\n",
+    "if \"attributes\" in final_documents.columns:\n",
+    "    final_documents.rename(columns={\"attributes\": \"metadata\"}, inplace=True)\n",
+    "\n",
+    "# we're removing the nodes table, so we need to copy the graph columns into entities\n",
+    "graph_props = (\n",
+    "    final_nodes.loc[:, [\"id\", \"degree\", \"x\", \"y\"]].groupby(\"id\").first().reset_index()\n",
+    ")\n",
+    "final_entities = final_entities.merge(graph_props, on=\"id\", how=\"left\")\n",
+    "\n",
+    "# we renamed all the output files for better clarity now that we don't have workflow naming constraints from DataShaper\n",
+    "await write_table_to_storage(final_documents, \"documents\", storage)\n",
+    "await write_table_to_storage(final_text_units, \"text_units\", storage)\n",
+    "await write_table_to_storage(final_entities, \"entities\", storage)\n",
+    "await write_table_to_storage(final_relationships, \"relationships\", storage)\n",
+    "await write_table_to_storage(final_communities, \"communities\", storage)\n",
+    "await write_table_to_storage(final_community_reports, \"community_reports\", storage)\n",
+    "\n",
+    "# delete all the old versions\n",
+    "await delete_table_from_storage(\"create_final_documents\", storage)\n",
+    "await delete_table_from_storage(\"create_final_text_units\", storage)\n",
+    "await delete_table_from_storage(\"create_final_entities\", storage)\n",
+    "await delete_table_from_storage(\"create_final_nodes\", storage)\n",
+    "await delete_table_from_storage(\"create_final_relationships\", storage)\n",
+    "await delete_table_from_storage(\"create_final_communities\", storage)\n",
+    "await delete_table_from_storage(\"create_final_community_reports\", storage)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}