|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 4, |
| 6 | + "metadata": {}, |
| 7 | + "outputs": [], |
| 8 | + "source": [ |
| 9 | + "# Copyright (c) 2024 Microsoft Corporation.\n", |
| 10 | + "# Licensed under the MIT License." |
| 11 | + ] |
| 12 | + }, |
| 13 | + { |
| 14 | + "cell_type": "markdown", |
| 15 | + "metadata": {}, |
| 16 | + "source": [ |
| 17 | + "## Index Migration (v2 to v3)\n", |
| 18 | + "\n", |
| 19 | + "This notebook is used to maintain data model parity with older indexes for version 3.0 of GraphRAG. If you have a pre-3.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment. If you have a pre-2.0 index, please run the v2 migration notebook first!\n", |
| 20 | + "\n", |
| 21 | + "NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration.\n", |
| 22 | + "\n", |
| 23 | + "This notebook will also update your settings.yaml to ensure compatibility with our newer vector store collection naming scheme in order to avoid re-ingesting.\n", |
| 24 | + "\n", |
| 25 | + "WARNING: This will overwrite your parquet files, you may want to make a backup!" |
| 26 | + ] |
| 27 | + }, |
| 28 | + { |
| 29 | + "cell_type": "code", |
| 30 | + "execution_count": 7, |
| 31 | + "metadata": {}, |
| 32 | + "outputs": [], |
| 33 | + "source": [ |
| 34 | + "# This is the directory that has your settings.yaml\n", |
| 35 | + "PROJECT_DIRECTORY = \"/Users/naevans/graphrag/working/migration\"" |
| 36 | + ] |
| 37 | + }, |
| 38 | + { |
| 39 | + "cell_type": "code", |
| 40 | + "execution_count": 15, |
| 41 | + "metadata": {}, |
| 42 | + "outputs": [], |
| 43 | + "source": [ |
| 44 | + "from pathlib import Path\n", |
| 45 | + "\n", |
| 46 | + "from graphrag.config.load_config import load_config\n", |
| 47 | + "from graphrag.storage.factory import StorageFactory\n", |
| 48 | + "\n", |
| 49 | + "config = load_config(Path(PROJECT_DIRECTORY))\n", |
| 50 | + "storage_config = config.output.model_dump()\n", |
| 51 | + "storage = StorageFactory().create_storage(\n", |
| 52 | + " storage_type=storage_config[\"type\"],\n", |
| 53 | + " kwargs=storage_config,\n", |
| 54 | + ")" |
| 55 | + ] |
| 56 | + }, |
| 57 | + { |
| 58 | + "cell_type": "code", |
| 59 | + "execution_count": 7, |
| 60 | + "metadata": {}, |
| 61 | + "outputs": [], |
| 62 | + "source": [ |
| 63 | + "def remove_columns(df, columns):\n", |
| 64 | + " \"\"\"Remove columns from a DataFrame, suppressing errors.\"\"\"\n", |
| 65 | + " df.drop(labels=columns, axis=1, errors=\"ignore\", inplace=True)" |
| 66 | + ] |
| 67 | + }, |
| 68 | + { |
| 69 | + "cell_type": "code", |
| 70 | + "execution_count": 8, |
| 71 | + "metadata": {}, |
| 72 | + "outputs": [], |
| 73 | + "source": [ |
| 74 | + "from graphrag.utils.storage import (\n", |
| 75 | + " load_table_from_storage,\n", |
| 76 | + " write_table_to_storage,\n", |
| 77 | + ")\n", |
| 78 | + "\n", |
| 79 | + "text_units = await load_table_from_storage(\"text_units\", storage)\n", |
| 80 | + "\n", |
| 81 | + "text_units[\"document_id\"] = text_units[\"document_ids\"].apply(lambda ids: ids[0])\n", |
| 82 | + "remove_columns(text_units, [\"document_ids\"])\n", |
| 83 | + "\n", |
| 84 | + "await write_table_to_storage(text_units, \"text_units\", storage)" |
| 85 | + ] |
| 86 | + }, |
| 87 | + { |
| 88 | + "cell_type": "markdown", |
| 89 | + "metadata": {}, |
| 90 | + "source": [ |
| 91 | + "## Update settings.yaml\n", |
| 92 | + "This next section will attempt to insert index names for each vector index using our new schema structure. It depends on most things being default. If you have already customized your vector store schema it may not be necessary.\n", |
| 93 | + "\n", |
| 94 | + "The primary goal is to align v2 indexes using our old default naming schema with the new customizability. If don't need this done or you have a more complicated config, comment it out and update your config manually to ensure each index name is set.\n", |
| 95 | + "\n", |
| 96 | + "Old default index names:\n", |
| 97 | + "- default-text_unit-text\n", |
| 98 | + "- default-entity-description\n", |
| 99 | + "- default-community-full_content\n", |
| 100 | + "\n", |
| 101 | + "v3 versions are:\n", |
| 102 | + "- text_unit_text\n", |
| 103 | + "- entity_description\n", |
| 104 | + "- community_full_content\n", |
| 105 | + "\n", |
| 106 | + "Therefore, with a v2 index we will explicitly set the old index names so it connects correctly.\n", |
| 107 | + "\n", |
| 108 | + "NOTE: we are also setting the default vector_size for each index, under the assumption that you are using a prior default with 1536 dimensions. Our new default of text-embedding-3-large has 3072 dimensions, which will be populated as the default if unset. Again, if you have a more complicated situation you may want to manually configure this.\n" |
| 109 | + ] |
| 110 | + }, |
| 111 | + { |
| 112 | + "cell_type": "code", |
| 113 | + "execution_count": null, |
| 114 | + "metadata": {}, |
| 115 | + "outputs": [], |
| 116 | + "source": [ |
| 117 | + "import yaml\n", |
| 118 | + "\n", |
| 119 | + "EMBEDDING_DIMENSIONS = 1536\n", |
| 120 | + "\n", |
| 121 | + "settings = Path(PROJECT_DIRECTORY) / \"settings.yaml\"\n", |
| 122 | + "with Path.open(settings) as f:\n", |
| 123 | + " conf = yaml.safe_load(f)\n", |
| 124 | + "\n", |
| 125 | + "vector_store = conf.get(\"vector_store\", {})\n", |
| 126 | + "container_name = vector_store.get(\"container_name\", \"default\")\n", |
| 127 | + "embeddings_schema = vector_store.get(\"embeddings_schema\", {})\n", |
| 128 | + "text_unit_schema = embeddings_schema.get(\"text_unit.text\", {})\n", |
| 129 | + "if \"index_name\" not in text_unit_schema:\n", |
| 130 | + " text_unit_schema[\"index_name\"] = f\"{container_name}-text_unit-text\"\n", |
| 131 | + "if \"vector_size\" not in text_unit_schema:\n", |
| 132 | + " text_unit_schema[\"vector_size\"] = EMBEDDING_DIMENSIONS\n", |
| 133 | + "embeddings_schema[\"text_unit.text\"] = text_unit_schema\n", |
| 134 | + "entity_schema = embeddings_schema.get(\"entity.description\", {})\n", |
| 135 | + "if \"index_name\" not in entity_schema:\n", |
| 136 | + " entity_schema[\"index_name\"] = f\"{container_name}-entity-description\"\n", |
| 137 | + "if \"vector_size\" not in entity_schema:\n", |
| 138 | + " entity_schema[\"vector_size\"] = EMBEDDING_DIMENSIONS\n", |
| 139 | + "embeddings_schema[\"entity.description\"] = entity_schema\n", |
| 140 | + "community_schema = embeddings_schema.get(\"community.full_content\", {})\n", |
| 141 | + "if \"index_name\" not in community_schema:\n", |
| 142 | + " community_schema[\"index_name\"] = f\"{container_name}-community-full_content\"\n", |
| 143 | + "if \"vector_size\" not in community_schema:\n", |
| 144 | + " community_schema[\"vector_size\"] = EMBEDDING_DIMENSIONS\n", |
| 145 | + "embeddings_schema[\"community.full_content\"] = community_schema\n", |
| 146 | + "vector_store[\"embeddings_schema\"] = embeddings_schema\n", |
| 147 | + "conf[\"vector_store\"] = vector_store\n", |
| 148 | + "\n", |
| 149 | + "with Path.open(settings, \"w\") as f:\n", |
| 150 | + " yaml.safe_dump(conf, f)" |
| 151 | + ] |
| 152 | + } |
| 153 | + ], |
| 154 | + "metadata": { |
| 155 | + "kernelspec": { |
| 156 | + "display_name": "graphrag", |
| 157 | + "language": "python", |
| 158 | + "name": "python3" |
| 159 | + }, |
| 160 | + "language_info": { |
| 161 | + "codemirror_mode": { |
| 162 | + "name": "ipython", |
| 163 | + "version": 3 |
| 164 | + }, |
| 165 | + "file_extension": ".py", |
| 166 | + "mimetype": "text/x-python", |
| 167 | + "name": "python", |
| 168 | + "nbconvert_exporter": "python", |
| 169 | + "pygments_lexer": "ipython3", |
| 170 | + "version": "3.12.10" |
| 171 | + } |
| 172 | + }, |
| 173 | + "nbformat": 4, |
| 174 | + "nbformat_minor": 2 |
| 175 | +} |
0 commit comments