microsoft
diff --git a/‎.semversioner/1.0.0.json‎
Lines changed: 26 additions & 0 deletions b/‎.semversioner/1.0.0.json‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎DEVELOPING.md‎
Lines changed: 40 additions & 8 deletions b/‎DEVELOPING.md‎
Lines changed: 40 additions & 8 deletions
diff --git a/‎docs/config/env_vars.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/config/env_vars.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/examples_notebooks/drift_search.ipynb‎
Lines changed: 6 additions & 33 deletions b/‎docs/examples_notebooks/drift_search.ipynb‎
Lines changed: 6 additions & 33 deletions
@@ -0,0 +1,26 @@
+{
+  "changes": [
+    {
+      "description": "Add Parent id to communities data model",
+      "type": "patch"
+    },
+    {
+      "description": "Add migration notebook.",
+      "type": "patch"
+    },
+    {
+      "description": "Create separate community workflow, collapse subflows.",
+      "type": "patch"
+    },
+    {
+      "description": "Dependency Updates",
+      "type": "patch"
+    },
+    {
+      "description": "cleanup and refactor factory classes.",
+      "type": "patch"
+    }
+  ],
+  "created_at": "2024-12-11T21:41:49+00:00",
+  "version": "1.0.0"
+}
@@ -1,6 +1,14 @@
 # Changelog
 Note: version releases in the 0.x.y range may introduce breaking changes.
 
+## 1.0.0
+
+- patch: Add Parent id to communities data model
+- patch: Add migration notebook.
+- patch: Create separate community workflow, collapse subflows.
+- patch: Dependency Updates
+- patch: cleanup and refactor factory classes.
+
 ## 0.9.0
 
 - minor: Refactor graph creation.
 
@@ -10,24 +10,56 @@
 # Getting Started
 
 ## Install Dependencies
-
-```sh
-# Install Python dependencies.
+```shell
+# install python dependencies
 poetry install
 ```
 
-## Executing the Indexing Engine
-
-```sh
+## Execute the indexing engine
+```shell
 poetry run poe index <...args>
 ```
 
-## Executing Queries
+## Execute prompt tuning
+```shell
+poetry run poe prompt_tune <...args>
+```
 
-```sh
+## Execute Queries
+```shell
 poetry run poe query <...args>
 ```
 
+## Repository Structure
+An overview of the repository's top-level folder structure is provided below, detailing the overall design and purpose.
+We leverage a factory design pattern where possible, enabling a variety of implementations for each core component of graphrag.
+
+```shell
+graphrag
+├── api             # library API definitions
+├── cache           # cache module supporting several options
+│    └─ factory.py  #  └─ main entrypoint to create a cache
+├── callbacks       # a collection of commonly used callback functions
+├── cli             # library CLI
+│    └─ main.py     #  └─ primary CLI entrypoint
+├── config          # configuration management
+├── index           # indexing engine
+|    └─ run/run.py  #  main entrypoint to build an index
+├── llm             # generic llm interfaces
+├── logger          # logger module supporting several options
+│    └─ factory.py  #  └─ main entrypoint to create a logger
+├── model           # data model definitions associated with the knowledge graph
+├── prompt_tune     # prompt tuning module 
+├── prompts         # a collection of all the system prompts used by graphrag
+├── query           # query engine
+├── storage         # storage module supporting several options
+│    └─ factory.py  #  └─ main entrypoint to create/load a storage endpoint
+├── utils           # helper functions used throughout the library
+└── vector_stores   # vector store module containing a few options
+     └─ factory.py  #  └─ main entrypoint to create a vector store
+```
+Where appropriate, the factories expose a registration method for users to provide their own custom implementations if desired.
+
 ## Versioning
 
 We use [semversioner](https://github.com/raulgomis/semversioner) to automate and enforce semantic versioning in the release process. Our CI/CD pipeline checks that all PR's include a json file generated by semversioner. When submitting a PR, please run:
 
@@ -156,7 +156,7 @@ This section controls the storage mechanism used by the pipeline used for export
 
 | Parameter                                   | Description                                                                                                                                                        | Type  | Required or Optional | Default |
 | ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- | -------------------- | ------- |
-| `GRAPHRAG_STORAGE_TYPE`                     | The type of reporter to use. Options are `file`, `memory`, or `blob`                                                                                               | `str` | optional             | `file`  |
+| `GRAPHRAG_STORAGE_TYPE`                     | The type of storage to use. Options are `file`, `memory`, or `blob`                                                                                               | `str` | optional             | `file`  |
 | `GRAPHRAG_STORAGE_STORAGE_ACCOUNT_BLOB_URL` | The Azure Storage blob endpoint to use when in `blob` mode and using managed identity. Will have the format `https://<storage_account_name>.blob.core.windows.net` | `str` | optional             | None    |
 | `GRAPHRAG_STORAGE_CONNECTION_STRING`        | The Azure Storage connection string to use when in `blob` mode.                                                                                                    | `str` | optional             | None    |
 | `GRAPHRAG_STORAGE_CONTAINER_NAME`           | The Azure Storage container name to use when in `blob` mode.                                                                                                       | `str` | optional             | None    |
 
@@ -171,9 +171,6 @@
     "    read_indexer_reports,\n",
     "    read_indexer_text_units,\n",
     ")\n",
-    "from graphrag.query.input.loaders.dfs import (\n",
-    "    store_entity_semantic_embeddings,\n",
-    ")\n",
     "from graphrag.query.llm.oai.chat_openai import ChatOpenAI\n",
     "from graphrag.query.llm.oai.embedding import OpenAIEmbedding\n",
     "from graphrag.query.llm.oai.typing import OpenaiApiType\n",
@@ -207,9 +204,6 @@
     "    collection_name=\"default-entity-description\",\n",
     ")\n",
     "description_embedding_store.connect(db_uri=LANCEDB_URI)\n",
-    "entity_description_embeddings = store_entity_semantic_embeddings(\n",
-    "    entities=entities, vectorstore=description_embedding_store\n",
-    ")\n",
     "\n",
     "print(f\"Entity count: {len(entity_df)}\")\n",
     "entity_df.head()\n",
@@ -270,37 +264,16 @@
     }
    ],
    "source": [
-    "def embed_community_reports(\n",
+    "def read_community_reports(\n",
     "    input_dir: str,\n",
-    "    embedder: OpenAIEmbedding,\n",
     "    community_report_table: str = COMMUNITY_REPORT_TABLE,\n",
     "):\n",
     "    \"\"\"Embeds the full content of the community reports and saves the DataFrame with embeddings to the output path.\"\"\"\n",
     "    input_path = Path(input_dir) / f\"{community_report_table}.parquet\"\n",
-    "    output_path = Path(input_dir) / f\"{community_report_table}_with_embeddings.parquet\"\n",
-    "\n",
-    "    if not Path(output_path).exists():\n",
-    "        print(\"Embedding file not found. Computing community report embeddings...\")\n",
-    "\n",
-    "        report_df = pd.read_parquet(input_path)\n",
-    "\n",
-    "        if \"full_content\" not in report_df.columns:\n",
-    "            error_msg = f\"'full_content' column not found in {input_path}\"\n",
-    "            raise ValueError(error_msg)\n",
-    "\n",
-    "        report_df[\"full_content_embeddings\"] = report_df.loc[:, \"full_content\"].apply(\n",
-    "            lambda x: embedder.embed(x)\n",
-    "        )\n",
-    "\n",
-    "        # Save the DataFrame with embeddings to the output path\n",
-    "        report_df.to_parquet(output_path)\n",
-    "        print(f\"Embeddings saved to {output_path}\")\n",
-    "        return report_df\n",
-    "    print(f\"Embeddings file already exists at {output_path}\")\n",
-    "    return pd.read_parquet(output_path)\n",
+    "    return pd.read_parquet(input_path)\n",
     "\n",
     "\n",
-    "report_df = embed_community_reports(INPUT_DIR, text_embedder)\n",
+    "report_df = read_community_reports(INPUT_DIR)\n",
     "reports = read_indexer_reports(\n",
     "    report_df,\n",
     "    entity_df,\n",
@@ -321,7 +294,7 @@
     "    entities=entities,\n",
     "    relationships=relationships,\n",
     "    reports=reports,\n",
-    "    entity_text_embeddings=entity_description_embeddings,\n",
+    "    entity_text_embeddings=description_embedding_store,\n",
     "    text_units=text_units,\n",
     ")\n",
     "\n",
@@ -3172,7 +3145,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "graphrag-ta_-cxM1-py3.10",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },
@@ -3186,7 +3159,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,