remove optional embeddings

Gaudy Blanco · Gaudy Blanco · commit b2403c6fceb7 · 2025-11-11T15:55:59.000-06:00
diff --git a/docs/config/yaml.md b/docs/config/yaml.md
@@ -182,14 +182,9 @@ Where to put all vectors for the system. Configured for lancedb by default. This
 
 The supported embeddings are:
 
-- `text_unit.text`
-- `document.text`
-- `entity.title`
-- `entity.description`
-- `relationship.description`
-- `community.title`
-- `community.summary`
-- `community.full_content`
+- `text_unit_text`
+- `entity_description`
+- `community_full_content`
 
 For example:
 
@@ -199,12 +194,12 @@ vector_store:
   db_uri: output/lancedb
   index_prefix: "christmas-carol"
   embeddings_schema:
-    text_unit.text:
+    text_unit_text:
       index_name: "text-unit-embeddings"
       id_field: "id_custom"
       vector_field: "vector_custom"
       vector_size: 3072
-    entity.description:
+    entity_description:
       id_field: "id_custom"
 
 ```
@@ -224,14 +219,9 @@ By default, the GraphRAG indexer will only export embeddings required for our qu
 
 Supported embeddings names are:
 
-- `text_unit.text`
-- `document.text`
-- `entity.title`
-- `entity.description`
-- `relationship.description`
-- `community.title`
-- `community.summary`
-- `community.full_content`
+- `text_unit_text`
+- `entity_description`
+- `community_full_content`
 
 #### Fields
 
diff --git a/docs/examples_notebooks/api_overview.ipynb b/docs/examples_notebooks/api_overview.ipynb
@@ -28,10 +28,11 @@
     "from pathlib import Path\n",
     "from pprint import pprint\n",
     "\n",
-    "import graphrag.api as api\n",
     "import pandas as pd\n",
     "from graphrag.config.load_config import load_config\n",
-    "from graphrag.index.typing.pipeline_run_result import PipelineRunResult"
+    "from graphrag.index.typing.pipeline_run_result import PipelineRunResult\n",
+    "\n",
+    "import graphrag.api as api"
    ]
   },
   {
diff --git a/docs/examples_notebooks/index_migration_to_v1.ipynb b/docs/examples_notebooks/index_migration_to_v1.ipynb
@@ -229,8 +229,6 @@
     "tokenizer = get_tokenizer(model_config)\n",
     "\n",
     "await generate_text_embeddings(\n",
-    "    documents=None,\n",
-    "    relationships=None,\n",
     "    text_units=final_text_units,\n",
     "    entities=final_entities,\n",
     "    community_reports=final_community_reports,\n",
diff --git a/docs/examples_notebooks/input_documents.ipynb b/docs/examples_notebooks/input_documents.ipynb
@@ -30,10 +30,11 @@
     "from pathlib import Path\n",
     "from pprint import pprint\n",
     "\n",
-    "import graphrag.api as api\n",
     "import pandas as pd\n",
     "from graphrag.config.load_config import load_config\n",
-    "from graphrag.index.typing.pipeline_run_result import PipelineRunResult"
+    "from graphrag.index.typing.pipeline_run_result import PipelineRunResult\n",
+    "\n",
+    "import graphrag.api as api"
    ]
   },
   {
diff --git a/packages/graphrag/graphrag/config/embeddings.py b/packages/graphrag/graphrag/config/embeddings.py
@@ -3,22 +3,12 @@
 
 """A module containing embeddings values."""
 
-entity_title_embedding = "entity.title"
-entity_description_embedding = "entity.description"
-relationship_description_embedding = "relationship.description"
-document_text_embedding = "document.text"
-community_title_embedding = "community.title"
-community_summary_embedding = "community.summary"
-community_full_content_embedding = "community.full_content"
-text_unit_text_embedding = "text_unit.text"
+entity_description_embedding = "entity_description"
+community_full_content_embedding = "community_full_content"
+text_unit_text_embedding = "text_unit_text"
 
 all_embeddings: set[str] = {
-    entity_title_embedding,
     entity_description_embedding,
-    relationship_description_embedding,
-    document_text_embedding,
-    community_title_embedding,
-    community_summary_embedding,
     community_full_content_embedding,
     text_unit_text_embedding,
 }
diff --git a/packages/graphrag/graphrag/index/workflows/generate_text_embeddings.py b/packages/graphrag/graphrag/index/workflows/generate_text_embeddings.py
@@ -10,13 +10,8 @@
 from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
 from graphrag.config.embeddings import (
     community_full_content_embedding,
-    community_summary_embedding,
-    community_title_embedding,
     create_index_name,
-    document_text_embedding,
     entity_description_embedding,
-    entity_title_embedding,
-    relationship_description_embedding,
     text_unit_text_embedding,
 )
 from graphrag.config.models.graph_rag_config import GraphRagConfig
@@ -47,29 +42,14 @@ async def run_workflow(
     logger.info("Workflow started: generate_text_embeddings")
     embedded_fields = config.embed_text.names
     logger.info("Embedding the following fields: %s", embedded_fields)
-    documents = None
-    relationships = None
     text_units = None
     entities = None
     community_reports = None
-    if document_text_embedding in embedded_fields:
-        documents = await load_table_from_storage("documents", context.output_storage)
-    if relationship_description_embedding in embedded_fields:
-        relationships = await load_table_from_storage(
-            "relationships", context.output_storage
-        )
     if text_unit_text_embedding in embedded_fields:
         text_units = await load_table_from_storage("text_units", context.output_storage)
-    if (
-        entity_title_embedding in embedded_fields
-        or entity_description_embedding in embedded_fields
-    ):
+    if entity_description_embedding in embedded_fields:
         entities = await load_table_from_storage("entities", context.output_storage)
-    if (
-        community_title_embedding in embedded_fields
-        or community_summary_embedding in embedded_fields
-        or community_full_content_embedding in embedded_fields
-    ):
+    if community_full_content_embedding in embedded_fields:
         community_reports = await load_table_from_storage(
             "community_reports", context.output_storage
         )
@@ -87,8 +67,6 @@ async def run_workflow(
     tokenizer = get_tokenizer(model_config)
 
     output = await generate_text_embeddings(
-        documents=documents,
-        relationships=relationships,
         text_units=text_units,
         entities=entities,
         community_reports=community_reports,
@@ -115,8 +93,6 @@ async def run_workflow(
 
 
 async def generate_text_embeddings(
-    documents: pd.DataFrame | None,
-    relationships: pd.DataFrame | None,
     text_units: pd.DataFrame | None,
     entities: pd.DataFrame | None,
     community_reports: pd.DataFrame | None,
@@ -131,26 +107,12 @@ async def generate_text_embeddings(
 ) -> dict[str, pd.DataFrame]:
     """All the steps to generate all embeddings."""
     embedding_param_map = {
-        document_text_embedding: {
-            "data": documents.loc[:, ["id", "text"]] if documents is not None else None,
-            "embed_column": "text",
-        },
-        relationship_description_embedding: {
-            "data": relationships.loc[:, ["id", "description"]]
-            if relationships is not None
-            else None,
-            "embed_column": "description",
-        },
         text_unit_text_embedding: {
             "data": text_units.loc[:, ["id", "text"]]
             if text_units is not None
             else None,
             "embed_column": "text",
         },
-        entity_title_embedding: {
-            "data": entities.loc[:, ["id", "title"]] if entities is not None else None,
-            "embed_column": "title",
-        },
         entity_description_embedding: {
             "data": entities.loc[:, ["id", "title", "description"]].assign(
                 title_description=lambda df: df["title"] + ":" + df["description"]
@@ -159,18 +121,6 @@ async def generate_text_embeddings(
             else None,
             "embed_column": "title_description",
         },
-        community_title_embedding: {
-            "data": community_reports.loc[:, ["id", "title"]]
-            if community_reports is not None
-            else None,
-            "embed_column": "title",
-        },
-        community_summary_embedding: {
-            "data": community_reports.loc[:, ["id", "summary"]]
-            if community_reports is not None
-            else None,
-            "embed_column": "summary",
-        },
         community_full_content_embedding: {
             "data": community_reports.loc[:, ["id", "full_content"]]
             if community_reports is not None
diff --git a/packages/graphrag/graphrag/index/workflows/update_text_embeddings.py b/packages/graphrag/graphrag/index/workflows/update_text_embeddings.py
@@ -26,9 +26,6 @@ async def run_workflow(
     output_storage, _, _ = get_update_storages(
         config, context.state["update_timestamp"]
     )
-
-    final_documents_df = context.state["incremental_update_final_documents"]
-    merged_relationships_df = context.state["incremental_update_merged_relationships"]
     merged_text_units = context.state["incremental_update_merged_text_units"]
     merged_entities_df = context.state["incremental_update_merged_entities"]
     merged_community_reports = context.state[
@@ -50,8 +47,6 @@ async def run_workflow(
     tokenizer = get_tokenizer(model_config)
 
     result = await generate_text_embeddings(
-        documents=final_documents_df,
-        relationships=merged_relationships_df,
         text_units=merged_text_units,
         entities=merged_entities_df,
         community_reports=merged_community_reports,
diff --git a/tests/verbs/test_create_community_reports.py b/tests/verbs/test_create_community_reports.py
@@ -4,15 +4,16 @@
 
 from graphrag.config.models.graph_rag_config import GraphRagConfig
 from graphrag.data_model.schemas import COMMUNITY_REPORTS_FINAL_COLUMNS
-from graphrag.index.operations.summarize_communities.community_reports_extractor import (
-    CommunityReportResponse,
-    FindingModel,
-)
 from graphrag.index.workflows.create_community_reports import (
     run_workflow,
 )
 from graphrag.utils.storage import load_table_from_storage
 
+from graphrag.index.operations.summarize_communities.community_reports_extractor import (
+    CommunityReportResponse,
+    FindingModel,
+)
+
 from .util import (
     DEFAULT_MODEL_CONFIG,
     compare_outputs,
diff --git a/unified-search-app/app/app_logic.py b/unified-search-app/app/app_logic.py
@@ -7,7 +7,6 @@
 import logging
 from typing import TYPE_CHECKING
 
-import graphrag.api as api
 import streamlit as st
 from knowledge_loader.data_sources.loader import (
     create_datasource,
@@ -18,6 +17,8 @@
 from state.session_variables import SessionVariables
 from ui.search import display_search_result
 
+import graphrag.api as api
+
 if TYPE_CHECKING:
     import pandas as pd
 

Original file line number	Diff line number	Diff line change
`@@ -28,10 +28,11 @@`
`28`	`28`	`"from pathlib import Path\n",`
`29`	`29`	`"from pprint import pprint\n",`
`30`	`30`	`"\n",`
`31`		`- "import graphrag.api as api\n",`
`32`	`31`	`"import pandas as pd\n",`
`33`	`32`	`"from graphrag.config.load_config import load_config\n",`
`34`		`- "from graphrag.index.typing.pipeline_run_result import PipelineRunResult"`
	`33`	`+ "from graphrag.index.typing.pipeline_run_result import PipelineRunResult\n",`
	`34`	`+ "\n",`
	`35`	`+ "import graphrag.api as api"`
`35`	`36`	`]`
`36`	`37`	`},`
`37`	`38`	`{`
Original file line number	Diff line number	Diff line change
`@@ -30,10 +30,11 @@`
`30`	`30`	`"from pathlib import Path\n",`
`31`	`31`	`"from pprint import pprint\n",`
`32`	`32`	`"\n",`
`33`		`- "import graphrag.api as api\n",`
`34`	`33`	`"import pandas as pd\n",`
`35`	`34`	`"from graphrag.config.load_config import load_config\n",`
`36`		`- "from graphrag.index.typing.pipeline_run_result import PipelineRunResult"`
	`35`	`+ "from graphrag.index.typing.pipeline_run_result import PipelineRunResult\n",`
	`36`	`+ "\n",`
	`37`	`+ "import graphrag.api as api"`
`37`	`38`	`]`
`38`	`39`	`},`
`39`	`40`	`{`