microsoft
diff --git a/‎.semversioner/next-release/patch-20250206203219915745.json‎
Lines changed: 4 additions & 0 deletions b/‎.semversioner/next-release/patch-20250206203219915745.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.semversioner/next-release/patch-20250212004406773499.json‎
Lines changed: 4 additions & 0 deletions b/‎.semversioner/next-release/patch-20250212004406773499.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎graphrag/cli/initialize.py‎
Lines changed: 5 additions & 1 deletion b/‎graphrag/cli/initialize.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎graphrag/config/defaults.py‎
Lines changed: 2 additions & 0 deletions b/‎graphrag/config/defaults.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎graphrag/config/init_content.py‎
Lines changed: 2 additions & 1 deletion b/‎graphrag/config/init_content.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎graphrag/config/models/chunking_config.py‎
Lines changed: 8 additions & 0 deletions b/‎graphrag/config/models/chunking_config.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎graphrag/config/models/community_reports_config.py‎
Lines changed: 14 additions & 4 deletions b/‎graphrag/config/models/community_reports_config.py‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎graphrag/index/flows/create_base_text_units.py‎
Lines changed: 56 additions & 11 deletions b/‎graphrag/index/flows/create_base_text_units.py‎
Lines changed: 56 additions & 11 deletions
diff --git a/‎graphrag/index/flows/create_community_reports.py‎
Lines changed: 2 additions & 0 deletions b/‎graphrag/index/flows/create_community_reports.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎graphrag/index/flows/create_community_reports_text.py‎
Lines changed: 1 addition & 5 deletions b/‎graphrag/index/flows/create_community_reports_text.py‎
Lines changed: 1 addition & 5 deletions
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "add option to prepend metadata into chunks"
+}
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Export NLP community reports prompt."
+}
@@ -10,6 +10,9 @@
 from graphrag.prompts.index.community_report import (
     COMMUNITY_REPORT_PROMPT,
 )
+from graphrag.prompts.index.community_report_text_units import (
+    COMMUNITY_REPORT_TEXT_PROMPT,
+)
 from graphrag.prompts.index.extract_claims import EXTRACT_CLAIMS_PROMPT
 from graphrag.prompts.index.extract_graph import GRAPH_EXTRACTION_PROMPT
 from graphrag.prompts.index.summarize_descriptions import SUMMARIZE_PROMPT
@@ -72,7 +75,8 @@ def initialize_project_at(path: Path, force: bool) -> None:
         "extract_graph": GRAPH_EXTRACTION_PROMPT,
         "summarize_descriptions": SUMMARIZE_PROMPT,
         "extract_claims": EXTRACT_CLAIMS_PROMPT,
-        "community_report": COMMUNITY_REPORT_PROMPT,
+        "community_report_graph": COMMUNITY_REPORT_PROMPT,
+        "community_report_text": COMMUNITY_REPORT_TEXT_PROMPT,
         "drift_search_system_prompt": DRIFT_LOCAL_SYSTEM_PROMPT,
         "drift_reduce_prompt": DRIFT_REDUCE_PROMPT,
         "global_search_map_system_prompt": MAP_SYSTEM_PROMPT,
 
@@ -64,6 +64,8 @@
 CHUNK_OVERLAP = 100
 CHUNK_GROUP_BY_COLUMNS = ["id"]
 CHUNK_STRATEGY = ChunkStrategyType.tokens
+CHUNK_PREPEND_METADATA = False
+CHUNK_SIZE_INCLUDES_METADATA = False
 
 # Claim extraction
 DESCRIPTION = "Any claims or facts that could be relevant to information discovery."
 
@@ -123,7 +123,8 @@
 
 community_reports:
   model_id: {defs.COMMUNITY_REPORT_MODEL_ID}
-  prompt: "prompts/community_report.txt"
+  graph_prompt: "prompts/community_report_graph.txt"
+  text_prompt: "prompts/community_report_text.txt"
   max_length: {defs.COMMUNITY_REPORT_MAX_LENGTH}
   max_input_length: {defs.COMMUNITY_REPORT_MAX_INPUT_LENGTH}
 
 
@@ -26,3 +26,11 @@ class ChunkingConfig(BaseModel):
     encoding_model: str = Field(
         description="The encoding model to use.", default=defs.ENCODING_MODEL
     )
+    prepend_metadata: bool = Field(
+        description="Prepend metadata into each chunk.",
+        default=defs.CHUNK_PREPEND_METADATA,
+    )
+    chunk_size_includes_metadata: bool = Field(
+        description="Count metadata in max tokens.",
+        default=defs.CHUNK_SIZE_INCLUDES_METADATA,
+    )
@@ -14,8 +14,13 @@
 class CommunityReportsConfig(BaseModel):
     """Configuration section for community reports."""
 
-    prompt: str | None = Field(
-        description="The community report extraction prompt to use.", default=None
+    graph_prompt: str | None = Field(
+        description="The community report extraction prompt to use for graph-based summarization.",
+        default=None,
+    )
+    text_prompt: str | None = Field(
+        description="The community report extraction prompt to use for text-based summarization.",
+        default=None,
     )
     max_length: int = Field(
         description="The community report maximum length in tokens.",
@@ -45,10 +50,15 @@ def resolved_strategy(
             "type": CreateCommunityReportsStrategyType.graph_intelligence,
             "llm": model_config.model_dump(),
             "num_threads": model_config.concurrent_requests,
-            "extraction_prompt": (Path(root_dir) / self.prompt).read_text(
+            "graph_prompt": (Path(root_dir) / self.graph_prompt).read_text(
+                encoding="utf-8"
+            )
+            if self.graph_prompt
+            else None,
+            "text_prompt": (Path(root_dir) / self.text_prompt).read_text(
                 encoding="utf-8"
             )
-            if self.prompt
+            if self.text_prompt
             else None,
             "max_report_length": self.max_length,
             "max_input_length": self.max_input_length,
 
@@ -3,13 +3,15 @@
 
 """All the steps to transform base text_units."""
 
-from typing import cast
+import json
+from typing import Any, cast
 
 import pandas as pd
 
 from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
 from graphrag.config.models.chunking_config import ChunkStrategyType
 from graphrag.index.operations.chunk_text.chunk_text import chunk_text
+from graphrag.index.operations.chunk_text.strategies import get_encoding_fn
 from graphrag.index.utils.hashing import gen_sha512_hash
 from graphrag.logger.progress import Progress
 
@@ -22,6 +24,8 @@ def create_base_text_units(
     overlap: int,
     encoding_model: str,
     strategy: ChunkStrategyType,
+    prepend_metadata: bool = False,
+    chunk_size_includes_metadata: bool = False,
 ) -> pd.DataFrame:
     """All the steps to transform base text_units."""
     sort = documents.sort_values(by=["id"], ascending=[True])
@@ -32,25 +36,66 @@ def create_base_text_units(
 
     callbacks.progress(Progress(percent=0))
 
+    agg_dict = {"text_with_ids": list}
+    if "metadata" in documents:
+        agg_dict["metadata"] = "first"  # type: ignore
+
     aggregated = (
         (
             sort.groupby(group_by_columns, sort=False)
             if len(group_by_columns) > 0
             else sort.groupby(lambda _x: True)
         )
-        .agg(texts=("text_with_ids", list))
+        .agg(agg_dict)
         .reset_index()
     )
+    aggregated.rename(columns={"text_with_ids": "texts"}, inplace=True)
 
-    aggregated["chunks"] = chunk_text(
-        aggregated,
-        column="texts",
-        size=size,
-        overlap=overlap,
-        encoding_model=encoding_model,
-        strategy=strategy,
-        callbacks=callbacks,
-    )
+    def chunker(row: dict[str, Any]) -> Any:
+        line_delimiter = ".\n"
+        metadata_str = ""
+        metadata_tokens = 0
+
+        if prepend_metadata and "metadata" in row:
+            metadata = row["metadata"]
+            if isinstance(metadata, str):
+                metadata = json.loads(metadata)
+            if isinstance(metadata, dict):
+                metadata_str = (
+                    line_delimiter.join(f"{k}: {v}" for k, v in metadata.items())
+                    + line_delimiter
+                )
+
+            if chunk_size_includes_metadata:
+                encode, _ = get_encoding_fn(encoding_model)
+                metadata_tokens = len(encode(metadata_str))
+                if metadata_tokens >= size:
+                    message = "Metadata tokens exceeds the maximum tokens per chunk. Please increase the tokens per chunk."
+                    raise ValueError(message)
+
+        chunked = chunk_text(
+            pd.DataFrame([row]).reset_index(drop=True),
+            column="texts",
+            size=size - metadata_tokens,
+            overlap=overlap,
+            encoding_model=encoding_model,
+            strategy=strategy,
+            callbacks=callbacks,
+        )[0]
+
+        if prepend_metadata:
+            for index, chunk in enumerate(chunked):
+                if isinstance(chunk, str):
+                    chunked[index] = metadata_str + chunk
+                else:
+                    chunked[index] = (
+                        (chunk[0], metadata_str + chunk[1], chunk[2]) if chunk else None
+                    )
+
+        row["chunks"] = chunked
+        return row
+
+    aggregated = aggregated.apply(lambda row: chunker(row), axis=1)
 
     aggregated = cast("pd.DataFrame", aggregated[[*group_by_columns, "chunks"]])
     aggregated = aggregated.explode("chunks")
 
@@ -46,6 +46,8 @@ async def create_community_reports(
     if claims_input is not None:
         claims = _prep_claims(claims_input)
 
+    summarization_strategy["extraction_prompt"] = summarization_strategy["graph_prompt"]
+
     max_input_length = summarization_strategy.get(
         "max_input_length", defaults.COMMUNITY_REPORT_MAX_INPUT_LENGTH
     )
 
@@ -24,9 +24,6 @@
     build_level_context,
     build_local_context,
 )
-from graphrag.prompts.index.community_report_text_units import (
-    COMMUNITY_REPORT_PROMPT,
-)
 
 log = logging.getLogger(__name__)
 
@@ -44,8 +41,7 @@ async def create_community_reports_text(
     """All the steps to transform community reports."""
     nodes = explode_communities(communities, entities)
 
-    # TEMP: forcing override of the prompt until we can put it into config
-    summarization_strategy["extraction_prompt"] = COMMUNITY_REPORT_PROMPT
+    summarization_strategy["extraction_prompt"] = summarization_strategy["text_prompt"]
 
     max_input_length = summarization_strategy.get(
         "max_input_length", defaults.COMMUNITY_REPORT_MAX_INPUT_LENGTH
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "add option to prepend metadata into chunks"
 +}
Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,8 @@ async def create_community_reports(`
`46`	`46`	`if claims_input is not None:`
`47`	`47`	`claims = _prep_claims(claims_input)`
`48`	`48`
	`49`	`+ summarization_strategy["extraction_prompt"] = summarization_strategy["graph_prompt"]`
	`50`	`+`
`49`	`51`	`max_input_length = summarization_strategy.get(`
`50`	`52`	`"max_input_length", defaults.COMMUNITY_REPORT_MAX_INPUT_LENGTH`
`51`	`53`	`)`