microsoft
diff --git a/‎.semversioner/next-release/major-20250909205146252760.json‎
Lines changed: 4 additions & 0 deletions b/‎.semversioner/next-release/major-20250909205146252760.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/config/yaml.md‎
Lines changed: 0 additions & 1 deletion b/‎docs/config/yaml.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/index/default_dataflow.md‎
Lines changed: 1 addition & 3 deletions b/‎docs/index/default_dataflow.md‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎docs/index/outputs.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/index/outputs.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎graphrag/config/defaults.py‎
Lines changed: 4 additions & 5 deletions b/‎graphrag/config/defaults.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎graphrag/config/init_content.py‎
Lines changed: 0 additions & 1 deletion b/‎graphrag/config/init_content.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎graphrag/config/models/chunking_config.py‎
Lines changed: 0 additions & 4 deletions b/‎graphrag/config/models/chunking_config.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎graphrag/data_model/schemas.py‎
Lines changed: 2 additions & 2 deletions b/‎graphrag/data_model/schemas.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎graphrag/data_model/text_unit.py‎
Lines changed: 4 additions & 4 deletions b/‎graphrag/data_model/text_unit.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎graphrag/index/workflows/create_base_text_units.py‎
Lines changed: 16 additions & 37 deletions b/‎graphrag/index/workflows/create_base_text_units.py‎
Lines changed: 16 additions & 37 deletions
@@ -0,0 +1,4 @@
+{
+  "type": "major",
+  "description": "Remove text unit group-by ability."
+}
@@ -99,7 +99,6 @@ These settings configure how we parse documents into text chunks. This is necess
 
 - `size` **int** - The max chunk size in tokens.
 - `overlap` **int** - The chunk overlap in tokens.
-- `group_by_columns` **list[str]** - Group documents by these fields before chunking.
 - `strategy` **str**[tokens|sentences] - How to chunk the text. 
 - `encoding_model` **str** - The text encoding model to use for splitting on token boundaries.
 - `prepend_metadata` **bool** - Determines if metadata values should be added at the beginning of each chunk. Default=`False`.
 
@@ -59,9 +59,7 @@ flowchart TB
 
 The first phase of the default-configuration workflow is to transform input documents into _TextUnits_. A _TextUnit_ is a chunk of text that is used for our graph extraction techniques. They are also used as source-references by extracted knowledge items in order to empower breadcrumbs and provenance by concepts back to their original source text.
 
-The chunk size (counted in tokens), is user-configurable. By default this is set to 300 tokens, although we've had positive experience with 1200-token chunks using a single "glean" step. (A "glean" step is a follow-on extraction). Larger chunks result in lower-fidelity output and less meaningful reference texts; however, using larger chunks can result in much faster processing time.
-
-The group-by configuration is also user-configurable. By default, we align our chunks to document boundaries, meaning that there is a strict 1-to-many relationship between Documents and TextUnits. In rare cases, this can be turned into a many-to-many relationship. This is useful when the documents are very short and we need several of them to compose a meaningful analysis unit (e.g. Tweets or a chat log)
+The chunk size (counted in tokens), is user-configurable. By default this is set to 1200 tokens. Larger chunks result in lower-fidelity output and less meaningful reference texts; however, using larger chunks can result in much faster processing time.
 
 ```mermaid
 ---
 
@@ -102,7 +102,7 @@ List of all text chunks parsed from the input documents.
 | ----------------- | ----- | ----------- |
 | text              | str   | Raw full text of the chunk. |
 | n_tokens          | int   | Number of tokens in the chunk. This should normally match the `chunk_size` config parameter, except for the last chunk which is often shorter. |
-| document_ids      | str[] | List of document IDs the chunk came from. This is normally only 1 due to our default groupby, but for very short text documents (e.g., microblogs) it can be configured so text units span multiple documents. |
+| document_id       | str   | ID of the document the chunk came from. |
 | entity_ids        | str[] | List of entities found in the text unit. |
 | relationships_ids | str[] | List of relationships found in the text unit. |
 | covariate_ids     | str[] | Optional list of covariates found in the text unit. |
@@ -27,15 +27,15 @@
 DEFAULT_OUTPUT_BASE_DIR = "output"
 DEFAULT_CHAT_MODEL_ID = "default_chat_model"
 DEFAULT_CHAT_MODEL_TYPE = ModelType.OpenAIChat
-DEFAULT_CHAT_MODEL = "gpt-4-turbo-preview"
+DEFAULT_CHAT_MODEL = "gpt-4o"
 DEFAULT_CHAT_MODEL_AUTH_TYPE = AuthType.APIKey
 DEFAULT_EMBEDDING_MODEL_ID = "default_embedding_model"
 DEFAULT_EMBEDDING_MODEL_TYPE = ModelType.OpenAIEmbedding
-DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"
+DEFAULT_EMBEDDING_MODEL = "text-embedding-ada-002"
 DEFAULT_EMBEDDING_MODEL_AUTH_TYPE = AuthType.APIKey
 DEFAULT_VECTOR_STORE_ID = "default_vector_store"
 
-ENCODING_MODEL = "cl100k_base"
+ENCODING_MODEL = "o200k_base"
 COGNITIVE_SERVICES_AUDIENCE = "https://cognitiveservices.azure.com/.default"
 
 
@@ -68,9 +68,8 @@ class ChunksDefaults:
 
     size: int = 1200
     overlap: int = 100
-    group_by_columns: list[str] = field(default_factory=lambda: ["id"])
     strategy: ClassVar[ChunkStrategyType] = ChunkStrategyType.tokens
-    encoding_model: str = "cl100k_base"
+    encoding_model: str = ENCODING_MODEL
     prepend_metadata: bool = False
     chunk_size_includes_metadata: bool = False
 
 
@@ -67,7 +67,6 @@
 chunks:
   size: {graphrag_config_defaults.chunks.size}
   overlap: {graphrag_config_defaults.chunks.overlap}
-  group_by_columns: [{",".join(graphrag_config_defaults.chunks.group_by_columns)}]
 
 ### Output/storage settings ###
 ## If blob storage is specified in the following four sections,
 
@@ -20,10 +20,6 @@ class ChunkingConfig(BaseModel):
         description="The chunk overlap to use.",
         default=graphrag_config_defaults.chunks.overlap,
     )
-    group_by_columns: list[str] = Field(
-        description="The chunk by columns to use.",
-        default=graphrag_config_defaults.chunks.group_by_columns,
-    )
     strategy: ChunkStrategyType = Field(
         description="The chunking strategy to use.",
         default=graphrag_config_defaults.chunks.strategy,
 
@@ -52,7 +52,7 @@
 RELATIONSHIP_IDS = "relationship_ids"
 TEXT_UNIT_IDS = "text_unit_ids"
 COVARIATE_IDS = "covariate_ids"
-DOCUMENT_IDS = "document_ids"
+DOCUMENT_ID = "document_id"
 
 PERIOD = "period"
 SIZE = "size"
@@ -142,7 +142,7 @@
     SHORT_ID,
     TEXT,
     N_TOKENS,
-    DOCUMENT_IDS,
+    DOCUMENT_ID,
     ENTITY_IDS,
     RELATIONSHIP_IDS,
     COVARIATE_IDS,
 
@@ -28,8 +28,8 @@ class TextUnit(Identified):
     n_tokens: int | None = None
     """The number of tokens in the text (optional)."""
 
-    document_ids: list[str] | None = None
-    """List of document IDs in which the text unit appears (optional)."""
+    document_id: str | None = None
+    """ID of the document in which the text unit appears (optional)."""
 
     attributes: dict[str, Any] | None = None
     """A dictionary of additional attributes associated with the text unit (optional)."""
@@ -45,7 +45,7 @@ def from_dict(
         relationships_key: str = "relationship_ids",
         covariates_key: str = "covariate_ids",
         n_tokens_key: str = "n_tokens",
-        document_ids_key: str = "document_ids",
+        document_id_key: str = "document_id",
         attributes_key: str = "attributes",
     ) -> "TextUnit":
         """Create a new text unit from the dict data."""
@@ -57,6 +57,6 @@ def from_dict(
             relationship_ids=d.get(relationships_key),
             covariate_ids=d.get(covariates_key),
             n_tokens=d.get(n_tokens_key),
-            document_ids=d.get(document_ids_key),
+            document_id=d.get(document_id_key),
             attributes=d.get(attributes_key),
         )
@@ -35,7 +35,6 @@ async def run_workflow(
     output = create_base_text_units(
         documents,
         context.callbacks,
-        chunks.group_by_columns,
         chunks.size,
         chunks.overlap,
         chunks.encoding_model,
@@ -53,7 +52,6 @@ async def run_workflow(
 def create_base_text_units(
     documents: pd.DataFrame,
     callbacks: WorkflowCallbacks,
-    group_by_columns: list[str],
     size: int,
     overlap: int,
     encoding_model: str,
@@ -62,26 +60,9 @@ def create_base_text_units(
     chunk_size_includes_metadata: bool = False,
 ) -> pd.DataFrame:
     """All the steps to transform base text_units."""
-    sort = documents.sort_values(by=["id"], ascending=[True])
+    documents.sort_values(by=["id"], ascending=[True], inplace=True)
 
-    sort["text_with_ids"] = list(
-        zip(*[sort[col] for col in ["id", "text"]], strict=True)
-    )
-
-    agg_dict = {"text_with_ids": list}
-    if "metadata" in documents:
-        agg_dict["metadata"] = "first"  # type: ignore
-
-    aggregated = (
-        (
-            sort.groupby(group_by_columns, sort=False)
-            if len(group_by_columns) > 0
-            else sort.groupby(lambda _x: True)
-        )
-        .agg(agg_dict)
-        .reset_index()
-    )
-    aggregated.rename(columns={"text_with_ids": "texts"}, inplace=True)
+    encode, _ = get_encoding_fn(encoding_model)
 
     def chunker(row: pd.Series) -> Any:
         line_delimiter = ".\n"
@@ -99,15 +80,14 @@ def chunker(row: pd.Series) -> Any:
                 )
 
             if chunk_size_includes_metadata:
-                encode, _ = get_encoding_fn(encoding_model)
                 metadata_tokens = len(encode(metadata_str))
                 if metadata_tokens >= size:
                     message = "Metadata tokens exceeds the maximum tokens per chunk. Please increase the tokens per chunk."
                     raise ValueError(message)
 
         chunked = chunk_text(
             pd.DataFrame([row]).reset_index(drop=True),
-            column="texts",
+            column="text",
             size=size - metadata_tokens,
             overlap=overlap,
             encoding_model=encoding_model,
@@ -128,7 +108,7 @@ def chunker(row: pd.Series) -> Any:
         return row
 
     # Track progress of row-wise apply operation
-    total_rows = len(aggregated)
+    total_rows = len(documents)
     logger.info("Starting chunking process for %d documents", total_rows)
 
     def chunker_with_logging(row: pd.Series, row_index: int) -> Any:
@@ -137,27 +117,26 @@ def chunker_with_logging(row: pd.Series, row_index: int) -> Any:
         logger.info("chunker progress:  %d/%d", row_index + 1, total_rows)
         return result
 
-    aggregated = aggregated.apply(
+    text_units = documents.apply(
         lambda row: chunker_with_logging(row, row.name), axis=1
     )
 
-    aggregated = cast("pd.DataFrame", aggregated[[*group_by_columns, "chunks"]])
-    aggregated = aggregated.explode("chunks")
-    aggregated.rename(
+    text_units = cast("pd.DataFrame", text_units[["id", "chunks"]])
+    text_units = text_units.explode("chunks")
+    text_units.rename(
         columns={
-            "chunks": "chunk",
+            "id": "document_id",
+            "chunks": "text",
         },
         inplace=True,
     )
-    aggregated["id"] = aggregated.apply(
-        lambda row: gen_sha512_hash(row, ["chunk"]), axis=1
-    )
-    aggregated[["document_ids", "chunk", "n_tokens"]] = pd.DataFrame(
-        aggregated["chunk"].tolist(), index=aggregated.index
+
+    text_units["id"] = text_units.apply(
+        lambda row: gen_sha512_hash(row, ["text"]), axis=1
     )
-    # rename for downstream consumption
-    aggregated.rename(columns={"chunk": "text"}, inplace=True)
+    # get a final token measurement
+    text_units["n_tokens"] = text_units["text"].apply(lambda x: len(encode(x)))
 
     return cast(
-        "pd.DataFrame", aggregated[aggregated["text"].notna()].reset_index(drop=True)
+        "pd.DataFrame", text_units[text_units["text"].notna()].reset_index(drop=True)
     )
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "major",
 +  "description": "Remove text unit group-by ability."
 +}