microsoft · natoverse · Jan 12, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026
@@ -87,9 +87,9 @@ Our pipeline can ingest .csv, .txt, or .json data from an input location. See th
   - `container_name` **str** - (blob/cosmosdb only) The Azure Storage container name.
   - `storage_account_blob_url` **str** - (blob only) The storage account blob URL to use.
   - `cosmosdb_account_blob_url` **str** - (cosmosdb only) The CosmosDB account blob URL to use.
-- `file_type` **text|csv|json** - The type of input data to load. Default is `text`
+- `type` **text|csv|json** - The type of input data to load. Default is `text`
 - `encoding` **str** - The encoding of the input file. Default is `utf-8`
-- `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$`, `.*\.txt$`, or `.*\.json$` depending on the specified `file_type`, but you can customize it if needed.
+- `file_pattern` **str** - A regex to match input files. Default is `.*\.csv$`, `.*\.txt$`, or `.*\.json$` depending on the specified `type`, but you can customize it if needed.
 - `text_column` **str** - (CSV/JSON only) The text column name. If unset we expect a column named `text`.
 - `title_column` **str** - (CSV/JSON only) The title column name, filename will be used if unset.
 - `metadata` **list[str]** - (CSV/JSON only) The additional document attributes fields to keep.

@@ -28,11 +28,10 @@
     "from pathlib import Path\n",
     "from pprint import pprint\n",
     "\n",
+    "import graphrag.api as api\n",
     "import pandas as pd\n",
     "from graphrag.config.load_config import load_config\n",
-    "from graphrag.index.typing.pipeline_run_result import PipelineRunResult\n",
-    "\n",
-    "import graphrag.api as api"
+    "from graphrag.index.typing.pipeline_run_result import PipelineRunResult"
    ]
   },
   {

@@ -30,11 +30,10 @@
     "from pathlib import Path\n",
     "from pprint import pprint\n",
     "\n",
+    "import graphrag.api as api\n",
     "import pandas as pd\n",
     "from graphrag.config.load_config import load_config\n",
-    "from graphrag.index.typing.pipeline_run_result import PipelineRunResult\n",
-    "\n",
-    "import graphrag.api as api"
+    "from graphrag.index.typing.pipeline_run_result import PipelineRunResult"
    ]
   },
   {

@@ -116,7 +116,7 @@ settings.yaml
 
 ```yaml
 input:
-    file_type: text
+    type: text
     metadata: [title]
 
 chunks:
@@ -194,7 +194,7 @@ settings.yaml
 
 ```yaml
 input:
-    file_type: json
+    type: json
     title_column: headline
     text_column: content
 

@@ -26,7 +26,7 @@ def register_cache(
     cache_initializer: Callable[..., Cache],
     scope: ServiceScope = "transient",
 ) -> None:
-    """Register a custom storage implementation.
+    """Register a custom cache implementation.
 
     Args
     ----

@@ -23,12 +23,12 @@ authors = [
 license = "MIT"
 readme = "README.md"
 license-files = ["LICENSE"]
-requires-python = ">=3.10,<3.13"
+requires-python = ">=3.11,<3.14"
 classifiers = [
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
 ]
 dependencies = [
     "graphrag-common==2.7.0",

@@ -4,9 +4,10 @@
 """A module containing the 'Chunker' class."""
 
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from typing import Any
 
-from graphrag_chunking.chunk_result import ChunkResult
+from graphrag_chunking.text_chunk import TextChunk
 
 
 class Chunker(ABC):
@@ -17,5 +18,7 @@ def __init__(self, **kwargs: Any) -> None:
         """Create a chunker instance."""
 
     @abstractmethod
-    def chunk(self, text: str) -> list[ChunkResult]:
+    def chunk(
+        self, text: str, transform: Callable[[str], str] | None = None
+    ) -> list[TextChunk]:
         """Chunk method definition."""
@@ -30,7 +30,7 @@ class ChunkingConfig(BaseModel):
         description="The chunk overlap to use.",
         default=100,
     )
-    prepend_metadata: bool = Field(
-        description="Prepend metadata into each chunk.",
-        default=False,
+    prepend_metadata: list[str] | None = Field(
+        description="Metadata fields from the source document to prepend on each chunk.",
+        default=None,
     )
@@ -5,26 +5,28 @@
 
 from collections.abc import Callable
 
-from graphrag_chunking.chunk_result import ChunkResult
+from graphrag_chunking.text_chunk import TextChunk
 
 
 def create_chunk_results(
     chunks: list[str],
+    transform: Callable[[str], str] | None = None,
     encode: Callable[[str], list[int]] | None = None,
-) -> list[ChunkResult]:
-    """Create chunk results from a list of text chunks. The index assignments are 0-based and assume chunks we not stripped relative to the source text."""
+) -> list[TextChunk]:
+    """Create chunk results from a list of text chunks. The index assignments are 0-based and assume chunks were not stripped relative to the source text."""
     results = []
     start_char = 0
     for index, chunk in enumerate(chunks):
         end_char = start_char + len(chunk) - 1  # 0-based indices
-        chunk = ChunkResult(
-            text=chunk,
+        result = TextChunk(
+            original=chunk,
+            text=transform(chunk) if transform else chunk,
             index=index,
             start_char=start_char,
             end_char=end_char,
         )
         if encode:
-            chunk.token_count = len(encode(chunk.text))
-        results.append(chunk)
+            result.token_count = len(encode(result.text))
+        results.append(result)
         start_char = end_char + 1
     return results
@@ -9,9 +9,9 @@
 import nltk
 
 from graphrag_chunking.bootstrap_nltk import bootstrap
-from graphrag_chunking.chunk_result import ChunkResult
 from graphrag_chunking.chunker import Chunker
 from graphrag_chunking.create_chunk_results import create_chunk_results
+from graphrag_chunking.text_chunk import TextChunk
 
 
 class SentenceChunker(Chunker):
@@ -24,10 +24,14 @@ def __init__(
         self._encode = encode
         bootstrap()
 
-    def chunk(self, text) -> list[ChunkResult]:
+    def chunk(
+        self, text: str, transform: Callable[[str], str] | None = None
+    ) -> list[TextChunk]:
         """Chunk the text into sentence-based chunks."""
         sentences = nltk.sent_tokenize(text.strip())
-        results = create_chunk_results(sentences, encode=self._encode)
+        results = create_chunk_results(
+            sentences, transform=transform, encode=self._encode
+        )
         # nltk sentence tokenizer may trim whitespace, so we need to adjust start/end chars
         for index, result in enumerate(results):
             txt = result.text

@@ -0,0 +1,29 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""The TextChunk dataclass."""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class TextChunk:
+    """Result of chunking a document."""
+
+    original: str
+    """Raw original text chunk before any transformation."""
+
+    text: str
+    """The final text content of this chunk."""
+
+    index: int
+    """Zero-based index of this chunk within the source document."""
+
+    start_char: int
+    """Character index where the raw chunk text begins in the source document."""
+
+    end_char: int
+    """Character index where the raw chunk text ends in the source document."""
+
+    token_count: int | None = None
+    """Number of tokens in the final chunk text, if computed."""
@@ -6,9 +6,9 @@
 from collections.abc import Callable
 from typing import Any
 
-from graphrag_chunking.chunk_result import ChunkResult
 from graphrag_chunking.chunker import Chunker
 from graphrag_chunking.create_chunk_results import create_chunk_results
+from graphrag_chunking.text_chunk import TextChunk
 
 
 class TokenChunker(Chunker):
@@ -28,7 +28,9 @@ def __init__(
         self._encode = encode
         self._decode = decode
 
-    def chunk(self, text: str) -> list[ChunkResult]:
+    def chunk(
+        self, text: str, transform: Callable[[str], str] | None = None
+    ) -> list[TextChunk]:
         """Chunk the text into token-based chunks."""
         chunks = split_text_on_tokens(
             text,
@@ -37,7 +39,7 @@ def chunk(self, text: str) -> list[ChunkResult]:
             encode=self._encode,
             decode=self._decode,
         )
-        return create_chunk_results(chunks, encode=self._encode)
+        return create_chunk_results(chunks, transform=transform, encode=self._encode)
 
 
 def split_text_on_tokens(

@@ -0,0 +1,25 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""A collection of useful built-in transformers you can use for chunking."""
+
+from collections.abc import Callable
+from typing import Any
+
+
+def add_metadata(
+    metadata: dict[str, Any],
+    delimiter: str = ": ",
+    line_delimiter: str = "\n",
+    append: bool = False,
+) -> Callable[[str], str]:
+    """Add metadata to the given text, prepending by default. This utility writes the dict as rows of key/value pairs."""
+
+    def transformer(text: str) -> str:
+        metadata_str = (
+            line_delimiter.join(f"{k}{delimiter}{v}" for k, v in metadata.items())
+            + line_delimiter
+        )
+        return text + metadata_str if append else metadata_str + text
+
+    return transformer
@@ -0,0 +1,72 @@
+# GraphRAG Inputs
+
+This package provides input document loading utilities for GraphRAG, supporting multiple file formats including CSV, JSON, JSON Lines, and plain text.
+
+## Supported File Types
+
+The following four standard file formats are supported out of the box:
+
+- **CSV** - Tabular data with configurable column mappings
+- **JSON** - JSON files with configurable property paths
+- **JSON Lines** - Line-delimited JSON records
+- **Text** - Plain text files
+
+### Markitdown Support
+
+Additionally, we support the `InputType.MarkItDown` format, which uses the [MarkItDown](https://github.com/microsoft/markitdown) library to import any supported file type. The MarkItDown converter can handle a wide variety of file formats including Office documents, PDFs, HTML, and more.
+
+**Note:** Additional optional dependencies may need to be installed depending on the file type you're processing. The choice of converter is determined by MarkItDowns's processing logic, which primarily uses the file extension to select the appropriate converter. Please refer to the [MarkItDown repository](https://github.com/microsoft/markitdown) for installation instructions and detailed information about supported formats.
+
+## Examples
+
+Basic usage with the factory:
+```python
+from graphrag_input import create_input_reader, InputConfig, InputType
+from graphrag_storage import StorageConfig, create_storage
+
+config = InputConfig(
+    type=InputType.Csv,
+    text_column="content",
+    title_column="title",
+)
+storage = create_storage(StorageConfig(base_dir="./input"))
+reader = create_input_reader(config, storage)
+documents = await reader.read_files()
+```
+
+Import a pdf with MarkItDown:
+
+```bash
+pip install 'markitdown[pdf]' # required dependency for pdf processing
+```
+
+```python
+from graphrag_input import create_input_reader, InputConfig, InputType
+from graphrag_storage import StorageConfig, create_storage
+
+config = InputConfig(
+    type=InputType.MarkitDown,
+    file_pattern=".*\\.pdf$"
+)
+storage = create_storage(StorageConfig(base_dir="./input"))
+reader = create_input_reader(config, storage)
+documents = await reader.read_files()
+```
+
+YAML config example for above:
+```yaml
+input:
+  type: markitdown
+  file_pattern: ".*\\.pdf$$"
+input_storage:
+    type: file
+    base_dir: "input"
+```
+
+Note that when specifying column names for data extraction, we can handle nested objects (e.g., in JSON) with dot notation:
+```python
+from graphrag_input import get_property
+
+data = {"user": {"profile": {"name": "Alice"}}}
+name = get_property(data, "user.profile.name")  # Returns "Alice"
+```
@@ -0,0 +1,20 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""GraphRAG input document loading package."""
+
+from graphrag_input.get_property import get_property
+from graphrag_input.input_config import InputConfig
+from graphrag_input.input_reader import InputReader
+from graphrag_input.input_reader_factory import create_input_reader
+from graphrag_input.input_type import InputType
+from graphrag_input.text_document import TextDocument
+
+__all__ = [
+    "InputConfig",
+    "InputReader",
+    "InputType",
+    "TextDocument",
+    "create_input_reader",
+    "get_property",
+]