remove unused columns and rename document_attribute_columns (microsoft#1672)

dayesouza · web-flow · commit ad5b5120ec31 · 2025-02-03T14:37:06.000-03:00
* remove unused columns and change property document_attribute_columns to metadata

* format file

* fix 'metadata' column on output

* run check

* fix test on nltk

* remove docs changes
diff --git a/.semversioner/next-release/patch-20250131200818198240.json b/.semversioner/next-release/patch-20250131200818198240.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "remove unused columns and change property document_attribute_columns to metadata"
+}
diff --git a/graphrag/config/models/input_config.py b/graphrag/config/models/input_config.py
@@ -40,21 +40,12 @@ class InputConfig(BaseModel):
     file_filter: dict[str, str] | None = Field(
         description="The optional file filter for the input files.", default=None
     )
-    source_column: str | None = Field(
-        description="The input source column to use.", default=None
-    )
-    timestamp_column: str | None = Field(
-        description="The input timestamp column to use.", default=None
-    )
-    timestamp_format: str | None = Field(
-        description="The input timestamp format to use.", default=None
-    )
     text_column: str = Field(
         description="The input text column to use.", default=defs.INPUT_TEXT_COLUMN
     )
     title_column: str | None = Field(
         description="The input title column to use.", default=None
     )
-    document_attribute_columns: list[str] = Field(
-        description="The document attribute columns to use.", default=[]
+    metadata: list[str] | None = Field(
+        description="The document attribute columns to use.", default=None
     )
diff --git a/graphrag/index/flows/create_final_documents.py b/graphrag/index/flows/create_final_documents.py
@@ -9,7 +9,7 @@
 def create_final_documents(
     documents: pd.DataFrame,
     text_units: pd.DataFrame,
-    document_attribute_columns: list[str] | None = None,
+    metadata: list[str] | None = None,
 ) -> pd.DataFrame:
     """All the steps to transform final documents."""
     exploded = (
@@ -46,22 +46,18 @@ def create_final_documents(
     rejoined["id"] = rejoined["id"].astype(str)
     rejoined["human_readable_id"] = rejoined.index + 1
 
-    # Convert attribute columns to strings and collapse them into a JSON object
-    if document_attribute_columns:
+    # Convert metadata columns to strings and collapse them into a JSON object
+    if metadata:
         # Convert all specified columns to string at once
-        rejoined[document_attribute_columns] = rejoined[
-            document_attribute_columns
-        ].astype(str)
+        rejoined[metadata] = rejoined[metadata].astype(str)
 
-        # Collapse the document_attribute_columns into a single JSON object column
-        rejoined["attributes"] = rejoined[document_attribute_columns].to_dict(
-            orient="records"
-        )
+        # Collapse the metadata columns into a single JSON object column
+        rejoined["metadata"] = rejoined[metadata].to_dict(orient="records")
 
-        # Drop the original attribute columns after collapsing them
-        rejoined.drop(columns=document_attribute_columns, inplace=True)
+        # Drop the original metadata columns after collapsing them
+        rejoined.drop(columns=metadata, inplace=True)
 
-    # set the final column order, but adjust for attributes
+    # set the final column order, but adjust for metadata
     core_columns = [
         "id",
         "human_readable_id",
@@ -70,7 +66,7 @@ def create_final_documents(
         "text_unit_ids",
     ]
     final_columns = [column for column in core_columns if column in rejoined.columns]
-    if document_attribute_columns:
-        final_columns.append("attributes")
+    if metadata:
+        final_columns.append("metadata")
 
     return rejoined.loc[:, final_columns]
diff --git a/graphrag/index/input/csv.py b/graphrag/index/input/csv.py
@@ -41,15 +41,6 @@ async def load_file(path: str, group: dict | None) -> pd.DataFrame:
             )
         if "id" not in data.columns:
             data["id"] = data.apply(lambda x: gen_sha512_hash(x, x.keys()), axis=1)
-        if config.source_column is not None and "source" not in data.columns:
-            if config.source_column not in data.columns:
-                log.warning(
-                    "source_column %s not found in csv file %s",
-                    config.source_column,
-                    path,
-                )
-            else:
-                data["source"] = data.apply(lambda x: x[config.source_column], axis=1)
         if config.text_column is not None and "text" not in data.columns:
             if config.text_column not in data.columns:
                 log.warning(
@@ -69,37 +60,6 @@ async def load_file(path: str, group: dict | None) -> pd.DataFrame:
             else:
                 data["title"] = data.apply(lambda x: x[config.title_column], axis=1)
 
-        if config.timestamp_column is not None:
-            fmt = config.timestamp_format
-            if fmt is None:
-                msg = "Must specify timestamp_format if timestamp_column is specified"
-                raise ValueError(msg)
-
-            if config.timestamp_column not in data.columns:
-                log.warning(
-                    "timestamp_column %s not found in csv file %s",
-                    config.timestamp_column,
-                    path,
-                )
-            else:
-                data["timestamp"] = pd.to_datetime(
-                    data[config.timestamp_column], format=fmt
-                )
-
-            # TODO: Theres probably a less gross way to do this
-            if "year" not in data.columns:
-                data["year"] = data.apply(lambda x: x["timestamp"].year, axis=1)
-            if "month" not in data.columns:
-                data["month"] = data.apply(lambda x: x["timestamp"].month, axis=1)
-            if "day" not in data.columns:
-                data["day"] = data.apply(lambda x: x["timestamp"].day, axis=1)
-            if "hour" not in data.columns:
-                data["hour"] = data.apply(lambda x: x["timestamp"].hour, axis=1)
-            if "minute" not in data.columns:
-                data["minute"] = data.apply(lambda x: x["timestamp"].minute, axis=1)
-            if "second" not in data.columns:
-                data["second"] = data.apply(lambda x: x["timestamp"].second, axis=1)
-
         return data
 
     file_pattern = (
diff --git a/graphrag/index/workflows/create_final_documents.py b/graphrag/index/workflows/create_final_documents.py
@@ -28,9 +28,7 @@ async def run_workflow(
     )
 
     input = config.input
-    output = create_final_documents(
-        documents, text_units, input.document_attribute_columns
-    )
+    output = create_final_documents(documents, text_units, input.metadata)
 
     await write_table_to_storage(output, workflow_name, context.storage)
 
diff --git a/tests/unit/config/utils.py b/tests/unit/config/utils.py
@@ -94,12 +94,9 @@
         "encoding": defs.INPUT_FILE_ENCODING,
         "file_pattern": defs.INPUT_TEXT_PATTERN,
         "file_filter": None,
-        "source_column": None,
-        "timestamp_column": None,
-        "timestamp_format": None,
         "text_column": defs.INPUT_TEXT_COLUMN,
         "title_column": None,
-        "document_attribute_columns": [],
+        "metadata": None,
     },
     "embed_graph": {
         "enabled": defs.NODE2VEC_ENABLED,
@@ -344,12 +341,9 @@ def assert_input_configs(actual: InputConfig, expected: InputConfig) -> None:
     assert actual.encoding == expected.encoding
     assert actual.file_pattern == expected.file_pattern
     assert actual.file_filter == expected.file_filter
-    assert actual.source_column == expected.source_column
-    assert actual.timestamp_column == expected.timestamp_column
-    assert actual.timestamp_format == expected.timestamp_format
     assert actual.text_column == expected.text_column
     assert actual.title_column == expected.title_column
-    assert actual.document_attribute_columns == expected.document_attribute_columns
+    assert actual.metadata == expected.metadata
 
 
 def assert_embed_graph_configs(
diff --git a/tests/unit/indexing/operations/__init__.py b/tests/unit/indexing/operations/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
diff --git a/tests/unit/indexing/operations/chunk_text/__init__.py b/tests/unit/indexing/operations/chunk_text/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
diff --git a/tests/unit/indexing/operations/chunk_text/test_strategies.py b/tests/unit/indexing/operations/chunk_text/test_strategies.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+from unittest.mock import Mock, patch
+
+from graphrag.config.models.chunking_config import ChunkingConfig
+from graphrag.index.bootstrap import bootstrap
+from graphrag.index.operations.chunk_text.strategies import run_sentences, run_tokens
+from graphrag.index.operations.chunk_text.typing import TextChunk
+
+
+class TestRunSentences:
+    def setup_method(self, method):
+        bootstrap()
+
+    def test_basic_functionality(self):
+        """Test basic sentence splitting without metadata"""
+        input = ["This is a test. Another sentence."]
+        tick = Mock()
+        chunks = list(run_sentences(input, ChunkingConfig(), tick))
+
+        assert len(chunks) == 2
+        assert chunks[0].text_chunk == "This is a test."
+        assert chunks[1].text_chunk == "Another sentence."
+        assert all(c.source_doc_indices == [0] for c in chunks)
+        tick.assert_called_once_with(1)
+
+    def test_multiple_documents(self):
+        """Test processing multiple input documents"""
+        input = ["First. Document.", "Second. Doc."]
+        tick = Mock()
+        chunks = list(run_sentences(input, ChunkingConfig(), tick))
+
+        assert len(chunks) == 4
+        assert chunks[0].source_doc_indices == [0]
+        assert chunks[2].source_doc_indices == [1]
+        assert tick.call_count == 2
+
+    def test_mixed_whitespace_handling(self):
+        """Test input with irregular whitespace"""
+        input = ["   Sentence with spaces.  Another one!   "]
+        chunks = list(run_sentences(input, ChunkingConfig(), Mock()))
+        assert chunks[0].text_chunk == "   Sentence with spaces."
+        assert chunks[1].text_chunk == "Another one!"
+
+
+class TestRunTokens:
+    @patch("tiktoken.get_encoding")
+    def test_basic_functionality(self, mock_get_encoding):
+        mock_encoder = Mock()
+        mock_encoder.encode.side_effect = lambda x: list(x.encode())
+        mock_encoder.decode.side_effect = lambda x: bytes(x).decode()
+        mock_get_encoding.return_value = mock_encoder
+
+        # Input and config
+        input = [
+            "Marley was dead: to begin with. There is no doubt whatever about that. The register of his burial was signed by the clergyman, the clerk, the undertaker, and the chief mourner. Scrooge signed it. And Scrooge's name was good upon 'Change, for anything he chose to put his hand to."
+        ]
+        config = ChunkingConfig(size=5, overlap=1, encoding_model="fake-encoding")
+        tick = Mock()
+
+        # Run the function
+        chunks = list(run_tokens(input, config, tick))
+
+        # Verify output
+        assert len(chunks) > 0
+        assert all(isinstance(chunk, TextChunk) for chunk in chunks)
+        tick.assert_called_once_with(1)
+
+    @patch("tiktoken.get_encoding")
+    def test_non_string_input(self, mock_get_encoding):
+        """Test handling of non-string input (e.g., numbers)."""
+        mock_encoder = Mock()
+        mock_encoder.encode.side_effect = lambda x: list(str(x).encode())
+        mock_encoder.decode.side_effect = lambda x: bytes(x).decode()
+        mock_get_encoding.return_value = mock_encoder
+
+        input = [123]  # Non-string input
+        config = ChunkingConfig(size=5, overlap=1, encoding_model="fake-encoding")
+        tick = Mock()
+
+        chunks = list(run_tokens(input, config, tick))  # type: ignore
+
+        # Verify non-string input is handled
+        assert len(chunks) > 0
+        assert "123" in chunks[0].text_chunk
diff --git a/tests/unit/indexing/text_splitting/test_text_splitting.py b/tests/unit/indexing/text_splitting/test_text_splitting.py
@@ -5,6 +5,7 @@
 from unittest.mock import MagicMock
 
 import pytest
+import tiktoken
 
 from graphrag.index.text_splitting.text_splitting import (
     NoopTextSplitter,
@@ -159,3 +160,45 @@ def test_split_multiple_texts_on_tokens():
 
     split_multiple_texts_on_tokens(texts, tokenizer, tick=mock_tick)
     mock_tick.assert_called()
+
+
+def test_split_single_text_on_tokens_no_overlap():
+    text = "This is a test text, meaning to be taken seriously by this test only."
+    enc = tiktoken.get_encoding("cl100k_base")
+
+    def encode(text: str) -> list[int]:
+        if not isinstance(text, str):
+            text = f"{text}"
+        return enc.encode(text)
+
+    def decode(tokens: list[int]) -> str:
+        return enc.decode(tokens)
+
+    tokenizer = Tokenizer(
+        chunk_overlap=1,
+        tokens_per_chunk=2,
+        decode=decode,
+        encode=lambda text: encode(text),
+    )
+
+    expected_splits = [
+        "This is",
+        " is a",
+        " a test",
+        " test text",
+        " text,",
+        ", meaning",
+        " meaning to",
+        " to be",
+        " be taken",  # cspell:disable-line
+        " taken seriously",  # cspell:disable-line
+        " seriously by",
+        " by this",  # cspell:disable-line
+        " this test",
+        " test only",
+        " only.",
+        ".",
+    ]
+
+    result = split_single_text_on_tokens(text=text, tokenizer=tokenizer)
+    assert result == expected_splits
diff --git a/tests/verbs/test_create_final_documents.py b/tests/verbs/test_create_final_documents.py
@@ -45,7 +45,7 @@ async def test_create_final_documents_with_attribute_columns():
     )
 
     config = create_graphrag_config({"models": DEFAULT_MODEL_CONFIG})
-    config.input.document_attribute_columns = ["title"]
+    config.input.metadata = ["title"]
 
     await run_workflow(
         config,
@@ -63,4 +63,4 @@ async def test_create_final_documents_with_attribute_columns():
     )
     assert len(actual.columns) == 5
     assert "title" not in actual.columns
-    assert "attributes" in actual.columns
+    assert "metadata" in actual.columns

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "remove unused columns and change property document_attribute_columns to metadata"
 +}
Original file line number	Diff line number	Diff line change
`@@ -40,21 +40,12 @@ class InputConfig(BaseModel):`
`40`	`40`	`file_filter: dict[str, str] \| None = Field(`
`41`	`41`	`description="The optional file filter for the input files.", default=None`
`42`	`42`	`)`
`43`		`- source_column: str \| None = Field(`
`44`		`- description="The input source column to use.", default=None`
`45`		`- )`
`46`		`- timestamp_column: str \| None = Field(`
`47`		`- description="The input timestamp column to use.", default=None`
`48`		`- )`
`49`		`- timestamp_format: str \| None = Field(`
`50`		`- description="The input timestamp format to use.", default=None`
`51`		`- )`
`52`	`43`	`text_column: str = Field(`
`53`	`44`	`description="The input text column to use.", default=defs.INPUT_TEXT_COLUMN`
`54`	`45`	`)`
`55`	`46`	`title_column: str \| None = Field(`
`56`	`47`	`description="The input title column to use.", default=None`
`57`	`48`	`)`
`58`		`- document_attribute_columns: list[str] = Field(`
`59`		`- description="The document attribute columns to use.", default=[]`
	`49`	`+ metadata: list[str] \| None = Field(`
	`50`	`+ description="The document attribute columns to use.", default=None`
`60`	`51`	`)`
Original file line number	Diff line number	Diff line change
`@@ -28,9 +28,7 @@ async def run_workflow(`
`28`	`28`	`)`
`29`	`29`
`30`	`30`	`input = config.input`
`31`		`- output = create_final_documents(`
`32`		`- documents, text_units, input.document_attribute_columns`
`33`		`- )`
	`31`	`+ output = create_final_documents(documents, text_units, input.metadata)`
`34`	`32`
`35`	`33`	`await write_table_to_storage(output, workflow_name, context.storage)`
`36`	`34`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Copyright (c) 2024 Microsoft Corporation.`
	`2`	`+# Licensed under the MIT License`