feat: add chunking to partition_tsv (#2982)

Coniferish · web-flow · commit ef47d530f6fe · 2024-05-07T23:09:27.000Z
Closes #2980
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.13.7-dev8
+## 0.13.7-dev9
 
 ### Enhancements
 
@@ -15,6 +15,7 @@
 * **`partition_docx()` handles short table rows.** The DOCX format allows a table row to start late and/or end early, meaning cells at the beginning or end of a row can be omitted. While there are legitimate uses for this capability, using it in practice is relatively rare. However, it can happen unintentionally when adjusting cell borders with the mouse. Accommodate this case and generate accurate `.text` and `.metadata.text_as_html` for these tables.
 * **Remedy macOS test failure not triggered by CI.** Generalize temp-file detection beyond hard-coded Linux-specific prefix.
 * **Remove unnecessary warning log for using default layout model.**
+* **Add chunking to partition_tsv** Even though partition_tsv() produces a single Table element, chunking is made available because the Table element is often larger than the desired chunk size and must be divided into smaller chunks.
 
 ## 0.13.6
 
diff --git a/test_unstructured/partition/csv/test_tsv.py b/test_unstructured/partition/csv/test_tsv.py
@@ -10,6 +10,7 @@
     EXPECTED_TEXT_XLSX,
 )
 from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
+from unstructured.chunking.title import chunk_by_title
 from unstructured.cleaners.core import clean_extra_whitespace
 from unstructured.documents.elements import Table
 from unstructured.partition.tsv import partition_tsv
@@ -228,11 +229,27 @@ def test_partition_tsv_element_metadata_has_languages():
     assert elements[0].metadata.languages == ["eng"]
 
 
-def test_partition_csv_header():
+def test_partition_tsv_header():
     filename = "example-docs/stanley-cups.tsv"
     elements = partition_tsv(filename=filename, strategy="fast", include_header=True)
     assert (
         clean_extra_whitespace(elements[0].text)
         == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
     )
     assert "<thead>" in elements[0].metadata.text_as_html
+
+
+def test_partition_tsv_supports_chunking_strategy_while_partitioning():
+    elements = partition_tsv(filename=example_doc_path("stanley-cups.tsv"))
+    chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=0)
+
+    chunk_elements = partition_tsv(
+        example_doc_path("stanley-cups.tsv"),
+        chunking_strategy="by_title",
+        max_characters=9,
+        combine_text_under_n_chars=0,
+        include_header=False,
+    )
+
+    # The same chunks are returned if chunking elements or chunking during partitioning.
+    assert chunk_elements == chunks
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.13.7-dev8"  # pragma: no cover
+__version__ = "0.13.7-dev9"  # pragma: no cover
diff --git a/unstructured/partition/tsv.py b/unstructured/partition/tsv.py
@@ -5,6 +5,7 @@
 import pandas as pd
 from lxml.html.soupparser import fromstring as soupparser_fromstring
 
+from unstructured.chunking import add_chunking_strategy
 from unstructured.documents.elements import (
     Element,
     ElementMetadata,
@@ -25,6 +26,7 @@
 
 @process_metadata()
 @add_metadata_with_filetype(FileType.TSV)
+@add_chunking_strategy
 def partition_tsv(
     filename: Optional[str] = None,
     file: Optional[IO[bytes]] = None,

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.13.7-dev8" # pragma: no cover`
	`1`	`+__version__ = "0.13.7-dev9" # pragma: no cover`