Skip to content

Commit ef47d53

Browse files
authored
feat: add chunking to partition_tsv (#2982)
Closes #2980
1 parent 668dd01 commit ef47d53

File tree

4 files changed

+23
-3
lines changed

4 files changed

+23
-3
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.13.7-dev8
1+
## 0.13.7-dev9
22

33
### Enhancements
44

@@ -15,6 +15,7 @@
1515
* **`partition_docx()` handles short table rows.** The DOCX format allows a table row to start late and/or end early, meaning cells at the beginning or end of a row can be omitted. While there are legitimate uses for this capability, using it in practice is relatively rare. However, it can happen unintentionally when adjusting cell borders with the mouse. Accommodate this case and generate accurate `.text` and `.metadata.text_as_html` for these tables.
1616
* **Remedy macOS test failure not triggered by CI.** Generalize temp-file detection beyond hard-coded Linux-specific prefix.
1717
* **Remove unnecessary warning log for using default layout model.**
18+
* **Add chunking to partition_tsv** Even though partition_tsv() produces a single Table element, chunking is made available because the Table element is often larger than the desired chunk size and must be divided into smaller chunks.
1819

1920
## 0.13.6
2021

test_unstructured/partition/csv/test_tsv.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
EXPECTED_TEXT_XLSX,
1111
)
1212
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
13+
from unstructured.chunking.title import chunk_by_title
1314
from unstructured.cleaners.core import clean_extra_whitespace
1415
from unstructured.documents.elements import Table
1516
from unstructured.partition.tsv import partition_tsv
@@ -228,11 +229,27 @@ def test_partition_tsv_element_metadata_has_languages():
228229
assert elements[0].metadata.languages == ["eng"]
229230

230231

231-
def test_partition_csv_header():
232+
def test_partition_tsv_header():
232233
filename = "example-docs/stanley-cups.tsv"
233234
elements = partition_tsv(filename=filename, strategy="fast", include_header=True)
234235
assert (
235236
clean_extra_whitespace(elements[0].text)
236237
== "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
237238
)
238239
assert "<thead>" in elements[0].metadata.text_as_html
240+
241+
242+
def test_partition_tsv_supports_chunking_strategy_while_partitioning():
243+
elements = partition_tsv(filename=example_doc_path("stanley-cups.tsv"))
244+
chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=0)
245+
246+
chunk_elements = partition_tsv(
247+
example_doc_path("stanley-cups.tsv"),
248+
chunking_strategy="by_title",
249+
max_characters=9,
250+
combine_text_under_n_chars=0,
251+
include_header=False,
252+
)
253+
254+
# The same chunks are returned if chunking elements or chunking during partitioning.
255+
assert chunk_elements == chunks

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.13.7-dev8" # pragma: no cover
1+
__version__ = "0.13.7-dev9" # pragma: no cover

unstructured/partition/tsv.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pandas as pd
66
from lxml.html.soupparser import fromstring as soupparser_fromstring
77

8+
from unstructured.chunking import add_chunking_strategy
89
from unstructured.documents.elements import (
910
Element,
1011
ElementMetadata,
@@ -25,6 +26,7 @@
2526

2627
@process_metadata()
2728
@add_metadata_with_filetype(FileType.TSV)
29+
@add_chunking_strategy
2830
def partition_tsv(
2931
filename: Optional[str] = None,
3032
file: Optional[IO[bytes]] = None,

0 commit comments

Comments
 (0)