Skip to content

Commit 301cef2

Browse files
authored
feat: add page_name to metadata for Excel documents (#609)
* Add page_name to metadata for Excel documents * Update changelog and version number * fix lint
1 parent 34d563c commit 301cef2

File tree

5 files changed

+16
-1
lines changed

5 files changed

+16
-1
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 0.6.7-dev5
2+
3+
### Enhancements
4+
5+
* Add `page_name` to metadata. Currently used for the sheet name in XLSX documents.
6+
17
## 0.6.7-dev4
28

39
### Enhancements

test_unstructured/partition/test_xlsx.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232

3333
EXPECTED_FILETYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
3434

35+
EXCEPTED_PAGE_NAME = "Stanley Cups"
36+
3537

3638
def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx"):
3739
elements = partition_xlsx(filename=filename)
@@ -43,6 +45,7 @@ def test_partition_xlsx_from_filename(filename="example-docs/stanley-cups.xlsx")
4345
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
4446
assert elements[0].metadata.page_number == 1
4547
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
48+
assert elements[0].metadata.page_name == EXCEPTED_PAGE_NAME
4649

4750

4851
def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
@@ -56,6 +59,7 @@ def test_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"):
5659
assert elements[0].metadata.text_as_html == EXPECTED_TABLE
5760
assert elements[0].metadata.page_number == 1
5861
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
62+
assert elements[0].metadata.page_name == EXCEPTED_PAGE_NAME
5963

6064

6165
def test_partition_xlsx_can_exclude_metadata(filename="example-docs/stanley-cups.xlsx"):
@@ -68,3 +72,4 @@ def test_partition_xlsx_can_exclude_metadata(filename="example-docs/stanley-cups
6872
assert elements[0].metadata.text_as_html is None
6973
assert elements[0].metadata.page_number is None
7074
assert elements[0].metadata.filetype is None
75+
assert elements[0].metadata.page_name is None

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.7-dev4" # pragma: no cover
1+
__version__ = "0.6.7-dev5" # pragma: no cover

unstructured/documents/elements.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ class ElementMetadata:
2525
# Page numbers currenlty supported for PDF, HTML and PPT documents
2626
page_number: Optional[int] = None
2727

28+
# Page name. The sheet name in XLXS documents.
29+
page_name: Optional[str] = None
30+
2831
# Webpage specific metadata fields
2932
url: Optional[str] = None
3033

unstructured/partition/xlsx.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def partition_xlsx(
5151
if include_metadata:
5252
metadata = ElementMetadata(
5353
text_as_html=html_text,
54+
page_name=sheet_name,
5455
page_number=page_number,
5556
filename=metadata_filename,
5657
)

0 commit comments

Comments
 (0)