Skip to content

Commit 2d5dba0

Browse files
authored
feat: Implement staging brick for ISD CSV format (#36)
* Implement convert_to_isd_csv function * Add unit tests for convert_to_isd_csv function * Update docs with description and example of convert_to_isd_csv function * Update changelog and version
1 parent fb16847 commit 2d5dba0

File tree

5 files changed

+57
-3
lines changed

5 files changed

+57
-3
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 0.2.1-dev7
1+
## 0.2.1-dev8
22

3+
* Added staging brick for CSV format for ISD (Initial Structured Data) format.
34
* Added staging brick for separating text into attention window size chunks for `transformers`.
45
* Added staging brick for LabelBox.
56
* Added ability to upload LabelStudio predictions

docs/source/bricks.rst

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,22 @@ Examples:
338338
isd = convert_to_isd(elements)
339339
340340
341+
``convert_to_isd_csv``
342+
----------------------
343+
344+
Converts outputs to the initial structured data (ISD) format as a CSV string.
345+
346+
Examples:
347+
348+
.. code:: python
349+
350+
from unstructured.documents.elements import Title, NarrativeText
351+
from unstructured.staging.base import convert_to_isd_csv
352+
353+
elements = [Title(text="Title"), NarrativeText(text="Narrative")]
354+
isd_csv = convert_to_isd_csv(elements)
355+
356+
341357
``stage_for_transformers``
342358
--------------------------
343359

@@ -422,7 +438,6 @@ The following optional keyword arguments can be specified in
422438
results = [nlp(chunk) for chunk in chunks]
423439
424440
425-
426441
``stage_for_label_studio``
427442
--------------------------
428443

test_unstructured/staging/test_base_staging.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,17 @@
1+
import os
2+
import pytest
3+
import csv
4+
15
import unstructured.staging.base as base
26

37
from unstructured.documents.elements import Title, NarrativeText
48

59

10+
@pytest.fixture
11+
def output_csv_file(tmp_path):
12+
return os.path.join(tmp_path, "isd_data.csv")
13+
14+
615
def test_convert_to_isd():
716
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
817
isd = base.convert_to_isd(elements)
@@ -12,3 +21,16 @@ def test_convert_to_isd():
1221

1322
assert isd[1]["text"] == "Narrative 1"
1423
assert isd[1]["type"] == "NarrativeText"
24+
25+
26+
def test_convert_to_isd_csv(output_csv_file):
27+
28+
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
29+
with open(output_csv_file, "w+") as csv_file:
30+
isd_csv_string = base.convert_to_isd_csv(elements)
31+
csv_file.write(isd_csv_string)
32+
33+
fieldnames = ["type", "text"]
34+
with open(output_csv_file, "r") as csv_file:
35+
csv_rows = csv.DictReader(csv_file)
36+
assert all(set(row.keys()) == set(fieldnames) for row in csv_rows)

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.1-dev7" # pragma: no cover
1+
__version__ = "0.2.1-dev8" # pragma: no cover

unstructured/staging/base.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import io
2+
import csv
13
from typing import Dict, List
24

35
from unstructured.documents.elements import Text
@@ -10,3 +12,17 @@ def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
1012
section = dict(text=element.text, type=element.category)
1113
isd.append(section)
1214
return isd
15+
16+
17+
def convert_to_isd_csv(elements: List[Text]) -> str:
18+
"""
19+
Returns the representation of document elements as an Initial Structured Document (ISD)
20+
in CSV Format.
21+
"""
22+
csv_fieldnames: List[str] = ["type", "text"]
23+
rows: List[Dict[str, str]] = convert_to_isd(elements)
24+
with io.StringIO() as buffer:
25+
csv_writer = csv.DictWriter(buffer, fieldnames=csv_fieldnames)
26+
csv_writer.writeheader()
27+
csv_writer.writerows(rows)
28+
return buffer.getvalue()

0 commit comments

Comments
 (0)