feat: Implement staging brick for ISD CSV format (#36)

asymness · web-flow · commit 2d5dba0ddc90 · 2022-10-13T11:35:46.000-04:00
* Implement convert_to_isd_csv function

* Add unit tests for convert_to_isd_csv function

* Update docs with description and example of convert_to_isd_csv function

* Update changelog and version
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,6 @@
-## 0.2.1-dev7
+## 0.2.1-dev8
 
+* Added staging brick for CSV format for ISD (Initial Structured Data) format.
 * Added staging brick for separating text into attention window size chunks for `transformers`.
 * Added staging brick for LabelBox.
 * Added ability to upload LabelStudio predictions
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -338,6 +338,22 @@ Examples:
   isd = convert_to_isd(elements)
 
 
+``convert_to_isd_csv``
+----------------------
+
+Converts outputs to the initial structured data (ISD) format as a CSV string.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.documents.elements import Title, NarrativeText
+  from unstructured.staging.base import convert_to_isd_csv
+
+  elements = [Title(text="Title"), NarrativeText(text="Narrative")]
+  isd_csv = convert_to_isd_csv(elements)
+
+
 ``stage_for_transformers``
 --------------------------
 
@@ -422,7 +438,6 @@ The following optional keyword arguments can be specified in
     results = [nlp(chunk) for chunk in chunks]
 
 
-
 ``stage_for_label_studio``
 --------------------------
 
diff --git a/test_unstructured/staging/test_base_staging.py b/test_unstructured/staging/test_base_staging.py
@@ -1,8 +1,17 @@
+import os
+import pytest
+import csv
+
 import unstructured.staging.base as base
 
 from unstructured.documents.elements import Title, NarrativeText
 
 
+@pytest.fixture
+def output_csv_file(tmp_path):
+    return os.path.join(tmp_path, "isd_data.csv")
+
+
 def test_convert_to_isd():
     elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
     isd = base.convert_to_isd(elements)
@@ -12,3 +21,16 @@ def test_convert_to_isd():
 
     assert isd[1]["text"] == "Narrative 1"
     assert isd[1]["type"] == "NarrativeText"
+
+
+def test_convert_to_isd_csv(output_csv_file):
+
+    elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
+    with open(output_csv_file, "w+") as csv_file:
+        isd_csv_string = base.convert_to_isd_csv(elements)
+        csv_file.write(isd_csv_string)
+
+    fieldnames = ["type", "text"]
+    with open(output_csv_file, "r") as csv_file:
+        csv_rows = csv.DictReader(csv_file)
+        assert all(set(row.keys()) == set(fieldnames) for row in csv_rows)
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.2.1-dev7"  # pragma: no cover
+__version__ = "0.2.1-dev8"  # pragma: no cover
diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py
@@ -1,3 +1,5 @@
+import io
+import csv
 from typing import Dict, List
 
 from unstructured.documents.elements import Text
@@ -10,3 +12,17 @@ def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
         section = dict(text=element.text, type=element.category)
         isd.append(section)
     return isd
+
+
+def convert_to_isd_csv(elements: List[Text]) -> str:
+    """
+    Returns the representation of document elements as an Initial Structured Document (ISD)
+    in CSV Format.
+    """
+    csv_fieldnames: List[str] = ["type", "text"]
+    rows: List[Dict[str, str]] = convert_to_isd(elements)
+    with io.StringIO() as buffer:
+        csv_writer = csv.DictWriter(buffer, fieldnames=csv_fieldnames)
+        csv_writer.writeheader()
+        csv_writer.writerows(rows)
+        return buffer.getvalue()

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.2.1-dev7" # pragma: no cover`
	`1`	`+__version__ = "0.2.1-dev8" # pragma: no cover`