feat: partition_org for Org Mode documents (#780)

nightscape · web-flow · commit 752e78e80306 · 2023-06-23T18:45:31.000Z
* feat: partition_org for Org Mode documents

* update version
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 0.7.9-dev0
+
+### Enhancements
+
+### Features
+
+* Adds `partition_org` for processed Org Mode documents.
+
+### Fixes
+
 ## 0.7.8
 
 ### Enhancements
diff --git a/README.md b/README.md
@@ -95,6 +95,7 @@ about the library.
 | HTML Pages (`.html`) | `partition_html` | N/A | No | Encoding; Include Page Breaks |
 | Images (`.png`/`.jpg`) | `partition_image` | `"auto"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
 | Markdown (`.md`) | `partitin_md` | N/A | Yes | Include Page Breaks |
+| Org Mode (`.org`) | `partition_org` | N/A | Yes | Include Page Breaks |
 | Open Office Documents (`.odt`) | `partition_odt` | N/A | Yes | None |
 | PDFs (`.pdf`) | `partition_pdf` | `"auto"`, `"fast"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
 | Plain Text (`.txt`) | `partition_text` | N/A | No | Encoding, Paragraph Grouper |
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -590,6 +590,24 @@ Examples:
       elements = partition_pptx(file=f)
 
 
+``partition_org``
+---------------------
+
+The ``partition_org`` function processes Org Mode (``.org``) documents. The function
+first converts the document to HTML using ``pandoc`` and then calls ``partition_html``.
+You'll need `pandoc <https://pandoc.org/installing.html>`_ installed on your system
+to use ``partition_org``.
+
+
+Examples:
+
+.. code:: python
+
+  from unstructured.partition.org import partition_org
+
+  elements = partition_org(filename="example-docs/README.org")
+
+
 ``partition_rst``
 ---------------------
 
@@ -607,7 +625,6 @@ Examples:
 
   elements = partition_rst(filename="example-docs/README.rst")
 
-
 ``partition_rtf``
 ---------------------
 
diff --git a/example-docs/README.org b/example-docs/README.org
@@ -0,0 +1,27 @@
+* Example Docs
+
+The sample docs directory contains the following files:
+
+-  ~example-10k.html~ - A 10-K SEC filing in HTML format
+-  ~layout-parser-paper.pdf~ - A PDF copy of the layout parser paper
+-  ~factbook.xml~ / ~factbook.xsl~ - Example XML/XLS files that you
+   can use to test stylesheets
+
+These documents can be used to test out the parsers in the library. In
+addition, here are instructions for pulling in some sample docs that are
+too big to store in the repo.
+
+** XBRL 10-K
+
+You can get an example 10-K in inline XBRL format using the following
+~curl~. Note, you need to have the user agent set in the header or the
+SEC site will reject your request.
+
+#+BEGIN_SRC bash
+
+   curl -O \
+     -A '${organization} ${email}'
+     https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt
+#+END_SRC
+
+You can parse this document using the HTML parser.
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -45,6 +45,7 @@
         ("fake-power-point.pptx", FileType.PPTX),
         ("winter-sports.epub", FileType.EPUB),
         ("spring-weather.html.json", FileType.JSON),
+        ("README.org", FileType.ORG),
         ("README.rst", FileType.RST),
         ("README.md", FileType.MD),
         ("fake.odt", FileType.ODT),
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -784,6 +784,21 @@ def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
         assert partition(file=f) == []
 
 
+def test_auto_partition_org_from_filename(filename="example-docs/README.org"):
+    elements = partition(filename=filename)
+
+    assert elements[0] == Title("Example Docs")
+    assert elements[0].metadata.filetype == "text/org"
+
+
+def test_auto_partition_org_from_file(filename="example-docs/README.org"):
+    with open(filename, "rb") as f:
+        elements = partition(file=f, content_type="text/org")
+
+    assert elements[0] == Title("Example Docs")
+    assert elements[0].metadata.filetype == "text/org"
+
+
 def test_auto_partition_rst_from_filename(filename="example-docs/README.rst"):
     elements = partition(filename=filename)
 
diff --git a/test_unstructured/partition/test_org.py b/test_unstructured/partition/test_org.py
@@ -0,0 +1,17 @@
+from unstructured.documents.elements import Title
+from unstructured.partition.org import partition_org
+
+
+def test_partition_org_from_filename(filename="example-docs/README.org"):
+    elements = partition_org(filename=filename)
+
+    assert elements[0] == Title("Example Docs")
+    assert elements[0].metadata.filetype == "text/org"
+
+
+def test_partition_org_from_file(filename="example-docs/README.org"):
+    with open(filename, "rb") as f:
+        elements = partition_org(file=f)
+
+    assert elements[0] == Title("Example Docs")
+    assert elements[0].metadata.filetype == "text/org"
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.8"  # pragma: no cover
+__version__ = "0.7.9-dev0"  # pragma: no cover
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -89,6 +89,7 @@ class FileType(Enum):
     MD = 52
     EPUB = 53
     RST = 54
+    ORG = 55
 
     # Compressed Types
     ZIP = 60
@@ -117,6 +118,7 @@ def __lt__(self, other):
     "text/tsv": FileType.TSV,
     "text/markdown": FileType.MD,
     "text/x-markdown": FileType.MD,
+    "text/org": FileType.ORG,
     "text/x-rst": FileType.RST,
     "application/epub": FileType.EPUB,
     "application/epub+zip": FileType.EPUB,
@@ -161,6 +163,7 @@ def __lt__(self, other):
     ".htm": FileType.HTML,
     ".html": FileType.HTML,
     ".md": FileType.MD,
+    ".org": FileType.ORG,
     ".rst": FileType.RST,
     ".xlsx": FileType.XLSX,
     ".pptx": FileType.PPTX,
@@ -289,7 +292,7 @@ def detect_filetype(
         if file and _check_eml_from_buffer(file=file) is True:
             return FileType.EML
 
-        if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".tsv", ".json"]:
+        if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".org", ".tsv", ".json"]:
             return EXT_TO_FILETYPE.get(extension)
 
         # Safety catch
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -23,6 +23,7 @@
 from unstructured.partition.md import partition_md
 from unstructured.partition.msg import partition_msg
 from unstructured.partition.odt import partition_odt
+from unstructured.partition.org import partition_org
 from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.ppt import partition_ppt
 from unstructured.partition.pptx import partition_pptx
@@ -154,6 +155,13 @@ def partition(
             include_page_breaks=include_page_breaks,
             **kwargs,
         )
+    elif filetype == FileType.ORG:
+        elements = partition_org(
+            filename=filename,
+            file=file,
+            include_page_breaks=include_page_breaks,
+            **kwargs,
+        )
     elif filetype == FileType.RST:
         elements = partition_rst(
             filename=filename,
diff --git a/unstructured/partition/org.py b/unstructured/partition/org.py
@@ -0,0 +1,31 @@
+from typing import IO, List, Optional
+
+from unstructured.documents.elements import Element
+from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
+from unstructured.partition.html import convert_and_partition_html
+
+
+@add_metadata_with_filetype(FileType.ORG)
+def partition_org(
+    filename: Optional[str] = None,
+    file: Optional[IO] = None,
+    include_page_breaks: bool = False,
+) -> List[Element]:
+    """Partitions an org document. The document is first converted to HTML and then
+    partitioned using partition_html.
+
+    Parameters
+    ----------
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    include_page_breaks
+        If True, the output will include page breaks if the filetype supports it
+    """
+    return convert_and_partition_html(
+        source_format="org",
+        filename=filename,
+        file=file,
+        include_page_breaks=include_page_breaks,
+    )

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.7.8" # pragma: no cover`
	`1`	`+__version__ = "0.7.9-dev0" # pragma: no cover`