Skip to content

Commit 752e78e

Browse files
authored
feat: partition_org for Org Mode documents (#780)
* feat: partition_org for Org Mode documents * update version
1 parent 5320aa6 commit 752e78e

File tree

11 files changed

+133
-3
lines changed

11 files changed

+133
-3
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## 0.7.9-dev0
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
* Adds `partition_org` for processed Org Mode documents.
8+
9+
### Fixes
10+
111
## 0.7.8
212

313
### Enhancements

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ about the library.
9595
| HTML Pages (`.html`) | `partition_html` | N/A | No | Encoding; Include Page Breaks |
9696
| Images (`.png`/`.jpg`) | `partition_image` | `"auto"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
9797
| Markdown (`.md`) | `partitin_md` | N/A | Yes | Include Page Breaks |
98+
| Org Mode (`.org`) | `partition_org` | N/A | Yes | Include Page Breaks |
9899
| Open Office Documents (`.odt`) | `partition_odt` | N/A | Yes | None |
99100
| PDFs (`.pdf`) | `partition_pdf` | `"auto"`, `"fast"`, `"hi_res"`, `"ocr_only"` | Yes | Encoding; Include Page Breaks; Infer Table Structure; OCR Languages, Strategy |
100101
| Plain Text (`.txt`) | `partition_text` | N/A | No | Encoding, Paragraph Grouper |

docs/source/bricks.rst

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,24 @@ Examples:
590590
elements = partition_pptx(file=f)
591591
592592
593+
``partition_org``
594+
---------------------
595+
596+
The ``partition_org`` function processes Org Mode (``.org``) documents. The function
597+
first converts the document to HTML using ``pandoc`` and then calls ``partition_html``.
598+
You'll need `pandoc <https://pandoc.org/installing.html>`_ installed on your system
599+
to use ``partition_org``.
600+
601+
602+
Examples:
603+
604+
.. code:: python
605+
606+
from unstructured.partition.org import partition_org
607+
608+
elements = partition_org(filename="example-docs/README.org")
609+
610+
593611
``partition_rst``
594612
---------------------
595613

@@ -607,7 +625,6 @@ Examples:
607625
608626
elements = partition_rst(filename="example-docs/README.rst")
609627
610-
611628
``partition_rtf``
612629
---------------------
613630

example-docs/README.org

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
* Example Docs
2+
3+
The sample docs directory contains the following files:
4+
5+
- ~example-10k.html~ - A 10-K SEC filing in HTML format
6+
- ~layout-parser-paper.pdf~ - A PDF copy of the layout parser paper
7+
- ~factbook.xml~ / ~factbook.xsl~ - Example XML/XLS files that you
8+
can use to test stylesheets
9+
10+
These documents can be used to test out the parsers in the library. In
11+
addition, here are instructions for pulling in some sample docs that are
12+
too big to store in the repo.
13+
14+
** XBRL 10-K
15+
16+
You can get an example 10-K in inline XBRL format using the following
17+
~curl~. Note, you need to have the user agent set in the header or the
18+
SEC site will reject your request.
19+
20+
#+BEGIN_SRC bash
21+
22+
curl -O \
23+
-A '${organization} ${email}'
24+
https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt
25+
#+END_SRC
26+
27+
You can parse this document using the HTML parser.

test_unstructured/file_utils/test_filetype.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
("fake-power-point.pptx", FileType.PPTX),
4646
("winter-sports.epub", FileType.EPUB),
4747
("spring-weather.html.json", FileType.JSON),
48+
("README.org", FileType.ORG),
4849
("README.rst", FileType.RST),
4950
("README.md", FileType.MD),
5051
("fake.odt", FileType.ODT),

test_unstructured/partition/test_auto.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -784,6 +784,21 @@ def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
784784
assert partition(file=f) == []
785785

786786

787+
def test_auto_partition_org_from_filename(filename="example-docs/README.org"):
788+
elements = partition(filename=filename)
789+
790+
assert elements[0] == Title("Example Docs")
791+
assert elements[0].metadata.filetype == "text/org"
792+
793+
794+
def test_auto_partition_org_from_file(filename="example-docs/README.org"):
795+
with open(filename, "rb") as f:
796+
elements = partition(file=f, content_type="text/org")
797+
798+
assert elements[0] == Title("Example Docs")
799+
assert elements[0].metadata.filetype == "text/org"
800+
801+
787802
def test_auto_partition_rst_from_filename(filename="example-docs/README.rst"):
788803
elements = partition(filename=filename)
789804

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from unstructured.documents.elements import Title
2+
from unstructured.partition.org import partition_org
3+
4+
5+
def test_partition_org_from_filename(filename="example-docs/README.org"):
6+
elements = partition_org(filename=filename)
7+
8+
assert elements[0] == Title("Example Docs")
9+
assert elements[0].metadata.filetype == "text/org"
10+
11+
12+
def test_partition_org_from_file(filename="example-docs/README.org"):
13+
with open(filename, "rb") as f:
14+
elements = partition_org(file=f)
15+
16+
assert elements[0] == Title("Example Docs")
17+
assert elements[0].metadata.filetype == "text/org"

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.8" # pragma: no cover
1+
__version__ = "0.7.9-dev0" # pragma: no cover

unstructured/file_utils/filetype.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ class FileType(Enum):
8989
MD = 52
9090
EPUB = 53
9191
RST = 54
92+
ORG = 55
9293

9394
# Compressed Types
9495
ZIP = 60
@@ -117,6 +118,7 @@ def __lt__(self, other):
117118
"text/tsv": FileType.TSV,
118119
"text/markdown": FileType.MD,
119120
"text/x-markdown": FileType.MD,
121+
"text/org": FileType.ORG,
120122
"text/x-rst": FileType.RST,
121123
"application/epub": FileType.EPUB,
122124
"application/epub+zip": FileType.EPUB,
@@ -161,6 +163,7 @@ def __lt__(self, other):
161163
".htm": FileType.HTML,
162164
".html": FileType.HTML,
163165
".md": FileType.MD,
166+
".org": FileType.ORG,
164167
".rst": FileType.RST,
165168
".xlsx": FileType.XLSX,
166169
".pptx": FileType.PPTX,
@@ -289,7 +292,7 @@ def detect_filetype(
289292
if file and _check_eml_from_buffer(file=file) is True:
290293
return FileType.EML
291294

292-
if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".tsv", ".json"]:
295+
if extension in [".eml", ".md", ".rtf", ".html", ".rst", ".org", ".tsv", ".json"]:
293296
return EXT_TO_FILETYPE.get(extension)
294297

295298
# Safety catch

unstructured/partition/auto.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from unstructured.partition.md import partition_md
2424
from unstructured.partition.msg import partition_msg
2525
from unstructured.partition.odt import partition_odt
26+
from unstructured.partition.org import partition_org
2627
from unstructured.partition.pdf import partition_pdf
2728
from unstructured.partition.ppt import partition_ppt
2829
from unstructured.partition.pptx import partition_pptx
@@ -154,6 +155,13 @@ def partition(
154155
include_page_breaks=include_page_breaks,
155156
**kwargs,
156157
)
158+
elif filetype == FileType.ORG:
159+
elements = partition_org(
160+
filename=filename,
161+
file=file,
162+
include_page_breaks=include_page_breaks,
163+
**kwargs,
164+
)
157165
elif filetype == FileType.RST:
158166
elements = partition_rst(
159167
filename=filename,

0 commit comments

Comments
 (0)