Unstructured-IO
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 5 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/layout_analysis/README.md‎
Lines changed: 20 additions & 0 deletions b/‎examples/layout_analysis/README.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎examples/layout_analysis/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎examples/layout_analysis/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/layout_analysis/visualization.ipynb‎
Lines changed: 94 additions & 0 deletions b/‎examples/layout_analysis/visualization.ipynb‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎examples/layout_analysis/visualization.py‎
Lines changed: 47 additions & 0 deletions b/‎examples/layout_analysis/visualization.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎examples/ocr_layout_supplement/README.md‎
Lines changed: 19 additions & 0 deletions b/‎examples/ocr_layout_supplement/README.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎examples/ocr_layout_supplement/ocr_layout_supplement.ipynb‎
Lines changed: 126 additions & 0 deletions b/‎examples/ocr_layout_supplement/ocr_layout_supplement.ipynb‎
Lines changed: 126 additions & 0 deletions
diff --git a/‎examples/ocr_layout_supplement/ocr_layout_supplement.py‎
Lines changed: 60 additions & 0 deletions b/‎examples/ocr_layout_supplement/ocr_layout_supplement.py‎
Lines changed: 60 additions & 0 deletions
@@ -143,3 +143,4 @@ dmypy.json
 .vscode/
 
 sample-docs/*_images
+examples/**/output
@@ -1,3 +1,8 @@
+## 0.5.19
+
+* Add functionality to supplement detected layout with elements from the full page OCR
+* Add functionality to annotate any layout(extracted, inferred, OCR) on a page
+
 ## 0.5.18
 
 * Fix for incorrect type assignation at ingest test
 
@@ -0,0 +1,20 @@
+# Analyzing Layout Elements
+
+This directory contains examples of how to analyze layout elements.
+
+## How to run
+
+Run `pip install -r requirements.txt` to install the Python dependencies.
+
+### Visualization
+- Python script (visualization.py)
+```
+PYTHONPATH=. python examples/layout_analysis/visualization.py <file_path>
+```
+For example,
+```
+PYTHONPATH=. python examples/layout_analysis/visualization.py sample-docs/loremipsum.pdf
+```
+- Jupyter Notebook (visualization.ipynb)
+  - Run `jupyter-notebook` to start.
+  - Run the `visualization.ipynb` notebook.
@@ -0,0 +1 @@
+unstructured-inference
@@ -0,0 +1,94 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from unstructured_inference.inference.layout import process_file_with_model\n",
+    "from unstructured_inference.visualize import show_plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "annotation_data_map = {\n",
+    "    \"final\": None,\n",
+    "    \"extracted\": {\"layout\": {\"color\": \"green\", \"width\": 2}},\n",
+    "    \"inferred\": {\"inferred_layout\": {\"color\": \"blue\", \"width\": 2}},\n",
+    "    \"ocr\": {\"ocr_layout\": {\"color\": \"yellow\", \"width\": 2}},\n",
+    "}"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f_path = \"../../sample-docs/loremipsum.pdf\"\n",
+    "f_name = os.path.basename(f_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "doc = process_file_with_model(\n",
+    "    f_path,\n",
+    "    model_name=None,\n",
+    "    analysis=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for idx, page in enumerate(doc.pages):\n",
+    "    for action_type, action_value in annotation_data_map.items():\n",
+    "        img = page.annotate(annotation_data=action_value)\n",
+    "        if action_value is None:\n",
+    "            n_layout_elements = len(page.elements)\n",
+    "        else:\n",
+    "            attribute = list(action_value.keys())[0]\n",
+    "            n_layout_elements = len(getattr(page, attribute))\n",
+    "        print(f\"Filename: {f_name} - Page: {idx+1} - Layout: {action_type} - n_layout_elements: {n_layout_elements}\")\n",
+    "        show_plot(img, desired_width=14)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
@@ -0,0 +1,47 @@
+import os
+import pathlib
+import sys
+
+from unstructured_inference.inference.layout import process_file_with_model
+from unstructured_inference.utils import write_image
+
+CUR_DIR = pathlib.Path(__file__).parent.resolve()
+
+
+def run(f_path):
+    annotation_data_map = {
+        "final": None,
+        "extracted": {"layout": {"color": "green", "width": 2}},
+        "inferred": {"inferred_layout": {"color": "blue", "width": 2}},
+        "ocr": {"ocr_layout": {"color": "yellow", "width": 2}},
+    }
+
+    f_basename = os.path.splitext(os.path.basename(f_path))[0]
+    output_dir_path = os.path.join(output_basedir_path, f_basename)
+    os.makedirs(output_dir_path, exist_ok=True)
+
+    doc = process_file_with_model(
+        f_path,
+        model_name=None,
+        analysis=True,
+    )
+
+    for idx, page in enumerate(doc.pages):
+        for action_type, action_value in annotation_data_map.items():
+            img = page.annotate(annotation_data=action_value)
+            output_f_path = os.path.join(output_dir_path, f"{f_basename}_{idx+1}_{action_type}.jpg")
+            write_image(img, output_f_path)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print(
+            "Please provide the path to the file name as the first argument and the strategy as the "
+            "second argument.",
+        )
+        sys.exit(1)
+
+    output_basedir_path = os.path.join(CUR_DIR, "output")
+    os.makedirs(output_basedir_path, exist_ok=True)
+
+    run(f_path=sys.argv[1])
@@ -0,0 +1,19 @@
+# Supplementing detected layout with elements from the full-page OCR
+
+This directory contains examples of how to analyze layout elements.
+
+## Running the example
+
+Run `pip install -r requirements.txt` to install the Python dependencies.
+
+### Running python script
+```
+PYTHONPATH=. python examples/ocr_layout_supplement/ocr_layout_supplement.py <file_path> <file_type>
+```
+For example,
+```
+PYTHONPATH=. python examples/ocr_layout_supplement/ocr_layout_supplement.py sample-docs/patent-1p.pdf pdf
+```
+### Running jupyter notebook
+  - Run `jupyter-notebook` to start.
+  - Run the `visualization.ipynb` notebook.
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "from unstructured_inference.constants import AnnotationResult\n",
+    "from unstructured_inference.inference.layout import process_file_with_model\n",
+    "from unstructured_inference.utils import annotate_layout_elements"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "output_basedir_path = \"output\""
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "def run(f_path, f_type):\n",
+    "    if f_type == \"pdf\":\n",
+    "        is_image = False\n",
+    "    elif f_type == \"image\":\n",
+    "        is_image = True\n",
+    "    else:\n",
+    "        print(\"Invalid file type.\")\n",
+    "        sys.exit(1)\n",
+    "\n",
+    "    annotation_data_map = {\n",
+    "        \"final\": None,\n",
+    "    }\n",
+    "    actions = [False, True]\n",
+    "    for action in actions:\n",
+    "        _f_basename = os.path.splitext(os.path.basename(f_path))[0]\n",
+    "        output_dir_path = os.path.join(output_basedir_path, f\"{_f_basename}_{file_type}\")\n",
+    "        os.makedirs(output_dir_path, exist_ok=True)\n",
+    "\n",
+    "        f_basename = f\"updated_{_f_basename}\" if action else f\"original_{_f_basename}\"\n",
+    "\n",
+    "        label = \"Updated Results: \" if action else \"Original Results: \"\n",
+    "        print(label)\n",
+    "\n",
+    "        doc = process_file_with_model(\n",
+    "            f_path,\n",
+    "            is_image=is_image,\n",
+    "            model_name=None,\n",
+    "            supplement_with_ocr_elements=action,\n",
+    "            analysis=True,\n",
+    "        )\n",
+    "\n",
+    "        annotate_layout_elements(doc, annotation_data_map, output_dir_path, f_basename, AnnotationResult.PLOT)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "file_path = \"../../sample-docs/patent-1p.pdf\"\n",
+    "file_type = \"pdf\"\n",
+    "f_name = os.path.basename(file_path)\n",
+    "print(f\"file_name: {f_name} - file_type: {file_type}\")\n",
+    "\n",
+    "run(file_path, file_type)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "file_path = \"../../sample-docs/layout-parser-paper-fast.jpg\"\n",
+    "file_type = \"image\"\n",
+    "f_name = os.path.basename(file_path)\n",
+    "print(f\"file_name: {f_name} - file_type: {file_type}\")\n",
+    "\n",
+    "run(file_path, file_type)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
@@ -0,0 +1,60 @@
+import os
+import pathlib
+import sys
+
+from unstructured_inference.constants import AnnotationResult
+from unstructured_inference.inference.layout import process_file_with_model
+from unstructured_inference.utils import annotate_layout_elements
+
+CUR_DIR = pathlib.Path(__file__).parent.resolve()
+
+
+def run(f_path, file_type):
+    print(">>> Start...")
+    print(f">>> file_path: {f_path} - file_type: {file_type}")
+
+    if file_type == "pdf":
+        is_image = False
+    elif file_type == "image":
+        is_image = True
+    else:
+        print("Invalid file type.")
+        sys.exit(1)
+
+    annotation_data_map = {
+        "final": None,
+    }
+
+    actions = [False, True]
+    for action in actions:
+        _f_basename = os.path.splitext(os.path.basename(f_path))[0]
+        output_dir_path = os.path.join(output_basedir_path, f"{_f_basename}_{file_type}")
+        os.makedirs(output_dir_path, exist_ok=True)
+
+        f_basename = f"updated_{_f_basename}" if action else f"original_{_f_basename}"
+
+        doc = process_file_with_model(
+            f_path,
+            is_image=is_image,
+            model_name=None,
+            supplement_with_ocr_elements=action,
+            analysis=True,
+        )
+
+        annotate_layout_elements(doc, annotation_data_map, output_dir_path, f_basename, AnnotationResult.IMAGE)
+
+    print("<<< Finished")
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3:
+        print(
+            "Please provide the path to the file name as the first argument and the strategy as the "
+            "second argument.",
+        )
+        sys.exit(1)
+
+    output_basedir_path = os.path.join(CUR_DIR, "output")
+    os.makedirs(output_basedir_path, exist_ok=True)
+
+    run(f_path=sys.argv[1], file_type=sys.argv[2])
Original file line number	Diff line number	Diff line change
`@@ -143,3 +143,4 @@ dmypy.json`
`143`	`143`	`.vscode/`
`144`	`144`
`145`	`145`	`sample-docs/*_images`
	`146`	`+examples/**/output`