Chore: fix bug caused by page break has no page number (#196)

yuming-long · web-flow · commit d4719496c8bc · 2023-08-22T17:09:31.000-04:00
* fix page break
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.0.38-dev0
+
+* Fix page break has None page number bug
+
 ## 0.0.37
 
 * Bump unstructured to 0.10.4
diff --git a/README.md b/README.md
@@ -115,7 +115,7 @@ To extract the table structure from PDF files using the `hi_res` strategy, ensur
 #### Skip Table Extraction
 
 Currently, we provide support for enabling and disabling table extraction for file types other than PDF files. Set parameter `skip_infer_table_types` to specify the document types that you want to skip table extraction with. By default, we skip table extraction
-for PDFs and Images, which are `pdf`, `jpg` and `png`. Again, please note that table extraction only works with `hi_res` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to `skip_infer_table_types`with:
+for PDFs and Images, which are `pdf`, `jpg` and `png`. Again, please note that table extraction only works with `hi_res` strategy. For example, if you don't want to skip table extraction for images, you can pass an empty value to `skip_infer_table_types` with:
 
 ```
  curl -X 'POST' \
@@ -124,7 +124,7 @@ for PDFs and Images, which are `pdf`, `jpg` and `png`. Again, please note that t
   -H 'Content-Type: multipart/form-data' \
   -F 'files=@sample-docs/layout-parser-paper-with-table.jpg' \
   -F 'strategy=hi_res' \
-  -F 'skip_infer_table_types=' \
+  -F 'skip_infer_table_types=[]' \
   | jq -C . | less -R
 ```
 
diff --git a/pipeline-notebooks/pipeline-general.ipynb b/pipeline-notebooks/pipeline-general.ipynb
@@ -673,7 +673,9 @@
     "\n",
     "    # We need to account for the original page numbers\n",
     "    for element in elements:\n",
-    "        element.metadata.page_number += page_offset\n",
+    "        if element.metadata.page_number:\n",
+    "            # Page number could be None if we include page breaks\n",
+    "            element.metadata.page_number += page_offset\n",
     "\n",
     "    return elements\n",
     "\n",
diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
@@ -144,7 +144,9 @@ def partition_file_via_api(file_tuple, request, filename, content_type, **partit
 
     # We need to account for the original page numbers
     for element in elements:
-        element.metadata.page_number += page_offset
+        if element.metadata.page_number:
+            # Page number could be None if we include page breaks
+            element.metadata.page_number += page_offset
 
     return elements
 
@@ -484,7 +486,7 @@ def return_content_type(filename):
 
 
 @router.post("/general/v0/general")
-@router.post("/general/v0.0.37/general")
+@router.post("/general/v0.0.38/general")
 def pipeline_1(
     request: Request,
     gz_uncompressed_content_type: Optional[str] = Form(default=None),
diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml
@@ -1,2 +1,2 @@
 name: general
-version: 0.0.37
+version: 0.0.38

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`name: general`
`2`		`-version: 0.0.37`
	`2`	`+version: 0.0.38`