|
10 | 10 | import pytesseract |
11 | 11 | from pdfminer import psparser |
12 | 12 | from pdfminer.high_level import extract_pages |
13 | | -from PIL import Image |
| 13 | +from PIL import Image, ImageSequence |
14 | 14 | from pytesseract import Output |
15 | 15 |
|
16 | 16 | from unstructured_inference.inference.elements import ( |
@@ -143,26 +143,32 @@ def from_image_file( |
143 | 143 | try: |
144 | 144 | image = Image.open(filename) |
145 | 145 | format = image.format |
146 | | - image = image.convert("RGB") |
147 | | - image.format = format |
| 146 | + images = [] |
| 147 | + for i, im in enumerate(ImageSequence.Iterator(image)): |
| 148 | + im = im.convert("RGB") |
| 149 | + im.format = format |
| 150 | + images.append(im) |
148 | 151 | except Exception as e: |
149 | 152 | if os.path.isdir(filename) or os.path.isfile(filename): |
150 | 153 | raise e |
151 | 154 | else: |
152 | 155 | raise FileNotFoundError(f'File "{filename}" not found!') from e |
153 | | - page = PageLayout.from_image( |
154 | | - image, |
155 | | - image_path=filename, |
156 | | - detection_model=detection_model, |
157 | | - element_extraction_model=element_extraction_model, |
158 | | - layout=None, |
159 | | - ocr_strategy=ocr_strategy, |
160 | | - ocr_languages=ocr_languages, |
161 | | - ocr_mode=ocr_mode, |
162 | | - fixed_layout=fixed_layout, |
163 | | - extract_tables=extract_tables, |
164 | | - ) |
165 | | - return cls.from_pages([page]) |
| 156 | + pages = [] |
| 157 | + for i, image in enumerate(images): |
| 158 | + page = PageLayout.from_image( |
| 159 | + image, |
| 160 | + image_path=filename, |
| 161 | + number=i, |
| 162 | + detection_model=detection_model, |
| 163 | + element_extraction_model=element_extraction_model, |
| 164 | + layout=None, |
| 165 | + ocr_strategy=ocr_strategy, |
| 166 | + ocr_languages=ocr_languages, |
| 167 | + fixed_layout=fixed_layout, |
| 168 | + extract_tables=extract_tables, |
| 169 | + ) |
| 170 | + pages.append(page) |
| 171 | + return cls.from_pages(pages) |
166 | 172 |
|
167 | 173 |
|
168 | 174 | class PageLayout: |
|
0 commit comments