diff --git a/CHANGELOG.md b/CHANGELOG.md index 09ca336d..a6601ccb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.8.11 + +* feat: remove `donut` model + ## 0.8.10 * feat: unpin `numpy` and bump minimum for `onnxruntime` to be compatible with `numpy>=2` diff --git a/test_unstructured_inference/models/test_donut.py b/test_unstructured_inference/models/test_donut.py deleted file mode 100644 index 73861717..00000000 --- a/test_unstructured_inference/models/test_donut.py +++ /dev/null @@ -1,79 +0,0 @@ -import pytest -from PIL import Image -from transformers import DonutSwinModel - -from unstructured_inference.models import donut - - -@pytest.mark.parametrize( - ("model_path", "processor_path", "config_path"), - [ - ("crispy_donut_path", "crispy_proc", "crispy_config"), - ("cherry_donut_path", "cherry_proc", "cherry_config"), - ], -) -def test_load_donut_model_raises_when_not_available(model_path, processor_path, config_path): - with pytest.raises(ImportError): - donut_model = donut.UnstructuredDonutModel() - donut_model.initialize( - model=model_path, - processor=processor_path, - config=config_path, - task_prompt="", - ) - - -@pytest.mark.parametrize( - ("model_path", "processor_path", "config_path"), - [ - ( - "unstructuredio/donut-base-sroie", - "unstructuredio/donut-base-sroie", - "unstructuredio/donut-base-sroie", - ), - ], -) -def test_load_donut_model(model_path, processor_path, config_path): - donut_model = donut.UnstructuredDonutModel() - donut_model.initialize( - model=model_path, - processor=processor_path, - config=config_path, - task_prompt="", - ) - assert type(donut_model.model.encoder) is DonutSwinModel - - -@pytest.fixture() -def sample_receipt_transcript(): - return { - "total": "46.00", - "date": "20/03/2018", - "company": "UROKO JAPANESE CUISINE SDN BHD", - "address": "22A-1, JALAN 17/54, SECTION 17, 46400 PETALING JAYA, SELANGOR.", - } - - -@pytest.mark.skip() -@pytest.mark.parametrize( - ("model_path", "processor_path", "config_path"), - [ - ( - "unstructuredio/donut-base-sroie", - "unstructuredio/donut-base-sroie", - "unstructuredio/donut-base-sroie", - ), - ], -) -def test_donut_prediction(model_path, processor_path, config_path, sample_receipt_transcript): - donut_model = donut.UnstructuredDonutModel() - donut_model.initialize( - model=model_path, - processor=processor_path, - config=config_path, - task_prompt="", - ) - image_path = "./sample-docs/receipt-sample.jpg" - with Image.open(image_path) as image: - prediction = donut_model.predict(image) - assert prediction == sample_receipt_transcript diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 4ab59b42..e79b6c5a 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.8.10" # pragma: no cover +__version__ = "0.8.11" # pragma: no cover diff --git a/unstructured_inference/models/donut.py b/unstructured_inference/models/donut.py deleted file mode 100644 index bc60d2c6..00000000 --- a/unstructured_inference/models/donut.py +++ /dev/null @@ -1,79 +0,0 @@ -import logging -from pathlib import Path -from typing import Optional, Union - -import torch -from PIL import Image as PILImage -from transformers import ( - DonutProcessor, - VisionEncoderDecoderConfig, - VisionEncoderDecoderModel, -) - -from unstructured_inference.models.unstructuredmodel import UnstructuredModel - - -class UnstructuredDonutModel(UnstructuredModel): - """Unstructured model wrapper for Donut image transformer.""" - - def predict(self, x: PILImage.Image): - """Make prediction using donut model""" - super().predict(x) - return self.run_prediction(x) - - def initialize( - self, - model: Union[str, Path, VisionEncoderDecoderModel] = None, - processor: Union[str, Path, DonutProcessor] = None, - config: Optional[Union[str, Path, VisionEncoderDecoderConfig]] = None, - task_prompt: Optional[str] = "", - device: Optional[str] = "cuda" if torch.cuda.is_available() else "cpu", - ): - """Loads the donut model using the specified parameters""" - - self.task_prompt = task_prompt - self.device = device - - try: - if not isinstance(config, VisionEncoderDecoderModel): - config = VisionEncoderDecoderConfig.from_pretrained(config) - - logging.info("Loading the Donut model and processor...") - self.processor = DonutProcessor.from_pretrained(processor) - self.model = VisionEncoderDecoderModel.from_pretrained(model, config=config) - - except EnvironmentError: - logging.critical("Failed to initialize the model.") - logging.critical( - "Ensure that the Donut parameters config, model and processor are correct", - ) - raise ImportError("Review the parameters to initialize a UnstructuredDonutModel obj") - self.model.to(device) - - def run_prediction(self, x: PILImage.Image): - """Internal prediction method.""" - pixel_values = self.processor(x, return_tensors="pt").pixel_values - decoder_input_ids = self.processor.tokenizer( - self.task_prompt, - add_special_tokens=False, - return_tensors="pt", - ).input_ids - outputs = self.model.generate( - pixel_values.to(self.device), - decoder_input_ids=decoder_input_ids.to(self.device), - max_length=self.model.decoder.config.max_position_embeddings, - early_stopping=True, - pad_token_id=self.processor.tokenizer.pad_token_id, - eos_token_id=self.processor.tokenizer.eos_token_id, - use_cache=True, - num_beams=1, - bad_words_ids=[[self.processor.tokenizer.unk_token_id]], - return_dict_in_generate=True, - ) - prediction = self.processor.batch_decode(outputs.sequences)[0] - # NOTE(alan): As of right now I think this would not work if passed in as the model to - # DocumentLayout.from_file and similar functions that take a model object as input. This - # produces image-to-text inferences rather than image-to-bboxes, so we actually need to - # hook it up in a different way. - prediction = self.processor.token2json(prediction) - return prediction