From 8e71ef6a1c3dfdc409432a20a1b79b835a02b0fe Mon Sep 17 00:00:00 2001 From: Yao You Date: Mon, 2 Jun 2025 18:50:55 -0500 Subject: [PATCH 1/8] feat: use thread lock to prevent racing condition --- CHANGELOG.md | 4 ++++ unstructured_inference/__version__.py | 2 +- unstructured_inference/models/base.py | 8 ++++++-- unstructured_inference/models/tables.py | 6 +++++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f4f991bb..0f90bfd3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 1.0.5 + +* feat: add thread lock to prevent racing condition when instantiating singletons + ## 1.0.4 * feat: use singleton instead of `global` to store shared variables diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 18934c58..f7b35997 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "1.0.4" # pragma: no cover +__version__ = "1.0.5" # pragma: no cover diff --git a/unstructured_inference/models/base.py b/unstructured_inference/models/base.py index 37344100..a393df7e 100644 --- a/unstructured_inference/models/base.py +++ b/unstructured_inference/models/base.py @@ -2,6 +2,7 @@ import json import os +import threading from typing import Dict, Optional, Tuple, Type from unstructured_inference.models.detectron2onnx import ( @@ -18,12 +19,15 @@ class Models(object): _instance = None + _lock = threading.Lock() def __new__(cls): """return an instance if one already exists otherwise create an instance""" if cls._instance is None: - cls._instance = super(Models, cls).__new__(cls) - cls.models: Dict[str, UnstructuredModel] = {} + with cls._lock: + if cls._instance is None: + cls._instance = super(Models, cls).__new__(cls) + cls.models: Dict[str, UnstructuredModel] = {} return cls._instance def __contains__(self, key): diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py index 9760dfc4..05ab3bf9 100644 --- a/unstructured_inference/models/tables.py +++ b/unstructured_inference/models/tables.py @@ -1,5 +1,6 @@ # https://github.com/microsoft/table-transformer/blob/main/src/inference.py # https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Table%20Transformer/Using_Table_Transformer_for_table_detection_and_table_structure_recognition.ipynb +import threading import xml.etree.ElementTree as ET from collections import defaultdict from pathlib import Path @@ -28,6 +29,7 @@ class UnstructuredTableTransformerModel(UnstructuredModel): """Unstructured model wrapper for table-transformer.""" _instance = None + _lock = threading.Lock() def __init__(self): pass @@ -36,7 +38,9 @@ def __init__(self): def instance(cls): """return an instance if one already exists otherwise create an instance""" if cls._instance is None: - cls._instance = cls.__new__(cls) + with cls._lock: + if cls._instance is None: + cls._instance = cls.__new__(cls) return cls._instance def predict( From f0730c43d5eabaf4280874fcf3444f9e4a2672ce Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 3 Jun 2025 08:44:00 -0500 Subject: [PATCH 2/8] put initialize at new and behind threadlock --- unstructured_inference/models/tables.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py index 05ab3bf9..af3527a0 100644 --- a/unstructured_inference/models/tables.py +++ b/unstructured_inference/models/tables.py @@ -31,16 +31,13 @@ class UnstructuredTableTransformerModel(UnstructuredModel): _instance = None _lock = threading.Lock() - def __init__(self): - pass - - @classmethod - def instance(cls): + def __new__(cls): """return an instance if one already exists otherwise create an instance""" if cls._instance is None: with cls._lock: if cls._instance is None: - cls._instance = cls.__new__(cls) + cls._instance = super(UnstructuredTableTransformerModel, cls).__new__(cls) + cls._instance.initialize("microsoft/table-transformer-structure-recognition") return cls._instance def predict( @@ -149,15 +146,17 @@ def run_prediction( return prediction -tables_agent: UnstructuredTableTransformerModel = UnstructuredTableTransformerModel.instance() +tables_agent: UnstructuredTableTransformerModel = UnstructuredTableTransformerModel() def load_agent(): """Loads the Table agent.""" if not hasattr(tables_agent, "model"): - logger.info("Loading the Table agent ...") - tables_agent.initialize("microsoft/table-transformer-structure-recognition") + with tables_agent._lock: + if not hasattr(tables_agent, "model"): + logger.info("Loading the Table agent ...") + tables_agent.initialize("microsoft/table-transformer-structure-recognition") return From 3ef4c5f8bed1d93d013df1442e7df380ecfea8dd Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 3 Jun 2025 08:56:02 -0500 Subject: [PATCH 3/8] put image processor also on self.device --- unstructured_inference/models/tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py index af3527a0..caf880fa 100644 --- a/unstructured_inference/models/tables.py +++ b/unstructured_inference/models/tables.py @@ -71,7 +71,7 @@ def initialize( ): """Loads the donut model using the specified parameters""" self.device = device - self.feature_extractor = DetrImageProcessor.from_pretrained(model) + self.feature_extractor = DetrImageProcessor.from_pretrained(model, device_map=self.device) # value not set in the configuration and needed for newer models # https://huggingface.co/microsoft/table-transformer-structure-recognition-v1.1-all/discussions/1 self.feature_extractor.size["shortest_edge"] = 800 From 90ed5e8371d013aafba15596d5ca8584dc17b4ea Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 3 Jun 2025 09:01:29 -0500 Subject: [PATCH 4/8] pass in model name to initialization --- unstructured_inference/models/tables.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py index caf880fa..2f018a1d 100644 --- a/unstructured_inference/models/tables.py +++ b/unstructured_inference/models/tables.py @@ -24,6 +24,8 @@ from . import table_postprocess as postprocess +DEFAULT_MODEL = "microsoft/table-transformer-structure-recognition" + class UnstructuredTableTransformerModel(UnstructuredModel): """Unstructured model wrapper for table-transformer.""" @@ -31,13 +33,13 @@ class UnstructuredTableTransformerModel(UnstructuredModel): _instance = None _lock = threading.Lock() - def __new__(cls): + def __new__(cls, model: str = DEFAULT_MODEL): """return an instance if one already exists otherwise create an instance""" if cls._instance is None: with cls._lock: if cls._instance is None: cls._instance = super(UnstructuredTableTransformerModel, cls).__new__(cls) - cls._instance.initialize("microsoft/table-transformer-structure-recognition") + cls._instance.initialize(model) return cls._instance def predict( @@ -156,7 +158,7 @@ def load_agent(): with tables_agent._lock: if not hasattr(tables_agent, "model"): logger.info("Loading the Table agent ...") - tables_agent.initialize("microsoft/table-transformer-structure-recognition") + tables_agent.initialize(DEFAULT_MODEL) return From b1289685272147883c77b536fa8c4dbe3fd998b5 Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 3 Jun 2025 09:15:50 -0500 Subject: [PATCH 5/8] add test --- .../models/test_tables.py | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py index 03c9b1fd..9d67fa02 100644 --- a/test_unstructured_inference/models/test_tables.py +++ b/test_unstructured_inference/models/test_tables.py @@ -1,4 +1,6 @@ import os +import threading +from copy import deepcopy import numpy as np import pytest @@ -7,7 +9,6 @@ from transformers.models.table_transformer.modeling_table_transformer import ( TableTransformerDecoder, ) -from copy import deepcopy import unstructured_inference.models.table_postprocess as postprocess from unstructured_inference.models import tables @@ -572,7 +573,7 @@ def test_load_table_model_raises_when_not_available(model_path): @pytest.mark.parametrize( - "bbox1, bbox2, expected_result", + ("bbox1", "bbox2", "expected_result"), [ ((0, 0, 5, 5), (2, 2, 7, 7), 0.36), ((0, 0, 0, 0), (6, 6, 10, 10), 0), @@ -921,7 +922,9 @@ def test_table_prediction_output_format( ) if output_format: result = table_transformer.run_prediction( - example_image, result_format=output_format, ocr_tokens=mocked_ocr_tokens + example_image, + result_format=output_format, + ocr_tokens=mocked_ocr_tokens, ) else: result = table_transformer.run_prediction(example_image, ocr_tokens=mocked_ocr_tokens) @@ -952,7 +955,9 @@ def test_table_prediction_output_format_when_wrong_type_then_value_error( ) with pytest.raises(ValueError): table_transformer.run_prediction( - example_image, result_format="Wrong format", ocr_tokens=mocked_ocr_tokens + example_image, + result_format="Wrong format", + ocr_tokens=mocked_ocr_tokens, ) @@ -991,7 +996,8 @@ def test_table_prediction_with_no_ocr_tokens(table_transformer, example_image): ], ) def test_objects_are_filtered_based_on_class_thresholds_when_correct_prediction_and_threshold( - thresholds, expected_object_number + thresholds, + expected_object_number, ): objects = [ {"label": "0", "score": 0.2}, @@ -1010,7 +1016,8 @@ def test_objects_are_filtered_based_on_class_thresholds_when_correct_prediction_ ], ) def test_objects_are_filtered_based_on_class_thresholds_when_two_classes( - thresholds, expected_object_number + thresholds, + expected_object_number, ): objects = [ {"label": "0", "score": 0.2}, @@ -1800,7 +1807,7 @@ def test_compute_confidence_score_zero_division_error_handling(): @pytest.mark.parametrize( - "column_span_score, row_span_score, expected_text_to_indexes", + ("column_span_score", "row_span_score", "expected_text_to_indexes"), [ ( 0.9, @@ -1827,7 +1834,9 @@ def test_compute_confidence_score_zero_division_error_handling(): ], ) def test_subcells_filtering_when_overlapping_spanning_cells( - column_span_score, row_span_score, expected_text_to_indexes + column_span_score, + row_span_score, + expected_text_to_indexes, ): """ # table @@ -1894,3 +1903,14 @@ def test_subcells_filtering_when_overlapping_spanning_cells( predicted_cells_after_reorder, _ = structure_to_cells(saved_table_structure, tokens=tokens) assert predicted_cells_after_reorder == predicted_cells + + +def test_model_init_is_thread_safe(): + threads = [] + for i in range(5): + thread = threading.Thread(target=tables.load_agent) + threads.append(thread) + thread.start() + + for thread in threads: + thread.join() From a7d57044b87fcb23bf6d4aa82f7e5c6622ab27b6 Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 3 Jun 2025 09:31:44 -0500 Subject: [PATCH 6/8] backoff from initialize at new to ensure compatiblity --- unstructured_inference/models/tables.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py index 2f018a1d..f324a158 100644 --- a/unstructured_inference/models/tables.py +++ b/unstructured_inference/models/tables.py @@ -33,13 +33,12 @@ class UnstructuredTableTransformerModel(UnstructuredModel): _instance = None _lock = threading.Lock() - def __new__(cls, model: str = DEFAULT_MODEL): + def __new__(cls): """return an instance if one already exists otherwise create an instance""" if cls._instance is None: with cls._lock: if cls._instance is None: cls._instance = super(UnstructuredTableTransformerModel, cls).__new__(cls) - cls._instance.initialize(model) return cls._instance def predict( From 6d901d75e4a954fcb7997d81ea947d96c66e3906 Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 3 Jun 2025 10:17:00 -0500 Subject: [PATCH 7/8] update load agent condition with new `__new__` method --- unstructured_inference/models/tables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py index f324a158..bb8675ba 100644 --- a/unstructured_inference/models/tables.py +++ b/unstructured_inference/models/tables.py @@ -153,9 +153,9 @@ def run_prediction( def load_agent(): """Loads the Table agent.""" - if not hasattr(tables_agent, "model"): + if getattr(tables_agent, "model", None) is None: with tables_agent._lock: - if not hasattr(tables_agent, "model"): + if getattr(tables_agent, "model", None) is None: logger.info("Loading the Table agent ...") tables_agent.initialize(DEFAULT_MODEL) From d56fa003fe342fdc855dd994b1bea4516faea438 Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 3 Jun 2025 10:21:39 -0500 Subject: [PATCH 8/8] update test --- test_unstructured_inference/models/test_tables.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py index 9d67fa02..65a71729 100644 --- a/test_unstructured_inference/models/test_tables.py +++ b/test_unstructured_inference/models/test_tables.py @@ -1907,6 +1907,7 @@ def test_subcells_filtering_when_overlapping_spanning_cells( def test_model_init_is_thread_safe(): threads = [] + tables.tables_agent.model = None for i in range(5): thread = threading.Thread(target=tables.load_agent) threads.append(thread) @@ -1914,3 +1915,5 @@ def test_model_init_is_thread_safe(): for thread in threads: thread.join() + + assert tables.tables_agent.model is not None