Skip to content

Commit 88051e4

Browse files
Merge branch 'main' into nli/performance_improvements
2 parents d5e088d + bcc5200 commit 88051e4

File tree

6 files changed

+148
-15
lines changed

6 files changed

+148
-15
lines changed

docling_eval/dataset_builders/dataset_builder.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,8 @@
44
import sys
55
from abc import abstractmethod
66
from pathlib import Path
7-
from typing import Iterable, Optional, Union
7+
from typing import TYPE_CHECKING, Iterable, Optional, Union
88

9-
import ibm_boto3 # type: ignore
109
from docling.utils.utils import chunkify
1110
from docling_core.types.doc.document import ImageRefMode
1211
from huggingface_hub import snapshot_download
@@ -29,6 +28,14 @@
2928
# Get logger
3029
_log = logging.getLogger(__name__)
3130

31+
if TYPE_CHECKING:
32+
try:
33+
from ibm_boto3 import client as IbmBoto3Client # type: ignore
34+
from ibm_boto3 import resource as IbmBoto3Resource # type: ignore
35+
except ImportError:
36+
IbmBoto3Client = object
37+
IbmBoto3Resource = object
38+
3239

3340
class HFSource(BaseModel):
3441
repo_id: str
@@ -45,25 +52,48 @@ class S3Source(BaseModel):
4552
overwrite_downloads: bool = True
4653

4754
def __init__(self, **data):
55+
r""" """
56+
# Import guards
57+
try:
58+
import ibm_boto3
59+
except ImportError:
60+
raise ImportError(
61+
"ibm_boto3 package is missing. Install optional dependencies."
62+
)
63+
4864
super().__init__(**data)
4965
self._cos_resource: ibm_boto3.resource = self.initialize_s3_resource()
5066
self._cos_client: ibm_boto3.client = self.initialize_s3_client()
5167

52-
def initialize_s3_client(self) -> ibm_boto3.client:
68+
def initialize_s3_client(self) -> "IbmBoto3Client":
5369
"""Initializes boto3 client - s3 instance
5470
Returns the s3 client
5571
"""
72+
# Import guards
73+
try:
74+
import ibm_boto3
75+
except ImportError:
76+
raise ImportError(
77+
"ibm_boto3 package is missing. Install optional dependencies."
78+
)
5679
return ibm_boto3.client(
5780
"s3",
5881
endpoint_url=self.endpoint,
5982
aws_access_key_id=self.access_key,
6083
aws_secret_access_key=self.secret_key,
6184
)
6285

63-
def initialize_s3_resource(self) -> ibm_boto3.resource:
86+
def initialize_s3_resource(self) -> "IbmBoto3Resource":
6487
"""Initializes boto3 resource - s3 instance
6588
Returns the s3 instance
6689
"""
90+
# Import guards
91+
try:
92+
import ibm_boto3
93+
except ImportError:
94+
raise ImportError(
95+
"ibm_boto3 package is missing. Install optional dependencies."
96+
)
6797

6898
return ibm_boto3.resource(
6999
"s3",

docling_eval/prediction_providers/aws_prediction_provider.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import os
55
from typing import Dict, Optional, Set, Tuple
66

7-
import boto3
87
from docling.datamodel.base_models import ConversionStatus
98
from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
109
from docling_core.types.doc.document import (
@@ -55,6 +54,13 @@ def __init__(
5554
true_labels: Optional[Set[DocItemLabel]] = None,
5655
pred_labels: Optional[Set[DocItemLabel]] = None,
5756
):
57+
# Import guards
58+
try:
59+
import boto3
60+
except ImportError:
61+
raise ImportError(
62+
"boto3 package is missing. Install optional packages dependencies."
63+
)
5864
super().__init__(
5965
do_visualization=do_visualization,
6066
ignore_missing_predictions=ignore_missing_predictions,

docling_eval/prediction_providers/azure_prediction_provider.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
from io import BytesIO
66
from typing import Dict, Optional, Set, Tuple
77

8-
from azure.ai.documentintelligence import DocumentIntelligenceClient
9-
from azure.ai.documentintelligence.models import AnalyzeOutputOption
108
from docling.datamodel.base_models import ConversionStatus
119

1210
# from docling_core.types import DoclingDocument
@@ -64,6 +62,15 @@ def __init__(
6462
true_labels: Optional[Set[DocItemLabel]] = None,
6563
pred_labels: Optional[Set[DocItemLabel]] = None,
6664
): # could be the docling converter options, the remote credentials for MS/Google, etc.
65+
66+
# Import guards
67+
try:
68+
from azure.ai.documentintelligence import DocumentIntelligenceClient
69+
except ImportError:
70+
raise ImportError(
71+
"azure.ai.documentintelligence package is missing. Install optional dependencies."
72+
)
73+
6774
super().__init__(
6875
do_visualization=do_visualization,
6976
ignore_missing_predictions=ignore_missing_predictions,
@@ -404,6 +411,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
404411
Raises:
405412
RuntimeError: If ground truth doc is not available or if mime type is unsupported
406413
"""
414+
from azure.ai.documentintelligence.models import AnalyzeOutputOption
407415

408416
status = ConversionStatus.SUCCESS
409417
result_orig = None

docling_eval/prediction_providers/google_prediction_provider.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@
2323
TextCell,
2424
)
2525
from docling_core.types.io import DocumentStream
26-
from google.cloud import documentai
27-
from google.oauth2 import service_account
2826
from google.protobuf.json_format import MessageToDict
2927

3028
from docling_eval.datamodels.dataset_record import (
@@ -458,6 +456,21 @@ def __init__(
458456
true_labels: Optional[Set[DocItemLabel]] = None,
459457
pred_labels: Optional[Set[DocItemLabel]] = None,
460458
):
459+
r""" """
460+
# Import guards
461+
try:
462+
from google.cloud import documentai
463+
except ImportError:
464+
raise ImportError(
465+
"google.cloud package is missing. Install optional dependencies."
466+
)
467+
try:
468+
from google.oauth2 import service_account
469+
except ImportError:
470+
raise ImportError(
471+
"google.oauth2 package is missing. Install optional dependencies."
472+
)
473+
461474
super().__init__(
462475
do_visualization=do_visualization,
463476
ignore_missing_predictions=ignore_missing_predictions,
@@ -857,6 +870,14 @@ def prediction_format(self) -> PredictionFormats:
857870
def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
858871
"""For the given document stream (single document), run the API and create the doclingDocument."""
859872

873+
# Import guards
874+
try:
875+
from google.cloud import documentai
876+
except ImportError:
877+
raise ImportError(
878+
"google.cloud package is missing. Install optional dependencies."
879+
)
880+
860881
status = ConversionStatus.SUCCESS
861882
assert record.original is not None
862883

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ hyperscalers = [
7373
'azure-core (>=1.33.0,<2.0.0)',
7474
'boto3 (>=1.37.8,<2.0.0)',
7575
'google-cloud-documentai (>=3.2.0,<4.0.0)',
76+
'google-oauth (>=1.0.1,<2.0.0)',
7677
'ibm-cos-sdk (>=2.1.40,<3.0.0)',
7778
]
7879
cvat_tools = [

0 commit comments

Comments
 (0)