44import traceback
55from abc import ABC , abstractmethod
66from collections .abc import Iterable
7- from typing import Any , Callable , List
7+ from pathlib import Path
8+ from typing import Any , Callable , List , Optional
89
910from docling_core .types .doc import NodeItem
1011
2021 Page ,
2122)
2223from docling .datamodel .document import ConversionResult , InputDocument
23- from docling .datamodel .pipeline_options import PdfPipelineOptions , PipelineOptions
24+ from docling .datamodel .pipeline_options import (
25+ ConvertPipelineOptions ,
26+ PdfPipelineOptions ,
27+ PipelineOptions ,
28+ )
2429from docling .datamodel .settings import settings
2530from docling .models .base_model import GenericEnrichmentModel
31+ from docling .models .document_picture_classifier import (
32+ DocumentPictureClassifier ,
33+ DocumentPictureClassifierOptions ,
34+ )
35+ from docling .models .factories import get_picture_description_factory
36+ from docling .models .picture_description_base_model import PictureDescriptionBaseModel
2637from docling .utils .profiling import ProfilingScope , TimeRecorder
2738from docling .utils .utils import chunkify
2839
@@ -36,6 +47,18 @@ def __init__(self, pipeline_options: PipelineOptions):
3647 self .build_pipe : List [Callable ] = []
3748 self .enrichment_pipe : List [GenericEnrichmentModel [Any ]] = []
3849
50+ self .artifacts_path : Optional [Path ] = None
51+ if pipeline_options .artifacts_path is not None :
52+ self .artifacts_path = Path (pipeline_options .artifacts_path ).expanduser ()
53+ elif settings .artifacts_path is not None :
54+ self .artifacts_path = Path (settings .artifacts_path ).expanduser ()
55+
56+ if self .artifacts_path is not None and not self .artifacts_path .is_dir ():
57+ raise RuntimeError (
58+ f"The value of { self .artifacts_path = } is not valid. "
59+ "When defined, it must point to a folder containing all models required by the pipeline."
60+ )
61+
3962 def execute (self , in_doc : InputDocument , raises_on_error : bool ) -> ConversionResult :
4063 conv_res = ConversionResult (input = in_doc )
4164
@@ -108,15 +131,58 @@ def get_default_options(cls) -> PipelineOptions:
108131 def is_backend_supported (cls , backend : AbstractDocumentBackend ):
109132 pass
110133
111- # def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
112- # for model in self.build_pipe:
113- # element_batch = model(element_batch)
114- #
115- # yield from element_batch
116134
135+ class ConvertPipeline (BasePipeline ):
136+ def __init__ (self , pipeline_options : ConvertPipelineOptions ):
137+ super ().__init__ (pipeline_options )
138+ self .pipeline_options : ConvertPipelineOptions
117139
118- class PaginatedPipeline (BasePipeline ): # TODO this is a bad name.
119- def __init__ (self , pipeline_options : PipelineOptions ):
140+ # ------ Common enrichment models working on all backends
141+
142+ # Picture description model
143+ if (
144+ picture_description_model := self ._get_picture_description_model (
145+ artifacts_path = self .artifacts_path
146+ )
147+ ) is None :
148+ raise RuntimeError (
149+ f"The specified picture description kind is not supported: { pipeline_options .picture_description_options .kind } ."
150+ )
151+
152+ self .enrichment_pipe = [
153+ # Document Picture Classifier
154+ DocumentPictureClassifier (
155+ enabled = pipeline_options .do_picture_classification ,
156+ artifacts_path = self .artifacts_path ,
157+ options = DocumentPictureClassifierOptions (),
158+ accelerator_options = pipeline_options .accelerator_options ,
159+ ),
160+ # Document Picture description
161+ picture_description_model ,
162+ ]
163+
164+ def _get_picture_description_model (
165+ self , artifacts_path : Optional [Path ] = None
166+ ) -> Optional [PictureDescriptionBaseModel ]:
167+ factory = get_picture_description_factory (
168+ allow_external_plugins = self .pipeline_options .allow_external_plugins
169+ )
170+ return factory .create_instance (
171+ options = self .pipeline_options .picture_description_options ,
172+ enabled = self .pipeline_options .do_picture_description ,
173+ enable_remote_services = self .pipeline_options .enable_remote_services ,
174+ artifacts_path = artifacts_path ,
175+ accelerator_options = self .pipeline_options .accelerator_options ,
176+ )
177+
178+ @classmethod
179+ @abstractmethod
180+ def get_default_options (cls ) -> ConvertPipelineOptions :
181+ pass
182+
183+
184+ class PaginatedPipeline (ConvertPipeline ): # TODO this is a bad name.
185+ def __init__ (self , pipeline_options : ConvertPipelineOptions ):
120186 super ().__init__ (pipeline_options )
121187 self .keep_backend = False
122188
0 commit comments