epam · paknikolai · Jun 27, 2025 · Jun 30, 2025 · Jul 1, 2025 · Jul 1, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -58,7 +58,7 @@ FROM builder AS builder_download_model
 COPY download_model.py .
 
 # Model: https://huggingface.co/epam/bge-small-en
-RUN python download_model.py "epam/bge-small-en" "$BGE_EMBEDDINGS_MODEL_PATH" "openvino" "torch"
+RUN python download_model.py embeddings "epam/bge-small-en" "$BGE_EMBEDDINGS_MODEL_PATH" "openvino" "torch"
 
 
 FROM builder AS builder_repo_digest

@@ -0,0 +1,39 @@
+# Set base image with default value
+ARG BASE_IMAGE_NAME=epam/ai-dial-rag:latest
+
+# Stage 1: Download ColPali model
+FROM ${BASE_IMAGE_NAME} AS colpali_downloader
+
+# Set environment variables for ColPali models
+ENV COLPALI_MODELS_BASE_PATH=/colpali_models
+
+# Set specific model to download with default value
+ARG COLPALI_MODEL_NAME=vidore/colSmol-256M
+ENV COLPALI_MODEL_NAME=${COLPALI_MODEL_NAME}
+
+# Switch to root user for model downloads
+USER root
+
+# Copy necessary files for ColPali model download
+COPY aidial_rag/__init__.py aidial_rag/
+COPY aidial_rag/retrievers/__init__.py aidial_rag/retrievers/
+COPY aidial_rag/retrievers/colpali_retriever/__init__.py aidial_rag/retrievers/colpali_retriever/
+COPY aidial_rag/retrievers/colpali_retriever/colpali_models.py aidial_rag/retrievers/colpali_retriever/
+COPY download_model.py ./
+
+# Download the specified ColPali model
+RUN python download_model.py colpali "$COLPALI_MODELS_BASE_PATH" "$COLPALI_MODEL_NAME"
+
+# Stage 2: Final image with downloaded model
+FROM ${BASE_IMAGE_NAME}
+
+# Set environment variables for ColPali models
+ENV COLPALI_MODELS_BASE_PATH=/colpali_models
+
+# Copy the downloaded ColPali model from the downloader stage
+COPY --from=colpali_downloader --chown=appuser "$COLPALI_MODELS_BASE_PATH" "$COLPALI_MODELS_BASE_PATH"
+
+# Switch back to appuser
+USER appuser
+
+# The base image already has EXPOSE 5000 and CMD, so we inherit those
@@ -373,6 +373,19 @@ The `docker_compose_local` folder contains the Docker Compose file and auxiliary
     ```
 
 
+## Building docker file with predownloaded ColPali model
+Due to large weight of each model, a separate docker image was created to avoid making the base image hold those weights when they are not needed.
+
+`Dockerfile.colpali` - additional docker file that saves into the image one of the ColPali models.
+
+There are a few arguments for building the image:
+
+- `BASE_IMAGE_NAME` - argument that allows you to set the base image name for ai-dial-rag, default is `epam/ai-dial-rag:latest`
+- `COLPALI_MODEL_NAME` - name of the ColPali model to download, default is `vidore/colSmol-256M`
+
+And environment variable:
+- `COLPALI_MODELS_BASE_PATH` - path where to store models inside the image, default is `/colpali_models`
+
 
 ## Lint
 

@@ -59,6 +59,9 @@
 from aidial_rag.request_context import RequestContext, create_request_context
 from aidial_rag.resources.cpu_pools import init_cpu_pools
 from aidial_rag.retrieval_chain import create_retrieval_chain
+from aidial_rag.retrievers.colpali_retriever.colpali_model_resource import (
+    ColpaliModelResource,
+)
 from aidial_rag.stages import RetrieverStage
 from aidial_rag.transform_history import transform_history
 from aidial_rag.utils import profiler_if_enabled, timed_stage
@@ -196,6 +199,10 @@ class DialRAGApplication(ChatCompletion):
 
     def __init__(self, app_config: AppConfig):
         self.app_config = app_config
+        self.colpali_model_resource = ColpaliModelResource(
+            app_config.colpali_model_resource_config,
+            app_config.request.indexing.colpali_index,
+        )
         self.enable_debug_commands = app_config.enable_debug_commands
         self.repository_digest = read_repository_digest(REPOSITORY_DIGEST_PATH)
         logger.info(
@@ -289,6 +296,7 @@ async def chat_completion(
                 indexing_tasks,
                 index_storage,
                 dial_api_client,
+                self.colpali_model_resource,
                 config=request_config,
             )
 
@@ -343,6 +351,7 @@ def _make_retrieval_stage(retriever: BaseRetriever, stage_name):
                     indexing_config=request_config.indexing,
                     document_records=document_records,
                     query_chain=query_chain,
+                    colpali_model_resource=self.colpali_model_resource,
                     make_retrieval_stage=_make_retrieval_stage,
                 )
 

@@ -11,6 +11,9 @@
 from aidial_rag.configuration_endpoint import RequestConfig
 from aidial_rag.index_storage import IndexStorageConfig
 from aidial_rag.resources.cpu_pools import CpuPoolsConfig
+from aidial_rag.retrievers.colpali_retriever.colpali_model_resource import (
+    ColpaliModelResourceConfig,
+)
 
 
 class AppConfig(BaseSettings):
@@ -38,6 +41,9 @@ class AppConfig(BaseSettings):
 
     cpu_pools: CpuPoolsConfig = Field(default=CpuPoolsConfig())
     index_storage: IndexStorageConfig = Field(default=IndexStorageConfig())
+    colpali_model_resource_config: ColpaliModelResourceConfig | None = Field(
+        default=None
+    )
     request: RequestConfig = Field(default=RequestConfig())
 
     model_config = SettingsConfigDict(

@@ -48,6 +48,7 @@ class DocumentRecord(BaseDoc):
     embeddings_index: MultiEmbeddings | None
     multimodal_embeddings_index: MultiEmbeddings | None
     description_embeddings_index: MultiEmbeddings | None
+    colpali_embeddings_index: MultiEmbeddings | None
     mime_type: str
     document_bytes: bytes  # Could be attached document or converted document
 

@@ -47,6 +47,12 @@
 from aidial_rag.request_context import RequestContext
 from aidial_rag.resources.dial_limited_resources import DialLimitedResources
 from aidial_rag.retrievers.bm25_retriever import BM25Retriever
+from aidial_rag.retrievers.colpali_retriever.colpali_model_resource import (
+    ColpaliModelResource,
+)
+from aidial_rag.retrievers.colpali_retriever.colpali_retriever import (
+    ColpaliRetriever,
+)
 from aidial_rag.retrievers.description_retriever.description_retriever import (
     DescriptionRetriever,
 )
@@ -107,6 +113,7 @@ async def load_document_impl(
     attachment_link: AttachmentLink,
     stage_stream: SupportsWriteStr,
     index_settings: IndexSettings,
+    colpali_model_resource: ColpaliModelResource,
     config: RequestConfig,
 ) -> DocumentRecord:
     logger_stream = LoggerStream()
@@ -171,6 +178,18 @@ async def load_document_impl(
                 )
             )
 
+        colpali_index_task = None
+        if index_config.colpali_index is not None:
+            colpali_index_task = tg.create_task(
+                ColpaliRetriever.build_index(
+                    model_resource=colpali_model_resource,
+                    colpali_index_config=index_config.colpali_index,
+                    stageio=StreamWithPrefix(io_stream, "ColpaliRetriever: "),
+                    mime_type=mime_type,
+                    original_document=doc_bytes,
+                )
+            )
+
         # TODO: try to move is_image check to the parse_document since another loader is not exposed here from the document_loaders.py
         if is_image(content_type):
             chunks_list = [get_default_image_chunk(attachment_link)]
@@ -203,6 +222,9 @@ async def load_document_impl(
     description_indexes = (
         description_index_task.result() if description_index_task else None
     )
+    colpali_indexes = (
+        colpali_index_task.result() if colpali_index_task else None
+    )
 
     return DocumentRecord(
         format_version=FORMAT_VERSION,
@@ -212,6 +234,7 @@ async def load_document_impl(
         embeddings_index=embeddings_index_task.result(),
         multimodal_embeddings_index=multimodal_index,
         description_embeddings_index=description_indexes,
+        colpali_embeddings_index=colpali_indexes,
         document_bytes=doc_bytes,
         mime_type=mime_type,
     )
@@ -236,6 +259,7 @@ async def load_document(
     task: IndexingTask,
     index_storage: IndexStorage,
     dial_api_client: DialApiClient,
+    colpali_model_resource: ColpaliModelResource,
     config: RequestConfig,
 ) -> DocumentRecord:
     attachment_link = task.attachment_link
@@ -275,6 +299,7 @@ async def load_document(
                         attachment_link,
                         io_stream,
                         index_settings,
+                        colpali_model_resource,
                         config,
                     )
                 except InvalidDocumentError as e:
@@ -297,10 +322,16 @@ async def load_document_task(
     index_storage: IndexStorage,
     dial_api_client: DialApiClient,
     config: RequestConfig,
+    colpali_model_resource: ColpaliModelResource,
 ) -> DocumentIndexingResult:
     try:
         doc_record = await load_document(
-            request_context, task, index_storage, dial_api_client, config
+            request_context,
+            task,
+            index_storage,
+            dial_api_client,
+            colpali_model_resource,
+            config,
         )
         return DocumentIndexingSuccess(
             task=task,
@@ -319,14 +350,20 @@ async def load_documents(
     tasks: Iterable[IndexingTask],
     index_storage: IndexStorage,
     dial_api_client: DialApiClient,
+    colpali_model_resource: ColpaliModelResource,
     config: RequestConfig,
 ) -> List[DocumentIndexingResult]:
     # TODO: Rewrite this function using TaskGroup to cancel all tasks if one of them fails
     # if ignore_document_loading_errors is not set in the config
     return await asyncio.gather(
         *[
             load_document_task(
-                request_context, task, index_storage, dial_api_client, config
+                request_context,
+                task,
+                index_storage,
+                dial_api_client,
+                config,
+                colpali_model_resource,
             )
             for task in tasks
         ],

@@ -7,6 +7,9 @@
 )
 from aidial_rag.document_loaders import ParserConfig
 from aidial_rag.document_record import IndexSettings
+from aidial_rag.retrievers.colpali_retriever.colpali_index_config import (
+    ColpaliIndexConfig,
+)
 from aidial_rag.retrievers.description_retriever.description_retriever import (
     DescriptionIndexConfig,
 )
@@ -29,6 +32,9 @@ class IndexingConfig(BaseConfig):
         description="Enables DescriptionRetriever which uses vision model to generate page images "
         "descriptions and perform search on them.",
     )
+    colpali_index: ColpaliIndexConfig | None = Field(
+        default=None, description="Enables ColpaliRetriever"
+    )
 
     def collect_fields_that_rebuild_index(self) -> IndexSettings:
         """Return the IndexingConfig fields that determine when the index needs to be rebuilt."""

@@ -33,11 +33,26 @@ class CpuPoolsConfig(BaseConfig):
         description="Embedding process for the query. Should be `1`, unless you have a lot of cores.",
     )
 
+    heavy_indexing_embeddings_pool: int = Field(
+        default=1,
+        description="Embedding process for gpu heavy tasks."
+        "Needed not to block ligher tasks on indexing_embeddings_pool. "
+        "Should be `1`",
+    )
+    heavy_query_embeddings_pool: int = Field(
+        default=1,
+        description="Embedding process for gpu heavy tasks for queries."
+        "Needed not to block ligher tasks on query_embeddings_pool."
+        "Should be `1`",
+    )
+
 
 class CpuPools:
     indexing_cpu_pool: ThreadPoolExecutor
     indexing_embeddings_pool: ThreadPoolExecutor
     query_embeddings_pool: ThreadPoolExecutor
+    heavy_indexing_embeddings_pool: ThreadPoolExecutor
+    heavyquery_embeddings_pool: ThreadPoolExecutor
 
     def __init__(self, config: CpuPoolsConfig) -> None:
         # Using ThreadPoolExecutor instead of ProcessPoolExecutor, because
@@ -58,6 +73,16 @@ def __init__(self, config: CpuPoolsConfig) -> None:
             thread_name_prefix="query_embeddings",
         )
 
+        self.heavy_indexing_embeddings_pool = ThreadPoolExecutor(
+            max_workers=config.heavy_indexing_embeddings_pool,
+            thread_name_prefix="heavy_indexing_embeddings",
+        )
+
+        self.heavy_query_embeddings_pool = ThreadPoolExecutor(
+            max_workers=config.heavy_query_embeddings_pool,
+            thread_name_prefix="heavy_query_embeddings",
+        )
+
     def _run_in_pool(self, pool, func, *args, **kwargs):
         return asyncio.get_running_loop().run_in_executor(
             pool, func, *args, **kwargs
@@ -76,6 +101,16 @@ def run_in_query_embeddings_pool(self, func, *args, **kwargs):
             self.query_embeddings_pool, func, *args, **kwargs
         )
 
+    def run_in_heavy_indexing_embeddings_pool(self, func, *args, **kwargs):
+        return self._run_in_pool(
+            self.heavy_indexing_embeddings_pool, func, *args, **kwargs
+        )
+
+    def run_in_heavy_query_embeddings_pool(self, func, *args, **kwargs):
+        return self._run_in_pool(
+            self.heavy_query_embeddings_pool, func, *args, **kwargs
+        )
+
     _instance = None
 
     @classmethod
@@ -103,6 +138,8 @@ async def init_cpu_pools(config: CpuPoolsConfig):
     await cpu_pools.run_in_indexing_cpu_pool(sum, range(10))
     await cpu_pools.run_in_indexing_embeddings_pool(sum, range(10))
     await cpu_pools.run_in_query_embeddings_pool(sum, range(10))
+    await cpu_pools.run_in_heavy_indexing_embeddings_pool(sum, range(10))
+    await cpu_pools.run_in_heavy_query_embeddings_pool(sum, range(10))
 
 
 def run_in_indexing_cpu_pool(func, *args, **kwargs):
@@ -119,3 +156,15 @@ def run_in_query_embeddings_pool(func, *args, **kwargs):
     return CpuPools.instance().run_in_query_embeddings_pool(
         func, *args, **kwargs
     )
+
+
+def run_in_heavy_indexing_embeddings_pool(func, *args, **kwargs):
+    return CpuPools.instance().run_in_heavy_indexing_embeddings_pool(
+        func, *args, **kwargs
+    )
+
+
+def run_in_heavy_query_embeddings_pool(func, *args, **kwargs):
+    return CpuPools.instance().run_in_heavy_query_embeddings_pool(
+        func, *args, **kwargs
+    )