|
16 | 16 | _log = logging.getLogger(__name__)
|
17 | 17 |
|
18 | 18 |
|
| 19 | +@dsl.component( |
| 20 | + base_image=PYTHON_BASE_IMAGE, |
| 21 | + packages_to_install=["llama-stack-client", "fire", "httpx"], |
| 22 | +) |
| 23 | +def clear_vector_db( |
| 24 | + service_url: str, |
| 25 | + vector_db_id: str, |
| 26 | +): |
| 27 | + """Unregisters (deletes) a vector database if it exists.""" |
| 28 | + from llama_stack_client import LlamaStackClient |
| 29 | + |
| 30 | + client = LlamaStackClient(base_url=service_url) |
| 31 | + |
| 32 | + try: |
| 33 | + print(f"Attempting to clear vector DB '{vector_db_id}'...") |
| 34 | + client.vector_dbs.unregister(vector_db_id=vector_db_id) |
| 35 | + print(f"Successfully cleared vector DB '{vector_db_id}'.") |
| 36 | + |
| 37 | + except Exception as e: |
| 38 | + print( |
| 39 | + f"Warning: Could not clear vector DB '{vector_db_id}'." |
| 40 | + f"This is expected if it's the first run. Error: {e}" |
| 41 | + ) |
| 42 | + |
| 43 | + |
19 | 44 | @dsl.component(
|
20 | 45 | base_image=PYTHON_BASE_IMAGE,
|
21 | 46 | packages_to_install=["llama-stack-client", "fire", "requests"],
|
@@ -139,15 +164,12 @@ def docling_convert_and_ingest_audio(
|
139 | 164 | import pathlib
|
140 | 165 | import subprocess
|
141 | 166 | import os
|
142 |
| - |
143 |
| - from docling.datamodel.pipeline_options_asr_model import ( |
144 |
| - InlineAsrNativeWhisperOptions, |
145 |
| - InferenceAsrFramework, |
146 |
| - ) |
| 167 | + import re |
147 | 168 |
|
148 | 169 | from docling.datamodel.base_models import ConversionStatus, InputFormat
|
149 | 170 | from docling.datamodel.document import ConversionResult
|
150 | 171 | from docling.datamodel.pipeline_options import AsrPipelineOptions
|
| 172 | + from docling.datamodel import asr_model_specs |
151 | 173 | from docling.document_converter import AudioFormatOption, DocumentConverter
|
152 | 174 | from docling.pipeline.asr_pipeline import AsrPipeline
|
153 | 175 | from docling_core.types.doc.document import DoclingDocument
|
@@ -303,22 +325,20 @@ def cleanup_temp_files(temp_files_to_cleanup: List[pathlib.Path]) -> None:
|
303 | 325 | temp_file.unlink(missing_ok=True)
|
304 | 326 | print(f"Cleaned up temporary file: {temp_file.name}")
|
305 | 327 |
|
| 328 | + def clean_timestamps(doc: DoclingDocument) -> None: |
| 329 | + for item in doc.texts: |
| 330 | + cleaned_text = re.sub(r"\[time: .*?\]\s*", "", item.text) |
| 331 | + item.text = cleaned_text |
| 332 | + item.orig = cleaned_text |
| 333 | + |
306 | 334 | # Return a Docling DocumentConverter configured for ASR with whisper_turbo model.
|
307 | 335 | def get_asr_converter() -> DocumentConverter:
|
308 | 336 | """Create a DocumentConverter configured for ASR with whisper_turbo model."""
|
309 |
| - whisper_turbo_asr_model = InlineAsrNativeWhisperOptions( |
310 |
| - repo_id="turbo", |
311 |
| - inference_framework=InferenceAsrFramework.WHISPER, |
312 |
| - verbose=True, |
313 |
| - timestamps=False, |
314 |
| - word_timestamps=False, |
315 |
| - temperature=0.0, |
316 |
| - max_new_tokens=256, |
317 |
| - max_time_chunk=30.0, |
318 |
| - ) |
319 |
| - |
320 | 337 | pipeline_options = AsrPipelineOptions()
|
321 |
| - pipeline_options.asr_options = whisper_turbo_asr_model |
| 338 | + pipeline_options.asr_options = asr_model_specs.WHISPER_TURBO |
| 339 | + pipeline_options.asr_options.timestamps = False |
| 340 | + pipeline_options.asr_options.word_timestamps = False |
| 341 | + pipeline_options.asr_options.verbose = False |
322 | 342 |
|
323 | 343 | converter = DocumentConverter(
|
324 | 344 | format_options={
|
@@ -414,6 +434,7 @@ def process_conversion_results(
|
414 | 434 | processed_docs += 1
|
415 | 435 | file_name = conv_res.input.file.stem
|
416 | 436 | document = conv_res.document
|
| 437 | + clean_timestamps(document) |
417 | 438 |
|
418 | 439 | if document is None:
|
419 | 440 | _log.warning(f"Document conversion failed for {file_name}")
|
@@ -481,11 +502,17 @@ def docling_convert_pipeline(
|
481 | 502 | :param use_gpu: boolean to enable/disable gpu in the docling workers
|
482 | 503 | :return:
|
483 | 504 | """
|
| 505 | + clear_task = clear_vector_db( |
| 506 | + service_url=service_url, |
| 507 | + vector_db_id=vector_db_id, |
| 508 | + ) |
| 509 | + clear_task.set_caching_options(False) |
| 510 | + |
484 | 511 | register_task = register_vector_db(
|
485 | 512 | service_url=service_url,
|
486 | 513 | vector_db_id=vector_db_id,
|
487 | 514 | embed_model_id=embed_model_id,
|
488 |
| - ) |
| 515 | + ).after(clear_task) |
489 | 516 | register_task.set_caching_options(False)
|
490 | 517 |
|
491 | 518 | import_task = import_audio_files(
|
|
0 commit comments