Skip to content

Commit bbb2a10

Browse files
committed
feat: disable timestamps in Docling Documents in conversion results
1 parent 7d2e3d6 commit bbb2a10

File tree

3 files changed

+253
-166
lines changed

3 files changed

+253
-166
lines changed

demos/kfp/docling/asr-conversion/docling_asr_convert_pipeline.py

Lines changed: 45 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,31 @@
1616
_log = logging.getLogger(__name__)
1717

1818

19+
@dsl.component(
20+
base_image=PYTHON_BASE_IMAGE,
21+
packages_to_install=["llama-stack-client", "fire", "httpx"],
22+
)
23+
def clear_vector_db(
24+
service_url: str,
25+
vector_db_id: str,
26+
):
27+
"""Unregisters (deletes) a vector database if it exists."""
28+
from llama_stack_client import LlamaStackClient
29+
30+
client = LlamaStackClient(base_url=service_url)
31+
32+
try:
33+
print(f"Attempting to clear vector DB '{vector_db_id}'...")
34+
client.vector_dbs.unregister(vector_db_id=vector_db_id)
35+
print(f"Successfully cleared vector DB '{vector_db_id}'.")
36+
37+
except Exception as e:
38+
print(
39+
f"Warning: Could not clear vector DB '{vector_db_id}'."
40+
f"This is expected if it's the first run. Error: {e}"
41+
)
42+
43+
1944
@dsl.component(
2045
base_image=PYTHON_BASE_IMAGE,
2146
packages_to_install=["llama-stack-client", "fire", "requests"],
@@ -139,15 +164,12 @@ def docling_convert_and_ingest_audio(
139164
import pathlib
140165
import subprocess
141166
import os
142-
143-
from docling.datamodel.pipeline_options_asr_model import (
144-
InlineAsrNativeWhisperOptions,
145-
InferenceAsrFramework,
146-
)
167+
import re
147168

148169
from docling.datamodel.base_models import ConversionStatus, InputFormat
149170
from docling.datamodel.document import ConversionResult
150171
from docling.datamodel.pipeline_options import AsrPipelineOptions
172+
from docling.datamodel import asr_model_specs
151173
from docling.document_converter import AudioFormatOption, DocumentConverter
152174
from docling.pipeline.asr_pipeline import AsrPipeline
153175
from docling_core.types.doc.document import DoclingDocument
@@ -303,22 +325,20 @@ def cleanup_temp_files(temp_files_to_cleanup: List[pathlib.Path]) -> None:
303325
temp_file.unlink(missing_ok=True)
304326
print(f"Cleaned up temporary file: {temp_file.name}")
305327

328+
def clean_timestamps(doc: DoclingDocument) -> None:
329+
for item in doc.texts:
330+
cleaned_text = re.sub(r"\[time: .*?\]\s*", "", item.text)
331+
item.text = cleaned_text
332+
item.orig = cleaned_text
333+
306334
# Return a Docling DocumentConverter configured for ASR with whisper_turbo model.
307335
def get_asr_converter() -> DocumentConverter:
308336
"""Create a DocumentConverter configured for ASR with whisper_turbo model."""
309-
whisper_turbo_asr_model = InlineAsrNativeWhisperOptions(
310-
repo_id="turbo",
311-
inference_framework=InferenceAsrFramework.WHISPER,
312-
verbose=True,
313-
timestamps=False,
314-
word_timestamps=False,
315-
temperature=0.0,
316-
max_new_tokens=256,
317-
max_time_chunk=30.0,
318-
)
319-
320337
pipeline_options = AsrPipelineOptions()
321-
pipeline_options.asr_options = whisper_turbo_asr_model
338+
pipeline_options.asr_options = asr_model_specs.WHISPER_TURBO
339+
pipeline_options.asr_options.timestamps = False
340+
pipeline_options.asr_options.word_timestamps = False
341+
pipeline_options.asr_options.verbose = False
322342

323343
converter = DocumentConverter(
324344
format_options={
@@ -414,6 +434,7 @@ def process_conversion_results(
414434
processed_docs += 1
415435
file_name = conv_res.input.file.stem
416436
document = conv_res.document
437+
clean_timestamps(document)
417438

418439
if document is None:
419440
_log.warning(f"Document conversion failed for {file_name}")
@@ -481,11 +502,17 @@ def docling_convert_pipeline(
481502
:param use_gpu: boolean to enable/disable gpu in the docling workers
482503
:return:
483504
"""
505+
clear_task = clear_vector_db(
506+
service_url=service_url,
507+
vector_db_id=vector_db_id,
508+
)
509+
clear_task.set_caching_options(False)
510+
484511
register_task = register_vector_db(
485512
service_url=service_url,
486513
vector_db_id=vector_db_id,
487514
embed_model_id=embed_model_id,
488-
)
515+
).after(clear_task)
489516
register_task.set_caching_options(False)
490517

491518
import_task = import_audio_files(

0 commit comments

Comments
 (0)