Skip to content

Commit 9d7fd22

Browse files
committed
refactor: refactor convert_audio_to_wav() and cleanup_temp_files() functions and modified cells in notebook
1 parent ea37c29 commit 9d7fd22

File tree

3 files changed

+200
-194
lines changed

3 files changed

+200
-194
lines changed

demos/kfp/docling/asr-conversion/docling_asr_convert_pipeline.py

Lines changed: 42 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -255,53 +255,53 @@ def convert_audio_to_wav(
255255
if audio_file.suffix.lower() == ".wav":
256256
processed_audio_files.append(audio_file)
257257
print(f"Using WAV file directly: {audio_file.name}")
258-
else:
259-
# Convert non-WAV files to WAV format using ffmpeg
260-
print(f"Converting {audio_file.name} to WAV format...")
261-
import tempfile
262-
263-
with tempfile.NamedTemporaryFile(
264-
suffix=f"_{audio_file.stem}.wav", delete=False
265-
) as tmp:
266-
temp_wav = pathlib.Path(tmp.name)
267-
268-
try:
269-
# Use ffmpeg to convert to WAV format
270-
subprocess.run(
271-
[
272-
"ffmpeg",
273-
"-i",
274-
str(audio_file),
275-
"-ar",
276-
"16000", # 16kHz sample rate (good for whisper)
277-
"-ac",
278-
"1", # mono channel
279-
"-c:a",
280-
"pcm_s16le", # 16-bit PCM
281-
"-y", # overwrite output file
282-
str(temp_wav),
283-
],
284-
check=True,
285-
capture_output=True,
286-
)
287-
288-
processed_audio_files.append(temp_wav)
289-
temp_files_to_cleanup.append(temp_wav)
290-
print(f"Successfully converted {audio_file.name} to WAV format")
291-
292-
except subprocess.CalledProcessError as e:
293-
print(f"ffmpeg conversion failed for {audio_file.name}: {e}")
294-
if e.stderr:
295-
print(f"stderr: {e.stderr.decode()}")
296-
continue
258+
continue
259+
260+
# Convert non-WAV files to WAV format using ffmpeg
261+
print(f"Converting {audio_file.name} to WAV format...")
262+
import tempfile
263+
264+
with tempfile.NamedTemporaryFile(
265+
suffix=f"_{audio_file.stem}.wav", delete=False
266+
) as tmp:
267+
temp_wav = pathlib.Path(tmp.name)
268+
269+
try:
270+
# Use ffmpeg to convert to WAV format
271+
subprocess.run(
272+
[
273+
"ffmpeg",
274+
"-i",
275+
str(audio_file),
276+
"-ar",
277+
"16000", # 16kHz sample rate (good for whisper)
278+
"-ac",
279+
"1", # mono channel
280+
"-c:a",
281+
"pcm_s16le", # 16-bit PCM
282+
"-y", # overwrite output file
283+
str(temp_wav),
284+
],
285+
check=True,
286+
capture_output=True,
287+
)
288+
289+
processed_audio_files.append(temp_wav)
290+
temp_files_to_cleanup.append(temp_wav)
291+
print(f"Successfully converted {audio_file.name} to WAV format")
292+
293+
except subprocess.CalledProcessError as e:
294+
print(f"ffmpeg conversion failed for {audio_file.name}: {e}")
295+
if e.stderr:
296+
print(f"stderr: {e.stderr.decode()}")
297+
continue
297298
return (processed_audio_files, temp_files_to_cleanup)
298299

299300
# Clean up temporary files
300301
def cleanup_temp_files(temp_files_to_cleanup: List[pathlib.Path]) -> None:
301302
for temp_file in temp_files_to_cleanup:
302-
if temp_file.exists():
303-
temp_file.unlink()
304-
print(f"Cleaned up temporary file: {temp_file.name}")
303+
temp_file.unlink(missing_ok=True)
304+
print(f"Cleaned up temporary file: {temp_file.name}")
305305

306306
# Return a Docling DocumentConverter configured for ASR with whisper_turbo model.
307307
def get_asr_converter() -> DocumentConverter:

demos/kfp/docling/asr-conversion/docling_asr_convert_pipeline_compiled.yaml

Lines changed: 60 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -435,39 +435,36 @@ deploymentSpec:
435435
\ continue\n\n # Check if file is already WAV\n \
436436
\ if audio_file.suffix.lower() == \".wav\":\n processed_audio_files.append(audio_file)\n\
437437
\ print(f\"Using WAV file directly: {audio_file.name}\")\n\
438-
\ else:\n # Convert non-WAV files to WAV format\
439-
\ using ffmpeg\n print(f\"Converting {audio_file.name} to\
440-
\ WAV format...\")\n import tempfile\n\n with\
441-
\ tempfile.NamedTemporaryFile(\n suffix=f\"_{audio_file.stem}.wav\"\
442-
, delete=False\n ) as tmp:\n temp_wav\
443-
\ = pathlib.Path(tmp.name)\n\n try:\n \
444-
\ # Use ffmpeg to convert to WAV format\n subprocess.run(\n\
445-
\ [\n \"ffmpeg\",\n \
446-
\ \"-i\",\n str(audio_file),\n\
447-
\ \"-ar\",\n \"16000\"\
448-
, # 16kHz sample rate (good for whisper)\n \"\
449-
-ac\",\n \"1\", # mono channel\n \
450-
\ \"-c:a\",\n \"pcm_s16le\",\
451-
\ # 16-bit PCM\n \"-y\", # overwrite output\
452-
\ file\n str(temp_wav),\n \
453-
\ ],\n check=True,\n capture_output=True,\n\
454-
\ )\n\n processed_audio_files.append(temp_wav)\n\
455-
\ temp_files_to_cleanup.append(temp_wav)\n \
456-
\ print(f\"Successfully converted {audio_file.name} to WAV format\"\
457-
)\n\n except subprocess.CalledProcessError as e:\n \
458-
\ print(f\"ffmpeg conversion failed for {audio_file.name}:\
459-
\ {e}\")\n if e.stderr:\n print(f\"\
460-
stderr: {e.stderr.decode()}\")\n continue\n return\
461-
\ (processed_audio_files, temp_files_to_cleanup)\n\n # Clean up temporary\
462-
\ files\n def cleanup_temp_files(temp_files_to_cleanup: List[pathlib.Path])\
463-
\ -> None:\n for temp_file in temp_files_to_cleanup:\n \
464-
\ if temp_file.exists():\n temp_file.unlink()\n \
465-
\ print(f\"Cleaned up temporary file: {temp_file.name}\")\n\n \
466-
\ # Return a Docling DocumentConverter configured for ASR with whisper_turbo\
467-
\ model.\n def get_asr_converter() -> DocumentConverter:\n \"\"\
468-
\"Create a DocumentConverter configured for ASR with whisper_turbo model.\"\
469-
\"\"\n whisper_turbo_llm = InlineAsrNativeWhisperOptions(\n \
470-
\ repo_id=\"turbo\",\n inference_framework=InferenceAsrFramework.WHISPER,\n\
438+
\ continue\n\n # Convert non-WAV files to WAV\
439+
\ format using ffmpeg\n print(f\"Converting {audio_file.name}\
440+
\ to WAV format...\")\n import tempfile\n\n with tempfile.NamedTemporaryFile(\n\
441+
\ suffix=f\"_{audio_file.stem}.wav\", delete=False\n \
442+
\ ) as tmp:\n temp_wav = pathlib.Path(tmp.name)\n\n\
443+
\ try:\n # Use ffmpeg to convert to WAV format\n\
444+
\ subprocess.run(\n [\n \
445+
\ \"ffmpeg\",\n \"-i\",\n \
446+
\ str(audio_file),\n \"-ar\",\n \
447+
\ \"16000\", # 16kHz sample rate (good for whisper)\n \
448+
\ \"-ac\",\n \"1\", # mono\
449+
\ channel\n \"-c:a\",\n \"\
450+
pcm_s16le\", # 16-bit PCM\n \"-y\", # overwrite\
451+
\ output file\n str(temp_wav),\n \
452+
\ ],\n check=True,\n capture_output=True,\n\
453+
\ )\n\n processed_audio_files.append(temp_wav)\n\
454+
\ temp_files_to_cleanup.append(temp_wav)\n \
455+
\ print(f\"Successfully converted {audio_file.name} to WAV format\")\n\n\
456+
\ except subprocess.CalledProcessError as e:\n \
457+
\ print(f\"ffmpeg conversion failed for {audio_file.name}: {e}\")\n \
458+
\ if e.stderr:\n print(f\"stderr: {e.stderr.decode()}\"\
459+
)\n continue\n return (processed_audio_files, temp_files_to_cleanup)\n\
460+
\n # Clean up temporary files\n def cleanup_temp_files(temp_files_to_cleanup:\
461+
\ List[pathlib.Path]) -> None:\n for temp_file in temp_files_to_cleanup:\n\
462+
\ temp_file.unlink(missing_ok=True)\n print(f\"Cleaned\
463+
\ up temporary file: {temp_file.name}\")\n\n # Return a Docling DocumentConverter\
464+
\ configured for ASR with whisper_turbo model.\n def get_asr_converter()\
465+
\ -> DocumentConverter:\n \"\"\"Create a DocumentConverter configured\
466+
\ for ASR with whisper_turbo model.\"\"\"\n whisper_turbo_llm = InlineAsrNativeWhisperOptions(\n\
467+
\ repo_id=\"turbo\",\n inference_framework=InferenceAsrFramework.WHISPER,\n\
471468
\ verbose=True,\n timestamps=False,\n word_timestamps=False,\n\
472469
\ temperature=0.0,\n max_new_tokens=256,\n \
473470
\ max_time_chunk=30.0,\n )\n\n pipeline_options = AsrPipelineOptions()\n\
@@ -643,39 +640,36 @@ deploymentSpec:
643640
\ continue\n\n # Check if file is already WAV\n \
644641
\ if audio_file.suffix.lower() == \".wav\":\n processed_audio_files.append(audio_file)\n\
645642
\ print(f\"Using WAV file directly: {audio_file.name}\")\n\
646-
\ else:\n # Convert non-WAV files to WAV format\
647-
\ using ffmpeg\n print(f\"Converting {audio_file.name} to\
648-
\ WAV format...\")\n import tempfile\n\n with\
649-
\ tempfile.NamedTemporaryFile(\n suffix=f\"_{audio_file.stem}.wav\"\
650-
, delete=False\n ) as tmp:\n temp_wav\
651-
\ = pathlib.Path(tmp.name)\n\n try:\n \
652-
\ # Use ffmpeg to convert to WAV format\n subprocess.run(\n\
653-
\ [\n \"ffmpeg\",\n \
654-
\ \"-i\",\n str(audio_file),\n\
655-
\ \"-ar\",\n \"16000\"\
656-
, # 16kHz sample rate (good for whisper)\n \"\
657-
-ac\",\n \"1\", # mono channel\n \
658-
\ \"-c:a\",\n \"pcm_s16le\",\
659-
\ # 16-bit PCM\n \"-y\", # overwrite output\
660-
\ file\n str(temp_wav),\n \
661-
\ ],\n check=True,\n capture_output=True,\n\
662-
\ )\n\n processed_audio_files.append(temp_wav)\n\
663-
\ temp_files_to_cleanup.append(temp_wav)\n \
664-
\ print(f\"Successfully converted {audio_file.name} to WAV format\"\
665-
)\n\n except subprocess.CalledProcessError as e:\n \
666-
\ print(f\"ffmpeg conversion failed for {audio_file.name}:\
667-
\ {e}\")\n if e.stderr:\n print(f\"\
668-
stderr: {e.stderr.decode()}\")\n continue\n return\
669-
\ (processed_audio_files, temp_files_to_cleanup)\n\n # Clean up temporary\
670-
\ files\n def cleanup_temp_files(temp_files_to_cleanup: List[pathlib.Path])\
671-
\ -> None:\n for temp_file in temp_files_to_cleanup:\n \
672-
\ if temp_file.exists():\n temp_file.unlink()\n \
673-
\ print(f\"Cleaned up temporary file: {temp_file.name}\")\n\n \
674-
\ # Return a Docling DocumentConverter configured for ASR with whisper_turbo\
675-
\ model.\n def get_asr_converter() -> DocumentConverter:\n \"\"\
676-
\"Create a DocumentConverter configured for ASR with whisper_turbo model.\"\
677-
\"\"\n whisper_turbo_llm = InlineAsrNativeWhisperOptions(\n \
678-
\ repo_id=\"turbo\",\n inference_framework=InferenceAsrFramework.WHISPER,\n\
643+
\ continue\n\n # Convert non-WAV files to WAV\
644+
\ format using ffmpeg\n print(f\"Converting {audio_file.name}\
645+
\ to WAV format...\")\n import tempfile\n\n with tempfile.NamedTemporaryFile(\n\
646+
\ suffix=f\"_{audio_file.stem}.wav\", delete=False\n \
647+
\ ) as tmp:\n temp_wav = pathlib.Path(tmp.name)\n\n\
648+
\ try:\n # Use ffmpeg to convert to WAV format\n\
649+
\ subprocess.run(\n [\n \
650+
\ \"ffmpeg\",\n \"-i\",\n \
651+
\ str(audio_file),\n \"-ar\",\n \
652+
\ \"16000\", # 16kHz sample rate (good for whisper)\n \
653+
\ \"-ac\",\n \"1\", # mono\
654+
\ channel\n \"-c:a\",\n \"\
655+
pcm_s16le\", # 16-bit PCM\n \"-y\", # overwrite\
656+
\ output file\n str(temp_wav),\n \
657+
\ ],\n check=True,\n capture_output=True,\n\
658+
\ )\n\n processed_audio_files.append(temp_wav)\n\
659+
\ temp_files_to_cleanup.append(temp_wav)\n \
660+
\ print(f\"Successfully converted {audio_file.name} to WAV format\")\n\n\
661+
\ except subprocess.CalledProcessError as e:\n \
662+
\ print(f\"ffmpeg conversion failed for {audio_file.name}: {e}\")\n \
663+
\ if e.stderr:\n print(f\"stderr: {e.stderr.decode()}\"\
664+
)\n continue\n return (processed_audio_files, temp_files_to_cleanup)\n\
665+
\n # Clean up temporary files\n def cleanup_temp_files(temp_files_to_cleanup:\
666+
\ List[pathlib.Path]) -> None:\n for temp_file in temp_files_to_cleanup:\n\
667+
\ temp_file.unlink(missing_ok=True)\n print(f\"Cleaned\
668+
\ up temporary file: {temp_file.name}\")\n\n # Return a Docling DocumentConverter\
669+
\ configured for ASR with whisper_turbo model.\n def get_asr_converter()\
670+
\ -> DocumentConverter:\n \"\"\"Create a DocumentConverter configured\
671+
\ for ASR with whisper_turbo model.\"\"\"\n whisper_turbo_llm = InlineAsrNativeWhisperOptions(\n\
672+
\ repo_id=\"turbo\",\n inference_framework=InferenceAsrFramework.WHISPER,\n\
679673
\ verbose=True,\n timestamps=False,\n word_timestamps=False,\n\
680674
\ temperature=0.0,\n max_new_tokens=256,\n \
681675
\ max_time_chunk=30.0,\n )\n\n pipeline_options = AsrPipelineOptions()\n\

0 commit comments

Comments
 (0)