Add docs for segment_audio param (#1818)

ChrisJar · kheiss-uwzoo · greptile-apps[bot] · web-flow · commit 32a4ad149129 · 2026-04-08T15:46:49.000-07:00
Co-authored-by: Kurt Heiss &lt;kheiss@nvidia.com&gt;
Co-authored-by: greptile-apps[bot] &lt;165735046+greptile-apps[bot]@users.noreply.github.com&gt;
diff --git a/client/src/nv_ingest_client/client/interface.py b/client/src/nv_ingest_client/client/interface.py
@@ -1070,6 +1070,9 @@ def extract(self, **kwargs: Any) -> "Ingestor":
             - extract_charts: bool, extract charts (default True)
             - extract_infographics: bool, extract infographics (default False)
             - extract_page_as_image: bool, extract full page as image (default False)
+            - extract_audio_params: dict, audio extraction options such as
+              endpoint settings and `segment_audio` for sentence-like ASR
+              segmentation when using a hosted Parakeet service
             - table_output_format: str, format for table output (default "markdown")
             - auto_dedup: bool, auto-enable bbox deduplication when extracting both
               structured elements and images (default True). Set to False to disable.
diff --git a/docs/docs/extraction/audio.md b/docs/docs/extraction/audio.md
@@ -82,10 +82,13 @@ Use the following procedure to run the NIM locally.
         .extract(
             document_type="wav",  # Ingestor should detect type automatically in most cases
             extract_method="audio",
+            extract_audio_params={
+                "segment_audio": True,
+            },
         )
     )
     ```
-
+To generate one extracted element for each sentence-like ASR segment, include `extract_audio_params={"segment_audio": True}` when calling `.extract(...)`. This option applies when audio extraction runs with a Parakeet NIM (either locally through Docker or remotely via NVCF) but has no effect when using the local Hugging Face Parakeet model.
 
     !!! tip
 
@@ -117,6 +120,7 @@ Instead of running the pipeline locally, you can use NVCF to perform inference b
                 "auth_token": "<API key>",
                 "function_id": "<function ID>",
                 "use_ssl": True,
+                "segment_audio": True,
             },
         )
     )
diff --git a/docs/docs/extraction/nv-ingest-python-api.md b/docs/docs/extraction/nv-ingest-python-api.md
@@ -549,11 +549,13 @@ ingestor = Ingestor().files("audio_file.mp3")
 
 ingestor = ingestor.extract(
         document_type="mp3",
+        extract_method="audio",
         extract_text=True,
         extract_tables=False,
         extract_charts=False,
         extract_images=False,
         extract_infographics=False,
+        extract_audio_params={"segment_audio": True},
     ).split(
         tokenizer="meta-llama/Llama-3.2-1B",
         chunk_size=150,
@@ -563,8 +565,7 @@ ingestor = ingestor.extract(
 
 results = ingestor.ingest()
 ```
-
-
+Set extract_audio_params={"segment_audio": True} to output sentence-like audio segments as distinct extracted elements. This setting applies only when audio extraction runs through a hosted Parakeet endpoint—such as the Parakeet ASR NIM or NVCF—and has no effect when using the local Hugging Face Parakeet model.
 
 ## Related Topics
 
diff --git a/docs/docs/extraction/python-api-reference.md b/docs/docs/extraction/python-api-reference.md
@@ -641,11 +641,13 @@ ingestor = Ingestor().files("audio_file.mp3")
 
 ingestor = ingestor.extract(
         document_type="mp3",
+        extract_method="audio",
         extract_text=True,
         extract_tables=False,
         extract_charts=False,
         extract_images=False,
         extract_infographics=False,
+        extract_audio_params={"segment_audio": True},
     ).split(
         tokenizer="meta-llama/Llama-3.2-1B",
         chunk_size=150,
@@ -656,6 +658,12 @@ ingestor = ingestor.extract(
 results = ingestor.ingest()
 ```
 
+Set `extract_audio_params={"segment_audio": True}` to emit sentence-like
+audio segments as separate extracted elements. This option only takes effect
+when audio extraction is performed through a hosted Parakeet endpoint--such as the
+Parakeet ASR NIM or NVCF--and does not affect behavior when using the local Hugging
+Face Parakeet model.
+
 
 
 ## Related Topics

Original file line number	Diff line number	Diff line change
`@@ -82,10 +82,13 @@ Use the following procedure to run the NIM locally.`
`82`	`82`	`.extract(`
`83`	`83`	`document_type="wav", # Ingestor should detect type automatically in most cases`
`84`	`84`	`extract_method="audio",`
	`85`	`+ extract_audio_params={`
	`86`	`+ "segment_audio": True,`
	`87`	`+ },`
`85`	`88`	`)`
`86`	`89`	`)`
`87`	`90`	```
`88`		`-`
	`91`	+To generate one extracted element for each sentence-like ASR segment, include `extract_audio_params={"segment_audio": True}` when calling `.extract(...)`. This option applies when audio extraction runs with a Parakeet NIM (either locally through Docker or remotely via NVCF) but has no effect when using the local Hugging Face Parakeet model.
`89`	`92`
`90`	`93`	`!!! tip`
`91`	`94`
`@@ -117,6 +120,7 @@ Instead of running the pipeline locally, you can use NVCF to perform inference b`
`117`	`120`	`"auth_token": "<API key>",`
`118`	`121`	`"function_id": "<function ID>",`
`119`	`122`	`"use_ssl": True,`
	`123`	`+ "segment_audio": True,`
`120`	`124`	`},`
`121`	`125`	`)`
`122`	`126`	`)`