Skip to content

Commit 86159a7

Browse files
authored
feat: add blob.transcribe function (#1773)
* add transcribe function * add verbose * add some debugging message * transcribe functin is completed. test case is done * move the place to capture col name * remove a few features, update testcase * change the testcase, add data * introduce user specified instructions * tweak prompt * rebase confest * change the way to read in input audio * update variable names * change variable names * change the way past in input * remove addtional instruction for now * change the column name * add a name for result
1 parent a600b23 commit 86159a7

File tree

6 files changed

+140
-1
lines changed

6 files changed

+140
-1
lines changed

bigframes/operations/blob.py

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from __future__ import annotations
1616

1717
import os
18-
from typing import cast, Optional, Union
18+
from typing import cast, Literal, Optional, Union
1919
import warnings
2020

2121
import IPython.display as ipy_display
@@ -736,3 +736,77 @@ def pdf_chunk(
736736
return struct_series
737737
else:
738738
return content_series
739+
740+
def audio_transcribe(
741+
self,
742+
*,
743+
connection: Optional[str] = None,
744+
model_name: Optional[
745+
Literal[
746+
"gemini-2.0-flash-001",
747+
"gemini-2.0-flash-lite-001",
748+
]
749+
] = None,
750+
verbose: bool = False,
751+
) -> bigframes.series.Series:
752+
"""
753+
Transcribe audio content using a Gemini multimodal model.
754+
755+
Args:
756+
connection (str or None, default None): BQ connection used for
757+
function internet transactions, and the output blob if "dst"
758+
is str. If None, uses default connection of the session.
759+
model_name (str): The model for natural language tasks. Accepted
760+
values are "gemini-2.0-flash-lite-001", and "gemini-2.0-flash-001".
761+
See "https://ai.google.dev/gemini-api/docs/models" for model choices.
762+
verbose (bool, default "False"): controls the verbosity of the output.
763+
When set to True, both error messages and the transcribed content
764+
are displayed. Conversely, when set to False, only the transcribed
765+
content is presented, suppressing error messages.
766+
767+
Returns:
768+
bigframes.series.Series: str or struct[str, str],
769+
depend on the "verbose" parameter.
770+
Contains the transcribed text from the audio file.
771+
Includes error messages if verbosity is enabled.
772+
"""
773+
import bigframes.bigquery as bbq
774+
import bigframes.ml.llm as llm
775+
import bigframes.pandas as bpd
776+
777+
# col name doesn't matter here. Rename to avoid column name conflicts
778+
audio_series = bigframes.series.Series(self._block)
779+
780+
prompt_text = "**Task:** Transcribe the provided audio. **Instructions:** - Your response must contain only the verbatim transcription of the audio. - Do not include any introductory text, summaries, or conversational filler in your response. The output should begin directly with the first word of the audio."
781+
782+
llm_model = llm.GeminiTextGenerator(
783+
model_name=model_name,
784+
session=self._block.session,
785+
connection_name=connection,
786+
)
787+
788+
# transcribe audio using ML.GENERATE_TEXT
789+
transcribed_results = llm_model.predict(
790+
X=audio_series,
791+
prompt=[prompt_text, audio_series],
792+
temperature=0.0,
793+
)
794+
795+
transcribed_content_series = cast(
796+
bpd.Series, transcribed_results["ml_generate_text_llm_result"]
797+
).rename("transcribed_content")
798+
799+
if verbose:
800+
transcribed_status_series = cast(
801+
bpd.Series, transcribed_results["ml_generate_text_status"]
802+
)
803+
results_df = bpd.DataFrame(
804+
{
805+
"status": transcribed_status_series,
806+
"content": transcribed_content_series,
807+
}
808+
)
809+
results_struct = bbq.struct(results_df).rename("transcription_results")
810+
return results_struct
811+
else:
812+
return transcribed_content_series
380 KB
Binary file not shown.
17.9 KB
Binary file not shown.

scripts/data/pdfs/test-protected.pdf

12.4 KB
Binary file not shown.

tests/system/conftest.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1521,3 +1521,17 @@ def pdf_mm_df(
15211521
pdf_gcs_path, session: bigframes.Session, bq_connection: str
15221522
) -> bpd.DataFrame:
15231523
return session.from_glob_path(pdf_gcs_path, name="pdf", connection=bq_connection)
1524+
1525+
1526+
@pytest.fixture(scope="session")
1527+
def audio_gcs_path() -> str:
1528+
return "gs://bigframes_blob_test/audio/*"
1529+
1530+
1531+
@pytest.fixture(scope="session")
1532+
def audio_mm_df(
1533+
audio_gcs_path, session: bigframes.Session, bq_connection: str
1534+
) -> bpd.DataFrame:
1535+
return session.from_glob_path(
1536+
audio_gcs_path, name="audio", connection=bq_connection
1537+
)

tests/system/large/blob/test_function.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,3 +385,54 @@ def test_blob_pdf_chunk(
385385
check_dtype=False,
386386
check_index=False,
387387
)
388+
389+
390+
@pytest.mark.parametrize(
391+
"model_name, verbose",
392+
[
393+
("gemini-2.0-flash-001", True),
394+
("gemini-2.0-flash-001", False),
395+
("gemini-2.0-flash-lite-001", True),
396+
("gemini-2.0-flash-lite-001", False),
397+
],
398+
)
399+
def test_blob_transcribe(
400+
audio_mm_df: bpd.DataFrame,
401+
model_name: str,
402+
verbose: bool,
403+
):
404+
actual = (
405+
audio_mm_df["audio"]
406+
.blob.audio_transcribe(
407+
model_name=model_name,
408+
verbose=verbose,
409+
)
410+
.to_pandas()
411+
)
412+
413+
# check relative length
414+
expected_text = "Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress"
415+
expected_len = len(expected_text)
416+
417+
actual_text = ""
418+
if verbose:
419+
actual_text = actual[0]["content"]
420+
else:
421+
actual_text = actual[0]
422+
actual_len = len(actual_text)
423+
424+
relative_length_tolerance = 0.2
425+
min_acceptable_len = expected_len * (1 - relative_length_tolerance)
426+
max_acceptable_len = expected_len * (1 + relative_length_tolerance)
427+
assert min_acceptable_len <= actual_len <= max_acceptable_len, (
428+
f"Item (verbose={verbose}): Transcribed text length {actual_len} is outside the acceptable range "
429+
f"[{min_acceptable_len:.0f}, {max_acceptable_len:.0f}]. "
430+
f"Expected reference length was {expected_len}. "
431+
)
432+
433+
# check for major keywords
434+
major_keywords = ["book", "picture"]
435+
for keyword in major_keywords:
436+
assert (
437+
keyword.lower() in actual_text.lower()
438+
), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in transcribed text. "

0 commit comments

Comments
 (0)