|
15 | 15 | from __future__ import annotations
|
16 | 16 |
|
17 | 17 | import os
|
18 |
| -from typing import cast, Optional, Union |
| 18 | +from typing import cast, Literal, Optional, Union |
19 | 19 | import warnings
|
20 | 20 |
|
21 | 21 | import IPython.display as ipy_display
|
@@ -736,3 +736,77 @@ def pdf_chunk(
|
736 | 736 | return struct_series
|
737 | 737 | else:
|
738 | 738 | return content_series
|
| 739 | + |
| 740 | + def audio_transcribe( |
| 741 | + self, |
| 742 | + *, |
| 743 | + connection: Optional[str] = None, |
| 744 | + model_name: Optional[ |
| 745 | + Literal[ |
| 746 | + "gemini-2.0-flash-001", |
| 747 | + "gemini-2.0-flash-lite-001", |
| 748 | + ] |
| 749 | + ] = None, |
| 750 | + verbose: bool = False, |
| 751 | + ) -> bigframes.series.Series: |
| 752 | + """ |
| 753 | + Transcribe audio content using a Gemini multimodal model. |
| 754 | +
|
| 755 | + Args: |
| 756 | + connection (str or None, default None): BQ connection used for |
| 757 | + function internet transactions, and the output blob if "dst" |
| 758 | + is str. If None, uses default connection of the session. |
| 759 | + model_name (str): The model for natural language tasks. Accepted |
| 760 | + values are "gemini-2.0-flash-lite-001", and "gemini-2.0-flash-001". |
| 761 | + See "https://ai.google.dev/gemini-api/docs/models" for model choices. |
| 762 | + verbose (bool, default "False"): controls the verbosity of the output. |
| 763 | + When set to True, both error messages and the transcribed content |
| 764 | + are displayed. Conversely, when set to False, only the transcribed |
| 765 | + content is presented, suppressing error messages. |
| 766 | +
|
| 767 | + Returns: |
| 768 | + bigframes.series.Series: str or struct[str, str], |
| 769 | + depend on the "verbose" parameter. |
| 770 | + Contains the transcribed text from the audio file. |
| 771 | + Includes error messages if verbosity is enabled. |
| 772 | + """ |
| 773 | + import bigframes.bigquery as bbq |
| 774 | + import bigframes.ml.llm as llm |
| 775 | + import bigframes.pandas as bpd |
| 776 | + |
| 777 | + # col name doesn't matter here. Rename to avoid column name conflicts |
| 778 | + audio_series = bigframes.series.Series(self._block) |
| 779 | + |
| 780 | + prompt_text = "**Task:** Transcribe the provided audio. **Instructions:** - Your response must contain only the verbatim transcription of the audio. - Do not include any introductory text, summaries, or conversational filler in your response. The output should begin directly with the first word of the audio." |
| 781 | + |
| 782 | + llm_model = llm.GeminiTextGenerator( |
| 783 | + model_name=model_name, |
| 784 | + session=self._block.session, |
| 785 | + connection_name=connection, |
| 786 | + ) |
| 787 | + |
| 788 | + # transcribe audio using ML.GENERATE_TEXT |
| 789 | + transcribed_results = llm_model.predict( |
| 790 | + X=audio_series, |
| 791 | + prompt=[prompt_text, audio_series], |
| 792 | + temperature=0.0, |
| 793 | + ) |
| 794 | + |
| 795 | + transcribed_content_series = cast( |
| 796 | + bpd.Series, transcribed_results["ml_generate_text_llm_result"] |
| 797 | + ).rename("transcribed_content") |
| 798 | + |
| 799 | + if verbose: |
| 800 | + transcribed_status_series = cast( |
| 801 | + bpd.Series, transcribed_results["ml_generate_text_status"] |
| 802 | + ) |
| 803 | + results_df = bpd.DataFrame( |
| 804 | + { |
| 805 | + "status": transcribed_status_series, |
| 806 | + "content": transcribed_content_series, |
| 807 | + } |
| 808 | + ) |
| 809 | + results_struct = bbq.struct(results_df).rename("transcription_results") |
| 810 | + return results_struct |
| 811 | + else: |
| 812 | + return transcribed_content_series |
0 commit comments