1+ import os
2+ import json
3+ import re
4+ import subprocess
5+ from groq import Groq
6+ from src .config import WHISPER_API_KEY
7+
8+ def seconds_to_srt_time (seconds ):
9+ hours = int (seconds // 3600 )
10+ minutes = int ((seconds % 3600 ) // 60 )
11+ secs = int (seconds % 60 )
12+ millis = int ((seconds - int (seconds )) * 1000 )
13+ return f"{ hours :02d} :{ minutes :02d} :{ secs :02d} ,{ millis :03d} "
14+
15+ def write_to_srt (segments , output_file ):
16+ with open (output_file , 'w' , encoding = 'utf-8' ) as f :
17+ for i , segment in enumerate (segments , start = 1 ):
18+ start_time = seconds_to_srt_time (segment ['start' ])
19+ end_time = seconds_to_srt_time (segment ['end' ])
20+ text = segment ['text' ]
21+ # filter out the illusion
22+ if "请不吝" in text :
23+ text = ""
24+ f .write (f"{ i } \n " )
25+ f .write (f"{ start_time } --> { end_time } \n " )
26+ f .write (f"{ text } \n \n " )
27+
28+ def print_segment_info (segments ):
29+ if segments :
30+ for segment in segments :
31+ start_time = segment .get ('start' )
32+ end_time = segment .get ('end' )
33+ text = segment .get ('text' )
34+ print (f"Start time: { start_time } seconds, End time: { end_time } seconds, Text: { text } " )
35+ else :
36+ print ("No valid segments data found." )
37+
38+
39+ def check_file_format (filename ):
40+ if filename [- 4 :] != ".mp3" :
41+ mp3filename = filename [:- 4 ] + ".mp3"
42+ command = [
43+ 'ffmpeg' , '-i' , filename , '-vn' , '-acodec' , 'libmp3lame' , mp3filename
44+ ]
45+ subprocess .run (command , check = True , capture_output = True , text = True )
46+ return mp3filename
47+ else :
48+ return filename
49+
50+ # Groq API SDK: https://console.groq.com/docs/speech-to-text
51+ # due to the limit of API, 40 MB (free tier), 100MB (dev tier)
52+ # Requests per minute: 20, per day: 2000. And 7200 seconds / hour, 28800 seconds / day.
53+ # more info: https://console.groq.com/docs/rate-limits
54+ def generate_srt (filename , output_file = None ):
55+ client = Groq (
56+ api_key = WHISPER_API_KEY
57+ )
58+ filename = check_file_format (filename )
59+ if output_file is None :
60+ output_file = filename [:- 4 ] + ".srt"
61+ try :
62+ with open (filename , "rb" ) as file :
63+ transcription = client .audio .transcriptions .create (
64+ file = file , # Required audio file
65+ model = "whisper-large-v3-turbo" , # Required model to use for transcription
66+ prompt = "以下是普通话的句子" , # Optional
67+ response_format = "verbose_json" , # Optional
68+ timestamp_granularities = ["segment" ], # Optional (must set response_format to "json" to use and can specify "word", "segment" (default), or both)
69+ # language="zh", # Optional
70+ temperature = 0.0 # Optional
71+ )
72+ input_str = json .dumps (transcription , indent = 2 , default = str )
73+ # use index to segment the input_str
74+ start_index = input_str .find ('segments=' ) + len ('segments=' )
75+ end_index = input_str .rfind (']' ) + 1
76+ segments_str = input_str [start_index :end_index ]
77+ segments = json .loads (segments_str .replace ("'" , "\" " ))
78+ # print_segment_info(segments)
79+ write_to_srt (segments , output_file )
80+ # remove the audio file
81+ os .remove (filename )
82+ return output_file
83+ except Exception as e :
84+ print (f"Error: { e } " )
85+ return None
86+
87+ if __name__ == "__main__" :
88+ filename = ""
89+ generate_srt (filename )
0 commit comments