Skip to content

Commit cb689b0

Browse files
authored
feat: add whisper api (#252)
* feat: add whisper api * docs: update docs
1 parent 153446f commit cb689b0

File tree

11 files changed

+190
-48
lines changed

11 files changed

+190
-48
lines changed

README.md

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
- `GLM-4V-PLUS`
4040
- `Gemini-2.0-flash`
4141
- `Qwen-2.5-72B-Instruct`
42-
- **( :tada: NEW)持久化登录/下载/上传视频(支持多p投稿)**[bilitool](https://github.com/timerring/bilitool)已经开源,实现持久化登录,下载视频及弹幕(含多p)/上传视频(可分p投稿),查询投稿状态,查询详细信息等功能,一键pip安装,可以使用命令行 cli 操作,也可以作为api调用。
42+
- **( :tada: NEW)持久化登录/下载/上传视频(支持多p投稿)**[bilitool](https://github.com/timerring/bilitool) 已经开源,实现持久化登录,下载视频及弹幕(含多p)/上传视频(可分p投稿),查询投稿状态,查询详细信息等功能,一键pip安装,可以使用命令行 cli 操作,也可以作为api调用。
4343
- **( :tada: NEW)自动多平台循环直播推流**:该工具已经开源 [looplive](https://github.com/timerring/looplive) 是一个 7 x 24 小时全自动**循环多平台同时推流**直播工具。
4444

4545
项目架构流程如下:
@@ -140,7 +140,18 @@ pip install -r requirements.txt
140140

141141
#### 3. 配置 whisper 模型及 MLLM 模型
142142

143-
##### 3.1 whisper 模型(字幕识别)
143+
##### 3.1 whisper 语音识别
144+
145+
`ASR_METHOD` 默认为 none, 即不进行语音字幕识别。
146+
147+
##### 3.1.1 采用 api 方式
148+
149+
`src/config.py` 文件中的 `ASR_METHOD` 参数设置为 `api`,然后填写 `WHISPER_API_KEY` 参数为你的 [API Key](https://console.groq.com/keys)。本项目采用 groq 提供 free tier 的 `whisper-large-v3-turbo` 模型,上传限制为 40 MB(约半小时),因此如需采用 api 识别的方式,请将视频录制分段调整为 30 分钟。此外,free tier 请求限制为 7200秒/20次/小时,28800秒/2000次/天。如果有更多需求,也欢迎升级到 dev tier,更多信息见[groq 官网](https://console.groq.com/docs/rate-limits)
150+
151+
##### 3.1.2 采用本地部署方式
152+
153+
`src/config.py` 文件中的 `ASR_METHOD` 参数设置为 `deploy`,然后下载所需模型文件,并放置在 `src/subtitle/models` 文件夹中。
154+
144155
项目默认采用 [`small`](https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt) 模型,请点击下载所需文件,并放置在 `src/subtitle/models` 文件夹中。
145156

146157
> [!TIP]

src/burn/render_command.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ def render_command(in_video_path, out_video_path, in_subtitle_font_size, in_subt
1414
in_subtitle_margin_v: str, the bottom margin of subtitles
1515
"""
1616
in_ass_path = in_video_path[:-4] + '.ass'
17-
if GPU_EXIST:
18-
in_srt_path = in_video_path[:-4] + '.srt'
17+
in_srt_path = in_video_path[:-4] + '.srt'
18+
19+
if GPU_EXIST and os.path.isfile(in_srt_path):
1920
if os.path.isfile(in_ass_path):
2021
scan_log.info("Current Mode: GPU with danmaku")
2122
command = [

src/burn/render_queue.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import queue
44
import time
5-
from src.subtitle.generate_subtitles import generate_subtitles
5+
from src.subtitle.subtitle_generator import generate_subtitle
66
from src.burn.render_video import render_video
77
from src.log.logger import scan_log
88

@@ -11,7 +11,7 @@ def __init__(self):
1111
self.render_queue = queue.Queue()
1212

1313
def pipeline_render(self, video_path):
14-
generate_subtitles(video_path)
14+
generate_subtitle(video_path)
1515
self.render_queue.put(video_path)
1616

1717
def monitor_queue(self):

src/burn/render_then_merge.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import subprocess
66
from src.config import GPU_EXIST, SRC_DIR, VIDEOS_DIR
77
from src.danmaku.generate_danmakus import get_resolution, process_danmakus
8-
from src.subtitle.generate_subtitles import generate_subtitles
8+
from src.subtitle.subtitle_generator import generate_subtitle
99
from src.burn.render_command import render_command
1010
from src.upload.extract_video_info import get_video_info
1111
from src.log.logger import scan_log
@@ -70,8 +70,7 @@ def render_then_merge(video_path_list):
7070
# Process the danmakus to ass and remove emojis
7171
subtitle_font_size, subtitle_margin_v = process_danmakus(xml_path, video_resolution)
7272
# Generate the srt file via whisper model
73-
if GPU_EXIST:
74-
generate_subtitles(original_video_path)
73+
generate_subtitle(original_video_path)
7574
# Burn danmaku or subtitles into the videos
7675
render_command(original_video_path, video_to_be_merged, subtitle_font_size, subtitle_margin_v)
7776
if not os.path.exists(merge_list):

src/burn/render_video.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import subprocess
66
from src.config import GPU_EXIST, SRC_DIR, MODEL_TYPE, AUTO_SLICE, SLICE_DURATION, MIN_VIDEO_SIZE, VIDEOS_DIR , SLICE_NUM, SLICE_OVERLAP, SLICE_STEP
77
from src.danmaku.generate_danmakus import get_resolution, process_danmakus
8-
from src.subtitle.generate_subtitles import generate_subtitles
8+
from src.subtitle.subtitle_generator import generate_subtitle
99
from src.burn.render_command import render_command
1010
from autoslice import slice_video_by_danmaku
1111
from src.autoslice.inject_metadata import inject_metadata
@@ -52,9 +52,8 @@ def render_video(video_path):
5252
scan_log.error(f"FileNotFoundError: {e} - Check if the file exists")
5353

5454
# Generate the srt file via whisper model
55-
if GPU_EXIST:
56-
if MODEL_TYPE != "pipeline":
57-
generate_subtitles(original_video_path)
55+
if MODEL_TYPE != "pipeline":
56+
generate_subtitle(original_video_path)
5857

5958
# Burn danmaku or subtitles into the videos
6059
render_command(original_video_path, format_video_path, subtitle_font_size, subtitle_margin_v)

src/config.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,13 @@
1010
GPU_EXIST=True
1111
# Can be pipeline, append, merge
1212
MODEL_TYPE = "append"
13-
Inference_Model = "small"
13+
# =============== The auto speech recognition configuration ============================
14+
ASR_METHOD = "api" # can be "deploy" or "api" or "none"
15+
# If you choose "api", due to the limitation of free tier, you should keep every video less than 30 minutes(around)
16+
# Apply for your own API key at https://console.groq.com/keys
17+
WHISPER_API_KEY = ""
18+
Inference_Model = "small" # the model to be deployed
19+
# =============== The video configuration ============================
1420
TITLE = "{artist}直播回放-{date}-{title}"
1521
# You can change the title as you like, eg.
1622
# f"{artist}直播回放-{date}-{title}" - Streamer直播回放-20250328-Live title

src/subtitle/api/whisper_sdk.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import os
2+
import json
3+
import re
4+
import subprocess
5+
from groq import Groq
6+
from src.config import WHISPER_API_KEY
7+
8+
def seconds_to_srt_time(seconds):
9+
hours = int(seconds // 3600)
10+
minutes = int((seconds % 3600) // 60)
11+
secs = int(seconds % 60)
12+
millis = int((seconds - int(seconds)) * 1000)
13+
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
14+
15+
def write_to_srt(segments, output_file):
16+
with open(output_file, 'w', encoding='utf-8') as f:
17+
for i, segment in enumerate(segments, start=1):
18+
start_time = seconds_to_srt_time(segment['start'])
19+
end_time = seconds_to_srt_time(segment['end'])
20+
text = segment['text']
21+
# filter out the illusion
22+
if "请不吝" in text:
23+
text = ""
24+
f.write(f"{i}\n")
25+
f.write(f"{start_time} --> {end_time}\n")
26+
f.write(f"{text}\n\n")
27+
28+
def print_segment_info(segments):
29+
if segments:
30+
for segment in segments:
31+
start_time = segment.get('start')
32+
end_time = segment.get('end')
33+
text = segment.get('text')
34+
print(f"Start time: {start_time} seconds, End time: {end_time} seconds, Text: {text}")
35+
else:
36+
print("No valid segments data found.")
37+
38+
39+
def check_file_format(filename):
40+
if filename[-4:] != ".mp3":
41+
mp3filename = filename[:-4] + ".mp3"
42+
command = [
43+
'ffmpeg', '-i', filename, '-vn', '-acodec', 'libmp3lame', mp3filename
44+
]
45+
subprocess.run(command, check=True, capture_output=True, text=True)
46+
return mp3filename
47+
else:
48+
return filename
49+
50+
# Groq API SDK: https://console.groq.com/docs/speech-to-text
51+
# due to the limit of API, 40 MB (free tier), 100MB (dev tier)
52+
# Requests per minute: 20, per day: 2000. And 7200 seconds / hour, 28800 seconds / day.
53+
# more info: https://console.groq.com/docs/rate-limits
54+
def generate_srt(filename, output_file=None):
55+
client = Groq(
56+
api_key=WHISPER_API_KEY
57+
)
58+
filename = check_file_format(filename)
59+
if output_file is None:
60+
output_file = filename[:-4] + ".srt"
61+
try:
62+
with open(filename, "rb") as file:
63+
transcription = client.audio.transcriptions.create(
64+
file=file, # Required audio file
65+
model="whisper-large-v3-turbo", # Required model to use for transcription
66+
prompt="以下是普通话的句子", # Optional
67+
response_format="verbose_json", # Optional
68+
timestamp_granularities = ["segment"], # Optional (must set response_format to "json" to use and can specify "word", "segment" (default), or both)
69+
# language="zh", # Optional
70+
temperature=0.0 # Optional
71+
)
72+
input_str = json.dumps(transcription, indent=2, default=str)
73+
# use index to segment the input_str
74+
start_index = input_str.find('segments=') + len('segments=')
75+
end_index = input_str.rfind(']') + 1
76+
segments_str = input_str[start_index:end_index]
77+
segments = json.loads(segments_str.replace("'", "\""))
78+
# print_segment_info(segments)
79+
write_to_srt(segments, output_file)
80+
# remove the audio file
81+
os.remove(filename)
82+
return output_file
83+
except Exception as e:
84+
print(f"Error: {e}")
85+
return None
86+
87+
if __name__ == "__main__":
88+
filename = ""
89+
generate_srt(filename)

src/subtitle/generate_subtitles.py

Lines changed: 0 additions & 20 deletions
This file was deleted.

src/subtitle/subtitle_generator.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Copyright (c) 2024 bilive.
2+
3+
import os
4+
import subprocess
5+
from config import SRC_DIR, ASR_METHOD, WHISPER_API_KEY
6+
from log.logger import scan_log
7+
from functools import wraps
8+
9+
10+
def subtitle_generator(asr_method):
11+
"""Decorator to select subtitle generation function based on model type
12+
Args:
13+
model_type: str, type of model to use
14+
Returns:
15+
function: wrapped subtitle generation function
16+
"""
17+
def decorator(func):
18+
def wrapper(video_path):
19+
if asr_method == "api":
20+
from .api.whisper_sdk import generate_srt
21+
return generate_srt(video_path)
22+
elif asr_method == "deploy":
23+
try:
24+
subprocess.run(
25+
['python', os.path.join(SRC_DIR, 'subtitle', 'generate.py'), video_path],
26+
stdout=subprocess.DEVNULL
27+
)
28+
return video_path[:-4] + ".srt"
29+
except subprocess.CalledProcessError as e:
30+
scan_log.error(f"Generate subtitles failed: {e.stderr}")
31+
return None
32+
elif asr_method == "none":
33+
return None
34+
else:
35+
scan_log.error(f"Unsupported asr method: {asr_method}")
36+
return None
37+
return wrapper
38+
return decorator
39+
40+
# Generate the srt file via whisper model
41+
@subtitle_generator(ASR_METHOD)
42+
def generate_subtitle(in_video_path):
43+
"""Generate subtitles via whisper model
44+
Args:
45+
in_video_path: str, the path of video
46+
"""
47+
pass
48+
49+

src/upload/generate_upload_data.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,20 +20,24 @@ def generate_video_data(video_path):
2020
return copyright, title, desc, tid, tag, source, cover, dynamic
2121

2222
def generate_slice_data(video_path):
23-
command = [
24-
"ffprobe",
25-
"-v", "quiet",
26-
"-print_format", "json",
27-
"-show_format",
28-
video_path
29-
]
30-
output = subprocess.check_output(command, stderr=subprocess.STDOUT).decode('utf-8')
31-
parsed_output = json.loads(output)
32-
title = parsed_output["format"]["tags"]["generate"]
33-
copyright = 1
34-
tid = 138
35-
tag = "直播切片"
36-
return copyright, title, tid, tag
23+
try:
24+
command = [
25+
"ffprobe",
26+
"-v", "quiet",
27+
"-print_format", "json",
28+
"-show_format",
29+
video_path
30+
]
31+
output = subprocess.check_output(command, stderr=subprocess.STDOUT).decode('utf-8')
32+
parsed_output = json.loads(output)
33+
title = parsed_output["format"]["tags"]["generate"]
34+
copyright = 1
35+
tid = 138
36+
tag = "直播切片"
37+
return copyright, title, tid, tag
38+
except Exception as e:
39+
scan_log.error(f"Error in generate_slice_data: {e}")
40+
return None, None, None, None
3741

3842
if __name__ == "__main__":
3943
pass

0 commit comments

Comments
 (0)