speech2text/transcribe.py at main · peterzju/speech2text · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#请使用wsl中的speech2text的conda环境运行此脚本.
#输入可以用windows风格的路径名，但是记得加引号。
from faster_whisper import WhisperModel, download_model
import sys, os, time
from opencc import OpenCC
cc = OpenCC("tw2sp")  # 繁转简

# 在 WSL 中自动转换 Windows 路径 (例如 C:\ -> /mnt/c/)
def fix_wsl_path(path):
    if sys.platform.startswith("linux") and len(path) > 1 and path[1] == ":":
        try:
            drive = path[0].lower()
            # 将 C:\Users\... 转换为 /mnt/c/Users/...
            wsl_path_suffix = path[2:].replace('\\', '/')
            new_path = f"/mnt/{drive}{wsl_path_suffix}"
            if os.path.exists(new_path):
                return new_path
        except:
            pass
    return path

# model_size = os.environ.get("MODEL", "large-v3")  # 原配置暂时注释
model_size = "medium"
audio_path = fix_wsl_path(sys.argv[1])
output_path = sys.argv[2] if len(sys.argv) > 2 else None

print(f"[INFO] 模型: {model_size}")
print(f"[INFO] 文件: {audio_path}")

print(f"[INFO] 优先使用本地缓存模型 (默认路径 ~/.cache/huggingface/hub/)...")
# 优先使用本地缓存；若不存在则允许联网下载
try:
    model_path = download_model(model_size, local_files_only=True)
except Exception:
    print("[INFO] 本地缓存未找到，尝试联网下载模型...")
    model_path = download_model(model_size, local_files_only=False)

from dotenv import load_dotenv

# 加载 .env 中的环境变量
load_dotenv()

# 获取 CPU 线程数配置，如果没有默认使用 7
cpu_threads = int(os.environ.get("CPU_THREADS", 7))

print(f"[INFO] 正在加载模型：{model_path} (使用的 CPU 线程数: {cpu_threads})")
model = WhisperModel(model_path, device="cpu", compute_type="int8", cpu_threads=cpu_threads)

print(f"[INFO] 开始转写...\n")
start_time = time.time()

segments, info = model.transcribe(
    audio_path,
    language="zh",
    beam_size=1,
    vad_filter=True,
    vad_parameters=dict(
        min_silence_duration_ms=300,   # 课堂停顿较短，300ms更合适
        speech_pad_ms=400,             # 多保留一点前后语音，防止截断
    ),
    condition_on_previous_text=True,   # 课堂语音连贯，开着有助于上下文理解
    no_speech_threshold=0.6,
)

print(f"# 语言: {info.language}, 概率: {info.language_probability:.2f}")
print(f"# 音频时长: {info.duration:.1f}s ({info.duration/60:.1f}min)\n")

outfile = None
if output_path:
    outfile = open(output_path, "w", encoding="utf-8")
    outfile.write(f"# 文件: {audio_path}\n")
    outfile.write(f"# 语言: {info.language}, 概率: {info.language_probability:.2f}\n")
    outfile.write(f"# 音频时长: {info.duration:.1f}s ({info.duration/60:.1f}min)\n\n")

lines = []
for seg in segments:
    text_cn = cc.convert(seg.text)  # 转换为简体
    console_line = f"[{seg.start:.1f}s -> {seg.end:.1f}s] {text_cn}"
    file_line = text_cn
    print(console_line)
    lines.append(file_line)
    if outfile:
        outfile.write(file_line + "\n")
        outfile.flush()

elapsed = time.time() - start_time
ratio = info.duration / elapsed if elapsed > 0 else 0
print(f"\n[INFO] 耗时: {elapsed:.1f}s, 实时比: {ratio:.2f}x")

if outfile:
    outfile.write(f"\n# 耗时: {elapsed:.1f}s, 实时比: {ratio:.2f}x\n")
    outfile.flush()
    outfile.close()
    print(f"[INFO] 已保存到: {output_path}")