Video-to-captions/SubGen.py at main · HimanshuKumarSah/Video-to-captions · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import time
import warnings
import tkinter as tk
from tkinter import filedialog
from faster_whisper import WhisperModel
import subprocess
from tqdm import tqdm

# Suppress warnings
warnings.filterwarnings("ignore")

# Select video file
def select_video_file():
    root = tk.Tk()
    root.withdraw()
    video_path = filedialog.askopenfilename(
        title="Select Video File",
        filetypes=[("Video Files", "*.mp4 *.mov *.avi *.mkv *.flv")]
    )
    return video_path

# Extract audio using ffmpeg
def extract_audio(video_path, audio_path):
    print("[*] Extracting audio...")
    command = [
        "ffmpeg",
        "-y",  # overwrite if exists
        "-i", video_path,
        "-vn",
        "-acodec", "pcm_s16le",
        "-ar", "16000",
        "-ac", "1",
        audio_path
    ]
    try:
        subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT, check=True)
        print("[✓] Audio extracted")
    except subprocess.CalledProcessError:
        print("[!] Failed to extract audio. Is ffmpeg installed and in PATH?")
        raise SystemExit

# Transcribe audio and save to .srt and .txt
def transcribe_to_srt(audio_path, srt_path="subtitles.srt", txt_path="transcript.txt", model_size="medium"):
    print("[*] Loading Whisper model...")
    model = WhisperModel(model_size, compute_type="float16", device="auto")

    print("[*] Transcribing...")
    segments_gen, info = model.transcribe(audio_path, beam_size=5, word_timestamps=False)
    segments = list(segments_gen)  # Make reusable

    print(f"[ℹ️] Detected language: {info.language}")

    def format_timestamp(seconds):
        hrs = int(seconds // 3600)
        mins = int((seconds % 3600) // 60)
        secs = int(seconds % 60)
        millis = int((seconds - int(seconds)) * 1000)
        return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"

    # Write SRT
    with open(srt_path, "w", encoding="utf-8") as f:
        for i, segment in enumerate(tqdm(segments, desc="Writing subtitles", unit="segment")):
            start = format_timestamp(segment.start)
            end = format_timestamp(segment.end)
            f.write(f"{i+1}\n{start} --> {end}\n{segment.text.strip()}\n\n")

    # Write TXT
    with open(txt_path, "w", encoding="utf-8") as txt_file:
        for segment in segments:
            txt_file.write(segment.text.strip() + "\n")

    print(f"[✓] Subtitle file saved as: {srt_path}")
    print(f"[✓] Transcript file saved as: {txt_path}")

# Main workflow
def main():
    start_time = time.time()

    video_path = select_video_file()
    if not video_path:
        print("[!] No video selected.")
        return

    base_name = os.path.splitext(os.path.basename(video_path))[0]
    audio_path = f"{base_name}_audio.wav"

    # Ask user where to save SRT
    root = tk.Tk()
    root.withdraw()
    srt_path = filedialog.asksaveasfilename(
        title="Save Subtitle File As",
        defaultextension=".srt",
        initialfile=f"{base_name}.srt",
        filetypes=[("SubRip Subtitle", "*.srt")]
    )
    if not srt_path:
        print("[!] No save location selected.")
        return

    txt_path = os.path.splitext(srt_path)[0] + ".txt"

    extract_audio(video_path, audio_path)
    transcribe_to_srt(audio_path, srt_path, txt_path)

    # Auto-delete audio after processing
    os.remove(audio_path)

    end_time = time.time()
    elapsed = end_time - start_time
    minutes = int(elapsed // 60)
    seconds = int(elapsed % 60)
    print(f"\n[⏱] Total time taken: {minutes} min {seconds} sec")

if __name__ == "__main__":
    main()