Added STT model for audio context.

ShivangNagta · ShivangNagta · commit a56c1f1fabaa · 2025-08-17T16:56:48.000+05:30
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,4 @@ faiss_index.bin
 input_video.mp4
 metadata.pkl
 .env
+data
diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py
@@ -4,6 +4,9 @@
 from app.core.memory import ChatMemory
 from app.services.extract_frames import extract_frames
 from app.services.faiss import faiss_process
+from app.services.extract_audio import AudioExtractor
+from app.services.transcription import get_vosk_model
+from app.services.transcribe import Transcriber
 
 router = APIRouter()
 memory = ChatMemory()
@@ -28,8 +31,17 @@ async def clip_embed_video(file: UploadFile = File(...)):
         frames = extract_frames(temp_path, "video_frames")
         (preprocess, model) = clip_model()
 
-        faiss_process(preprocess, model, frames)
-        
+        extractor = AudioExtractor()
+        audio_path = extractor.extract_audio(temp_path)
+
+        #transcriptions handle
+        model_path = get_vosk_model()
+        transciber = Transcriber(model_path)
+        print("Using model:", model_path)
+        transcriptions = transciber.transcribe(audio_path)["transcription"]
+
+        print("Creating index.bin and metadata.pkl")
+        faiss_process(preprocess, model, frames, transcriptions)
 
         return {"ready": True}
     except Exception as e:
diff --git a/backend/app/services/.gitignore b/backend/app/services/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/backend/app/services/extract_audio.py b/backend/app/services/extract_audio.py
@@ -0,0 +1,35 @@
+import os
+import subprocess
+
+class AudioExtractor:
+    def __init__(self, ffmpeg_path="ffmpeg"):
+        self.ffmpeg_path = ffmpeg_path
+
+    def extract_audio(self, video_path, output_path=None, format="mp3"):
+        if not os.path.exists(video_path):
+            raise FileNotFoundError(f"Video file not found: {video_path}")
+
+        # Default output path
+        if output_path is None:
+            base, _ = os.path.splitext(video_path)
+            output_path = f"{base}.{format}"
+
+        # Build ffmpeg command
+        command = [
+            self.ffmpeg_path,
+            "-y",                # overwrite if file exists
+            "-i", video_path,    # input file
+            "-vn",               # no video
+            "-ac", "1",          # mono
+            "-ar", "16000",      # 16 kHz sample rate (good for ASR)
+            "-f", format,        # output format
+            output_path
+        ]
+
+        # Run command
+        try:
+            subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"ffmpeg failed: {e.stderr.decode()}")
+
+        return output_path
diff --git a/backend/app/services/faiss.py b/backend/app/services/faiss.py
@@ -5,13 +5,17 @@
 import faiss
 from tqdm import tqdm
 
-def faiss_process(preprocessor, model, frames, device="cpu"):
+def faiss_process(preprocessor, model, frames, transcriptions, temp_path=".", device="cpu"):
     model.to(device)
     model.eval()
 
     frame_embeddings = []
+    metadata = []
 
-    for frame_path in tqdm(frames, desc="Processing frames"):
+    # Build a list of transcriptions for easy lookup
+    transcription_segments = list(transcriptions.values())
+
+    for sec_index, frame_path in enumerate(tqdm(frames, desc="Processing frames")):
         image = Image.open(frame_path).convert("RGB")
         inputs = preprocessor(images=image, return_tensors="pt").to(device)
 
@@ -20,23 +24,40 @@ def faiss_process(preprocessor, model, frames, device="cpu"):
             image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
             frame_embeddings.append(image_features.cpu().numpy())
 
-    frame_embeddings = np.vstack(frame_embeddings)
+        # assume frame index = second in video
+        frame_time = sec_index  
+
+        # find matching transcription segment
+        matched_text = ""
+        for seg in transcription_segments:
+            if seg["start_sec"] <= frame_time < seg["end_sec"]:
+                matched_text = seg["text"]
+                break
+
+        metadata.append({
+            "frame_path": frame_path,
+            "frame_time": frame_time,
+            "transcription": matched_text
+        })
 
-    # Dummy text per frame (for now)
-    transcriptions = [f"Transcription for {fp}" for fp in frames]
+    print("Metadata done!")
+
+    # Stack embeddings
+    frame_embeddings = np.vstack(frame_embeddings).astype("float32")
 
     # Initialize FAISS index
     embedding_dim = frame_embeddings.shape[1]
     index = faiss.IndexFlatL2(embedding_dim)
     index.add(frame_embeddings)
+    print("FAISS initialized!")
 
-    metadata = [
-        {"frame_path": frames[i], "transcription": transcriptions[i]} for i in range(len(frames))
-    ]
+    # Save FAISS index
+    faiss_path = f"{temp_path}/faiss_index.bin"
+    faiss.write_index(index, faiss_path)
+    print(f"FAISS index written to {faiss_path}")
 
-    with open("metadata.pkl", "wb") as f:
+    # Save metadata
+    meta_path = f"{temp_path}/metadata.pkl"
+    with open(meta_path, "wb") as f:
         pickle.dump(metadata, f)
-        print("metadata.pkl written")
-
-    faiss.write_index(index, "faiss_index.bin")
-    print("faiss_index.bin written")
+    print(f"Metadata written to {meta_path}")
diff --git a/backend/app/services/llava_api.py b/backend/app/services/llava_api.py
@@ -8,15 +8,11 @@
 import base64
 from dotenv import load_dotenv
 
-
-
 load_dotenv()
 
 url = os.getenv("URL")
 api_key = os.getenv("API_KEY")
 
-
-
 # Use this function to convert an image file from the filesystem to base64
 def image_file_to_base64(image_path):
     with open(image_path, 'rb') as f:
@@ -28,7 +24,7 @@ def query_llava(context: dict, question: str) -> str:
     try:
         data = {
         "images": image_file_to_base64(context["frame_path"]),
-        "prompt": question
+        "prompt": f"Question:{question}, Transcription for the frame: {context['transcription']}"
         }
 
         headers = {'x-api-key': api_key}
diff --git a/backend/app/services/transcribe.py b/backend/app/services/transcribe.py
@@ -0,0 +1,98 @@
+import subprocess, sys, os, json
+from datetime import datetime
+from vosk import Model, KaldiRecognizer
+
+SAMPLE_RATE = 16000
+BYTES_PER_SECOND = SAMPLE_RATE * 2
+
+class Transcriber():
+    def __init__(self, model_path, window_size_sec=5, stride_sec=1):
+        """
+        window_size_sec: context window size (e.g., 5 seconds)
+        stride_sec: step size (e.g., 1 second)
+        """
+        self.model = Model(model_path)
+        self.window_size = window_size_sec
+        self.stride = stride_sec
+
+    def transcribe(self, filename):
+        rec = KaldiRecognizer(self.model, SAMPLE_RATE)
+        rec.SetWords(True)
+
+        if not os.path.exists(filename):
+            raise FileNotFoundError(filename)
+
+        ffmpeg_command = [
+            "ffmpeg",
+            "-nostdin",
+            "-loglevel", "quiet",
+            "-i", filename,
+            "-ar", str(SAMPLE_RATE),
+            "-ac", "1",
+            "-f", "s16le",
+            "-"
+        ]
+
+        transcription = {}
+        start_time = datetime.now()
+
+        with subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, bufsize=10**8) as process:
+            audio = process.stdout.read()
+
+        # Convert window/stride to bytes
+        window_bytes = self.window_size * BYTES_PER_SECOND
+        stride_bytes = self.stride * BYTES_PER_SECOND
+
+        total_len = len(audio)
+        frame_index = 0
+
+        # Slide across audio
+        for start in range(0, total_len - window_bytes + 1, stride_bytes):
+            window = audio[start:start + window_bytes]
+            rec = KaldiRecognizer(self.model, SAMPLE_RATE)
+            rec.SetWords(True)
+
+            if rec.AcceptWaveform(window):
+                result = json.loads(rec.Result())
+                text = result.get("text", "")
+            else:
+                part = json.loads(rec.PartialResult())
+                text = part.get("partial", "")
+
+            transcription[frame_index] = {
+                "start_sec": start // BYTES_PER_SECOND,
+                "end_sec": (start + window_bytes) // BYTES_PER_SECOND,
+                "text": text
+            }
+            frame_index += 1
+
+        # Handle tail
+        if total_len % stride_bytes != 0:
+            tail = audio[-window_bytes:]
+            if tail:
+                rec = KaldiRecognizer(self.model, SAMPLE_RATE)
+                rec.SetWords(True)
+                if rec.AcceptWaveform(tail):
+                    result = json.loads(rec.Result())
+                    text = result.get("text", "")
+                else:
+                    part = json.loads(rec.PartialResult())
+                    text = part.get("partial", "")
+
+                transcription[frame_index] = {
+                    "start_sec": (total_len - window_bytes) // BYTES_PER_SECOND,
+                    "end_sec": total_len // BYTES_PER_SECOND,
+                    "text": text
+                }
+
+        end_time = datetime.now()
+        time_elapsed = end_time - start_time
+
+        return {
+            "start_time": start_time.isoformat(),
+            "end_time": end_time.isoformat(),
+            "elapsed_time": str(time_elapsed),
+            "window_size": self.window_size,
+            "stride": self.stride,
+            "transcription": transcription  # dict of {index: {start_sec, end_sec, text}}
+        }
diff --git a/backend/app/services/transcription.py b/backend/app/services/transcription.py
@@ -0,0 +1,30 @@
+from app.services.transcribe import Transcriber
+import os
+import urllib.request
+import zipfile
+
+def get_vosk_model(model_name="vosk-model-small-en-us-0.15", target_dir="models"):
+    # Where the final unzipped model will live
+    model_path = os.path.join(target_dir, model_name)
+
+    os.makedirs(target_dir, exist_ok=True)
+
+    if os.path.exists(model_path):
+        print(f"Model already exists at {model_path}")
+        return model_path
+
+    # Download zip
+    url = f"https://alphacephei.com/vosk/models/{model_name}.zip"
+    zip_path = os.path.join(target_dir, f"{model_name}.zip")
+
+    print(f"Downloading {url} ...")
+    urllib.request.urlretrieve(url, zip_path)
+
+    print(f"Extracting {zip_path} ...")
+    with zipfile.ZipFile(zip_path, "r") as zip_ref:
+        zip_ref.extractall(target_dir)
+
+    os.remove(zip_path)
+
+    print(f"Model ready at {model_path}")
+    return model_path
diff --git a/backend/input_video.mp3 b/backend/input_video.mp3
diff --git a/backend/package-lock.json b/backend/package-lock.json
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -11,4 +11,5 @@ requests
 faiss-cpu
 opencv-python-headless
 scikit-learn
-
+python-dotenv
+vosk