bitcointranscripts
diff --git a/‎app/services/correction.py‎
Lines changed: 85 additions & 12 deletions b/‎app/services/correction.py‎
Lines changed: 85 additions & 12 deletions
@@ -1,5 +1,6 @@
 from app.transcript import Transcript
 from app.logging import get_logger
+from app.services.global_tag_manager import GlobalTagManager
 import openai
 from app.config import settings
 
@@ -9,6 +10,7 @@ class CorrectionService:
     def __init__(self, provider='openai', model='gpt-4o'):
         self.provider = provider
         self.model = model
+        self.tag_manager = GlobalTagManager()
         if self.provider == 'openai':
             self.client = openai
             self.client.api_key = settings.OPENAI_API_KEY
@@ -20,8 +22,9 @@ def process(self, transcript: Transcript, **kwargs):
         keywords = kwargs.get('keywords', [])
 
         metadata = transcript.source.to_json()
+        global_context = self.tag_manager.get_correction_context()
 
-        prompt = self._build_prompt(transcript.outputs['raw'], keywords, metadata)
+        prompt = self._build_enhanced_prompt(transcript.outputs['raw'], keywords, metadata, global_context)
 
         # Call the LLM
         response = self.client.chat.completions.create(
@@ -34,29 +37,99 @@ def process(self, transcript: Transcript, **kwargs):
         transcript.outputs['corrected_text'] = corrected_text
         logger.info("Correction complete.")
 
-    def _build_prompt(self, text, keywords, metadata):
+    def _build_enhanced_prompt(self, text, keywords, metadata, global_context):
         prompt = (
-            "You are a domain expert in Bitcoin and blockchain technologies.\n\n"
-            "The following transcript was generated using an automatic speech recognition (ASR) system. "
-            "Your task is to correct it based on the contextual metadata provided.\n\n"
-            "--- Contextual Metadata ---\n"
+            "You are a transcript correction specialist with expertise in Bitcoin and blockchain terminology.\n\n"
+            "The following transcript was generated by automatic speech recognition (ASR). Your task is to "
+            "correct ONLY the obvious mistakes while keeping the transcript as close to the original as possible.\n\n"
+            "DO NOT:\n"
+            "- Rephrase or rewrite sentences\n"
+            "- Change the speaker's style or tone\n"
+            "- Add or remove content\n"
+            "- Make major structural changes\n\n"
+            "DO:\n"
+            "- Fix spelling errors and typos\n"
+            "- Correct misheard words using context\n"
+            "- Fix technical terminology and proper names\n"
+            "- Maintain the exact same flow and structure\n\n"
+            "--- Current Video Metadata ---\n"
         )
 
         if metadata.get('title'):
-            prompt += f"Title: {metadata['title']}\n"
+            prompt += f"Video Title: {metadata['title']}\n"
         if metadata.get('speakers'):
             prompt += f"Speakers: {', '.join(metadata['speakers'])}\n"
         if metadata.get('tags'):
-            prompt += f"Tags: {', '.join(metadata['tags'])}\n"
+            prompt += f"Video Tags: {', '.join(metadata['tags'])}\n"
+        if metadata.get('categories'):
+            prompt += f"Categories: {', '.join(metadata['categories'])}\n"
+        if metadata.get('youtube', {}).get('description'):
+            description = metadata['youtube']['description'][:200] + "..." if len(metadata['youtube']['description']) > 200 else metadata['youtube']['description']
+            prompt += f"Description: {description}\n"
+
+        # Add global knowledge base context
+        video_count = global_context.get('video_count', 0)
+        prompt += f"\n--- Global Bitcoin Knowledge Base (From {video_count} Transcripts) ---\n"
+        
+        if global_context.get('frequent_tags'):
+            frequent_tags = global_context['frequent_tags'][:15]
+            prompt += f"Most Common Topics: {', '.join(frequent_tags)}\n"
+        
+        if global_context.get('technical_terms'):
+            tech_terms = global_context['technical_terms'][:20]
+            prompt += f"Technical Terms to Recognize: {', '.join(tech_terms)}\n"
+        
+        if global_context.get('project_names'):
+            projects = global_context['project_names'][:15]
+            prompt += f"Bitcoin Projects/Tools: {', '.join(projects)}\n"
+        
+        if global_context.get('common_speakers'):
+            speakers = global_context['common_speakers'][:10]
+            prompt += f"Frequent Speakers: {', '.join(speakers)}\n"
+        
+        if global_context.get('common_categories'):
+            categories = global_context['common_categories'][:8]
+            prompt += f"Common Content Categories: {', '.join(categories)}\n"
 
-        prompt += "Please use this metadata to improve the accuracy of your corrections.\n"
+        if global_context.get('expertise_areas'):
+            areas = global_context['expertise_areas'][:8]
+            prompt += f"Domain Expertise Areas: {', '.join(areas)}\n"
+        
+        if global_context.get('domain_context'):
+            prompt += f"Primary Domain Focus: {global_context['domain_context']}\n"
+
+        # Add specific correction focus
+        prompt += "\n--- Focus Areas for Correction ---\n"
+        prompt += "Using the metadata and global knowledge, focus on correcting:\n"
+        prompt += "1. Technical terms (ensure proper spelling and capitalization)\n"
+        prompt += "2. Speaker names and project names (match known variations)\n"
+        prompt += "3. Common ASR mishears (but, bit, big -> Bitcoin when context suggests it)\n"
+        prompt += "4. Homophones and similar-sounding words in Bitcoin context\n"
+        prompt += "5. Obvious typos and spelling mistakes\n\n"
+        prompt += "IMPORTANT: Make minimal changes - only fix clear errors, don't improve the text.\n"
+
+        # Add tag variations for better recognition
+        if global_context.get('tag_variations'):
+            variations = global_context['tag_variations']
+            if variations:
+                prompt += "\n--- Common Term Variations ---\n"
+                for base_term, variants in list(variations.items())[:5]:
+                    prompt += f"{base_term}: {', '.join(variants)}\n"
 
+        # Add user-provided keywords
         if keywords:
             prompt += (
-                "\nAdditionally, prioritize the following keywords. Ensure they are spelled, cased, and formatted correctly "
-                "whenever they appear in the transcript:\n- "
+                "\n--- Additional Priority Keywords ---\n"
+                "Pay special attention to these terms and ensure correct spelling/formatting:\n- "
             )
             prompt += "\n- ".join(keywords)
 
-        prompt += f"\n\n--- Transcript Start ---\n\n{text.strip()}\n\n--- Transcript End ---"
+        prompt += f"\n\n--- Transcript Start ---\n\n{text.strip()}\n\n--- Transcript End ---\n\n"
+        prompt += "Return ONLY the corrected transcript. Make minimal changes - fix only obvious errors while "
+        prompt += "preserving the original wording, sentence structure, and speaker's natural expression."
+        
         return prompt
+
+    def _build_prompt(self, text, keywords, metadata):
+        """Legacy method for backward compatibility"""
+        return self._build_enhanced_prompt(text, keywords, metadata, {})