bitcointranscripts
diff --git a/‎app/exporters.py‎
Lines changed: 20 additions & 5 deletions b/‎app/exporters.py‎
Lines changed: 20 additions & 5 deletions
diff --git a/‎app/github_api_handler.py‎
Lines changed: 25 additions & 10 deletions b/‎app/github_api_handler.py‎
Lines changed: 25 additions & 10 deletions
diff --git a/‎app/services/correction.py‎
Lines changed: 42 additions & 0 deletions b/‎app/services/correction.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎app/services/llm_service.py‎
Lines changed: 0 additions & 47 deletions b/‎app/services/llm_service.py‎
Lines changed: 0 additions & 47 deletions
diff --git a/‎app/services/summarizer.py‎
Lines changed: 30 additions & 0 deletions b/‎app/services/summarizer.py‎
Lines changed: 30 additions & 0 deletions
@@ -198,7 +198,7 @@ def _create_with_metadata(self, transcript: Transcript, **kwargs) -> str:
 
         Args:
             transcript: The transcript to export
-            **kwargs: Additional parameters like review_flag
+            **kwargs: Additional parameters like review_flag and content_key
 
         Returns:
             The complete Markdown content with metadata
@@ -215,6 +215,13 @@ def increase_indent(self, flow=False, indentless=False):
         # Get metadata from the source
         metadata = transcript.source.to_json()
 
+        # Determine which content to use
+        content_key = kwargs.get("content_key", "corrected_text")
+        content = transcript.outputs.get(content_key, transcript.outputs.get("raw"))
+
+        if content is None:
+            raise Exception(f"No transcript content found for key '{content_key}' or 'raw'")
+
         # Add or modify specific fields
         if self.transcript_by:
             review_flag = kwargs.get("review_flag", "")
@@ -312,32 +319,40 @@ def export(self, transcript: Transcript, **kwargs) -> str:
         Args:
             transcript: The transcript to export
             add_timestamp: Whether to add a timestamp to the filename (default: False)
+            content_key: The key in transcript.outputs to use for the content (default: "raw")
+            suffix: A suffix to add to the filename (e.g., "_raw")
             **kwargs: Additional parameters (unused)
 
         Returns:
             The path to the exported text file
         """
         self.logger.debug("Exporting transcript to plain text...")
 
-        if transcript.outputs["raw"] is None:
-            raise Exception("No transcript content found")
+        content_key = kwargs.get("content_key", "raw")
+        content = transcript.outputs.get(content_key)
+        if content is None and content_key == "summary":
+            content = transcript.summary
+
+        if content is None:
+            raise Exception(f"No content found for key: {content_key}")
 
         # Get parameters
         add_timestamp = kwargs.get("add_timestamp", False)
+        suffix = kwargs.get("suffix", "")
 
         # Get output directory
         output_dir = self.get_output_path(transcript)
 
         # Construct file path
         file_path = self.construct_file_path(
             directory=output_dir,
-            filename=transcript.title,
+            filename=f"{transcript.title}{suffix}",
             file_type="txt",
             include_timestamp=add_timestamp,
         )
 
         # Write to file
-        result_path = self.write_to_file(transcript.outputs["raw"], file_path)
+        result_path = self.write_to_file(content, file_path)
 
         self.logger.info(f"(exporter) Text file written to: {result_path}")
         return result_path
 
@@ -93,13 +93,17 @@ def create_branch(self, repo_type, branch_name, sha):
         response = self._make_request('POST', url, json=data)
         return response.json()
 
-    def create_or_update_file(self, repo_type, file_path, content, commit_message, branch):
+    def create_or_update_file(self, repo_type, file_path, content, commit_message, branch, get_sha=False):
         url = f"https://api.github.com/repos/{self.repos[repo_type]['owner']}/{self.repos[repo_type]['name']}/contents/{quote(file_path)}"
         data = {
             "message": commit_message,
             "content": base64.b64encode(content.encode()).decode(),
             "branch": branch
         }
+        if get_sha:
+            response = self._make_request('GET', url + f'?ref={branch}')
+            data['sha'] = response.json()['sha']
+
         response = self._make_request('PUT', url, json=data)
         return response.json()
 
@@ -114,23 +118,34 @@ def create_pull_request(self, repo_type, title, head, base, body):
         response = self._make_request('POST', url, json=data)
         return response.json()
 
-    def push_transcripts(self, transcripts: list[Transcript]) -> str | None:
+    def push_transcripts(self, transcripts: list[Transcript], markdown_exporter) -> str | None:
         try:
             default_branch = self.get_default_branch('transcripts')
             branch_sha = self.get_branch_sha('transcripts', default_branch)
-            branch_name = f"transcripts-{''.join(random.choices('0123456789', k=6))}"
+            branch_name = f"transcripts-{'' .join(random.choices('0123456789', k=6))}"
             self.create_branch('transcripts', branch_name, branch_sha)
 
             for transcript in transcripts:
-                if transcript.outputs and transcript.outputs['markdown']:
-                    with open(transcript.outputs['markdown'], 'r') as file:
-                        content = file.read()
+                # First commit: Raw transcript
+                raw_content = markdown_exporter._create_with_metadata(transcript, content_key='raw')
+                self.create_or_update_file(
+                    'transcripts',
+                    transcript.output_path_with_title + ".md",
+                    raw_content,
+                    f'ai(transcript): "{transcript.title}" (raw)',
+                    branch_name
+                )
+
+                # Second commit: Corrected transcript
+                if transcript.outputs.get('corrected_text'):
+                    corrected_content = markdown_exporter._create_with_metadata(transcript, content_key='corrected_text')
                     self.create_or_update_file(
                         'transcripts',
-                        transcript.output_path_with_title,
-                        content,
-                        f'ai(transcript): "{transcript.title}" ({transcript.source.loc})',
-                        branch_name
+                        transcript.output_path_with_title + ".md",
+                        corrected_content,
+                        f'ai(transcript): "{transcript.title}" (corrected)',
+                        branch_name,
+                        get_sha=True # We need the SHA of the file to update it
                     )
 
             pr = self.create_pull_request(
 
@@ -0,0 +1,42 @@
+from app.transcript import Transcript
+from app.logging import get_logger
+import openai
+from app.config import settings
+
+logger = get_logger()
+
+class CorrectionService:
+    def __init__(self, provider='openai', model='gpt-4o'):
+        self.provider = provider
+        self.model = model
+        if self.provider == 'openai':
+            self.client = openai
+            self.client.api_key = settings.OPENAI_API_KEY
+        else:
+            raise ValueError(f"Unsupported LLM provider: {provider}")
+
+    def process(self, transcript: Transcript, **kwargs):
+        logger.info(f"Correcting transcript with {self.provider}...")
+        keywords = kwargs.get('keywords', [])
+        
+        # Build the prompt
+        prompt = self._build_prompt(transcript.outputs['raw'], keywords)
+
+        # Call the LLM
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[{"role": "user", "content": prompt}]
+        )
+        corrected_text = response.choices[0].message.content
+        
+        # Store the corrected text in a new field
+        transcript.outputs['corrected_text'] = corrected_text
+        logger.info("Correction complete.")
+
+    def _build_prompt(self, text, keywords):
+        prompt = "Please correct the following transcript for punctuation, grammar, and spelling. Do not change the content or the speaker labels."
+        if keywords:
+            prompt += "\n\nPlease pay special attention to the following keywords and ensure they are spelled correctly:\n- "
+            prompt += "\n- ".join(keywords)
+        prompt += f"\n\n---\n\n{text}"
+        return prompt
@@ -0,0 +1,30 @@
+from app.transcript import Transcript
+from app.logging import get_logger
+import openai
+from app.config import settings
+
+logger = get_logger()
+
+class SummarizerService:
+    def __init__(self, provider='openai', model='gpt-4o'):
+        self.provider = provider
+        self.model = model
+        if self.provider == 'openai':
+            self.client = openai
+            self.client.api_key = settings.OPENAI_API_KEY
+        else:
+            raise ValueError(f"Unsupported LLM provider: {provider}")
+
+    def process(self, transcript: Transcript, **kwargs):
+        logger.info(f"Summarizing transcript with {self.provider}...")
+        text_to_summarize = transcript.outputs.get('corrected_text', transcript.outputs['raw'])
+        
+        prompt = f"""Please summarize the following text.\n---\n{text_to_summarize}"""
+        
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[{"role": "user", "content": prompt}]
+        )
+        summary = response.choices[0].message.content
+        transcript.summary = summary
+        logger.info("Summarization complete.")