Skip to content

Commit 6434d79

Browse files
committed
WIP: summarizer and correction
1 parent a593ae0 commit 6434d79

File tree

9 files changed

+170
-180
lines changed

9 files changed

+170
-180
lines changed

app/exporters.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def _create_with_metadata(self, transcript: Transcript, **kwargs) -> str:
198198
199199
Args:
200200
transcript: The transcript to export
201-
**kwargs: Additional parameters like review_flag
201+
**kwargs: Additional parameters like review_flag and content_key
202202
203203
Returns:
204204
The complete Markdown content with metadata
@@ -215,6 +215,13 @@ def increase_indent(self, flow=False, indentless=False):
215215
# Get metadata from the source
216216
metadata = transcript.source.to_json()
217217

218+
# Determine which content to use
219+
content_key = kwargs.get("content_key", "corrected_text")
220+
content = transcript.outputs.get(content_key, transcript.outputs.get("raw"))
221+
222+
if content is None:
223+
raise Exception(f"No transcript content found for key '{content_key}' or 'raw'")
224+
218225
# Add or modify specific fields
219226
if self.transcript_by:
220227
review_flag = kwargs.get("review_flag", "")
@@ -312,32 +319,40 @@ def export(self, transcript: Transcript, **kwargs) -> str:
312319
Args:
313320
transcript: The transcript to export
314321
add_timestamp: Whether to add a timestamp to the filename (default: False)
322+
content_key: The key in transcript.outputs to use for the content (default: "raw")
323+
suffix: A suffix to add to the filename (e.g., "_raw")
315324
**kwargs: Additional parameters (unused)
316325
317326
Returns:
318327
The path to the exported text file
319328
"""
320329
self.logger.debug("Exporting transcript to plain text...")
321330

322-
if transcript.outputs["raw"] is None:
323-
raise Exception("No transcript content found")
331+
content_key = kwargs.get("content_key", "raw")
332+
content = transcript.outputs.get(content_key)
333+
if content is None and content_key == "summary":
334+
content = transcript.summary
335+
336+
if content is None:
337+
raise Exception(f"No content found for key: {content_key}")
324338

325339
# Get parameters
326340
add_timestamp = kwargs.get("add_timestamp", False)
341+
suffix = kwargs.get("suffix", "")
327342

328343
# Get output directory
329344
output_dir = self.get_output_path(transcript)
330345

331346
# Construct file path
332347
file_path = self.construct_file_path(
333348
directory=output_dir,
334-
filename=transcript.title,
349+
filename=f"{transcript.title}{suffix}",
335350
file_type="txt",
336351
include_timestamp=add_timestamp,
337352
)
338353

339354
# Write to file
340-
result_path = self.write_to_file(transcript.outputs["raw"], file_path)
355+
result_path = self.write_to_file(content, file_path)
341356

342357
self.logger.info(f"(exporter) Text file written to: {result_path}")
343358
return result_path

app/github_api_handler.py

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,17 @@ def create_branch(self, repo_type, branch_name, sha):
9393
response = self._make_request('POST', url, json=data)
9494
return response.json()
9595

96-
def create_or_update_file(self, repo_type, file_path, content, commit_message, branch):
96+
def create_or_update_file(self, repo_type, file_path, content, commit_message, branch, get_sha=False):
9797
url = f"https://api.github.com/repos/{self.repos[repo_type]['owner']}/{self.repos[repo_type]['name']}/contents/{quote(file_path)}"
9898
data = {
9999
"message": commit_message,
100100
"content": base64.b64encode(content.encode()).decode(),
101101
"branch": branch
102102
}
103+
if get_sha:
104+
response = self._make_request('GET', url + f'?ref={branch}')
105+
data['sha'] = response.json()['sha']
106+
103107
response = self._make_request('PUT', url, json=data)
104108
return response.json()
105109

@@ -114,23 +118,34 @@ def create_pull_request(self, repo_type, title, head, base, body):
114118
response = self._make_request('POST', url, json=data)
115119
return response.json()
116120

117-
def push_transcripts(self, transcripts: list[Transcript]) -> str | None:
121+
def push_transcripts(self, transcripts: list[Transcript], markdown_exporter) -> str | None:
118122
try:
119123
default_branch = self.get_default_branch('transcripts')
120124
branch_sha = self.get_branch_sha('transcripts', default_branch)
121-
branch_name = f"transcripts-{''.join(random.choices('0123456789', k=6))}"
125+
branch_name = f"transcripts-{'' .join(random.choices('0123456789', k=6))}"
122126
self.create_branch('transcripts', branch_name, branch_sha)
123127

124128
for transcript in transcripts:
125-
if transcript.outputs and transcript.outputs['markdown']:
126-
with open(transcript.outputs['markdown'], 'r') as file:
127-
content = file.read()
129+
# First commit: Raw transcript
130+
raw_content = markdown_exporter._create_with_metadata(transcript, content_key='raw')
131+
self.create_or_update_file(
132+
'transcripts',
133+
transcript.output_path_with_title + ".md",
134+
raw_content,
135+
f'ai(transcript): "{transcript.title}" (raw)',
136+
branch_name
137+
)
138+
139+
# Second commit: Corrected transcript
140+
if transcript.outputs.get('corrected_text'):
141+
corrected_content = markdown_exporter._create_with_metadata(transcript, content_key='corrected_text')
128142
self.create_or_update_file(
129143
'transcripts',
130-
transcript.output_path_with_title,
131-
content,
132-
f'ai(transcript): "{transcript.title}" ({transcript.source.loc})',
133-
branch_name
144+
transcript.output_path_with_title + ".md",
145+
corrected_content,
146+
f'ai(transcript): "{transcript.title}" (corrected)',
147+
branch_name,
148+
get_sha=True # We need the SHA of the file to update it
134149
)
135150

136151
pr = self.create_pull_request(

app/services/correction.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from app.transcript import Transcript
2+
from app.logging import get_logger
3+
import openai
4+
from app.config import settings
5+
6+
logger = get_logger()
7+
8+
class CorrectionService:
9+
def __init__(self, provider='openai', model='gpt-4o'):
10+
self.provider = provider
11+
self.model = model
12+
if self.provider == 'openai':
13+
self.client = openai
14+
self.client.api_key = settings.OPENAI_API_KEY
15+
else:
16+
raise ValueError(f"Unsupported LLM provider: {provider}")
17+
18+
def process(self, transcript: Transcript, **kwargs):
19+
logger.info(f"Correcting transcript with {self.provider}...")
20+
keywords = kwargs.get('keywords', [])
21+
22+
# Build the prompt
23+
prompt = self._build_prompt(transcript.outputs['raw'], keywords)
24+
25+
# Call the LLM
26+
response = self.client.chat.completions.create(
27+
model=self.model,
28+
messages=[{"role": "user", "content": prompt}]
29+
)
30+
corrected_text = response.choices[0].message.content
31+
32+
# Store the corrected text in a new field
33+
transcript.outputs['corrected_text'] = corrected_text
34+
logger.info("Correction complete.")
35+
36+
def _build_prompt(self, text, keywords):
37+
prompt = "Please correct the following transcript for punctuation, grammar, and spelling. Do not change the content or the speaker labels."
38+
if keywords:
39+
prompt += "\n\nPlease pay special attention to the following keywords and ensure they are spelled correctly:\n- "
40+
prompt += "\n- ".join(keywords)
41+
prompt += f"\n\n---\n\n{text}"
42+
return prompt

app/services/llm_service.py

Lines changed: 0 additions & 47 deletions
This file was deleted.

app/services/summarizer.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from app.transcript import Transcript
2+
from app.logging import get_logger
3+
import openai
4+
from app.config import settings
5+
6+
logger = get_logger()
7+
8+
class SummarizerService:
9+
def __init__(self, provider='openai', model='gpt-4o'):
10+
self.provider = provider
11+
self.model = model
12+
if self.provider == 'openai':
13+
self.client = openai
14+
self.client.api_key = settings.OPENAI_API_KEY
15+
else:
16+
raise ValueError(f"Unsupported LLM provider: {provider}")
17+
18+
def process(self, transcript: Transcript, **kwargs):
19+
logger.info(f"Summarizing transcript with {self.provider}...")
20+
text_to_summarize = transcript.outputs.get('corrected_text', transcript.outputs['raw'])
21+
22+
prompt = f"""Please summarize the following text.\n---\n{text_to_summarize}"""
23+
24+
response = self.client.chat.completions.create(
25+
model=self.model,
26+
messages=[{"role": "user", "content": prompt}]
27+
)
28+
summary = response.choices[0].message.content
29+
transcript.summary = summary
30+
logger.info("Summarization complete.")

0 commit comments

Comments
 (0)