Skip to content

Commit 78ba38f

Browse files
committed
Hardens regexes for AI script analysis and DOCX generation.
1 parent 5ff9318 commit 78ba38f

File tree

1 file changed

+204
-0
lines changed

1 file changed

+204
-0
lines changed

transcript_analyzer.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
"""
2+
Module for analyzing transcripts using Gemini API and generating DOCX documents.
3+
"""
4+
5+
import os
6+
import re
7+
from docx import Document
8+
from docx.shared import Pt, RGBColor
9+
from docx.enum.text import WD_ALIGN_PARAGRAPH
10+
import google.genai as genai
11+
from google.genai import types
12+
from generate_podcast import setup_logging
13+
from utils import get_asset_path
14+
15+
logger = setup_logging()
16+
17+
18+
def get_analysis_prompt_path():
19+
"""
20+
Returns the path to the analysis prompt configuration file.
21+
Checks in order:
22+
1. ./config/analysis_prompt.txt (for Docker and local config directory)
23+
2. app data directory (user-editable location)
24+
3. asset path (for bundled apps)
25+
"""
26+
from utils import get_app_data_dir
27+
28+
# First check in config directory (Docker and local development)
29+
config_dir_prompt = os.path.join(os.getcwd(), "config", "analysis_prompt.txt")
30+
if os.path.exists(config_dir_prompt):
31+
return config_dir_prompt
32+
33+
# Then check in app data directory (user-editable location)
34+
app_data_prompt = os.path.join(get_app_data_dir(), "analysis_prompt.txt")
35+
if os.path.exists(app_data_prompt):
36+
return app_data_prompt
37+
38+
# Finally check in asset path (bundled with app)
39+
asset_prompt = get_asset_path("analysis_prompt.txt")
40+
if asset_prompt and os.path.exists(asset_prompt):
41+
return asset_prompt
42+
43+
return None
44+
45+
46+
def load_analysis_prompt():
47+
"""
48+
Loads the analysis prompt from the configuration file.
49+
Returns None if the file doesn't exist.
50+
"""
51+
prompt_path = get_analysis_prompt_path()
52+
if not prompt_path:
53+
return None
54+
55+
try:
56+
with open(prompt_path, 'r', encoding='utf-8') as f:
57+
return f.read().strip()
58+
except Exception as e:
59+
logger.error(f"Error reading analysis prompt file: {e}")
60+
return None
61+
62+
63+
def generate_prompt(text: str) -> str:
64+
"""
65+
Génère le prompt pour l'API Gemini.
66+
Loads the prompt template from the configuration file and appends the transcript.
67+
68+
Raises:
69+
ValueError: If the prompt configuration file is not found or cannot be read.
70+
"""
71+
prompt_template = load_analysis_prompt()
72+
if not prompt_template:
73+
raise ValueError("Analysis prompt configuration file not found. Please create 'analysis_prompt.txt' in the application directory.")
74+
75+
return f"{prompt_template}\n\nTranscript:\n{text}"
76+
77+
78+
def analyze_transcript_with_gemini(transcript: str, api_key: str = None) -> str:
79+
"""
80+
Analyzes a transcript using the Gemini API.
81+
82+
Args:
83+
transcript: The transcript text to analyze
84+
api_key: Gemini API key (will use environment variable if not provided)
85+
86+
Returns:
87+
The analysis response from Gemini
88+
89+
Raises:
90+
ValueError: If no API key is found
91+
Exception: If the API call fails
92+
"""
93+
if not api_key:
94+
api_key = os.environ.get("GEMINI_API_KEY")
95+
96+
if not api_key:
97+
raise ValueError("GEMINI_API_KEY not found in environment variables")
98+
99+
# Get the model name from environment variable or use default
100+
model_name = os.environ.get("GEMINI_ANALYSIS_MODEL", "gemini-2.5-flash")
101+
102+
try:
103+
client = genai.Client(api_key=api_key)
104+
prompt = generate_prompt(transcript)
105+
106+
response = client.models.generate_content(
107+
model=model_name,
108+
contents=prompt,
109+
config=types.GenerateContentConfig(
110+
temperature=0.7,
111+
max_output_tokens=2048,
112+
)
113+
)
114+
115+
if response.text:
116+
return response.text
117+
else:
118+
raise Exception("Empty response from Gemini API")
119+
120+
except Exception as e:
121+
logger.error(f"Error analyzing transcript with Gemini: {e}", exc_info=True)
122+
raise
123+
124+
125+
def create_docx_from_analysis(analysis_text: str, output_path: str):
126+
"""
127+
Creates a well-formatted DOCX document from the Gemini analysis.
128+
129+
Args:
130+
analysis_text: The analysis text from Gemini (with markdown-style formatting)
131+
output_path: Path where to save the DOCX file
132+
"""
133+
doc = Document()
134+
135+
# Set document styles
136+
style = doc.styles['Normal']
137+
style.font.name = 'Calibri'
138+
style.font.size = Pt(11)
139+
140+
# Process the text line by line
141+
lines = analysis_text.split('\n')
142+
143+
for line in lines:
144+
line = line.strip()
145+
if not line:
146+
continue
147+
148+
# Check if the line contains bold text (**text**)
149+
bold_pattern = r'\*\*(.+?)\*\*'
150+
151+
if '**' in line:
152+
# Create a paragraph
153+
p = doc.add_paragraph()
154+
155+
# Split the line by bold markers
156+
parts = re.split(bold_pattern, line)
157+
158+
is_bold = False
159+
for i, part in enumerate(parts):
160+
if not part:
161+
continue
162+
163+
# Alternate between normal and bold
164+
if i % 2 == 1: # Odd indices are bold
165+
run = p.add_run(part)
166+
run.bold = True
167+
run.font.size = Pt(12)
168+
else: # Even indices are normal
169+
run = p.add_run(part)
170+
run.font.size = Pt(11)
171+
else:
172+
# Regular paragraph
173+
p = doc.add_paragraph(line)
174+
p.style = 'Normal'
175+
176+
# Save the document
177+
doc.save(output_path)
178+
logger.info(f"DOCX document saved to: {output_path}")
179+
180+
181+
def generate_analysis_docx(transcript: str, output_path: str = None, api_key: str = None) -> str:
182+
"""
183+
Complete workflow: analyze transcript with Gemini and create a DOCX document.
184+
185+
Args:
186+
transcript: The transcript text to analyze
187+
output_path: Path where to save the DOCX file (auto-generated if not provided)
188+
api_key: Gemini API key (will use environment variable if not provided)
189+
190+
Returns:
191+
Path to the generated DOCX file
192+
"""
193+
# Analyze the transcript
194+
analysis = analyze_transcript_with_gemini(transcript, api_key)
195+
196+
# Generate output path if not provided
197+
if not output_path:
198+
import tempfile
199+
output_path = os.path.join(tempfile.gettempdir(), f"script_analysis_{os.urandom(4).hex()}.docx")
200+
201+
# Create the DOCX
202+
create_docx_from_analysis(analysis, output_path)
203+
204+
return output_path

0 commit comments

Comments
 (0)