Skip to content

Commit 963b1c0

Browse files
committed
Hardens regexes for AI script analysis and DOCX generation.
1 parent 78ba38f commit 963b1c0

File tree

1 file changed

+19
-12
lines changed

1 file changed

+19
-12
lines changed

app.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def save_settings(settings):
6363
def extract_filename_from_script(script_text, extension, max_length=50):
6464
"""
6565
Extracts a safe filename from the beginning of the first sentence in the script.
66+
Skips instruction lines (lines without speaker format) and only uses actual dialogue.
6667
6768
Args:
6869
script_text: The script content
@@ -72,7 +73,7 @@ def extract_filename_from_script(script_text, extension, max_length=50):
7273
Returns:
7374
A sanitized filename with the given extension
7475
"""
75-
# Remove speaker labels and get the first sentence
76+
# Find the first line with speaker format (skip instructions)
7677
lines = script_text.strip().split('\n')
7778
first_dialogue = ""
7879

@@ -81,23 +82,29 @@ def extract_filename_from_script(script_text, extension, max_length=50):
8182
if not line.strip():
8283
continue
8384
# Check if line has speaker format (Speaker: text)
84-
# Use possessive quantifiers and atomic grouping concept to prevent ReDoS
85-
match = re.match(r'^\s*([^:\s]+(?:\s+[^:\s]+)*)\s*:\s*(.+)$', line)
86-
if match:
87-
first_dialogue = match.group(2).strip()
88-
break
89-
else:
90-
# If no speaker format, use the line as-is
91-
first_dialogue = line.strip()
92-
break
85+
# Simple check for colon to avoid regex complexity
86+
if ':' in line:
87+
parts = line.split(':', 1)
88+
if len(parts) == 2 and parts[0].strip() and parts[1].strip():
89+
# Found a speaker line - use the dialogue part
90+
first_dialogue = parts[1].strip()
91+
break
92+
# If no speaker format, skip this line (it's likely an instruction)
9393

9494
if not first_dialogue:
9595
# Fallback to UUID if no content found
9696
return f"podcast_{os.urandom(4).hex()}.{extension}"
9797

9898
# Remove any bracketed annotations like [playful], [laughing], etc.
99-
# Use a character class that excludes brackets to prevent ReDoS
100-
first_dialogue = re.sub(r'\[[^\]]*\]', '', first_dialogue).strip()
99+
# Use simple string operations instead of regex to avoid ReDoS concerns
100+
while '[' in first_dialogue and ']' in first_dialogue:
101+
start = first_dialogue.find('[')
102+
end = first_dialogue.find(']', start)
103+
if end > start:
104+
first_dialogue = first_dialogue[:start] + first_dialogue[end+1:]
105+
else:
106+
break # Malformed brackets, stop processing
107+
first_dialogue = first_dialogue.strip()
101108

102109
# Extract the beginning (up to first sentence or max_length)
103110
# Split by sentence-ending punctuation

0 commit comments

Comments
 (0)