razumau · razumau · Mar 2, 2026 · Feb 20, 2026 · Mar 1, 2026 · Mar 1, 2026
diff --git a/.env.example b/.env.example
@@ -24,3 +24,6 @@ ALLOWED_TELEGRAM_USERNAMES=username,another_username
 ALLOWED_TELEGRAM_IDS=123456,1235678
 
 ELEVENLABS_API_KEY=
+
+# Required for LLM preprocessing modes
+ANTHROPIC_API_KEY=
diff --git a/bot.py b/bot.py
@@ -14,11 +14,20 @@
 from dotenv import load_dotenv
 
 from extract_article import extract_webpage_content
+from preprocess import preprocess_for_tts
+from llm_preprocess import rewrite_for_audio
 from podcast import add_episode
 from tts import text_to_mp3, MODELS
 
 load_dotenv()
 
+PREPROCESS_MODES = {
+    "none": "No preprocessing (raw text)",
+    "regex": "Regex-based cleaning (remove URLs, code, citations, expand numbers)",
+    "llm": "LLM rewrite for natural audio narration",
+}
+DEFAULT_PREPROCESS = "regex"
+
 
 async def start(update: Update, _context: ContextTypes.DEFAULT_TYPE):
     await update.message.reply_text("Hello!")
@@ -56,6 +65,32 @@ async def set_model(update: Update, context: ContextTypes.DEFAULT_TYPE):
     await update.message.reply_text(f"Model set to {model}")
 
 
+async def set_preprocess(update: Update, context: ContextTypes.DEFAULT_TYPE):
+    user = update.message.from_user
+    if not is_allowed(user):
+        print(f"User {user} is not allowed")
+        return
+
+    if len(context.args) != 1 or context.args[0] not in PREPROCESS_MODES:
+        modes_list = "\n".join(f"  {k}: {v}" for k, v in PREPROCESS_MODES.items())
+        await update.message.reply_text(f"Usage: /setpreprocess <mode>\nAvailable modes:\n{modes_list}")
+        return
+
+    mode = context.args[0]
+    context.user_data["preprocess"] = mode
+    await update.message.reply_text(f"Preprocessing set to: {mode} — {PREPROCESS_MODES[mode]}")
+
+
+async def apply_preprocessing(content: str, mode: str) -> str:
+    if mode == "none":
+        return content
+    elif mode == "regex":
+        return preprocess_for_tts(content)
+    elif mode == "llm":
+        return await rewrite_for_audio(preprocess_for_tts(content))
+    return content
+
+
 async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
     user = update.message.from_user
     if not is_allowed(user):
@@ -64,6 +99,7 @@ async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
 
     default_model_name = next(iter(MODELS.keys()))
     model_name = context.user_data.get("model", default_model_name)
+    preprocess_mode = context.user_data.get("preprocess", DEFAULT_PREPROCESS)
 
     url_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
     urls = re.findall(url_pattern, update.message.text)
@@ -77,15 +113,28 @@ async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
 
     for url in urls:
         start_time = time.time()
-        title, content = extract_webpage_content(url)
+        result = extract_webpage_content(url)
+        if result is None:
+            await update.message.reply_text(f"Failed to extract content from {url}")
+            continue
+
+        title, content = result
         mp3_filename = title.replace(" ", "_").lower() + ".mp3"
-        await update.message.reply_text("Extracted content, producing audio")
+
+        await update.message.reply_text(f"Extracted content, preprocessing ({preprocess_mode})...")
+        content = await apply_preprocessing(content, preprocess_mode)
+
+        await update.message.reply_text("Producing audio...")
         metadata = text_to_mp3(text=content, output_mp3=mp3_filename, model_name=model_name, speed=1.0)
         await update.message.reply_text("Produced audio, updating feed")
-        description = f"Model: {metadata.model}. Voice: {metadata.voice}. {content[:150]}"
+        description = (
+            f"Model: {metadata.model}. Voice: {metadata.voice}. Preprocess: {preprocess_mode}. {content[:150]}"
+        )
         add_episode(mp3_filename, title, description=description)
         end_time = time.time()
-        await update.message.reply_text(f"Added “{title}” to the feed. This took {end_time - start_time:.2f} seconds")
+        await update.message.reply_text(
+            f"Added \u201c{title}\u201d to the feed. This took {end_time - start_time:.2f} seconds"
+        )
 
     if len(urls) > 1:
         await update.message.reply_text(f"Processed {len(urls)} URLs")
@@ -96,6 +145,7 @@ def main():
 
     application.add_handler(CommandHandler("start", start))
     application.add_handler(CommandHandler("setmodel", set_model))
+    application.add_handler(CommandHandler("setpreprocess", set_preprocess))
     application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))
 
     print("Bot is running...")

diff --git a/extract_article.py b/extract_article.py
@@ -1,28 +1,40 @@
 import subprocess
 import os
+import tempfile
 
 BUN_SCRIPT = "extract_article.ts"
-ARTICLE_TITLE_FILE = "extracted_article_title.txt"
-ARTICLE_FILE = "extracted_article.txt"
 
 
-def extract_webpage_content(url: str) -> tuple[str, str] or None:
+def extract_webpage_content(url: str) -> tuple[str, str] | None:
+    # Use unique temp files to avoid concurrency issues
+    with (
+        tempfile.NamedTemporaryFile(mode="w", suffix="_article.txt", delete=False) as article_f,
+        tempfile.NamedTemporaryFile(mode="w", suffix="_title.txt", delete=False) as title_f,
+    ):
+        article_path = article_f.name
+        title_path = title_f.name
+
     try:
-        subprocess.run(["bun", BUN_SCRIPT, url], check=True)
+        subprocess.run(
+            ["bun", BUN_SCRIPT, url, article_path, title_path],
+            check=True,
+        )
 
-        if not (os.path.exists(ARTICLE_FILE) and os.path.exists(ARTICLE_TITLE_FILE)):
-            print(f"We expect input files at {ARTICLE_FILE} and {ARTICLE_TITLE_FILE}")
+        if not (os.path.exists(article_path) and os.path.exists(title_path)):
+            print(f"Expected output files at {article_path} and {title_path}")
             return None
 
-        with open(ARTICLE_TITLE_FILE, "r", encoding="utf-8") as f:
+        with open(title_path, "r", encoding="utf-8") as f:
             title = f.read()
-        os.remove(ARTICLE_TITLE_FILE)
 
-        with open(ARTICLE_FILE, "r", encoding="utf-8") as f:
+        with open(article_path, "r", encoding="utf-8") as f:
             contents = f.read()
-        os.remove(ARTICLE_FILE)
 
         return title, contents
     except subprocess.CalledProcessError as e:
         print(f"Error running script: {e}")
         return None
+    finally:
+        for path in (article_path, title_path):
+            if os.path.exists(path):
+                os.remove(path)
diff --git a/extract_article.ts b/extract_article.ts
@@ -3,8 +3,8 @@ const { JSDOM } = require('jsdom');
 const fs = require('fs');
 
 const url = process.argv[2];
-const ARTICLE_FILE = 'extracted_article.txt';
-const ARTICLE_TITLE_FILE = 'extracted_article_title.txt';
+const ARTICLE_FILE = process.argv[3] || 'extracted_article.txt';
+const ARTICLE_TITLE_FILE = process.argv[4] || 'extracted_article_title.txt';
 
 if (!url) {
     console.error('Provide a URL or local file path as an argument');

diff --git a/html_fetcher.py b/html_fetcher.py
diff --git a/llm_preprocess.py b/llm_preprocess.py
@@ -0,0 +1,46 @@
+import os
+
+import anthropic
+
+SYSTEM_PROMPT = """You are a professional audio producer who adapts written articles for podcast narration.
+Your job is to slightly adjust the given article text so it sounds better when read aloud by a text-to-speech system."""
+
+REWRITE_PROMPT = """Update the following article for audio narration. Follow these rules strictly:
+
+1. Remove all URLs, email addresses, and hyperlinks entirely.
+2. Remove code blocks. If a code block is central to the article’s point, briefly describe what it does in one sentence.
+3. Convert tables to short prose descriptions.
+4. Remove all citation markers like [1], [2], etc.
+5. Remove references to figures, images, charts, or any visual elements (e.g. "see Figure 3", "as shown below").
+6. Expand abbreviations: "e.g." → "for example", "i.e." → "that is", "etc." → "et cetera".
+7. Write out numbers as words when appropriate. This includes years.
+8. Remove all markdown formatting (headers, bold, italic, links).
+9. Keep the content faithful to the original — do not add or rewrite anything that isn’t covered by the rules above.
+10. Output ONLY the rewritten text, nothing else.
+
+Article text:
+
+{text}"""
+
+
+async def rewrite_for_audio(text: str) -> str:
+    """Use Claude to rewrite article text for audio narration."""
+    api_key = os.getenv("ANTHROPIC_API_KEY")
+    if not api_key:
+        raise ValueError("ANTHROPIC_API_KEY environment variable is required for LLM preprocessing")
+
+    client = anthropic.AsyncAnthropic(api_key=api_key)
+
+    message = await client.messages.create(
+        model="claude-haiku-4-5-20251001",
+        max_tokens=65536,
+        system=SYSTEM_PROMPT,
+        messages=[
+            {"role": "user", "content": REWRITE_PROMPT.format(text=text)},
+        ],
+    )
+
+    if message.stop_reason == "max_tokens":
+        print("Warning: LLM preprocessing output was truncated due to max_tokens limit")
+
+    return message.content[0].text
diff --git a/models/base.py b/models/base.py
@@ -7,6 +7,7 @@ class TTSMetadata:
     model: str
     voice: str
 
+
 class BaseTTS(ABC):
     @abstractmethod
     def __init__(

diff --git a/models/eleven.py b/models/eleven.py
@@ -8,6 +8,11 @@
 
 GOOD_VOICES = ["Xb7hH8MSUJpSbSDYk0k2", "XB0fDUnXU5powFXDhCwa", "onwK4e9ZLuTAKqWW03F9", "ThT5KcBeYPX3keUQqHPh"]
 
+AVAILABLE_MODELS = {
+    "eleven": "eleven_flash_v2_5",
+    "eleven_v3": "eleven_v3",
+}
+
 
 class ElevenLabsTTS(BaseTTS):
     def __init__(
@@ -17,20 +22,21 @@ def __init__(
         pick_random_voice: bool = False,
         voice: str = GOOD_VOICES[0],
         speed: float = 1.0,
+        model_id: str = "eleven_flash_v2_5",
     ):
         self.text = text
         self.output_filename = output_filename
         if pick_random_voice:
             self.voice = random.choice(GOOD_VOICES)
         else:
             self.voice = voice
-        self.model_id = "eleven_flash_v2_5"
+        self.model_id = model_id
         self.client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
 
     def text_to_mp3(self) -> TTSMetadata:
         response = self.client.text_to_speech.convert(
             voice_id=self.voice,
-            output_format="mp3_22050_32",
+            output_format="mp3_44100_128",
             text=self.text,
             model_id=self.model_id,
             voice_settings=VoiceSettings(
@@ -46,4 +52,4 @@ def text_to_mp3(self) -> TTSMetadata:
                 if chunk:
                     f.write(chunk)
 
-        return TTSMetadata(model="eleven", voice=self.voice)
+        return TTSMetadata(model=f"eleven ({self.model_id})", voice=self.voice)