1414from dotenv import load_dotenv
1515
1616from extract_article import extract_webpage_content
17+ from preprocess import preprocess_for_tts
18+ from llm_preprocess import rewrite_for_audio
1719from podcast import add_episode
1820from tts import text_to_mp3 , MODELS
1921
2022load_dotenv ()
2123
24+ PREPROCESS_MODES = {
25+ "none" : "No preprocessing (raw text)" ,
26+ "regex" : "Regex-based cleaning (remove URLs, code, citations, expand numbers)" ,
27+ "llm" : "LLM rewrite for natural audio narration" ,
28+ }
29+ DEFAULT_PREPROCESS = "regex"
30+
2231
2332async def start (update : Update , _context : ContextTypes .DEFAULT_TYPE ):
2433 await update .message .reply_text ("Hello!" )
@@ -56,6 +65,32 @@ async def set_model(update: Update, context: ContextTypes.DEFAULT_TYPE):
5665 await update .message .reply_text (f"Model set to { model } " )
5766
5867
68+ async def set_preprocess (update : Update , context : ContextTypes .DEFAULT_TYPE ):
69+ user = update .message .from_user
70+ if not is_allowed (user ):
71+ print (f"User { user } is not allowed" )
72+ return
73+
74+ if len (context .args ) != 1 or context .args [0 ] not in PREPROCESS_MODES :
75+ modes_list = "\n " .join (f" { k } : { v } " for k , v in PREPROCESS_MODES .items ())
76+ await update .message .reply_text (f"Usage: /setpreprocess <mode>\n Available modes:\n { modes_list } " )
77+ return
78+
79+ mode = context .args [0 ]
80+ context .user_data ["preprocess" ] = mode
81+ await update .message .reply_text (f"Preprocessing set to: { mode } — { PREPROCESS_MODES [mode ]} " )
82+
83+
84+ async def apply_preprocessing (content : str , mode : str ) -> str :
85+ if mode == "none" :
86+ return content
87+ elif mode == "regex" :
88+ return preprocess_for_tts (content )
89+ elif mode == "llm" :
90+ return await rewrite_for_audio (preprocess_for_tts (content ))
91+ return content
92+
93+
5994async def handle_message (update : Update , context : ContextTypes .DEFAULT_TYPE ):
6095 user = update .message .from_user
6196 if not is_allowed (user ):
@@ -64,6 +99,7 @@ async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
6499
65100 default_model_name = next (iter (MODELS .keys ()))
66101 model_name = context .user_data .get ("model" , default_model_name )
102+ preprocess_mode = context .user_data .get ("preprocess" , DEFAULT_PREPROCESS )
67103
68104 url_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
69105 urls = re .findall (url_pattern , update .message .text )
@@ -77,15 +113,28 @@ async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
77113
78114 for url in urls :
79115 start_time = time .time ()
80- title , content = extract_webpage_content (url )
116+ result = extract_webpage_content (url )
117+ if result is None :
118+ await update .message .reply_text (f"Failed to extract content from { url } " )
119+ continue
120+
121+ title , content = result
81122 mp3_filename = title .replace (" " , "_" ).lower () + ".mp3"
82- await update .message .reply_text ("Extracted content, producing audio" )
123+
124+ await update .message .reply_text (f"Extracted content, preprocessing ({ preprocess_mode } )..." )
125+ content = await apply_preprocessing (content , preprocess_mode )
126+
127+ await update .message .reply_text ("Producing audio..." )
83128 metadata = text_to_mp3 (text = content , output_mp3 = mp3_filename , model_name = model_name , speed = 1.0 )
84129 await update .message .reply_text ("Produced audio, updating feed" )
85- description = f"Model: { metadata .model } . Voice: { metadata .voice } . { content [:150 ]} "
130+ description = (
131+ f"Model: { metadata .model } . Voice: { metadata .voice } . Preprocess: { preprocess_mode } . { content [:150 ]} "
132+ )
86133 add_episode (mp3_filename , title , description = description )
87134 end_time = time .time ()
88- await update .message .reply_text (f"Added “{ title } ” to the feed. This took { end_time - start_time :.2f} seconds" )
135+ await update .message .reply_text (
136+ f"Added \u201c { title } \u201d to the feed. This took { end_time - start_time :.2f} seconds"
137+ )
89138
90139 if len (urls ) > 1 :
91140 await update .message .reply_text (f"Processed { len (urls )} URLs" )
@@ -96,6 +145,7 @@ def main():
96145
97146 application .add_handler (CommandHandler ("start" , start ))
98147 application .add_handler (CommandHandler ("setmodel" , set_model ))
148+ application .add_handler (CommandHandler ("setpreprocess" , set_preprocess ))
99149 application .add_handler (MessageHandler (filters .TEXT & ~ filters .COMMAND , handle_message ))
100150
101151 print ("Bot is running..." )
0 commit comments