Goekdeniz-Guelmez
diff --git a/‎.DS_Store‎
6 KB b/‎.DS_Store‎
6 KB
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 1 deletion b/‎.gitignore‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 41 additions & 0 deletions b/‎README.md‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎example_config.json‎
Lines changed: 4 additions & 1 deletion b/‎example_config.json‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎examples/Gradio-WebUI.png‎
316 KB b/‎examples/Gradio-WebUI.png‎
316 KB
diff --git a/‎local_notebooklm/steps/helpers.py‎
Lines changed: 9 additions & 4 deletions b/‎local_notebooklm/steps/helpers.py‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎local_notebooklm/steps/prompts.py‎
Lines changed: 4 additions & 4 deletions b/‎local_notebooklm/steps/prompts.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎local_notebooklm/steps/step3.py‎
Lines changed: 32 additions & 4 deletions b/‎local_notebooklm/steps/step3.py‎
Lines changed: 32 additions & 4 deletions
diff --git a/‎local_notebooklm/steps/step4.py‎
Lines changed: 27 additions & 6 deletions b/‎local_notebooklm/steps/step4.py‎
Lines changed: 27 additions & 6 deletions
diff --git a/‎local_notebooklm/version.py‎
Lines changed: 1 addition & 1 deletion b/‎local_notebooklm/version.py‎
Lines changed: 1 addition & 1 deletion
@@ -14,4 +14,10 @@ test.text
 venv
 
 .conda
-conda
+conda
+
+local_notebooklm/web_ui/output
+.local_notebooklm/web_ui/output
+
+.gradio
+gradio
@@ -310,6 +310,47 @@ else:
     print(f"Failed to generate podcast: {result}")
 ```
 
+### Gradio Web UI
+
+Local-NotebookLM now includes a user-friendly Gradio web interface that makes it easy to use the tool without command line knowledge:
+
+```bash
+python -m local_notebooklm.web_ui
+```
+
+By default, the web UI runs locally on http://localhost:7860. You can access it from your browser.
+
+#### Web UI Screenshots
+
+![Web UI Main Screen](examples/Gradio-WebUI.png)
+*The main interface of the Local-NotebookLM web UI*
+
+#### Web UI Options
+
+| Option | Description | Default |
+|--------|-------------|---------|
+| `--share` | Make the UI accessible over the network | False |
+| `--port` | Specify a custom port | 7860 |
+
+#### Example Commands
+
+Basic local usage:
+```bash
+python -m local_notebooklm.web_ui
+```
+
+Share with others on your network:
+```bash
+python -m local_notebooklm.web_ui --share
+```
+
+Use a custom port:
+```bash
+python -m local_notebooklm.web_ui --port 8080
+```
+
+The web interface provides all the same options as the command line tool in an intuitive UI, making it easier for non-technical users to generate audio content from PDFs.
+
 ### FastAPI Server
 
 Start the FastAPI server to access the functionality via a web API:
 
@@ -1,5 +1,8 @@
 {
-    "Co-Host-Speaker-Voice": "af_sky+af_bella",
+    "Co-Host-Speaker-1-Voice": "af_sky+af_bella",
+    "Co-Host-Speaker-2-Voice": "af_echo",
+    "Co-Host-Speaker-3-Voice": "af_nova",
+    "Co-Host-Speaker-4-Voice": "af_shimmer",
     "Host-Speaker-Voice": "af_alloy",
 
     "Small-Text-Model": {
 
@@ -10,11 +10,14 @@
 FormatType = Literal[
     "podcast", "interview", "panel-discussion", "debate",
     "summary", "narration", "storytelling", "explainer",
-    "lecture", "tutorial", "q-and-a",
-    "news-report", "executive-brief", "meeting", "analysis"
+    "lecture", "tutorial", "q-and-a","news-report",
+    "executive-brief", "meeting", "analysis",
+    "three-people-podcast", "three-people-panel-discussion",
+    "three-people-debate", "four-people-podcast",
+    "four-people-panel-discussion", "four-people-debate",
+    "five-people-podcast", "five-people-panel-discussion", "five-people-debate"
 ]
 
-
 SingleSpeakerFormats = Literal[
     "summary", "narration", "storytelling", "explainer",
     "lecture", "tutorial", "news-report", "executive-brief", "analysis"
@@ -60,6 +63,7 @@
 
 StyleType = Literal["normal", "friendly", "professional", "academic", "casual", "technical", "gen-z", "funny"]
 
+SkipToOptions = [None, 1, 2, 3, 4]
 
 def wait_for_next_step(seconds: float = 2):
     time.sleep(seconds)
@@ -223,13 +227,14 @@ def generate_speech(
     output_path: str = "output"
 ):
     if isinstance(client, ElevenLabs):
+        file_extension = response_format.split('_')[0].split('-')[0]
         audio = client.text_to_speech.convert(
             text=text,
             voice_id=voice,
             model_id=model_name,
             output_format=response_format,
         )
-        save(audio=audio, filename=str(f"{output_path}.{response_format}"))
+        save(audio=audio, filename=str(f"{output_path}.{file_extension}"))
     else:
         with client.audio.speech.with_streaming_response.create(
             model=model_name,
 
@@ -329,15 +329,15 @@
 Make sure to structure the response exactly like this:
 
 [
-    ("Speaker n", "translated text 1"),
-    ("Speaker n", "translated text 2"),
-    ("Speaker n", "translated text 3")
+  ("Speaker n", "translated text 1"),
+  ("Speaker n", "translated text 2"),
+  ("Speaker n", "translated text 3")
 ]
 
 Do not change or rewrite the text in any way other than reformatformatting it into a list of tuples and translating it. Your response must only change the format and language, not the content.
 
 Ensure your output is in the correct tuple format, the speaker’s dialogue remains faithful to the original text, and the translation accurately reflects the meaning in {language}.
-DO NOT include episode titles, named speakers, intros, or section headers—ONLY provide raw dialogue labeled as ‘Speaker 1,’ ‘Speaker 2,’ etc. ONLY ONE SPEAKER CAN TALK AT A TIME."""
+DO NOT include episode titles, named speakers, intros, section headers, or ``` — ONLY provide raw dialogue labeled as ‘Speaker 1,’ ‘Speaker 2,’ etc. ONLY ONE SPEAKER CAN TALK AT A TIME."""
 
 
 gen_z_mapping_prompt = """Infuse humor, pop culture references, and a very laid-back conversational tone. Keep it **engaging, slightly chaotic, and fun, but still clear and informative.** Use modern wording naturally, like:  
 
@@ -48,13 +48,14 @@ def generate_rewritten_transcript(
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": input_text},
         ]
-        return generate_text(
+        out = generate_text(
             client=client,
             model=model_name,
             messages=conversation,
             max_tokens=max_tokens,
             temperature=temperature
         )
+        return out
 
     except Exception as e:
         raise TranscriptGenerationError(f"Failed to generate transcript: {str(e)}")
@@ -198,7 +199,7 @@ def generate_rewritten_transcript_with_overlap(
                         model=model_name,
                         messages=fix_prompt,
                         max_tokens=max_tokens,
-                        temperature=0.2,  # Lower temperature for more deterministic output
+                        temperature=0.3,
                     )
 
                     # Try to parse the fixed transcript
@@ -294,9 +295,36 @@ def step3(
                 language=language
             )
 
-        # Validate transcript format
         if not validate_transcript_format(transcript):
-            raise TranscriptGenerationError("Generated transcript is not in the correct format")
+            logger.warning("Generated transcript is not in the correct format. Attempting to fix...")
+            
+            fix_prompt = [
+                {"role": "system", "content": "Convert the following text into valid Python syntax as a list of tuples with format: [('Speaker1', 'Text1'), ('Speaker2', 'Text2'), ...]. Return ONLY the Python list, nothing else, no other text."},
+                {"role": "user", "content": transcript}
+            ]
+            
+            try:
+                fixed_transcript = generate_text(
+                    client=client,
+                    model=config["Big-Text-Model"]["model"],
+                    messages=fix_prompt,
+                    max_tokens=config["Step3"]["max_tokens"],
+                    temperature=0.3,
+                )
+                
+                # Try to validate the fixed transcript
+                if validate_transcript_format(fixed_transcript.strip()):
+                    logger.info("Successfully fixed transcript format")
+                    transcript = fixed_transcript.strip()
+                else:
+                    raise TranscriptGenerationError("Generated transcript is not in the correct format after correction attempt")
+                    
+            except Exception as retry_e:
+                # If the fix attempt fails, raise a detailed error
+                error_msg = f"Failed to fix transcript format: {str(retry_e)}"
+                logger.error(error_msg)
+                logger.error(f"Raw output (first 300 chars): {transcript[:300]}...")
+                raise TranscriptGenerationError(error_msg)
 
         # Save transcript
         output_file = output_dir / 'podcast_ready_data'
 
@@ -22,9 +22,9 @@ def load_podcast_data(data_path: Path) -> List[Tuple[str, str]]:
     except (ValueError, SyntaxError) as e:
         raise ValueError(f"Invalid podcast data format: {str(e)}")
 
-def concatenate_audio_files(segment_dir: Path) -> Tuple[np.ndarray, int]:
+def concatenate_audio_files(segment_dir: Path, format: str = "wav") -> Tuple[np.ndarray, int]:
     audio_files = sorted(
-        segment_dir.glob("*podcast_segment_*.wav"),
+        segment_dir.glob(f"*podcast_segment_*.{format}"),
         key=lambda x: int(re.search(r'segment_(\d+)\.wav', str(x)).group(1))
     )
 
@@ -60,6 +60,23 @@ def generate_speaker_audio(
     except Exception as e:
         raise AudioGenerationError(f"Failed to generate audio: {str(e)}")
 
+def parse_audio_format(format_string):
+    parts = format_string.split('_')
+    
+    # Default values
+    audio_format = parts[0]
+    sample_rate = None
+    bit_depth = None
+    
+    # Extract sample rate and bit depth if provided
+    if len(parts) > 1 and parts[1].isdigit():
+        sample_rate = int(parts[1])
+    
+    if len(parts) > 2 and parts[2].isdigit():
+        bit_depth = int(parts[2])
+    
+    return audio_format, sample_rate, bit_depth
+
 def step4(
     client: Any = None,
     config: Optional[Dict[str, Any]] = None,
@@ -75,9 +92,13 @@ def step4(
     response_format = config["Text-To-Speech-Model"].get("audio_format", "wav")
 
     try:
-        # Convert output_dir to a Path object
         input_dir = Path(input_dir)
         output_dir = Path(output_dir)
+
+        audio_format, sample_rate, bit_depth = parse_audio_format(response_format)
+
+        if audio_format not in ["wav", "mp3", "ogg", "flac", "aac"]:
+            raise ValueError(f"Unsupported audio format: {audio_format}")
 
         # Create output directories
         segments_dir = output_dir / "segments"
@@ -88,7 +109,7 @@ def step4(
 
         # Generate audio segments
         for i, (speaker, text) in enumerate(tqdm(podcast_data, desc="Generating podcast segments"), 1):
-            output_path = segments_dir / f"podcast_segment_{i}.wav"
+            output_path = segments_dir / f"podcast_segment_{i}"
 
             if speaker == "Speaker 1":
                 current_voice = host
@@ -112,10 +133,10 @@ def step4(
 
         # Concatenate all segments
         logger.info("Concatenating audio segments...")
-        final_audio, detected_sample_rate = concatenate_audio_files(segments_dir)
+        final_audio, detected_sample_rate = concatenate_audio_files(segments_dir, audio_format)
 
         # Save final podcast with the detected sample rate
-        final_path = output_dir / "podcast.wav"
+        final_path = f"{output_dir}/podcast.{audio_format}"
         sf.write(str(final_path), final_audio, detected_sample_rate)
         logger.info(f"Podcast generated successfully at {final_path}")
 
 
@@ -1 +1 @@
-__version__ = "1.1.2"
+__version__ = "1.2.0"
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,8 @@`
`1`	`1`	`{`
`2`		`- "Co-Host-Speaker-Voice": "af_sky+af_bella",`
	`2`	`+ "Co-Host-Speaker-1-Voice": "af_sky+af_bella",`
	`3`	`+ "Co-Host-Speaker-2-Voice": "af_echo",`
	`4`	`+ "Co-Host-Speaker-3-Voice": "af_nova",`
	`5`	`+ "Co-Host-Speaker-4-Voice": "af_shimmer",`
`3`	`6`	`"Host-Speaker-Voice": "af_alloy",`
`4`	`7`
`5`	`8`	`"Small-Text-Model": {`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "1.1.2"`
	`1`	`+__version__ = "1.2.0"`