Skip to content

Commit 8109f18

Browse files
Merge pull request #7 from Goekdeniz-Guelmez/adding-webui-and-personas
Adding webui
2 parents d3f018a + 0718eac commit 8109f18

File tree

13 files changed

+289
-26
lines changed

13 files changed

+289
-26
lines changed

.DS_Store

6 KB
Binary file not shown.

.gitignore

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,10 @@ test.text
1414
venv
1515

1616
.conda
17-
conda
17+
conda
18+
19+
local_notebooklm/web_ui/output
20+
.local_notebooklm/web_ui/output
21+
22+
.gradio
23+
gradio

README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,47 @@ else:
310310
print(f"Failed to generate podcast: {result}")
311311
```
312312

313+
### Gradio Web UI
314+
315+
Local-NotebookLM now includes a user-friendly Gradio web interface that makes it easy to use the tool without command line knowledge:
316+
317+
```bash
318+
python -m local_notebooklm.web_ui
319+
```
320+
321+
By default, the web UI runs locally on http://localhost:7860. You can access it from your browser.
322+
323+
#### Web UI Screenshots
324+
325+
![Web UI Main Screen](examples/Gradio-WebUI.png)
326+
*The main interface of the Local-NotebookLM web UI*
327+
328+
#### Web UI Options
329+
330+
| Option | Description | Default |
331+
|--------|-------------|---------|
332+
| `--share` | Make the UI accessible over the network | False |
333+
| `--port` | Specify a custom port | 7860 |
334+
335+
#### Example Commands
336+
337+
Basic local usage:
338+
```bash
339+
python -m local_notebooklm.web_ui
340+
```
341+
342+
Share with others on your network:
343+
```bash
344+
python -m local_notebooklm.web_ui --share
345+
```
346+
347+
Use a custom port:
348+
```bash
349+
python -m local_notebooklm.web_ui --port 8080
350+
```
351+
352+
The web interface provides all the same options as the command line tool in an intuitive UI, making it easier for non-technical users to generate audio content from PDFs.
353+
313354
### FastAPI Server
314355

315356
Start the FastAPI server to access the functionality via a web API:

example_config.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
{
2-
"Co-Host-Speaker-Voice": "af_sky+af_bella",
2+
"Co-Host-Speaker-1-Voice": "af_sky+af_bella",
3+
"Co-Host-Speaker-2-Voice": "af_echo",
4+
"Co-Host-Speaker-3-Voice": "af_nova",
5+
"Co-Host-Speaker-4-Voice": "af_shimmer",
36
"Host-Speaker-Voice": "af_alloy",
47

58
"Small-Text-Model": {

examples/Gradio-WebUI.png

316 KB
Loading

local_notebooklm/steps/helpers.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,14 @@
1010
FormatType = Literal[
1111
"podcast", "interview", "panel-discussion", "debate",
1212
"summary", "narration", "storytelling", "explainer",
13-
"lecture", "tutorial", "q-and-a",
14-
"news-report", "executive-brief", "meeting", "analysis"
13+
"lecture", "tutorial", "q-and-a","news-report",
14+
"executive-brief", "meeting", "analysis",
15+
"three-people-podcast", "three-people-panel-discussion",
16+
"three-people-debate", "four-people-podcast",
17+
"four-people-panel-discussion", "four-people-debate",
18+
"five-people-podcast", "five-people-panel-discussion", "five-people-debate"
1519
]
1620

17-
1821
SingleSpeakerFormats = Literal[
1922
"summary", "narration", "storytelling", "explainer",
2023
"lecture", "tutorial", "news-report", "executive-brief", "analysis"
@@ -60,6 +63,7 @@
6063

6164
StyleType = Literal["normal", "friendly", "professional", "academic", "casual", "technical", "gen-z", "funny"]
6265

66+
SkipToOptions = [None, 1, 2, 3, 4]
6367

6468
def wait_for_next_step(seconds: float = 2):
6569
time.sleep(seconds)
@@ -223,13 +227,14 @@ def generate_speech(
223227
output_path: str = "output"
224228
):
225229
if isinstance(client, ElevenLabs):
230+
file_extension = response_format.split('_')[0].split('-')[0]
226231
audio = client.text_to_speech.convert(
227232
text=text,
228233
voice_id=voice,
229234
model_id=model_name,
230235
output_format=response_format,
231236
)
232-
save(audio=audio, filename=str(f"{output_path}.{response_format}"))
237+
save(audio=audio, filename=str(f"{output_path}.{file_extension}"))
233238
else:
234239
with client.audio.speech.with_streaming_response.create(
235240
model=model_name,

local_notebooklm/steps/prompts.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -329,15 +329,15 @@
329329
Make sure to structure the response exactly like this:
330330
331331
[
332-
("Speaker n", "translated text 1"),
333-
("Speaker n", "translated text 2"),
334-
("Speaker n", "translated text 3")
332+
("Speaker n", "translated text 1"),
333+
("Speaker n", "translated text 2"),
334+
("Speaker n", "translated text 3")
335335
]
336336
337337
Do not change or rewrite the text in any way other than reformatformatting it into a list of tuples and translating it. Your response must only change the format and language, not the content.
338338
339339
Ensure your output is in the correct tuple format, the speaker’s dialogue remains faithful to the original text, and the translation accurately reflects the meaning in {language}.
340-
DO NOT include episode titles, named speakers, intros, or section headersONLY provide raw dialogue labeled as ‘Speaker 1,’ ‘Speaker 2,’ etc. ONLY ONE SPEAKER CAN TALK AT A TIME."""
340+
DO NOT include episode titles, named speakers, intros, section headers, or ``` — ONLY provide raw dialogue labeled as ‘Speaker 1,’ ‘Speaker 2,’ etc. ONLY ONE SPEAKER CAN TALK AT A TIME."""
341341

342342

343343
gen_z_mapping_prompt = """Infuse humor, pop culture references, and a very laid-back conversational tone. Keep it **engaging, slightly chaotic, and fun, but still clear and informative.** Use modern wording naturally, like:

local_notebooklm/steps/step3.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,14 @@ def generate_rewritten_transcript(
4848
{"role": "system", "content": system_prompt},
4949
{"role": "user", "content": input_text},
5050
]
51-
return generate_text(
51+
out = generate_text(
5252
client=client,
5353
model=model_name,
5454
messages=conversation,
5555
max_tokens=max_tokens,
5656
temperature=temperature
5757
)
58+
return out
5859

5960
except Exception as e:
6061
raise TranscriptGenerationError(f"Failed to generate transcript: {str(e)}")
@@ -198,7 +199,7 @@ def generate_rewritten_transcript_with_overlap(
198199
model=model_name,
199200
messages=fix_prompt,
200201
max_tokens=max_tokens,
201-
temperature=0.2, # Lower temperature for more deterministic output
202+
temperature=0.3,
202203
)
203204

204205
# Try to parse the fixed transcript
@@ -294,9 +295,36 @@ def step3(
294295
language=language
295296
)
296297

297-
# Validate transcript format
298298
if not validate_transcript_format(transcript):
299-
raise TranscriptGenerationError("Generated transcript is not in the correct format")
299+
logger.warning("Generated transcript is not in the correct format. Attempting to fix...")
300+
301+
fix_prompt = [
302+
{"role": "system", "content": "Convert the following text into valid Python syntax as a list of tuples with format: [('Speaker1', 'Text1'), ('Speaker2', 'Text2'), ...]. Return ONLY the Python list, nothing else, no other text."},
303+
{"role": "user", "content": transcript}
304+
]
305+
306+
try:
307+
fixed_transcript = generate_text(
308+
client=client,
309+
model=config["Big-Text-Model"]["model"],
310+
messages=fix_prompt,
311+
max_tokens=config["Step3"]["max_tokens"],
312+
temperature=0.3,
313+
)
314+
315+
# Try to validate the fixed transcript
316+
if validate_transcript_format(fixed_transcript.strip()):
317+
logger.info("Successfully fixed transcript format")
318+
transcript = fixed_transcript.strip()
319+
else:
320+
raise TranscriptGenerationError("Generated transcript is not in the correct format after correction attempt")
321+
322+
except Exception as retry_e:
323+
# If the fix attempt fails, raise a detailed error
324+
error_msg = f"Failed to fix transcript format: {str(retry_e)}"
325+
logger.error(error_msg)
326+
logger.error(f"Raw output (first 300 chars): {transcript[:300]}...")
327+
raise TranscriptGenerationError(error_msg)
300328

301329
# Save transcript
302330
output_file = output_dir / 'podcast_ready_data'

local_notebooklm/steps/step4.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ def load_podcast_data(data_path: Path) -> List[Tuple[str, str]]:
2222
except (ValueError, SyntaxError) as e:
2323
raise ValueError(f"Invalid podcast data format: {str(e)}")
2424

25-
def concatenate_audio_files(segment_dir: Path) -> Tuple[np.ndarray, int]:
25+
def concatenate_audio_files(segment_dir: Path, format: str = "wav") -> Tuple[np.ndarray, int]:
2626
audio_files = sorted(
27-
segment_dir.glob("*podcast_segment_*.wav"),
27+
segment_dir.glob(f"*podcast_segment_*.{format}"),
2828
key=lambda x: int(re.search(r'segment_(\d+)\.wav', str(x)).group(1))
2929
)
3030

@@ -60,6 +60,23 @@ def generate_speaker_audio(
6060
except Exception as e:
6161
raise AudioGenerationError(f"Failed to generate audio: {str(e)}")
6262

63+
def parse_audio_format(format_string):
64+
parts = format_string.split('_')
65+
66+
# Default values
67+
audio_format = parts[0]
68+
sample_rate = None
69+
bit_depth = None
70+
71+
# Extract sample rate and bit depth if provided
72+
if len(parts) > 1 and parts[1].isdigit():
73+
sample_rate = int(parts[1])
74+
75+
if len(parts) > 2 and parts[2].isdigit():
76+
bit_depth = int(parts[2])
77+
78+
return audio_format, sample_rate, bit_depth
79+
6380
def step4(
6481
client: Any = None,
6582
config: Optional[Dict[str, Any]] = None,
@@ -75,9 +92,13 @@ def step4(
7592
response_format = config["Text-To-Speech-Model"].get("audio_format", "wav")
7693

7794
try:
78-
# Convert output_dir to a Path object
7995
input_dir = Path(input_dir)
8096
output_dir = Path(output_dir)
97+
98+
audio_format, sample_rate, bit_depth = parse_audio_format(response_format)
99+
100+
if audio_format not in ["wav", "mp3", "ogg", "flac", "aac"]:
101+
raise ValueError(f"Unsupported audio format: {audio_format}")
81102

82103
# Create output directories
83104
segments_dir = output_dir / "segments"
@@ -88,7 +109,7 @@ def step4(
88109

89110
# Generate audio segments
90111
for i, (speaker, text) in enumerate(tqdm(podcast_data, desc="Generating podcast segments"), 1):
91-
output_path = segments_dir / f"podcast_segment_{i}.wav"
112+
output_path = segments_dir / f"podcast_segment_{i}"
92113

93114
if speaker == "Speaker 1":
94115
current_voice = host
@@ -112,10 +133,10 @@ def step4(
112133

113134
# Concatenate all segments
114135
logger.info("Concatenating audio segments...")
115-
final_audio, detected_sample_rate = concatenate_audio_files(segments_dir)
136+
final_audio, detected_sample_rate = concatenate_audio_files(segments_dir, audio_format)
116137

117138
# Save final podcast with the detected sample rate
118-
final_path = output_dir / "podcast.wav"
139+
final_path = f"{output_dir}/podcast.{audio_format}"
119140
sf.write(str(final_path), final_audio, detected_sample_rate)
120141
logger.info(f"Podcast generated successfully at {final_path}")
121142

local_notebooklm/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.1.2"
1+
__version__ = "1.2.0"

0 commit comments

Comments
 (0)