-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathxlsx2html.py
More file actions
744 lines (628 loc) · 30.4 KB
/
xlsx2html.py
File metadata and controls
744 lines (628 loc) · 30.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
"""
xlsx2html.py - Convert Excel transcript files to HTML with links and summaries
Usage: python xlsx2html.py input.xlsx video_id [output.html] [--format={simple|numbered}]
This script takes an Excel file generated by txt2xlsx.py and:
1. Creates direct links to video timestamps for all speakers
2. Generates AI-powered summaries for transcript sections
3. Places the summaries near their corresponding speaker links
4. Generates separate HTML and Markdown files with:
- Speaker summaries with direct timestamp links
- Meeting summaries organized by batch with links to both batch start times and
individual topic timestamps within each batch
- Topic-level navigation within meeting summaries
Examples:
python xlsx2html.py meeting.xlsx 757a2c7c-eb52-47d1-9b4a-b2a1014b530b
python xlsx2html.py meeting.xlsx 757a2c7c-eb52-47d1-9b4a-b2a1014b530b meeting_links.html
"""
import sys
import os
import pandas as pd
import argparse
import re
import time
import json
import openai
from dotenv import load_dotenv
import numpy as np
import importlib.util
# Import utility functions from utils.py
from utils import (
seconds_to_time_str,
time_str_to_seconds,
format_corrected_timestamp,
verify_timestamp_format,
get_column_letter,
extract_transcript_data,
extract_unique_speakers,
create_time_batches,
extract_text_for_batch,
find_best_timestamp_match,
update_speaker_timestamps_for_topics,
extract_topics_from_summary,
get_api_key,
)
from speaker_summary_utils import (
enhance_speaker_tracking,
summarize_speaker_topic,
generate_enhanced_speaker_summary_markdown,
generate_enhanced_speaker_summary_html,
generate_speaker_summaries_data,
)
# -------------------------------------------------------------
# Constants and Configuration
# -------------------------------------------------------------
# load API KEY from .env
load_dotenv()
# Access the API key
OPENAI_API_KEY = os.getenv("API_KEY")
MODEL = os.getenv("GPT_MODEL", "gpt-4o")
# Default batch size for meeting summaries (in minutes)
DEFAULT_BATCH_SIZE_MINUTES = 40
ENHANCED_SUMMARIES_AVAILABLE = True
# -------------------------------------------------------------
# HTML and Markdown Generation Functions
# -------------------------------------------------------------
def generate_meeting_summaries_html(
batches, batch_summaries, video_id, html_file, transcript_data=None
):
"""
Generate HTML file with meeting batch summaries that include clickable timestamp links
for both batches and individual topics within each batch.
Topics are sorted chronologically by timestamp across all batches.
Args:
batches (list): List of batch entries
batch_summaries (list): List of batch summaries
video_id (str): Panopto video ID
html_file (str): Output HTML file path
transcript_data (list, optional): Full transcript data for better timestamp matching
Returns:
str: Path to the generated HTML file
"""
html_content = "<!DOCTYPE html>\n<html>\n<head>\n<title>Meeting Summaries</title>\n"
html_content += "<style>\n"
html_content += "body { font-family: Arial, sans-serif; margin: 20px; font-size: 11pt; }\n"
html_content += "ol { list-style-position: outside; padding-left: 12px; margin-top: 2px; }\n"
html_content += "ol li { margin-bottom: 1px; }\n"
html_content += ".topic-content { margin-bottom: 0px; font-family: Arial, sans-serif; font-size: 11pt; margin-top: 0px; display: inline; }\n"
# Title styling - Cambria, 11pt, #c0504d, underlined
html_content += "h1 { font-family: Cambria, serif; font-size: 11pt; color: #c0504d; text-decoration: underline; margin-bottom: 0px; margin-top: 0px; display: inline-block; }\n"
html_content += ".url-line { color: #1155cc; text-decoration: none; font-size: 11pt; margin-top: 2px; margin-bottom: 2px; display: block; }\n"
# Topic styling - Arial, 11pt, #7030a0, underlined
html_content += "h3.topic-heading { font-family: Arial, sans-serif; font-size: 11pt; color: #7030a0; text-decoration: underline; margin-top: 0px; margin-bottom: 1px; display: inline; }\n"
html_content += "a { color: inherit; }\n"
html_content += ".topic-link { text-decoration: underline; color: #7030a0; }\n"
html_content += ".topic-link span { text-decoration: underline; }\n"
html_content += "b { font-weight: bold; }\n"
html_content += "</style>\n</head>\n<body>\n"
try:
folder_name = os.path.basename(os.path.dirname(html_file))
formatted_name = _format_meeting_name(folder_name)
html_content += f'<h1>{formatted_name}</h1>\n'
except:
html_content += "<h1>Meeting Summary</h1>\n"
if video_id:
video_link = f"https://mit.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id={video_id}"
html_content += f'<a href="{video_link}" class="url-line">{video_link}</a>\n'
# Extract all topics from all batches
all_topics = []
for i, (batch, summary) in enumerate(zip(batches, batch_summaries), 1):
# Extract topics from the summary with their timestamps
topics = extract_topics_from_summary(summary, video_id, transcript_data)
# If we have transcript data, update the topic timestamps to better match content
if transcript_data:
topics = update_speaker_timestamps_for_topics(topics, transcript_data)
# Add batch index for reference
for topic in topics:
topic["batch_index"] = i
topic["batch"] = batch
all_topics.extend(topics)
# Sort all topics by timestamp_seconds
all_topics.sort(
key=lambda x: (
x["timestamp_seconds"]
if x["timestamp_seconds"] is not None
else float("inf")
)
)
# Generate HTML content with ordered list
html_content += "<ol>\n"
# Process each topic
for idx, topic_info in enumerate(all_topics, 1):
topic = topic_info["topic"]
speaker = topic_info["speaker"]
content = topic_info["content"]
html_content += '<li><h3 class="topic-heading">'
# Check if the topic has a direct timestamp link
if topic_info["video_link"] and topic_info["timestamp_seconds"] is not None:
# Use the direct link from the timestamp in the summary
topic_link = topic_info["video_link"]
seconds = topic_info["timestamp_seconds"]
# Verify the timestamp matches the seconds value
# If not, get a corrected timestamp
corrected_timestamp = verify_timestamp_format(
topic_info["timestamp"], seconds
)
html_content += f'<a href="{topic_link}" class="topic-link">'
html_content += f'{topic} - {speaker} <span style="color: #1155cc;">({corrected_timestamp})</span></a>'
else:
# Fallback: Find the entry for this speaker in the batch
names = re.split(r'\s*&\s*|,\s*| and ', speaker)
speaker_entry = None
for entry in topic_info["batch"]:
if entry["name"] in names:
speaker_entry = entry
break
# If we found the entry, create a link to it
if speaker_entry:
speaker_seconds = speaker_entry["seconds"]
speaker_time = verify_timestamp_format(
speaker_entry.get("time_str", ""), speaker_seconds
)
topic_link = f"https://mit.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id={video_id}&start={speaker_seconds}"
html_content += f'<a href="{topic_link}" class="topic-link">'
html_content += f'{topic} - {speaker} <span style="color: #1155cc;">({speaker_time})</span></a>'
else:
# If no entry found, just display the topic without a link
html_content += f'{topic} - {speaker}'
html_content += '</h3>: ' # Colon and space before content
html_content += f'<div class="topic-content">{content}</div></li>\n'
html_content += "</ol>\n</body>\n</html>"
# Write the file
with open(html_file, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"Generated meeting summaries HTML with verified timestamps: {html_file}")
return html_file
def _format_meeting_name(raw_name: str) -> str:
"""Format meeting name for display in HTML/Markdown."""
import re
# Replace underscores with spaces
formatted = raw_name.replace('_', ' ')
# Fix timestamp formatting (e.g., "4.00pm" -> "4:00pm")
formatted = re.sub(r'(?<=\d)\.(\d{2})(am|pm)', r':\1\2', formatted)
return formatted
def generate_meeting_summaries_markdown(
batches, batch_summaries, video_id, md_file, transcript_data=None
):
"""
Generate Markdown file with meeting batch summaries that include clickable timestamp links
(if video_id provided) or text-only timestamps.
Topics are sorted chronologically by timestamp, and timestamps are verified to match URL seconds.
Args:
batches (list): List of batch entries
batch_summaries (list): List of batch summaries
video_id (str): Panopto video ID (can be None for text-only timestamps)
md_file (str): Output Markdown file path
transcript_data (list, optional): Full transcript data for better timestamp matching
Returns:
str: Path to the generated Markdown file
"""
md_lines = []
try:
title = re.sub(r'(?<=\d)\.(\d{2})(am|pm)', r':\1\2', md_file)
folder_name = os.path.basename(os.path.dirname(title))
formatted_name = folder_name.replace("_", " ")
if video_id:
video_link = f"https://mit.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id={video_id}"
md_lines.append(f"# [{formatted_name}]({video_link})\n")
else:
md_lines.append(f"# {formatted_name}\n")
except:
md_lines.append("# Meeting Summary\n")
# Extract all topics from all batches
all_topics = []
for i, (batch, summary) in enumerate(zip(batches, batch_summaries), 1):
# Extract topics from the summary with their timestamps
topics = extract_topics_from_summary(summary, video_id, transcript_data)
# If we have transcript data, update the topic timestamps to better match content
if transcript_data:
topics = update_speaker_timestamps_for_topics(topics, transcript_data)
# Add batch index for reference
for topic in topics:
topic["batch_index"] = i
topic["batch"] = batch
all_topics.extend(topics)
# Sort all topics by timestamp_seconds
all_topics.sort(
key=lambda x: (
x["timestamp_seconds"]
if x["timestamp_seconds"] is not None
else float("inf")
)
)
# Process each topic
for topic_info in all_topics:
topic = topic_info["topic"]
speaker = topic_info["speaker"]
content = topic_info["content"]
batch = topic_info["batch"]
# Check if the topic has a direct timestamp link and video_id is provided
if video_id and topic_info["video_link"] and topic_info["timestamp_seconds"] is not None:
# Use the direct link from the timestamp in the summary
topic_link = topic_info["video_link"]
seconds = topic_info["timestamp_seconds"]
# Verify the timestamp matches the seconds value
corrected_timestamp = verify_timestamp_format(
topic_info["timestamp"], seconds
)
# Add topic as a subheading with direct link and corrected timestamp
md_lines.append(
f"**{topic} - {speaker}** [({corrected_timestamp})]({topic_link})"
)
elif video_id:
# Fallback: Find the entry for this speaker in the batch with video links
speaker_entry = None
for entry in batch:
if entry["name"] == speaker:
speaker_entry = entry
break
# If we found the entry, create a link to it
if speaker_entry:
speaker_seconds = speaker_entry["seconds"]
speaker_time = verify_timestamp_format(
speaker_entry.get("time_str", ""), speaker_seconds
)
topic_link = f"https://mit.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id={video_id}&start={speaker_seconds}"
# Add topic as a subheading with link from entry
md_lines.append(
f"**{topic} - {speaker}** [({speaker_time})]({topic_link})"
)
else:
# If no entry found, just display the topic without a link
md_lines.append(f"**{topic} - {speaker}**")
else:
# No video_id provided - use text-only timestamps
if topic_info["timestamp_seconds"] is not None:
corrected_timestamp = verify_timestamp_format(
topic_info["timestamp"], topic_info["timestamp_seconds"]
)
md_lines.append(f"**{topic} - {speaker}** ({corrected_timestamp})")
else:
# Find timestamp from batch entry
speaker_entry = None
for entry in batch:
if entry["name"] == speaker:
speaker_entry = entry
break
if speaker_entry:
speaker_time = verify_timestamp_format(
speaker_entry.get("time_str", ""), speaker_entry["seconds"]
)
md_lines.append(f"**{topic} - {speaker}** ({speaker_time})")
else:
md_lines.append(f"**{topic} - {speaker}**")
# Add the content for this topic
md_lines.append(f"{content}\n")
# Write the file
with open(md_file, "w", encoding="utf-8") as f:
f.write("\n".join(md_lines))
timestamp_type = "clickable links" if video_id else "text-only timestamps"
print(f"Generated meeting summaries markdown with {timestamp_type}: {md_file}")
return md_file
def summarize_batch(batch_entries, batch_number, api_key):
"""
Summarize a batch of transcript entries using OpenAI API
Args:
batch_entries (list): List of transcript entries for this batch
batch_number (int): Batch number for identification
api_key (str): OpenAI API key
Returns:
str: Batch summary with topics and timestamps
"""
if not api_key:
return "API key not provided. Summaries not generated."
# Extract batch text
batch_text = extract_text_for_batch(batch_entries)
if not batch_text.strip():
return "No text available for summarization."
# Get start and end times
start_seconds = min(entry["seconds"] for entry in batch_entries)
# End time is either explicit end_seconds or last entry
if any("end_seconds" in entry for entry in batch_entries):
# Use the max end_seconds if available
end_seconds = max(
entry.get("end_seconds", entry["seconds"]) for entry in batch_entries
)
else:
# Otherwise use the last entry in the batch
end_seconds = max(entry["seconds"] for entry in batch_entries)
start_time = seconds_to_time_str(start_seconds)
end_time = seconds_to_time_str(end_seconds)
# Create a mapping of speaker names to ALL their timestamps for this batch
speaker_timestamps = {}
for entry in batch_entries:
speaker = entry["name"]
if speaker not in speaker_timestamps:
speaker_timestamps[speaker] = []
# Add this timestamp to the list for this speaker
speaker_timestamps[speaker].append(
{
"seconds": entry["seconds"],
"time_str": entry["time_str"],
"text": entry["text"][:100], # Include a snippet of text for context
}
)
# Prepare the timestamp reference for the model
timestamp_reference = "SPEAKER TIMESTAMPS (DO NOT MODIFY THESE):\n"
for speaker, timestamps in speaker_timestamps.items():
# Sort timestamps chronologically
sorted_timestamps = sorted(timestamps, key=lambda x: x["seconds"])
# Include all timestamps for the speaker with context snippets
timestamp_reference += f"\n{speaker}:\n"
for i, ts in enumerate(sorted_timestamps, 1):
timestamp_reference += f" {i}. {ts['time_str']} - '{ts['text']}...'\n"
try:
openai.api_key = api_key
# Determine if this is the first batch (meeting start)
is_first_batch = batch_number == 1
# Adjust guardrails based on batch position
if is_first_batch:
batch_context = """NON-NEGOTIABLE GUARDRAILS FOR FIRST BATCH:
- This is the beginning of the meeting. Start from the earliest timestamp.
- If the earliest content is simple (greetings, technical setup), title it: "Introductions & Setup".
- If the earliest content is substantive, title it based on the content (e.g., "Project Kickoff", "Budget Discussion").
- NEVER claim the meeting began late or at a later timestamp."""
else:
batch_context = f"""NON-NEGOTIABLE GUARDRAILS FOR CONTINUATION BATCH:
- This is batch #{batch_number} of an ongoing meeting (timespan: {start_time} - {end_time}).
- Start summarizing from the earliest timestamp in THIS batch ({start_time}).
- Do NOT create "Introductions & Setup" topics - the meeting has already started.
- Do NOT write phrases suggesting the meeting is beginning.
- Begin directly with the substantive topics being discussed in this time segment."""
prompt = (
f"""You are producing a structured summary of a meeting transcript batch.
{batch_context}
UNIVERSAL RULES (ALL BATCHES):
- Never invent or modify timestamps. Use only those in SPEAKER TIMESTAMPS.
- Obey the exact output format and paragraph-only content rule.
- Use third person voice, never first or second person.
- Do NOT write phrases like "the meeting began at..." or "set up in the middle of the meeting".
-
"OUTPUT FORMAT REQUIREMENTS (CRITICAL):\n"
"1. Each topic must follow this EXACT format:\n"
" **Topic Title - Speaker Name** (H:MM:SS): Content...\n"
"2. The format must be followed precisely with NO exceptions\n"
"3. Use only exact timestamps from the provided SPEAKER TIMESTAMPS section\n"
"4. BOLD important terms within the content: <b>terms</b>\n"
"5. Content should be in paragraph form (no bullet points or line breaks)\n\n"
"TIMESTAMP SELECTION RULES:\n"
"1. Choose the MOST RELEVANT timestamp from the provided options for each speaker\n"
"2. Match the timestamp to where the specific topic is actually discussed\n"
"3. NEVER create or modify timestamps - use only those provided\n\n"
"CONTENT REQUIREMENTS:\n"
"1. Thoroughly explain each topic with technical precision\n"
"2. Include interactions between different speakers\n"
"3. Be detailed and comprehensive\n"
"4. Do not hallucinate information\n"
"5. Do not include a concluding summary paragraph\n\n"
"6. Each paragraph must be of 5 minute conversation\n"
"7.For Speaker names write two speaker names that was most involved in the topic\n\n"
f"{timestamp_reference}\n\n"
f"MEETING TRANSCRIPT BATCH #{batch_number} ({start_time} - {end_time}):\n\n{batch_text}"
)
INTERNAL SELF-CHECK (DO NOT PRINT): Verify privately that
- every line matches the required pattern
- all timestamps appear in SPEAKER TIMESTAMPS
- no bullets or extra line breaks
- everything is third person
If any answer is NO, fix the output and re-check before returning.
AFTER YOU SELF-CHECK, RETURN ONLY THE TOPIC LINES—NO EXPLANATIONS, NO CHECKLIST, NO EXTRA TEXT."""
)
# Using chat completions API
response = openai.chat.completions.create(
model=MODEL,
messages=[
{
"role": "system",
"content": "You are a technical meeting summarizer. NEVER modify the timestamps provided to you.",
},
{"role": "user", "content": prompt},
],
max_tokens=10000, # More tokens for batch summaries
)
summary = response.choices[0].message.content.strip()
# Post-process to verify timestamps are from the provided list
for speaker, timestamps in speaker_timestamps.items():
# Create a set of valid timestamps for this speaker
valid_timestamps = {ts["time_str"] for ts in timestamps}
# Look for patterns like "**Topic - Speaker** (H:MM:SS):" with timestamps
pattern = f"\\*\\*[^*]+ - {re.escape(speaker)}\\*\\* \\(([0-9]:[0-9]{{2}}:[0-9]{{2}})\\)"
matches = re.finditer(pattern, summary)
for match in matches:
found_timestamp = match.group(1)
# Check if the timestamp is valid for this speaker
if found_timestamp not in valid_timestamps:
# Use the first timestamp as fallback
fallback_timestamp = timestamps[0]["time_str"]
# Replace the incorrect timestamp with a valid one
summary = summary.replace(
f"**{match.group(0).split('**')[1]}** ({found_timestamp})",
f"**{match.group(0).split('**')[1]}** ({fallback_timestamp})",
)
print(
f"Warning: Replaced invalid timestamp {found_timestamp} with {fallback_timestamp} for {speaker}"
)
return summary
except Exception as e:
return f"Error generating batch summary: {str(e)}"
# -------------------------------------------------------------
# Main Processing Function
# -------------------------------------------------------------
"""
Optimized speaker summary generation integration for xlsx2html.py
"""
# Update the process_xlsx function in xlsx2html.py to use the optimized approach:
def process_xlsx(
xlsx_file,
video_id, # Can now be None
html_file=None,
speaker_summary_file=None,
meeting_summary_md_file=None,
batch_size_minutes=DEFAULT_BATCH_SIZE_MINUTES,
use_enhanced_summaries=False,
):
"""
Process Excel file to generate HTML links with summaries and meeting summaries
Args:
xlsx_file (str): Path to input Excel file
video_id (str): Panopto video ID (can be None for text-only timestamps)
html_file (str, optional): Path to output HTML file for speaker links
speaker_summary_file (str, optional): Path to output Markdown file for speaker summaries
meeting_summary_md_file (str, optional): Path to output Markdown file for meeting summaries
batch_size_minutes (int, optional): Batch size in minutes (default: DEFAULT_BATCH_SIZE_MINUTES)
use_enhanced_summaries (bool, optional): Whether to use enhanced speaker summaries
Returns:
tuple: Paths to the generated files (html_file, summary_file, speaker_summary_file, meeting_summary_md_file)
"""
if html_file is None:
html_file = os.path.splitext(xlsx_file)[0] + "_speaker_summaries.html"
if speaker_summary_file is None:
speaker_summary_file = os.path.splitext(xlsx_file)[0] + "_speaker_summaries.md"
summary_file = os.path.splitext(xlsx_file)[0] + "_meeting_summaries.html"
if meeting_summary_md_file is None:
meeting_summary_md_file = (
os.path.splitext(xlsx_file)[0] + "_meeting_summaries.md"
)
# Get OpenAI API key
api_key = get_api_key()
if not api_key:
print("Warning: OpenAI API key not provided. Summaries will not be generated.")
return None, None, None, None
try:
# Read the Excel file
df = pd.read_excel(xlsx_file)
# Extract speaker links
speaker_links = extract_unique_speakers(df)
# Extract full transcript data
transcript_data = extract_transcript_data(df)
# Use enhanced speaker summaries if requested and available
if use_enhanced_summaries and ENHANCED_SUMMARIES_AVAILABLE:
link_type = "clickable links" if video_id else "text-only timestamps"
print(f"Using enhanced speaker summaries with multiple topic support and {link_type}...")
# Generate summaries data once - this avoids duplicate API calls
print("Generating speaker topic summaries...")
summaries_data = generate_speaker_summaries_data(transcript_data, api_key)
# Generate enhanced speaker summary markdown
if speaker_summary_file:
generate_enhanced_speaker_summary_markdown(
transcript_data,
video_id, # Can be None
speaker_summary_file,
api_key,
summaries_data, # Pass the pre-generated summaries
)
print(
f"Generated enhanced speaker summary markdown: {speaker_summary_file}"
)
# Generate enhanced speaker summary HTML
if html_file:
generate_enhanced_speaker_summary_html(
transcript_data,
video_id, # Can be None
html_file,
api_key,
summaries_data, # Pass the pre-generated summaries
)
print(f"Generated enhanced speaker summary HTML: {html_file}")
# Create time-based batches directly from transcript data
print("Creating time-based batches for meeting summaries...")
batches = create_time_batches(transcript_data, batch_size_minutes)
print(f"Created {len(batches)} batches")
# Generate batch summaries
batch_summaries = []
for i, batch in enumerate(batches, 1):
# Get start and end times for this batch
start_seconds = min(entry["seconds"] for entry in batch)
# End time is either explicit end_seconds or last entry
if any("end_seconds" in entry for entry in batch):
end_seconds = max(
entry.get("end_seconds", entry["seconds"]) for entry in batch
)
else:
end_seconds = max(entry["seconds"] for entry in batch)
start_time = seconds_to_time_str(start_seconds)
end_time = seconds_to_time_str(end_seconds)
print(f"Processing batch {i}/{len(batches)}: {start_time} - {end_time}")
# Generate summary
summary = summarize_batch(batch, i, api_key)
batch_summaries.append(summary)
# Generate meeting summaries HTML with topic-level clickable links (or text-only timestamps)
# Pass transcript_data for improved timestamp matching
generate_meeting_summaries_html(
batches, batch_summaries, video_id, summary_file, transcript_data
)
timestamp_type = "clickable links" if video_id else "text-only timestamps"
print(f"Generated meeting summaries HTML with {timestamp_type}: {summary_file}")
# Generate meeting summaries Markdown with topic-level clickable links (or text-only timestamps)
# Pass transcript_data for improved timestamp matching
generate_meeting_summaries_markdown(
batches, batch_summaries, video_id, meeting_summary_md_file, transcript_data
)
print(f"Generated meeting summaries Markdown with {timestamp_type}: {meeting_summary_md_file}")
return html_file, summary_file, speaker_summary_file, meeting_summary_md_file
except Exception as e:
print(f"Error processing Excel file: {e}", file=sys.stderr)
raise
def main():
"""
Main function to handle command-line arguments and process Excel file
"""
# Set up argument parser
parser = argparse.ArgumentParser(
description="Convert Excel transcript to HTML links with summaries"
)
parser.add_argument("input_file", help="Input Excel file")
parser.add_argument("video_id", help="Panopto video ID (required)")
parser.add_argument("output_file", nargs="?", help="Output HTML file (optional)")
parser.add_argument(
"--summary-file", help="Output file for meeting summaries HTML (optional)"
)
parser.add_argument(
"--speaker-summary-file",
help="Output file for speaker summaries markdown (optional)",
)
parser.add_argument(
"--meeting-summary-md-file",
help="Output file for meeting summaries markdown (optional)",
)
parser.add_argument(
"--batch-size",
type=int,
default=DEFAULT_BATCH_SIZE_MINUTES,
help=f"Batch size in minutes (default: {DEFAULT_BATCH_SIZE_MINUTES})",
)
parser.add_argument(
"--enhanced-summaries",
action="store_true",
help="Generate enhanced speaker summaries with multiple topics (requires speaker_summary_utils.py)",
)
args = parser.parse_args()
# Check if enhanced summaries are requested but not available
if args.enhanced_summaries and not ENHANCED_SUMMARIES_AVAILABLE:
print(
"Warning: Enhanced speaker summaries requested but speaker_summary_utils.py is not available."
)
print("Falling back to traditional speaker summaries.")
args.enhanced_summaries = False
try:
html_file, summary_html_file, speaker_summary_file, meeting_summary_md_file = (
process_xlsx(
args.input_file,
args.video_id,
args.output_file,
args.speaker_summary_file,
args.meeting_summary_md_file,
args.batch_size,
args.enhanced_summaries,
)
)
if html_file and summary_html_file:
print(f"Processing complete!")
print(f"Speaker links HTML: {html_file}")
print(f"Speaker summary Markdown: {speaker_summary_file}")
print(f"Meeting summaries HTML: {summary_html_file}")
print(f"Meeting summaries Markdown: {meeting_summary_md_file}")
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()