-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathparse_transcripts_docx.py
More file actions
142 lines (119 loc) · 5.65 KB
/
parse_transcripts_docx.py
File metadata and controls
142 lines (119 loc) · 5.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
Parse Transcripts.docx and extract transcripts matched to video titles.
"""
import os
import re
import logging
from typing import Dict, Optional
try:
from docx import Document
HAS_DOCX = True
except ImportError:
HAS_DOCX = False
logging.warning("python-docx not installed. Cannot parse .docx files.")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
TRANSCRIPTS_DOCX_PATH = "transcripts/Transcripts.docx"
def normalize_title(title: str) -> str:
"""Normalize title for matching (lowercase, remove special chars)."""
normalized = re.sub(r'[^\w\s]', '', title.lower())
normalized = re.sub(r'\s+', '_', normalized.strip())
return normalized[:50] # Take first 50 chars
def extract_youtube_id_from_text(text: str) -> Optional[str]:
"""Extract YouTube ID from text (could be in URL or standalone)."""
import re
# Try URL patterns
url_patterns = [
r'youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})',
r'youtu\.be/([a-zA-Z0-9_-]{11})',
r'youtube\.com/embed/([a-zA-Z0-9_-]{11})',
r'youtube\.com/v/([a-zA-Z0-9_-]{11})',
]
for pattern in url_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1)
# Try standalone 11-char ID
standalone_match = re.search(r'\b([a-zA-Z0-9_-]{11})\b', text)
if standalone_match:
return standalone_match.group(1)
return None
def parse_transcripts_docx() -> Dict[str, str]:
"""
Parse Transcripts.docx and return a dictionary mapping normalized titles/IDs to transcripts.
Returns:
Dict mapping (normalized_title or youtube_id) -> full_transcript_text
Multiple keys per transcript (by title AND by YouTube ID if found)
"""
if not HAS_DOCX:
logging.warning("python-docx not available. Skipping .docx parsing.")
return {}
if not os.path.exists(TRANSCRIPTS_DOCX_PATH):
logging.warning(f"Transcripts.docx not found at {TRANSCRIPTS_DOCX_PATH}")
return {}
logging.info(f"Parsing {TRANSCRIPTS_DOCX_PATH}...")
try:
doc = Document(TRANSCRIPTS_DOCX_PATH)
transcripts = {}
current_title = None
current_transcript = []
for para in doc.paragraphs:
text = para.text.strip()
if not text:
continue
# Check if this looks like a title (short, might be hyperlink, or formatted differently)
# Titles are often followed by transcript content
# Look for patterns: hyperlinks, bold text, or short lines that could be titles
# If line is short and looks like a title, start new entry
if len(text) < 100 and (text.count(' ') < 10):
# Save previous transcript if exists
if current_title and current_transcript:
transcript_text = ' '.join(current_transcript)
if len(transcript_text) > 50: # Only save if substantial content
# Map by normalized title
normalized = normalize_title(current_title)
transcripts[normalized] = transcript_text
# Also try to extract YouTube ID from title and map by that too
youtube_id = extract_youtube_id_from_text(current_title)
if youtube_id:
# Map by YouTube ID (as-is and cleaned)
transcripts[youtube_id] = transcript_text
if youtube_id.startswith('-') or youtube_id.startswith('_'):
transcripts[youtube_id[1:]] = transcript_text
logging.debug(f" Extracted transcript for: {current_title[:50]}")
# Start new entry
current_title = text
current_transcript = []
else:
# This is transcript content
if current_title:
current_transcript.append(text)
else:
# No title yet, might be at start of doc
# Try to extract title from first substantial line
if len(text) < 100:
current_title = text
current_transcript = []
# Save last transcript
if current_title and current_transcript:
transcript_text = ' '.join(current_transcript)
if len(transcript_text) > 50:
normalized = normalize_title(current_title)
transcripts[normalized] = transcript_text
# Also map by YouTube ID if found
youtube_id = extract_youtube_id_from_text(current_title)
if youtube_id:
transcripts[youtube_id] = transcript_text
if youtube_id.startswith('-') or youtube_id.startswith('_'):
transcripts[youtube_id[1:]] = transcript_text
logging.info(f" Extracted {len(set(transcripts.values()))} unique transcripts from .docx")
logging.info(f" Created {len(transcripts)} mapping keys (by title and YouTube ID)")
return transcripts
except Exception as e:
logging.error(f"Error parsing Transcripts.docx: {e}")
return {}
if __name__ == "__main__":
transcripts = parse_transcripts_docx()
print(f"\nExtracted {len(transcripts)} transcripts")
print("\nSample titles:")
for i, title in enumerate(list(transcripts.keys())[:5]):
print(f" {i+1}. {title}")