-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathinspect_html.py
More file actions
62 lines (53 loc) · 2.63 KB
/
inspect_html.py
File metadata and controls
62 lines (53 loc) · 2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python3
"""
Simple HTML inspector to see what's actually in the file
"""
from pathlib import Path
from bs4 import BeautifulSoup
html_path = Path("/home/ty/Repositories/ai_workspace/gemini_picturebook_generator/generated_stories/story_20250528_165909/The_story_of_the_first_embodie_story.html")
print("📖 Reading HTML file...")
with open(html_path, 'r', encoding='utf-8') as f:
html_content = f.read()
print(f"📊 HTML file size: {len(html_content)} characters")
# Look for the first scene to see its structure
first_scene_start = html_content.find('<div class="scene">')
if first_scene_start != -1:
# Get the first 1000 characters of the first scene
first_scene_end = html_content.find('</div>', first_scene_start + 1000)
if first_scene_end == -1:
first_scene_end = first_scene_start + 1000
scene_html = html_content[first_scene_start:first_scene_end + 6]
print("\n🎬 First scene HTML structure:")
print(scene_html)
# Now parse with BeautifulSoup and see what we get
print("\n🔍 BeautifulSoup parsing of first scene:")
soup = BeautifulSoup(scene_html, 'html.parser')
scene_div = soup.find(class_='scene')
from bs4.element import Tag, NavigableString
if scene_div and isinstance(scene_div, Tag):
print(f"Scene div found: {scene_div.name}")
all_children = [child for child in scene_div.children if isinstance(child, (Tag, NavigableString))]
print(f"Children count: {len(all_children)}")
for i, child in enumerate(all_children):
if isinstance(child, Tag):
child_name = child.name
classes = child.get('class', None)
print(f" Child {i}: {child_name} with classes {classes}")
if child_name == 'div' and classes and 'scene-text' in classes:
print(f" TEXT: {child.get_text()[:100]}...")
elif isinstance(child, NavigableString) and str(child).strip():
print(f" Child {i}: TEXT NODE: '{str(child).strip()[:50]}...'")
elif scene_div:
print(f"Scene div found but it is not a Tag (type: {type(scene_div)}).")
else:
print("❌ No scene divs found in HTML")
# Also check for scene-text specifically
scene_text_count = html_content.count('class="scene-text"')
print(f"\n📝 Found {scene_text_count} occurrences of 'class=\"scene-text\"' in raw HTML")
if scene_text_count > 0:
# Find the first one
first_scene_text = html_content.find('class="scene-text"')
start = max(0, first_scene_text - 100)
end = min(len(html_content), first_scene_text + 200)
print("First scene-text context:")
print(html_content[start:end])