Skip to content

Commit 12bc29a

Browse files
fix: resolve 15 bugs and gaps in video scraper pipeline
- Fix extract_visual_data returning 2-tuple instead of 3 (ValueError crash) - Move pytesseract from core deps to [video-full] optional group - Add 30-min timeout + user feedback to video enhancement subprocess - Add scrape_video_impl to MCP server fallback import block - Detect auto-generated YouTube captions via is_generated property - Forward --vision-ocr and --video-playlist through create command - Fix filename collision for non-ASCII video titles (fallback to video_id) - Make _vision_used a proper dataclass field on FrameSubSection - Expose 6 visual params in MCP scrape_video tool - Add install instructions on missing video deps in unified scraper - Update MCP docstring tool counts (25→33, 7 categories) - Add video and word commands to main.py docstring - Document video-full exclusion from [all] deps in pyproject.toml - Update parser registry test count (22→23 for video parser) All 2437 tests passing, 0 failures. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 066e196 commit 12bc29a

File tree

13 files changed

+171
-33
lines changed

13 files changed

+171
-33
lines changed

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ dependencies = [
5252
"anthropic>=0.76.0", # Required for AI enhancement (core feature)
5353
"PyMuPDF>=1.24.14",
5454
"Pillow>=11.0.0",
55-
"pytesseract>=0.3.13",
5655
"pydantic>=2.12.3",
5756
"pydantic-settings>=2.11.0",
5857
"python-dotenv>=1.1.1",
@@ -129,6 +128,7 @@ video-full = [
129128
"scenedetect[opencv]>=0.6.4",
130129
"easyocr>=1.7.0",
131130
"opencv-python-headless>=4.9.0",
131+
"pytesseract>=0.3.13",
132132
]
133133

134134
# RAG vector database upload support
@@ -172,6 +172,8 @@ embedding = [
172172
]
173173

174174
# All optional dependencies combined (dev dependencies now in [dependency-groups])
175+
# Note: video-full deps (opencv, easyocr, faster-whisper) excluded due to heavy
176+
# native dependencies. Install separately: pip install skill-seekers[video-full]
175177
all = [
176178
"mammoth>=1.6.0",
177179
"python-docx>=1.1.0",

src/skill_seekers/cli/arguments/create.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,13 @@
488488
"metavar": "THRESH",
489489
},
490490
},
491+
"vision_ocr": {
492+
"flags": ("--vision-ocr",),
493+
"kwargs": {
494+
"action": "store_true",
495+
"help": "Use Claude Vision API as fallback for low-confidence code frames (requires ANTHROPIC_API_KEY, ~$0.004/frame)",
496+
},
497+
},
491498
}
492499

493500
# Multi-source config specific (from unified_scraper.py)

src/skill_seekers/cli/create_command.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,8 +360,12 @@ def _route_video(self) -> int:
360360

361361
# Add video source (URL or file)
362362
parsed = self.source_info.parsed
363+
video_playlist = getattr(self.args, "video_playlist", None)
363364
if parsed.get("source_kind") == "file":
364365
argv.extend(["--video-file", parsed["file_path"]])
366+
elif video_playlist:
367+
# Explicit --video-playlist flag takes precedence
368+
argv.extend(["--playlist", video_playlist])
365369
elif parsed.get("url"):
366370
url = parsed["url"]
367371
# Detect playlist vs single video
@@ -374,11 +378,15 @@ def _route_video(self) -> int:
374378
self._add_common_args(argv)
375379

376380
# Add video-specific arguments
377-
video_langs = getattr(self.args, "video_languages", None) or getattr(self.args, "languages", None)
381+
video_langs = getattr(self.args, "video_languages", None) or getattr(
382+
self.args, "languages", None
383+
)
378384
if video_langs:
379385
argv.extend(["--languages", video_langs])
380386
if getattr(self.args, "visual", False):
381387
argv.append("--visual")
388+
if getattr(self.args, "vision_ocr", False):
389+
argv.append("--vision-ocr")
382390
if getattr(self.args, "whisper_model", None) and self.args.whisper_model != "base":
383391
argv.extend(["--whisper-model", self.args.whisper_model])
384392
vi = getattr(self.args, "visual_interval", None)

src/skill_seekers/cli/main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
scrape Scrape documentation website
1313
github Scrape GitHub repository
1414
pdf Extract from PDF file
15+
word Extract from Word (.docx) file
16+
video Extract from video (YouTube or local)
1517
unified Multi-source scraping (docs + GitHub + PDF)
1618
analyze Analyze local codebase and extract code knowledge
1719
enhance AI-powered enhancement (auto: API or LOCAL mode)

src/skill_seekers/cli/unified_scraper.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,14 @@ def __init__(self, config_path: str, merge_mode: str | None = None):
7979
}
8080

8181
# Track source index for unique naming (multi-source support)
82-
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0, "word": 0, "video": 0, "local": 0}
82+
self._source_counters = {
83+
"documentation": 0,
84+
"github": 0,
85+
"pdf": 0,
86+
"word": 0,
87+
"video": 0,
88+
"local": 0,
89+
}
8390

8491
# Output paths - cleaner organization
8592
self.name = self.config["name"]
@@ -583,8 +590,12 @@ def _scrape_video(self, source: dict[str, Any]):
583590
"""Scrape video source (YouTube, local file, etc.)."""
584591
try:
585592
from skill_seekers.cli.video_scraper import VideoToSkillConverter
586-
except ImportError:
587-
logger.error("video_scraper.py not found")
593+
except ImportError as e:
594+
logger.error(
595+
f"Video scraper dependencies not installed: {e}\n"
596+
" Install with: pip install skill-seekers[video]\n"
597+
" For visual extraction (frame analysis, OCR): pip install skill-seekers[video-full]"
598+
)
588599
return
589600

590601
# Multi-source support: Get unique index for this video source
@@ -630,8 +641,7 @@ def _scrape_video(self, source: dict[str, Any]):
630641
logger.info("✅ Video: Standalone SKILL.md created")
631642

632643
logger.info(
633-
f"✅ Video: {len(result.videos)} videos, "
634-
f"{result.total_segments} segments extracted"
644+
f"✅ Video: {len(result.videos)} videos, {result.total_segments} segments extracted"
635645
)
636646
except Exception as e:
637647
logger.error(f"Failed to process video source: {e}")

src/skill_seekers/cli/video_models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ class FrameSubSection:
222222
ocr_regions: list[OCRRegion] = field(default_factory=list)
223223
ocr_confidence: float = 0.0
224224
panel_id: str = "" # e.g. "panel_0_0" (row_col)
225+
_vision_used: bool = False # Whether Vision API was used for OCR
225226

226227
def to_dict(self) -> dict:
227228
return {

src/skill_seekers/cli/video_scraper.py

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,12 @@ def build_skill(self) -> str:
469469

470470
# Generate reference files for each video
471471
for video in self.result.videos:
472-
ref_filename = f"video_{_sanitize_filename(video.title)}.md"
472+
sanitized = (
473+
_sanitize_filename(video.title)
474+
or video.video_id
475+
or f"video_{hash(video.title) % 10000:04d}"
476+
)
477+
ref_filename = f"video_{sanitized}.md"
473478
ref_path = os.path.join(refs_dir, ref_filename)
474479
ref_content = self._generate_reference_md(video)
475480
with open(ref_path, "w", encoding="utf-8") as f:
@@ -750,7 +755,12 @@ def _generate_skill_md(self) -> str:
750755
preview += "..."
751756
lines.append(f"{preview}\n")
752757

753-
ref_filename = f"video_{_sanitize_filename(video.title)}.md"
758+
sanitized = (
759+
_sanitize_filename(video.title)
760+
or video.video_id
761+
or f"video_{hash(video.title) % 10000:04d}"
762+
)
763+
ref_filename = f"video_{sanitized}.md"
754764
lines.append(
755765
f"> Full transcript: [references/{ref_filename}](references/{ref_filename})\n"
756766
)
@@ -766,7 +776,12 @@ def _generate_skill_md(self) -> str:
766776
# References
767777
lines.append("## References\n")
768778
for video in self.result.videos:
769-
ref_filename = f"video_{_sanitize_filename(video.title)}.md"
779+
sanitized = (
780+
_sanitize_filename(video.title)
781+
or video.video_id
782+
or f"video_{hash(video.title) % 10000:04d}"
783+
)
784+
ref_filename = f"video_{sanitized}.md"
770785
lines.append(f"- [{video.title}](references/{ref_filename})")
771786

772787
return "\n".join(lines)
@@ -940,11 +955,25 @@ def _run_video_enhancement(skill_dir: str, enhance_level: int, args) -> None:
940955
if api_key:
941956
enhance_cmd.extend(["--api-key", api_key])
942957

943-
result = subprocess.run(enhance_cmd, check=True)
944-
if result.returncode == 0:
945-
logger.info("✅ Video skill enhancement complete!")
946-
except subprocess.CalledProcessError:
947-
logger.warning("⚠ Enhancement failed, but skill was still built")
958+
logger.info(
959+
"Starting video skill enhancement (this may take 10+ minutes "
960+
"for large videos with AI enhancement)..."
961+
)
962+
subprocess.run(enhance_cmd, check=True, timeout=1800)
963+
logger.info("Video skill enhancement complete!")
964+
except subprocess.TimeoutExpired:
965+
logger.warning(
966+
"⚠ Enhancement timed out after 30 minutes. "
967+
"The skill was still built without enhancement. "
968+
"You can retry manually with:\n"
969+
f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}"
970+
)
971+
except subprocess.CalledProcessError as exc:
972+
logger.warning(
973+
f"⚠ Enhancement failed (exit code {exc.returncode}), "
974+
"but skill was still built. You can retry manually with:\n"
975+
f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}"
976+
)
948977
except FileNotFoundError:
949978
logger.warning("⚠ skill-seekers-enhance not found. Run manually:")
950979
logger.info(f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}")

src/skill_seekers/cli/video_transcript.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,36 @@ def extract_youtube_transcript(
7070

7171
try:
7272
ytt_api = YouTubeTranscriptApi()
73-
transcript = ytt_api.fetch(video_id, languages=languages)
7473

75-
segments = []
74+
# Use list_transcripts to detect whether the transcript is auto-generated
7675
source = TranscriptSource.YOUTUBE_MANUAL
76+
try:
77+
transcript_list = ytt_api.list(video_id)
78+
# Prefer manually created transcripts; fall back to auto-generated
79+
try:
80+
transcript_entry = transcript_list.find_manually_created_transcript(languages)
81+
source = TranscriptSource.YOUTUBE_MANUAL
82+
except Exception:
83+
try:
84+
transcript_entry = transcript_list.find_generated_transcript(languages)
85+
source = TranscriptSource.YOUTUBE_AUTO
86+
except Exception:
87+
# Fall back to any available transcript
88+
transcript_entry = transcript_list.find_transcript(languages)
89+
source = (
90+
TranscriptSource.YOUTUBE_AUTO
91+
if transcript_entry.is_generated
92+
else TranscriptSource.YOUTUBE_MANUAL
93+
)
94+
transcript = transcript_entry.fetch()
95+
except Exception:
96+
# Fall back to direct fetch if list fails (older API versions)
97+
transcript = ytt_api.fetch(video_id, languages=languages)
98+
# Check is_generated on the FetchedTranscript if available
99+
if getattr(transcript, "is_generated", False):
100+
source = TranscriptSource.YOUTUBE_AUTO
101+
102+
segments = []
77103
for snippet in transcript.snippets:
78104
text = snippet.text.strip()
79105
if not text:

src/skill_seekers/cli/video_visual.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1864,7 +1864,7 @@ def _ocr_single_panel(
18641864
panel_id=f"panel_{row}_{col}",
18651865
)
18661866
# Stash vision_used flag for the caller to count
1867-
ss._vision_used = vision_used # type: ignore[attr-defined]
1867+
ss._vision_used = vision_used
18681868
return ss
18691869

18701870

@@ -1918,7 +1918,7 @@ def extract_visual_data(
19181918
cap = cv2.VideoCapture(video_path)
19191919
if not cap.isOpened():
19201920
logger.error(f"Cannot open video: {video_path}")
1921-
return [], []
1921+
return [], [], None
19221922

19231923
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
19241924
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
@@ -2003,7 +2003,7 @@ def extract_visual_data(
20032003
for fut in concurrent.futures.as_completed(futures):
20042004
ss = fut.result()
20052005
if ss is not None:
2006-
if getattr(ss, "_vision_used", False):
2006+
if ss._vision_used:
20072007
vision_api_frames += 1
20082008
sub_sections.append(ss)
20092009
else:
@@ -2018,7 +2018,7 @@ def extract_visual_data(
20182018
use_vision_api,
20192019
)
20202020
if ss is not None:
2021-
if getattr(ss, "_vision_used", False):
2021+
if ss._vision_used:
20222022
vision_api_frames += 1
20232023
sub_sections.append(ss)
20242024

src/skill_seekers/mcp/server_fastmcp.py

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,21 @@
33
Skill Seeker MCP Server (FastMCP Implementation)
44
55
Modern, decorator-based MCP server using FastMCP for simplified tool registration.
6-
Provides 25 tools for generating Claude AI skills from documentation.
6+
Provides 33 tools for generating Claude AI skills from documentation.
77
88
This is a streamlined alternative to server.py (2200 lines → 708 lines, 68% reduction).
99
All tool implementations are delegated to modular tool files in tools/ directory.
1010
1111
**Architecture:**
1212
- FastMCP server with decorator-based tool registration
13-
- 25 tools organized into 6 categories:
13+
- 33 tools organized into 7 categories:
1414
* Config tools (3): generate_config, list_configs, validate_config
15-
* Scraping tools (8): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns
15+
* Scraping tools (10): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_video, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns
1616
* Packaging tools (4): package_skill, upload_skill, enhance_skill, install_skill
1717
* Splitting tools (2): split_config, generate_router
18-
* Source tools (4): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source
18+
* Source tools (5): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source
1919
* Vector Database tools (4): export_to_weaviate, export_to_chroma, export_to_faiss, export_to_qdrant
20+
* Workflow tools (5): list_workflows, get_workflow, create_workflow, update_workflow, delete_workflow
2021
2122
**Usage:**
2223
# Stdio transport (default, backward compatible)
@@ -140,6 +141,7 @@
140141
scrape_docs_impl,
141142
scrape_github_impl,
142143
scrape_pdf_impl,
144+
scrape_video_impl,
143145
split_config_impl,
144146
submit_config_impl,
145147
upload_skill_impl,
@@ -250,7 +252,7 @@ async def validate_config(config_path: str) -> str:
250252

251253

252254
# ============================================================================
253-
# SCRAPING TOOLS (4 tools)
255+
# SCRAPING TOOLS (10 tools)
254256
# ============================================================================
255257

256258

@@ -432,6 +434,12 @@ async def scrape_video(
432434
description: str | None = None,
433435
languages: str | None = None,
434436
from_json: str | None = None,
437+
visual: bool = False,
438+
whisper_model: str | None = None,
439+
visual_interval: float | None = None,
440+
visual_min_gap: float | None = None,
441+
visual_similarity: float | None = None,
442+
vision_ocr: bool = False,
435443
) -> str:
436444
"""
437445
Scrape video content and build Claude skill.
@@ -444,6 +452,12 @@ async def scrape_video(
444452
description: Skill description
445453
languages: Transcript language preferences (comma-separated)
446454
from_json: Build from extracted JSON file
455+
visual: Enable visual frame extraction (requires video-full extras)
456+
whisper_model: Whisper model size for local transcription (e.g., base, small, medium, large)
457+
visual_interval: Seconds between frame captures (default: 5.0)
458+
visual_min_gap: Minimum seconds between kept frames (default: 2.0)
459+
visual_similarity: Similarity threshold to skip duplicate frames 0.0-1.0 (default: 0.95)
460+
vision_ocr: Use vision model for OCR on extracted frames
447461
448462
Returns:
449463
Video scraping results with file paths.
@@ -463,6 +477,18 @@ async def scrape_video(
463477
args["languages"] = languages
464478
if from_json:
465479
args["from_json"] = from_json
480+
if visual:
481+
args["visual"] = visual
482+
if whisper_model:
483+
args["whisper_model"] = whisper_model
484+
if visual_interval is not None:
485+
args["visual_interval"] = visual_interval
486+
if visual_min_gap is not None:
487+
args["visual_min_gap"] = visual_min_gap
488+
if visual_similarity is not None:
489+
args["visual_similarity"] = visual_similarity
490+
if vision_ocr:
491+
args["vision_ocr"] = vision_ocr
466492

467493
result = await scrape_video_impl(args)
468494
if isinstance(result, list) and result:

0 commit comments

Comments
 (0)