diff --git a/.github/workflows/genai-video-slide-deck-annotator.yml b/.github/workflows/genai-video-slide-deck-annotator.yml new file mode 100644 index 0000000..b29d1a9 --- /dev/null +++ b/.github/workflows/genai-video-slide-deck-annotator.yml @@ -0,0 +1,36 @@ +name: genai video slide deck annotator +on: + issues: + types: [opened, edited] +permissions: + contents: read + issues: write + models: read +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true +jobs: + genai-video-slide-deck-analyze: + runs-on: ubuntu-latest + services: + whisper: + image: onerahmet/openai-whisper-asr-webservice:latest + env: + ASR_MODEL: base + ASR_ENGINE: openai_whisper + ports: + - 9000:9000 + options: >- + --health-cmd "curl -f http://localhost:9000/docs || exit 1" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + --health-start-period 20s + steps: + - uses: actions/checkout@v4 + - uses: pelikhan/action-genai-video-issue-analyzer@main + with: + script: action-video-slide-deck-annotator + github_issue: ${{ github.event.issue.number }} + github_token: ${{ secrets.GITHUB_TOKEN }} + instructions: "Analyze the video frames to detect slide transitions in a presentation. Focus on identifying significant visual changes that indicate when slides change, ignore minor changes like cursor movement or highlighting. Generate timestamps with confidence scores for each detected transition." \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 85cc44b..d06cc46 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,8 +15,11 @@ WORKDIR /genaiscript/action # Copy source code COPY . . +# Make entrypoint script executable +RUN chmod +x entrypoint.sh + # Install dependencies RUN npm ci # GitHub Action forces the WORKDIR to GITHUB_WORKSPACE -ENTRYPOINT ["npm", "--prefix", "/genaiscript/action", "start"] \ No newline at end of file +ENTRYPOINT ["./entrypoint.sh"] \ No newline at end of file diff --git a/README.md b/README.md index 534fda6..678195a 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,32 @@ This GitHub Action runs all video assets in an issue body through a LLM model to analyze the content. The default behavior is to summarize and extract task items but this can be customized through the `prompt` input. +**New**: The action now supports slide deck analysis with the `action-video-slide-deck-annotator` script that detects slide transitions and generates timestamps for presentations. + +## Scripts Available + +- **action-video-issue-analyzer** (default): Analyzes videos for general content summary and task extraction +- **action-video-slide-deck-annotator**: Detects slide transitions in presentation videos and generates structured timestamps + +## Supported Video Sources + +The action supports videos from multiple sources: + +### GitHub User Attachments (Default) +Standard GitHub issue attachments uploaded via the web interface: +- `https://github.com/user-attachments/assets/...` + +### Git Large File Storage (LFS) +For large video files stored in Git LFS, the action supports: +- **Raw files**: `https://github.com/user/repo/raw/branch/video.mp4` +- **Release downloads**: `https://github.com/user/repo/releases/download/tag/video.mp4` +- **Blob files**: `https://github.com/user/repo/blob/branch/path/video.mp4` + +The action automatically detects the source type and uses appropriate download methods. For large files (>500MB), progress warnings are displayed during processing. + ## Inputs +- `script`: The script to run (action-video-issue-analyzer or action-video-slide-deck-annotator). **(optional, defaults to action-video-issue-analyzer)** - `github_token`: GitHub token with `models: read` permission at least. **(required)** - `instructions`: Custom prompt to use for the LLM model. If not provided, a default prompt will be used. - `github_issue`: The issue number to analyze. Typically this variable is inferred from the event context. @@ -38,6 +62,47 @@ It will launch a whisper service in a container that can be used by genaiscript. github_token: ${{ secrets.GITHUB_TOKEN }} ``` +## Slide Deck Annotator Usage + +To use the slide deck annotator for detecting slide transitions in presentation videos: + +```yaml + steps: + - uses: actions/checkout@v4 + - uses: pelikhan/action-genai-video-issue-analyzer@v0 + with: + script: action-video-slide-deck-annotator + github_token: ${{ secrets.GITHUB_TOKEN }} +``` + +The slide deck annotator will output structured JSON with: +- Video duration +- Slide transition timestamps with confidence scores +- Recommended 2-minute viewing segments for each slide + +Example output: +```json +{ + "video_duration": "01:23:45", + "slide_transitions": [ + { + "timestamp": "00:02:15", + "confidence": 0.95, + "slide_number": 1, + "description": "Title slide to agenda" + } + ], + "recommended_segments": [ + { + "start": "00:00:00", + "end": "00:02:00", + "slide": 1, + "description": "First 2 minutes of title slide" + } + ] +} +``` + ## Example Save the following in `.github/workflows/genai-video-issue-analyzer.yml` file: diff --git a/action.yml b/action.yml index 856a583..1dde282 100644 --- a/action.yml +++ b/action.yml @@ -1,12 +1,19 @@ name: action-genai-video-issue-analyzer description: Analyzes videos upload as assets inputs: + script: + description: The script to run (action-video-issue-analyzer or action-video-slide-deck-annotator). + required: false + default: action-video-issue-analyzer instructions: description: Custom prompting instructions for each video. required: false default: Analyze the video and provide a summary of its content. Extract list of followup subissues if any. The transcript is your primary source of text information, ignore text in images. + video_file_path: + description: Path to the video file in the repository (for slide deck annotator). + required: false github_token: description: "GitHub token with `models: read` permission at least (https://microsoft.github.io/genaiscript/reference/github-actions/#github\ diff --git a/entrypoint.sh b/entrypoint.sh new file mode 100755 index 0000000..73a8b95 --- /dev/null +++ b/entrypoint.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +# Set the script name from the input parameter, defaulting to action-video-issue-analyzer +export SCRIPT_NAME="${INPUT_SCRIPT:-action-video-issue-analyzer}" + +# Set the whisper API base +export WHISPERASR_API_BASE=http://whisper:9000 + +# Build the command arguments +ARGS="--github-workspace --pull-request-comment --no-run-trace --no-output-trace" + +# Add video file path parameter if provided and using slide deck annotator +if [ "$SCRIPT_NAME" = "action-video-slide-deck-annotator" ] && [ -n "$INPUT_VIDEO_FILE_PATH" ]; then + ARGS="$ARGS --args video_file_path=\"$INPUT_VIDEO_FILE_PATH\"" +fi + +# Run genaiscript directly with the selected script +cd /genaiscript/action +npx genaiscript run "$SCRIPT_NAME" $ARGS \ No newline at end of file diff --git a/genaisrc/action-video-issue-analyzer.genai.mts b/genaisrc/action-video-issue-analyzer.genai.mts index 938bc9a..338fb7e 100644 --- a/genaisrc/action-video-issue-analyzer.genai.mts +++ b/genaisrc/action-video-issue-analyzer.genai.mts @@ -23,10 +23,20 @@ if (!instructions) "No instructions provided. Please provide instructions to process the video.", ); -const RX = /^https:\/\/github.com\/user-attachments\/assets\/.+$/gim; -const assetLinks = Array.from( - new Set(Array.from(issue.body.matchAll(RX), (m) => m[0])), +// Pattern for GitHub user attachments +const USER_ATTACHMENTS_RX = + /^https:\/\/github.com\/user-attachments\/assets\/.+$/gim; +// Pattern for Git LFS files (raw GitHub URLs, releases, etc.) +const GIT_LFS_RX = + /^https:\/\/github.com\/[^\/]+\/[^\/]+\/(?:raw\/[^\/]+\/|releases\/download\/[^\/]+\/|blob\/[^\/]+\/).+\.(mp4|mov|avi|mkv|webm|flv|m4v)$/gim; + +const userAttachmentLinks = Array.from( + new Set(Array.from(issue.body.matchAll(USER_ATTACHMENTS_RX), (m) => m[0])), +); +const gitLfsLinks = Array.from( + new Set(Array.from(issue.body.matchAll(GIT_LFS_RX), (m) => m[0])), ); +const assetLinks = [...userAttachmentLinks, ...gitLfsLinks]; if (assetLinks.length === 0) cancel("No video assets found in the issue body, nothing to do."); @@ -37,18 +47,95 @@ for (const assetLink of assetLinks) await processAssetLink(assetLink); async function processAssetLink(assetLink: string) { output.heading(3, assetLink); dbg(assetLink); - const downloadUrl = await github.resolveAssetUrl(assetLink); - const res = await fetch(downloadUrl, { method: "GET" }); + + let downloadUrl: string; + let isGitLfs = false; + + // Determine if this is a Git LFS URL or user attachment + if (assetLink.match(GIT_LFS_RX)) { + isGitLfs = true; + downloadUrl = assetLink; // Use the URL directly for Git LFS files + dbg(`Detected Git LFS URL: %s`, assetLink); + } else { + downloadUrl = await github.resolveAssetUrl(assetLink); + dbg(`Resolved user attachment URL: %s`, downloadUrl); + } + + // Add appropriate headers for Git LFS if needed + const headers: Record = {}; + if (isGitLfs) { + headers["Accept"] = "application/vnd.git-lfs+json"; + // GitHub token will be handled by the environment if needed + } + + const res = await fetch(downloadUrl, { + method: "GET", + headers, + }); + const contentType = res.headers.get("content-type") || ""; + const contentLength = res.headers.get("content-length"); + dbg(`download url: %s`, downloadUrl); dbg(`headers: %O`, res.headers); - if (!res.ok) + dbg(`content-type: %s`, contentType); + dbg(`content-length: %s`, contentLength); + + if (!res.ok) { + if (res.status === 404 && isGitLfs) { + throw new Error( + `Git LFS file not found: ${assetLink}. The file may be too large or not available via LFS.`, + ); + } throw new Error( `Failed to download asset from ${downloadUrl}: ${res.status} ${res.statusText}`, ); + } + + // Check file size before downloading large files + if (contentLength) { + const sizeInMb = parseInt(contentLength) / 1e6; + dbg(`File size: ${sizeInMb.toFixed(1)}MB`); + + // Warn for very large files (>500MB) but still process them + if (sizeInMb > 500) { + output.p( + `⚠️ Large file detected (${sizeInMb.toFixed(1)}MB). Processing may take longer.`, + ); + } + } + + // For Git LFS, we might get a JSON response with download info instead of the actual file + if (isGitLfs && contentType.includes("application/json")) { + try { + const lfsInfo = await res.json(); + if (lfsInfo.download_url) { + dbg(`Git LFS redirect to: %s`, lfsInfo.download_url); + return processAssetLink(lfsInfo.download_url); // Recursively process the actual download URL + } + } catch (e) { + // If JSON parsing fails, treat as regular download + dbg( + `Failed to parse LFS JSON response, treating as direct download: %s`, + e.message, + ); + } + } + + // Check if content type indicates a video file if (!/^video\//.test(contentType)) { - output.p(`Asset is not a video file, skipping`); - return; + // For Git LFS URLs, also check file extension since content-type might not be set correctly + if (isGitLfs && /\.(mp4|mov|avi|mkv|webm|flv|m4v)$/i.test(assetLink)) { + dbg( + `Git LFS file extension indicates video, proceeding despite content-type: %s`, + contentType, + ); + } else { + output.p( + `Asset is not a video file (content-type: ${contentType}), skipping`, + ); + return; + } } // save and cache diff --git a/genaisrc/action-video-slide-deck-annotator.genai.mts b/genaisrc/action-video-slide-deck-annotator.genai.mts new file mode 100644 index 0000000..a6b904e --- /dev/null +++ b/genaisrc/action-video-slide-deck-annotator.genai.mts @@ -0,0 +1,168 @@ +script({ + title: "Analyzes videos to detect slide transitions and generate timestamps", + accept: "none", + parameters: { + instructions: { + type: "string", + description: + "Custom prompting instructions for slide transition detection.", + default: + "Analyze the video frames to detect slide transitions in a presentation. Focus on identifying significant visual changes that indicate when slides change, ignore minor changes like cursor movement or highlighting. Generate timestamps with confidence scores for each detected transition.", + }, + video_file_path: { + type: "string", + description: "Path to the video file in the repository.", + }, + }, +}); + +const { dbg, output, vars } = env; +const { instructions, video_file_path } = vars as { + instructions: string; + video_file_path?: string; +}; + +if (!instructions) { + throw new Error( + "No instructions provided. Please provide instructions to process the video.", + ); +} + +if (!video_file_path) { + throw new Error( + "No video file path provided. Please provide the path to the video file in the repository.", + ); +} + +dbg(`Processing video file: %s`, video_file_path); + +// Process the video file directly +await processVideo(video_file_path); + +async function processVideo(filename: string) { + const transcript = await transcribe(filename, { + model: "whisperasr:default", + cache: true, + }); + if (!transcript) { + output.error(`no transcript found for video ${filename}.`); + } + + // Extract frames for slide transition detection + const frames = await ffmpeg.extractFrames(filename, { + transcript, + }); + + const { text, error } = await runPrompt( + (ctx) => { + ctx.def("TRANSCRIPT", transcript?.srt, { ignoreEmpty: true }); // ignore silent videos + ctx.defImages(frames, { detail: "high", sliceSample: 80 }); // higher detail for slide detection + ctx.$`${instructions} + +## Analysis Instructions + +You are analyzing a video of a slide deck presentation. Your task is to: + +1. **Detect Slide Transitions**: Identify when the content significantly changes between frames, indicating a new slide +2. **Filter Noise**: Ignore minor changes like cursor movement, highlighting, or small animations +3. **Generate Timestamps**: Provide accurate timestamps for each transition +4. **Assess Confidence**: Rate your confidence in each detection (0.0 to 1.0) +5. **Create Viewing Segments**: Generate recommended 2-minute viewing segments for each slide + +## Output Format + +Respond with a valid JSON object in the following format: + +\`\`\`json +{ + "video_duration": "HH:MM:SS", + "slide_transitions": [ + { + "timestamp": "HH:MM:SS", + "confidence": 0.95, + "slide_number": 1, + "description": "Brief description of the transition" + } + ], + "recommended_segments": [ + { + "start": "HH:MM:SS", + "end": "HH:MM:SS", + "slide": 1, + "description": "First 2 minutes of slide content" + } + ] +} +\`\`\` + +## Key Guidelines + +- Focus on major visual changes that clearly indicate slide transitions +- Confidence scores should reflect how certain you are about the transition +- Slide numbers should increment sequentially starting from 1 +- Recommended segments should be exactly 2 minutes or until the next slide transition +- Use the transcript to help understand content changes when visual changes are ambiguous +- If frames show the same slide content, do not mark as a transition +- Look for changes in slide titles, bullet points, images, charts, or overall layout + +Analyze the provided frames and transcript to detect slide transitions.`.role( + "system", + ); + }, + { + systemSafety: true, + model: "vision", + responseType: "json", + label: `analyze slide transitions ${filename}`, + }, + ); + + if (error) { + output.error(error?.message); + } else { + // Parse and validate JSON response + try { + const analysisResult = JSON.parse(text); + + // Display results in a formatted way + output.heading(4, "Slide Transition Analysis Results"); + output.code(JSON.stringify(analysisResult, null, 2), "json"); + + // Also provide a summary + if ( + analysisResult.slide_transitions && + analysisResult.slide_transitions.length > 0 + ) { + output.heading(5, "Summary"); + output.p( + `Found ${analysisResult.slide_transitions.length} slide transitions in video duration: ${analysisResult.video_duration}`, + ); + + output.heading(5, "Detected Transitions"); + for (const transition of analysisResult.slide_transitions) { + output.p( + `**Slide ${transition.slide_number}** at [${transition.timestamp}] (confidence: ${transition.confidence}) - ${transition.description}`, + ); + } + + if ( + analysisResult.recommended_segments && + analysisResult.recommended_segments.length > 0 + ) { + output.heading(5, "Recommended Viewing Segments"); + for (const segment of analysisResult.recommended_segments) { + output.p( + `**Slide ${segment.slide}**: [${segment.start}] - [${segment.end}] - ${segment.description}`, + ); + } + } + } else { + output.p("No slide transitions detected in this video."); + } + } catch (parseError) { + output.error(`Failed to parse JSON response: ${parseError.message}`); + output.heading(4, "Raw Response"); + output.appendContent(text); + } + } +}