Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
src/inspect_scout/_view/dist/** filter=lfs diff=lfs merge=lfs -text
1 change: 1 addition & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

### Testing
- Test observable behavior, not internal implementation details
- Do not test things that are enforced by the type system
- Be efficient; avoid duplicate coverage
- Prefer data/table driven tests for maintainability
- Tests must be isolated; no shared mutable state or order dependencies
Expand Down
42 changes: 42 additions & 0 deletions scripts/fetch-dist.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env bash
# Fetch LFS dist files using the GitHub LFS batch API.
#
# This script is an escape hatch for users who want to populate the
# dist cache without running `scout view`. It uses the same Python
# resolver that the server uses at startup.
#
# Usage:
# ./scripts/fetch-dist.sh [--force-cache]

set -euo pipefail

FORCE_CACHE=False
for arg in "$@"; do
case "$arg" in
--force-cache) FORCE_CACHE=True ;;
*) echo "Unknown argument: $arg"; exit 1 ;;
esac
done

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
DIST_DIR="$REPO_DIR/src/inspect_scout/_view/dist"

DIST_DIR="$DIST_DIR" FORCE_CACHE="$FORCE_CACHE" "$REPO_DIR/.venv/bin/python" -c "
import os
from pathlib import Path
from inspect_scout._util.appdirs import scout_cache_dir
from inspect_scout._lfs import resolve_lfs_directory

source = Path(os.environ['DIST_DIR'])
result = resolve_lfs_directory(
source,
cache_dir=scout_cache_dir('dist'),
repo_url='https://github.com/meridianlabs-ai/inspect_scout.git',
force_cache=os.environ['FORCE_CACHE'] == 'True',
)
if result == source:
print('dist/ already contains real files. Nothing to fetch.')
else:
print(f'Done. LFS files cached at {result}')
"
6 changes: 6 additions & 0 deletions src/inspect_scout/_lfs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""LFS transparent fallback for directories containing pointer files."""

from .exceptions import LFSError
from .resolver import resolve_lfs_directory

__all__ = ["LFSError", "resolve_lfs_directory"]
207 changes: 207 additions & 0 deletions src/inspect_scout/_lfs/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
"""LFS dist cache management.

Mirrors the repository's dist/ directory structure in a local cache,
downloading real file content from GitHub LFS when the repo contains
pointer files.
"""

import logging
import os
import time
from pathlib import Path

from .client import (
LFSDownloadInfo,
download_lfs_object,
fetch_download_urls,
)
from .exceptions import LFSDownloadError
from .pointer import LFSPointer, is_lfs_pointer, parse_lfs_pointer

logger = logging.getLogger(__name__)


def ensure_cached(
repo_dist_dir: Path,
cache_dist_dir: Path,
repo_url: str,
) -> None:
"""Populate the cache with real files for all LFS pointers in dist/.

Walks repo_dist_dir, identifies LFS pointer files, checks the cache
for each one, and downloads any missing files via the LFS batch API.

The cache mirrors the dist/ directory structure so that StaticFiles
can serve it directly.

Args:
repo_dist_dir: Path to repository's dist/ directory (contains pointers).
cache_dist_dir: Path to cache directory (will contain real files).
repo_url: HTTPS URL of the git repository.

Raises:
LFSDownloadError: If any critical file fails to download.
"""
# Collect pointers and check cache status.
needs_download: list[tuple[Path, LFSPointer]] = []
source_rel_paths: set[Path] = set()

for repo_file in _walk_files(repo_dist_dir):
rel = repo_file.relative_to(repo_dist_dir)
source_rel_paths.add(rel)

if not is_lfs_pointer(repo_file):
# Real file — copy to cache if not already there.
cache_file = cache_dist_dir / rel
if not cache_file.exists():
cache_file.parent.mkdir(parents=True, exist_ok=True)
_copy_file(repo_file, cache_file)
continue

pointer = parse_lfs_pointer(repo_file)
if pointer is None:
logger.warning("Could not parse LFS pointer: %s", repo_file)
continue

cache_file = cache_dist_dir / rel
oid_file = cache_file.with_suffix(cache_file.suffix + ".oid")

# Cache hit: file exists and OID matches.
if cache_file.exists() and oid_file.exists():
cached_oid = oid_file.read_text(encoding="utf-8").strip()
if cached_oid == pointer.oid:
continue
# OID mismatch — remove stale cache files before re-download.
cache_file.unlink(missing_ok=True)
oid_file.unlink(missing_ok=True)
elif cache_file.exists() or oid_file.exists():
# Incomplete cache entry — clean up orphaned files.
cache_file.unlink(missing_ok=True)
oid_file.unlink(missing_ok=True)

needs_download.append((repo_file, pointer))

if not needs_download:
logger.debug("LFS cache is up to date")
_prune_cache(cache_dist_dir, source_rel_paths)
return

logger.info("Downloading %d LFS objects...", len(needs_download))

# Batch request for download URLs.
batch_objects = [(p.oid, p.size) for _, p in needs_download]
download_infos = fetch_download_urls(batch_objects, repo_url=repo_url)

# Index by OID for lookup.
info_by_oid: dict[str, LFSDownloadInfo] = {d.oid: d for d in download_infos}

# Download each file.
failed: list[str] = []
for repo_file, pointer in needs_download:
rel = repo_file.relative_to(repo_dist_dir)
cache_file = cache_dist_dir / rel
oid_file = cache_file.with_suffix(cache_file.suffix + ".oid")
marker_file = cache_file.with_suffix(cache_file.suffix + ".downloading")

info = info_by_oid.get(pointer.oid)
if info is None:
logger.warning("No download URL for %s (%s)", rel, pointer.oid[:12])
failed.append(str(rel))
continue

# Multiprocess coordination: skip if another process is downloading.
if not _try_create_marker(marker_file):
logger.debug("Another process is downloading %s, waiting...", rel)
_wait_for_marker(marker_file)
if cache_file.exists():
continue
# Other process may have failed; try to acquire marker ourselves.
if not _try_create_marker(marker_file):
_wait_for_marker(marker_file)
if cache_file.exists():
continue
logger.warning("Could not acquire download marker for %s", rel)
failed.append(str(rel))
continue

try:
download_lfs_object(info, marker_file)
marker_file.rename(cache_file)
oid_file.write_text(pointer.oid, encoding="utf-8")
logger.info("Cached %s", rel)
except Exception as e:
# Clean up all partial state.
marker_file.unlink(missing_ok=True)
cache_file.unlink(missing_ok=True)
oid_file.unlink(missing_ok=True)
failed.append(str(rel))
logger.warning("Failed to cache %s: %s", rel, e, exc_info=True)

if failed:
raise LFSDownloadError(
f"Failed to download {len(failed)} LFS object(s): {', '.join(failed)}"
)

_prune_cache(cache_dist_dir, source_rel_paths)


def _prune_cache(cache_dir: Path, source_rel_paths: set[Path]) -> None:
"""Remove cached files that no longer exist in the source directory."""
if not cache_dir.is_dir():
return

# Metadata suffixes managed by this module.
metadata_suffixes = {".oid", ".downloading"}

for cached_file in _walk_files(cache_dir):
rel = cached_file.relative_to(cache_dir)

# Skip metadata files — they'll be orphaned when their parent is removed.
if rel.suffix in metadata_suffixes:
continue

if rel not in source_rel_paths:
cached_file.unlink(missing_ok=True)
# Clean up associated metadata.
for suffix in metadata_suffixes:
cached_file.with_suffix(cached_file.suffix + suffix).unlink(
missing_ok=True
)
logger.debug("Pruned orphaned cache entry: %s", rel)


def _walk_files(directory: Path) -> list[Path]:
"""Recursively list all files in a directory."""
return [e for e in sorted(directory.rglob("*")) if e.is_file()]


def _copy_file(src: Path, dest: Path) -> None:
"""Copy a file, preserving content only."""
dest.write_bytes(src.read_bytes())


def _try_create_marker(marker_file: Path) -> bool:
"""Atomically create a marker file. Returns True if we created it."""
marker_file.parent.mkdir(parents=True, exist_ok=True)
try:
fd = os.open(marker_file, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
os.close(fd)
return True
except FileExistsError:
return False


def _wait_for_marker(marker_file: Path, timeout_seconds: int = 180) -> None:
"""Wait for a marker file to be removed (another process finished).

If the marker still exists after timeout, removes it as likely orphaned.
"""
deadline = time.monotonic() + timeout_seconds
while marker_file.exists() and time.monotonic() < deadline:
time.sleep(0.5)

# If marker still exists after timeout, it's likely orphaned by a crashed
# process. Remove it so subsequent attempts can proceed.
if marker_file.exists():
marker_file.unlink(missing_ok=True)
logger.warning("Removed orphaned marker file: %s", marker_file)
Loading