Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ jobs:
with:
enable-cache: true
- run: uv python install # Version from pyproject.toml project.requires-python
# - run: uvx --with=beautifulsoup4,dateparser,httpx,pydantic,pynntp pytest
# - run: uv run scripts/nntp_io.py || true # TODO(@cclauss): Remove `|| true` when that script is fixed
# TODO(@cclauss): Remove `|| true` when tests are fixed
- run: uvx --with=beautifulsoup4,dateparser,httpx,pondpond,pydantic,pynntp,pytest-run-parallel pytest --iterations=8 --parallel-threads=auto --ignore-gil-enabled
- run: uv run scripts/nntp_io.py
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ repos:
- tomli

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.15.7
rev: v0.15.8
hooks:
- id: ruff-check
- id: ruff-format
Expand Down
76 changes: 76 additions & 0 deletions server/plugins/collage_photos/get_photos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "beautifulsoup4",
# "httpx",
# ]
# ///

import asyncio
from collections.abc import AsyncGenerator
from pathlib import Path

import httpx
from bs4 import BeautifulSoup

PLUGIN_NAME = "college_photos"
IMAGE_DIR = Path(__file__).parent / "college_photos_images"
SEARCH_URL = "https://completecollegephotolibrary.org/?s=laptop"
NUM_IMAGES = 10


async def fetch_image_urls(
query_url: str = SEARCH_URL, max_images: int = NUM_IMAGES
) -> AsyncGenerator[str, None]:
"""Yield image URLs from the query results page."""
async with httpx.AsyncClient() as client:
resp = (await client.get(query_url)).raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
images = soup.find_all("img")
count = 0
for img in images:
src = img.get("src")
if (
src
and "collegephotolibrary.org" in src
and src.lower().endswith((".jpg", ".jpeg", ".png"))
):
yield src
count += 1
if count >= max_images:
break


async def download_images(
image_urls: list[str], dest_dir: Path = IMAGE_DIR
) -> list[str]:
"""Download images to the destination directory asynchronously."""
dest_dir.mkdir(parents=True, exist_ok=True)

async def download_one(i: int, url: str) -> str | None:
try:
(resp := await client.get(url, timeout=10)).raise_for_status()
ext = Path(url).suffix or ".jpg"
filename = f"college_photo_{i + 1:07_}{ext}"
path = dest_dir / filename
path.write_bytes(resp.content)
return str(path)
except Exception as ex: # noqa: BLE001
print(f"Failed to download {url}: {ex}")
return None

async with httpx.AsyncClient() as client:
tasks = [download_one(i, url) for i, url in enumerate(image_urls)]
results = await asyncio.gather(*tasks)
return [result for result in results if result]
return None


async def create_college_photos_from_laptop_query() -> list[str]:
"""Main entry point: fetch and save 10 images from the laptop query."""
return await download_images([url async for url in fetch_image_urls()])


if __name__ == "__main__":
results = asyncio.run(create_college_photos_from_laptop_query())
print("Downloaded images:", results)
Loading