Merge pull request #26 from LemurPwned/feat/usability-improvements

LemurPwned · web-flow · commit b23dd7b94563 · 2024-09-27T11:57:17.000+02:00
Feat/usability improvements
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,11 @@
 
 Changelog for the `video-sampler`.
 
+### 0.11.0
+
+- added multiprocessing over multiple videos in the folder
+- new cli command & support reading from `.yaml` config files
+
 ### 0.10.0
 
 - added summary creation from sampled frames
diff --git a/README.md b/README.md
@@ -67,14 +67,16 @@ Documentation is available at [https://lemurpwned.github.io/video-sampler/](http
 
 ## Installation and Usage
 
+If you intend to use all the integrations, you need all the dependencies:
+
 ```bash
-python3 -m pip install -U video_sampler
+python3 -m pip install -U video_sampler[all]
 ```
 
-If you intend to use all the integrations, you need can with all dependencies:
+for minimalist no-cli usage install:
 
 ```bash
-python3 -m pip install -U video_sampler[all]
+python3 -m pip install -U video_sampler
 ```
 
 Available extras are:
@@ -91,18 +93,22 @@ To see all available options, run:
 python3 -m video_sampler --help
 ```
 
-or simply
+### Basic usage
+
+Plain:
 
 ```bash
-video_sampler --help
+python3 -m video_sampler hash FatCat.mp4 ./dataset-frames/ --hash-size 3 --buffer-size 20
 ```
 
-### Basic usage
+From the config file:
 
 ```bash
-python3 -m video_sampler hash FatCat.mp4 ./dataset-frames/ --hash-size 3 --buffer-size 20
+python3 -m video_sampler config ./configs/hash_base.yaml /my-video-folder/ ./my-output-folder
 ```
 
+You can set the number of workers to use with the `n_workers` parameter. The default is 1.
+
 #### YT-DLP integration plugin
 
 Before using please consult the ToS of the website you are scraping from -- use responsibly and for research purposes.
diff --git a/configs/hash_base.yaml b/configs/hash_base.yaml
@@ -0,0 +1,26 @@
+# Minimum time interval between processed frames (in seconds)
+min_frame_interval_sec: 3.0
+# Whether to process only keyframes (it's way faster than processing all frames)
+keyframes_only: true
+# Read interval while processing video (in seconds) (when there's no frame yielded, when to check again)
+queue_wait: 0.1
+debug: false
+# Whether to print stats
+print_stats: false
+# Buffer configuration
+buffer_config:
+  type: hash
+  # the smaller the hash size, the greater chance of collision
+  # smaller hashsets are faster to process & reduce frames more aggressively
+  hash_size: 8
+  # size of the collision buffer. The larger the buffer, the more in time back the
+  # hashes are stored.
+  size: 15
+  debug: true
+# Gating configuration
+gate_config:
+  type: pass
+extractor_config: {}
+summary_config: {}
+# Number of workers (separate processes) to process the frames. Determines level of parallelism
+n_workers: 3
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "video_sampler"
 description = "Video Sampler -- sample frames from a video file"
 url = "https://github.com/LemurPwned/video-sampler"
-version = "0.10.2"
+version = "0.11.0"
 authors = [
     { name = "LemurPwned", email = "lemurpwned@gmail.com" }
 ]
@@ -33,7 +33,8 @@ dependencies = [
   "rich >= 13.5.3",
   "typer[all] >= 0.9.0",
   "tqdm >= 4.66.1",
-  "opencv-python-headless >= 4.9.0.80"
+  "opencv-python-headless >= 4.9.0.80",
+  "pydantic >= 2.6.1",
 ]
 
 [project.urls]
diff --git a/tests/test_ytplugin.py b/tests/test_ytplugin.py
@@ -20,6 +20,8 @@ def test_keyword_extractor(subtitles):
     assert c == 4
 
 
+# skip
+@pytest.mark.skip
 def test_segment_sampler(random_video):
     ytdlp = YTDLPPlugin()
     title, url, subs = next(ytdlp.generate_urls(random_video, get_subs=True))
@@ -41,6 +43,7 @@ def test_segment_sampler(random_video):
         assert len(os.listdir(tempdir)) > 0
 
 
+@pytest.mark.skip
 def test_single_url_gen(random_video):
     ytdlp = YTDLPPlugin()
     title, url, subs = next(ytdlp.generate_urls(random_video, get_subs=True))
@@ -49,6 +52,7 @@ def test_single_url_gen(random_video):
     assert subs and len(subs) > 0, f"Expected subtitles, got {subs}"
 
 
+@pytest.mark.skip
 def test_search_url_gen():
     ytdlp = YTDLPPlugin()
     expected_results, results = 5, 0
diff --git a/video_sampler/__main__.py b/video_sampler/__main__.py
@@ -123,6 +123,9 @@ def main(
     summary_interval: int = typer.Option(
         -1, help="Interval in seconds to summarise the video."
     ),
+    n_workers: int = typer.Option(
+        1, help="Number of workers to use. Default is 1. Use -1 to use all CPUs."
+    ),
 ) -> None:
     """Default buffer is the perceptual hash buffer"""
     extractor_cfg = {}
@@ -169,6 +172,7 @@ def main(
             }
         ),
         extractor_config=extractor_cfg,
+        n_workers=n_workers,
     )
     if ytdlp:
         video_path = _ytdlp_plugin(yt_extra_args, video_path, get_subs=subs_enable)
@@ -210,6 +214,9 @@ def buffer(
     yt_extra_args: str = typer.Option(
         None, help="Extra arguments for YouTube-DLP extraction in classic format."
     ),
+    n_workers: int = typer.Option(
+        1, help="Number of workers to use. Default is 1. Use -1 to use all CPUs."
+    ),
 ):
     """Buffer type can be one of entropy, gzip, hash, passthrough"""
     cfg = SamplerConfig(
@@ -239,6 +246,7 @@ def buffer(
                 "type": "pass",
             }
         ),
+        n_workers=n_workers,
     )
     if ytdlp:
         video_path = _ytdlp_plugin(yt_extra_args, video_path)
@@ -278,6 +286,9 @@ def clip(
     yt_extra_args: str = typer.Option(
         None, help="Extra arguments for YouTube-DLP extraction in classic format."
     ),
+    n_workers: int = typer.Option(
+        1, help="Number of workers to use. Default is 1. Use -1 to use all CPUs."
+    ),
 ):
     """Buffer type can be only of type hash when using CLIP gating."""
     if pos_samples is not None:
@@ -309,12 +320,37 @@ def clip(
             "model_name": model_name,
             "batch_size": batch_size,
         },
+        n_workers=n_workers,
     )
     if ytdlp:
         video_path = _ytdlp_plugin(yt_extra_args, video_path)
     _create_from_config(cfg=cfg, video_path=video_path, output_path=output_path)
 
 
+@app.command(name="config")
+def from_config(
+    config_path: str = typer.Argument(..., help="Path to the configuration file."),
+    video_path: str = typer.Argument(
+        ..., help="Path to the video file or a glob pattern."
+    ),
+    output_path: str = typer.Argument(..., help="Path to the output folder."),
+    ytdlp: bool = typer.Option(
+        False,
+        help="Use yt-dlp to download videos from urls. Default is False."
+        " Enabling this will treat video_path as an input to ytdlp command.",
+    ),
+    yt_extra_args: str = typer.Option(
+        None, help="Extra arguments for YouTube-DLP extraction in classic format."
+    ),
+):
+    """Create a sampler from a configuration file."""
+
+    cfg = SamplerConfig.from_yaml(config_path)
+    if ytdlp:
+        video_path = _ytdlp_plugin(yt_extra_args, video_path)
+    _create_from_config(cfg=cfg, video_path=video_path, output_path=output_path)
+
+
 def main_loop():
     app()
 
diff --git a/video_sampler/buffer.py b/video_sampler/buffer.py
@@ -3,17 +3,18 @@
 from abc import ABC, abstractmethod
 from collections import OrderedDict
 from collections.abc import Iterable
-from dataclasses import asdict, dataclass, field
+from dataclasses import asdict, field
 from typing import Any
 
+import yaml
 from imagehash import average_hash, phash
 from PIL import Image
+from pydantic import BaseModel, Field
 
 from .logging import Color, console
 
 
-@dataclass
-class SamplerConfig:
+class SamplerConfig(BaseModel):
     """
     Configuration options for the video sampler.
 
@@ -43,9 +44,9 @@ class SamplerConfig:
 
     """
 
-    min_frame_interval_sec: float = 1
+    min_frame_interval_sec: float = Field(default=1, ge=0)
     keyframes_only: bool = True
-    queue_wait: float = 0.1
+    queue_wait: float = Field(default=0.1, ge=1e-3)
     debug: bool = False
     print_stats: bool = False
     buffer_config: dict[str, Any] = field(
@@ -63,10 +64,17 @@ class SamplerConfig:
     )
     extractor_config: dict[str, Any] = field(default_factory=dict)
     summary_config: dict[str, Any] = field(default_factory=dict)
+    n_workers: int = 1
 
     def __str__(self) -> str:
         return str(asdict(self))
 
+    @classmethod
+    def from_yaml(cls, file_path: str) -> "SamplerConfig":
+        with open(file_path) as file:
+            data = yaml.safe_load(file)
+        return cls(**data)
+
 
 class FrameBuffer(ABC):
     @abstractmethod
diff --git a/video_sampler/iterators.py b/video_sampler/iterators.py