diff --git a/.gitignore b/.gitignore index 73d8c0b..dd16f03 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,6 @@ cython_debug/ error.*.log notebooks/semantic_search/output.json notebooks/semantic_search/config.py + +training/movinet_with_hmdb51/input +training/movinet_with_hmdb51/splits \ No newline at end of file diff --git a/training/movinet_with_hmdb51/AHMDB51.py b/training/movinet_with_hmdb51/AHMDB51.py new file mode 100644 index 0000000..8d9ae9d --- /dev/null +++ b/training/movinet_with_hmdb51/AHMDB51.py @@ -0,0 +1,105 @@ +from typing import Any, Callable, Dict, Optional +from aperturedb.Videos import Videos +from aperturedb.CommonLibrary import create_connector, execute_query +import torchvision +from AVideoClips import AVideoClips + + +def get_videos(train:bool, split:int) -> Videos: + """ + HMDB51 stores videos in clips corresponding to 51 categories. + They videos are classified as a test and train set (70% : 30%) + + The data set is further stored in 3 ways, + Get videos from aperturedb based on type (Train/Test) + and split. + + Fetch the appropriate set. + """ + + client = create_connector() + + query = [{ + "FindEntity": { + "_ref": 1, + "with_class": "Split", + "constraints": { + "id": ["==", split] + }, + "results": { + "all_properties": True + } + } + }, { + "FindVideo":{ + "is_connected_to": { + "ref": 1, + "constraints": { + "type": ["==", 1 if train else 2] + } + }, + "results":{ + "all_properties": True, + "count": True + } + } + }] + _, r, b = execute_query(client, query, []) + + + videos = Videos(client=client, response=r[1]["FindVideo"]["entities"]) + videos.blobs = True + print(f"Retrieved {len(videos)} videos") + return videos + + +class AHMDB51(torchvision.datasets.HMDB51): + """ + Implementation of HMDB51 aware of aperturedb. + Notice how pytorch's implementation has so much code for local file processing. + """ + def __init__(self, + frames_per_clip: int = 5, + step_between_clips: int = 1, + frame_rate: Optional[int] = None, + fold: int = 1, train: bool = True, + transform: Optional[Callable] = None, + _precomputed_metadata: Optional[Dict[str, Any]] = None, + num_workers: int = 1, + _video_width: int = 0, + _video_height: int = 0, + _video_min_dimension: int = 0, + _audio_samples: int = 0, + output_format: str = "THWC") -> None: + self.video_pts = [] + self.video_fps = [] + self.transform = transform + + videos = get_videos(train=train, split=fold) + self.ci = {} + videos.blobs = False + for v in videos: + if v["category"] not in self.ci: + self.ci[v["category"]] = len(self.ci) + self.samples = [(i, self.ci[v["category"]]) for i, v in enumerate(videos)] + videos.blobs = True + + + video_clips = AVideoClips( + videos, + frames_per_clip, + step_between_clips, + frame_rate, + _precomputed_metadata, + num_workers=num_workers, + _video_width=_video_width, + _video_height=_video_height, + _video_min_dimension=_video_min_dimension, + _audio_samples=_audio_samples, + output_format=output_format, + ) + + self.video_clips = video_clips + self.indices = [i for i in range(len(videos))] + assert len(videos) == len(list(filter(lambda e: 'preview' in e, videos))) + videos.loaded = True \ No newline at end of file diff --git a/training/movinet_with_hmdb51/AVideoClips.py b/training/movinet_with_hmdb51/AVideoClips.py new file mode 100644 index 0000000..c284317 --- /dev/null +++ b/training/movinet_with_hmdb51/AVideoClips.py @@ -0,0 +1,149 @@ +from typing import Any, Dict, List, Optional, Tuple +from torchvision.datasets.video_utils import VideoClips +from torchvision.datasets.video_utils import read_video_timestamps +from torchvision.io.video import read_video +import tempfile +import os +import shutil +from torch.utils.data.dataloader import DataLoader +import torch + +from aperturedb.Videos import Videos +from tqdm import tqdm + +class _VideoTimestampsDataset: + """ + Dataset used to parallelize the reading of the timestamps + of a list of videos, given their paths in the filesystem. + + Used in VideoClips and defined at top level so it can be + pickled when forking. + """ + def __init__(self, videos: Videos) -> None: + self._videos = videos + self._tmp_path = "scratch" + if os.path.exists(self._tmp_path) and os.path.isdir(self._tmp_path): + pass + else: + shutil.rmtree(self._tmp_path, ignore_errors=True) + os.makedirs(self._tmp_path) + + + def __len__(self) -> int: + return len(self._videos) + + def __getitem__(self, idx: int) -> Tuple[List[int], Optional[float]]: + video = self._videos[idx] + + with tempfile.NamedTemporaryFile(dir=self._tmp_path, suffix=".mp4") as ostream: + ostream.write(video["preview"]) + x = read_video_timestamps(ostream.name) + return x + raise Exception("Should not be here") + +class AVideoClips(VideoClips): + """ + Pytorch VideoClips with aperturedb. + """ + def __init__(self, videos: Videos, clip_length_in_frames: int = 16, frames_between_clips: int = 1, + frame_rate: Optional[int] = None, _precomputed_metadata: Optional[Dict[str, Any]] = None, num_workers: int = 0, + _video_width: int = 0, _video_height: int = 0, _video_min_dimension: int = 0, _video_max_dimension: int = 0, + _audio_samples: int = 0, _audio_channels: int = 0, output_format: str = "THWC") -> None: + self._videos = videos + self._num_workers = num_workers + + # these options are not valid for pyav backend + self._video_width = _video_width + self._video_height = _video_height + self._video_min_dimension = _video_min_dimension + self._video_max_dimension = _video_max_dimension + self._audio_samples = _audio_samples + self._audio_channels = _audio_channels + self.output_format = output_format.upper() + + self._compute_frame_pts() + self.compute_clips(clip_length_in_frames, frames_between_clips, frame_rate) + assert len(self._videos) == len(list(filter(lambda e: 'preview' in e, self._videos))) + + + + + def _compute_frame_pts(self) -> None: + dl: DataLoader = DataLoader( + _VideoTimestampsDataset(self._videos), + batch_size=16, + num_workers=self._num_workers, + collate_fn=lambda x: x + ) + + self.video_fps = [] + self.video_pts = [] + + with tqdm(total=len(dl)) as pbar: + for batch in dl: + pbar.update(1) + clips, fps = list(zip(*batch)) + clips = [torch.as_tensor(c, dtype=torch.long) for c in clips] + self.video_pts.extend(clips) + self.video_fps.extend(fps) + + def __len__(self) -> int: + return len(self._videos) + + def get_clip(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any], int]: + """ + Gets a subclip from a list of videos. + + Args: + idx (int): index of the subclip. Must be between 0 and num_clips(). + + Returns: + video (Tensor) + audio (Tensor) + info (Dict) + video_idx (int): index of the video in `video_paths` + """ + if idx >= self.num_clips(): + raise IndexError(f"Index {idx} out of range ({self.num_clips()} number of clips)") + video_idx, clip_idx = self.get_clip_location(idx) + clip_pts = self.clips[video_idx][clip_idx] + + from torchvision import get_video_backend + + backend = get_video_backend() + + if backend == "pyav": + # check for invalid options + if self._video_width != 0: + raise ValueError("pyav backend doesn't support _video_width != 0") + if self._video_height != 0: + raise ValueError("pyav backend doesn't support _video_height != 0") + if self._video_min_dimension != 0: + raise ValueError("pyav backend doesn't support _video_min_dimension != 0") + if self._video_max_dimension != 0: + raise ValueError("pyav backend doesn't support _video_max_dimension != 0") + if self._audio_samples != 0: + raise ValueError("pyav backend doesn't support _audio_samples != 0") + + if backend == "pyav": + start_pts = clip_pts[0].item() + end_pts = clip_pts[-1].item() + with tempfile.NamedTemporaryFile(dir="scratch", suffix=".mp4") as ostream: + ostream.write(self._videos[video_idx]["preview"]) + video, audio, info = read_video(ostream.name, start_pts, end_pts) + + if self.frame_rate is not None: + resampling_idx = self.resampling_idxs[video_idx][clip_idx] + if isinstance(resampling_idx, torch.Tensor): + resampling_idx = resampling_idx - resampling_idx[0] + video = video[resampling_idx] + info["video_fps"] = self.frame_rate + assert len(video) == self.num_frames, f"{video.shape} x {self.num_frames}" + + if self.output_format == "TCHW": + # [T,H,W,C] --> [T,C,H,W] + video = video.permute(0, 3, 1, 2) + + return video, audio, info, video_idx + + diff --git a/training/movinet_with_hmdb51/Classify-Vanilla-trained.ipynb b/training/movinet_with_hmdb51/Classify-Vanilla-trained.ipynb new file mode 100644 index 0000000..b87fe7a --- /dev/null +++ b/training/movinet_with_hmdb51/Classify-Vanilla-trained.ipynb @@ -0,0 +1,340 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c9b8f6e9-668c-4c4b-81dd-5fcfb7956d51", + "metadata": {}, + "source": [ + "# Classification with a vanilla model vs trained model.\n", + "\n", + "## Install pre requisites" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "064eb8e6-054d-4f9a-bf97-903706a577b8", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q git+https://github.com/Atze00/MoViNet-pytorch.git\n", + "!pip install -q av\n", + "!pip install -q -U aperturedb" + ] + }, + { + "cell_type": "markdown", + "id": "5382ab3d-dea0-4507-bf41-bcea892a6377", + "metadata": {}, + "source": [ + "## Util functions\n", + "\n", + "### Load datasets as clips (of 16 frames), sampled at 5fps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4871f56-82ef-4aaa-99cd-91cc9f36db27", + "metadata": {}, + "outputs": [], + "source": [ + "from torchvision.transforms import v2 as T\n", + "import torch\n", + "\n", + "def get_common():\n", + " \"\"\"\n", + " Just common parameters.\n", + " Applies to the training and data loading sections.\n", + " \"\"\"\n", + " torch.manual_seed(97)\n", + " num_frames = 16\n", + " clip_steps = 2\n", + " Bs_Train = 16\n", + " Bs_Test = 16\n", + "\n", + " transform = T.Compose([\n", + " T.Lambda(lambda x: x.permute(3, 0, 1, 2) / 255.),\n", + " T.Resize((200, 200)),\n", + " T.RandomHorizontalFlip(),\n", + " # T.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]),\n", + " T.RandomCrop((172, 172))])\n", + " transform_test = T.Compose([\n", + " T.Lambda(lambda x: x.permute(3, 0, 1, 2) / 255.),\n", + " # T.ToTensor()/255.0,\n", + " # T.ToTensor(),\n", + " T.Resize((200, 200)),\n", + " # T.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]),\n", + " T.CenterCrop((172, 172))])\n", + " return num_frames, clip_steps, Bs_Train, Bs_Test, transform, transform_test\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0488febf-57e3-44a4-bc83-253be7ce9f9f", + "metadata": {}, + "outputs": [], + "source": [ + "from AHMDB51 import AHMDB51\n", + "\n", + "def get_data_sets():\n", + " \"\"\"\n", + " Get the datasets from aperturedb.\n", + " The data has been ingested previously.\n", + " \"\"\"\n", + " num_frames, clip_steps, Bs_Train, Bs_Test, transform, transform_test = get_common()\n", + "\n", + " hmdb51_test = AHMDB51(\n", + " num_workers=1,\n", + " frame_rate=5,\n", + " frames_per_clip=num_frames,\n", + " step_between_clips=clip_steps,\n", + " train=False,\n", + " transform=transform_test\n", + " )\n", + "\n", + "\n", + " return None, hmdb51_test\n" + ] + }, + { + "cell_type": "markdown", + "id": "59fd3014-1831-4312-b135-05a59b02f5f3", + "metadata": {}, + "source": [ + "### Utility function to show a tensor.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6861e546-7dc7-408c-b13d-d42c9a0b6d55", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Video, display\n", + "import torchvision\n", + "\n", + "def show_tensor(tensor):\n", + " with open(\"tmp_video.mp4\", \"wb\") as f:\n", + " torchvision.io.write_video(f.name, tensor, fps=5, video_codec=\"h264\")\n", + " f.seek(0)\n", + " display(Video(\"tmp_video.mp4\"))\n" + ] + }, + { + "cell_type": "markdown", + "id": "cee562d4-3d97-44d6-8c47-191005ae60b4", + "metadata": {}, + "source": [ + "## Instantiate a off the shelf model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b01a490-4cf8-4328-8112-ed172650300e", + "metadata": {}, + "outputs": [], + "source": [ + "from movinets import MoViNet\n", + "from movinets.config import _C\n", + "\n", + "# Use the original movinet based on Kinetics400 dataset when we get pretrained.\n", + "model_vanilla = MoViNet(_C.MODEL.MoViNetA0, causal = False, pretrained = True )\n" + ] + }, + { + "cell_type": "markdown", + "id": "4f7045eb-228a-4019-93a1-7fe69ee1068a", + "metadata": {}, + "source": [ + "## Make a model from trained movinet with hmdb51" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af1e4da4-e7b8-42e0-a5db-a2dfe6a5169d", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# Load the model trained on HMDB51. It has been trained for 1 epoch.\n", + "model_trained = torch.load(\"movinet_hmdb51_1.pth\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb0acc9c-6d9f-4330-82ed-b3affebd4c52", + "metadata": {}, + "outputs": [], + "source": [ + "train, test = get_data_sets()\n", + "test.classes = {v: k for k, v in test.ci.items()}" + ] + }, + { + "cell_type": "markdown", + "id": "a7c4bda9", + "metadata": {}, + "source": [ + "## See the shape of the tensor passsed through model.\n", + "\n", + "This point is good to have, and troubleshoot any problems with the input going into the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45065c28-1aa8-458c-9c87-b9435fd7f8de", + "metadata": {}, + "outputs": [], + "source": [ + "data = test[333]\n", + "video, audio, class_index = data\n", + "print(video.shape)\n", + "x = video.permute(1, 2, 3, 0)\n", + "x=(x*255).type(torch.uint8)\n", + "show_tensor(x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f919086f-3058-4887-b559-12ee21285e80", + "metadata": {}, + "outputs": [], + "source": [ + "ground_truth = class_index\n", + "all_classes = test.classes\n", + "print(f\"{len(all_classes)=} \\r\\n {all_classes=}\\r\\n {all_classes[ground_truth]=}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8dcb120c-32db-4210-9914-af0cd2f3dc0e", + "metadata": {}, + "outputs": [], + "source": [ + "# Add an extra dim to video tensor to make it compatible with model.\n", + "p = video[None, :]\n", + "y = model_trained(p)\n", + "\n", + "# Get predictions from the trained movinet\n", + "preds = torch.topk(y, 5, largest=True)\n", + "\n", + "#show the top k predictions.\n", + "for i in preds.indices[0]:\n", + " print(test.classes[int(i)])\n" + ] + }, + { + "cell_type": "markdown", + "id": "242f4ac2", + "metadata": {}, + "source": [ + "### " + ] + }, + { + "cell_type": "markdown", + "id": "ff22c594-fa99-4981-9070-3aa658a18319", + "metadata": {}, + "source": [ + "### Predict with vanilla\n", + "\n", + "Take a random clip from the test Dataset (specified as an index between 0 and len test)\n", + "\n", + "Some indices will be out of 51 range, as the model had 600 classifications." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8ec98a8-0a26-4341-8406-094af799317c", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "video, _, ground_truth=test[10]\n", + "show_tensor((video.permute(1, 2, 3, 0)*255).type(torch.uint8))\n", + "\n", + "display(f\"{test.classes[ground_truth]=}\")\n", + "\n", + "y = model_vanilla(video[None, :])\n", + "op = torch.nn.Softmax(dim=1)\n", + "preds = torch.topk(op(y), 5, largest=True)\n", + "print(\"Predictions:\")\n", + "for i, prob in zip(preds.indices[0], preds.values[0]):\n", + " try:\n", + " prediction = test.classes[int(i)]\n", + " probability = float(prob)\n", + " print(f\"{prediction=}, {probability=}\")\n", + " except IndexError:\n", + " print(f\"Cannot find class for index={i}\")\n", + " except KeyError:\n", + " print(f\"Cannot find class for index={i}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2b9c3c9a-8f28-4188-b601-9f9f573c5ec9", + "metadata": {}, + "source": [ + "### Predict with trained.\n", + "\n", + "Way better predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc30fea4-96ec-4354-a0d5-544ae592a5e9", + "metadata": {}, + "outputs": [], + "source": [ + "video, _, ground_truth=test[120]\n", + "show_tensor((video.permute(1, 2, 3, 0)*255).type(torch.uint8))\n", + "\n", + "\n", + "y = model_trained(video[None, :])\n", + "op = torch.nn.Softmax(dim=1)\n", + "preds = torch.topk(op(y), 5, largest=True)\n", + "print(\"Predictions:\")\n", + "for i, prob in zip(preds.indices[0], preds.values[0]):\n", + " try:\n", + " prediction = test.classes[int(i)]\n", + " probability = float(prob)\n", + " print(f\"{prediction=}, {probability=}\")\n", + " except AttributeError as e:\n", + " print(f\"Cannot find class for index={i}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "package", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/training/movinet_with_hmdb51/README.md b/training/movinet_with_hmdb51/README.md new file mode 100644 index 0000000..53a9379 --- /dev/null +++ b/training/movinet_with_hmdb51/README.md @@ -0,0 +1,41 @@ +# Using HMDB51 to train movinet. + +[Download it here](https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/#Downloads) + +[Pytorch implementation](https://pytorch.org/vision/main/generated/torchvision.datasets.HMDB51.html) + +### Needed packages and binaries. + +#### Mac + brew brew install rar ffmpeg +#### Ubuntu + apt install rar on ubuntu. + +### Download data + mkdir input splits + DOWNLOAD=true bash prepare_data.sh + +### Download the helper scripts and classes. + wget https://github.com/aperture-data/aperturedb-applications/blob/train_movinet/training/movinet_with_hmdb51/AHMDB51.py + wget https://github.com/aperture-data/aperturedb-applications/blob/train_movinet/training/movinet_with_hmdb51/AVideoClips.py + wget https://github.com/aperture-data/aperturedb-applications/blob/train_movinet/training/movinet_with_hmdb51/ingest_transcode.py + wget https://github.com/aperture-data/aperturedb-applications/blob/train_movinet/training/movinet_with_hmdb51/train_movinet.py + wget https://github.com/aperture-data/aperturedb-applications/blob/train_movinet/training/movinet_with_hmdb51/prepare_data.sh + + +### Ingest into aperturedb + python ingest_transcode.py + +### Train model + python train_movinet.py training true + +### Explore classification abilities with the "off the shelf moveinet" vs fine tuned version of the same. +[Classify Vanilla trained](https://github.com/aperture-data/aperturedb-applications/blob/train_movinet/training/movinet_with_hmdb51/Classify-Vanilla-trained.ipynb) + +## Glossary of the files and resources. +### AHMDB51.py +It is an subclass of HMDB51 (from pytorch) and it incorporates the fact that the videos are stored in ApertureDB rather than as local files. + +### AVideoClips.py +Video Clips resample the Videos into clips of 16 frame lengths, sampled at a specified fps. +Since Movinet expects inputs of 172x172, or 200x200 pixels, there's also a transformation that is applied to a batch of videos. diff --git a/training/movinet_with_hmdb51/ingest_transcode.py b/training/movinet_with_hmdb51/ingest_transcode.py new file mode 100644 index 0000000..1b5e861 --- /dev/null +++ b/training/movinet_with_hmdb51/ingest_transcode.py @@ -0,0 +1,118 @@ +import os +import subprocess + +from aperturedb.ParallelLoader import ParallelLoader +from aperturedb.QueryGenerator import QueryGenerator +from aperturedb.CommonLibrary import create_connector +from aperturedb.Utils import Utils + + +class TreeIngest(QueryGenerator): + def __init__(self, root_path: str, annotation_path: str) -> None: + self._files = [os.path.join(dirpath, f) for dirpath, dirs, filenames in os.walk(root_path) for f in filenames if f.endswith("avi")] + self._fc = {os.path.basename(file_path): file_path for file_path in self._files} + print(f"{len(self._files)=}, {len(self._fc)=}") + + self._items = [] + for _,_, filenames in os.walk(annotation_path): + for filename in filenames: + with open(os.path.join(annotation_path, filename), "r") as ins: + split = int(filename.split(".")[0][-1]) + lines = ins.readlines() + for line in lines: + path, code = line.split() + if path in self._fc: + if os.path.exists(self._fc[path]): + self._items.append((split, int(code), self._fc[path])) + else: + print(f"{path} from splits not in files") + print("processed") + + def __repr__(self) -> str: + return f"A collection of {len(self)} smaples" + + def __len__(self): + return len(self._items) + + def getitem(self, subscript): + split, code, file_path = self._items[subscript] + dest_path = file_path.replace(".avi", ".mp4") + if not os.path.exists(dest_path): + p = subprocess.Popen( + f"ffmpeg -i '{file_path}' -vcodec libx264 -acodec aac '{dest_path}' 1> /dev/null 2>/dev/null", + shell=True, + ) + out, err = p.communicate() + if out or err: + print(f"res: {out, err}") + if "error" in err: + print(f"Error transcoding {file_path}") + return None + category = file_path.split("/")[-2] + video_uid = os.path.basename(dest_path) + connection_uid = f"{os.path.basename(dest_path)}_{split}_{code}" + query = [ + { + "AddEntity": { + "_ref": 1, + "class": "Split", + "properties": { + "id": split + }, + "if_not_found": { + "id": ["==", split] + } + } + }, + { + "AddVideo": { + "_ref": 2, + "properties": { + "name": video_uid, + "category": category + }, + "if_not_found": { + "name": ["==", video_uid] + } + } + }, + { + "AddConnection":{ + "class": "IsInSplit", + "src": 2, + "dst": 1, + "properties": { + "type": code, + "id": connection_uid + }, + "if_not_found": { + "id": ["==", connection_uid] + } + } + } + ] + buffer = None + with open(dest_path, "rb") as instream: + buffer = instream.read() + return query, [buffer] + +if __name__ == "__main__": + + + + generator = TreeIngest("input/categories", "splits/testTrainMulti_7030_splits") + print(generator) + + # Create a client. + client = create_connector() + + utils = Utils(client) + assert utils.create_entity_index("Split", "id"), "Failed to create index for Split" + assert utils.create_entity_index("_Video", "name"), "Failed to create index for _Video" + assert utils.create_connection_index("IsInSplit", "id"), "Failed to create index for IsInSplit" + + # Create a loader + loader = ParallelLoader(client=client, dry_run=False) + + # Ingest the data + loader.ingest(generator=generator, batchsize=1, stats=True) diff --git a/training/movinet_with_hmdb51/prepare_data.sh b/training/movinet_with_hmdb51/prepare_data.sh new file mode 100644 index 0000000..9ab3084 --- /dev/null +++ b/training/movinet_with_hmdb51/prepare_data.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -e + +if [ "${DOWNLOAD}" == "true" ]; then + echo "Downloading the data" + mkdir -p input + cd input + wget https://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/hmdb51_org.rar + + cd ../splits + wget http://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/test_train_splits.rar + cd .. + +fi + cd input + mkdir categories + cd categories + unrar x ../hmdb51_org.rar + for i in *.rar; do + unrar x $i + done + cd ../.. + cd splits + unrar x test_train_splits.rar + cd .. diff --git a/training/movinet_with_hmdb51/train_movinet.py b/training/movinet_with_hmdb51/train_movinet.py new file mode 100644 index 0000000..0638d2d --- /dev/null +++ b/training/movinet_with_hmdb51/train_movinet.py @@ -0,0 +1,276 @@ +import time +import torchvision +import torch.nn.functional as F +import torch.optim as optim +from torch.utils.data import DataLoader +import torch +from torchvision.transforms import v2 as T +from movinets import MoViNet +from movinets.config import _C + + + +from torch.utils.data.dataloader import DataLoader +from typer import Typer +from AHMDB51 import AHMDB51 + + +NUM_WORKERS = 0 # Number of workers for data loading, 0 = main process. +N_EPOCHS = 1 + +def get_common(): + """ + Just common parameters. + Applies to the training and data loading sections. + """ + torch.manual_seed(97) + num_frames = 16 + clip_steps = 2 + Bs_Train = 16 + Bs_Test = 16 + + transform = T.Compose([ + T.Lambda(lambda x: x.permute(3, 0, 1, 2) / 255.), + T.Resize((200, 200)), + T.RandomHorizontalFlip(), + # T.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]), + T.RandomCrop((172, 172))]) + transform_test = T.Compose([ + T.Lambda(lambda x: x.permute(3, 0, 1, 2) / 255.), + # T.ToTensor()/255.0, + # T.ToTensor(), + T.Resize((200, 200)), + # T.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]), + T.CenterCrop((172, 172))]) + return num_frames, clip_steps, Bs_Train, Bs_Test, transform, transform_test + +def get_local_data_sets(): + """ + Build datasets from local files. + This is the original code. + """ + num_frames, clip_steps, Bs_Train, Bs_Test, transform, transform_test = get_common() + hmdb51_train = torchvision.datasets.HMDB51('video_data/', 'test_train_splits/', num_frames,frame_rate=5, + step_between_clips = clip_steps, fold=1, train=True, + transform=transform, num_workers=1) + + hmdb51_test = torchvision.datasets.HMDB51('video_data/', 'test_train_splits/', num_frames,frame_rate=5, + step_between_clips = clip_steps, fold=1, train=False, + transform=transform_test, num_workers=1) + return hmdb51_train, hmdb51_test + +def get_data_sets(): + """ + Get the datasets from aperturedb. + The data has been ingested previously. + """ + num_frames, clip_steps, Bs_Train, Bs_Test, transform, transform_test = get_common() + + hmdb51_train = AHMDB51( + num_workers=NUM_WORKERS, + frame_rate=5, + frames_per_clip=num_frames, + step_between_clips=clip_steps, + train=True, + transform=transform + ) + hmdb51_test = AHMDB51( + num_workers=NUM_WORKERS, + frame_rate=5, + frames_per_clip=num_frames, + step_between_clips=clip_steps, + train=False, + transform=transform_test + ) + + + return hmdb51_train, hmdb51_test + +def get_data_loaders(use_aperturedb: bool=False): + """ + Build Data loaders using the datasets + arg use_aperturedb defines how to get datasets + """ + num_frames, clip_steps, Bs_Train, Bs_Test, transform, transform_test = get_common() + if not use_aperturedb: + hmdb51_train, hmdb51_test = get_local_data_sets() + else: + hmdb51_train, hmdb51_test = get_data_sets() + + train_loader = DataLoader(hmdb51_train, batch_size=Bs_Train, shuffle=True) + test_loader = DataLoader(hmdb51_test, batch_size=Bs_Test, shuffle=False) + return train_loader, test_loader + +def train_iter(model, optimz, data_load, loss_val): + samples = len(data_load.dataset) + model.train() + # model.cuda() + model.cpu() + model.clean_activation_buffers() + optimz.zero_grad() + for i, (data,_ , target) in enumerate(data_load): + # out = F.log_softmax(model(data.cuda()), dim=1) + out = F.log_softmax(model(data.cpu()), dim=1) + # loss = F.nll_loss(out, target.cuda()) + loss = F.nll_loss(out, target.cpu()) + loss.backward() + optimz.step() + optimz.zero_grad() + model.clean_activation_buffers() + if i % 50 == 0: + print('[' + '{:5}'.format(i * len(data)) + '/' + '{:5}'.format(samples) + + ' (' + '{:3.0f}'.format(100 * i / len(data_load)) + '%)] Loss: ' + + '{:6.4f}'.format(loss.item())) + loss_val.append(loss.item()) + +def evaluate(model, data_load, loss_val): + model.eval() + + samples = len(data_load.dataset) + csamp = 0 + tloss = 0 + model.clean_activation_buffers() + with torch.no_grad(): + for data, _, target in data_load: + # output = F.log_softmax(model(data.cuda()), dim=1) + output = F.log_softmax(model(data.cpu()), dim=1) + # loss = F.nll_loss(output, target.cuda(), reduction='sum') + loss = F.nll_loss(output, target.cpu(), reduction='sum') + _, pred = torch.max(output, dim=1) + + tloss += loss.item() + # csamp += pred.eq(target.cuda()).sum() + csamp += pred.eq(target.cpu()).sum() + + model.clean_activation_buffers() + aloss = tloss / samples + loss_val.append(aloss) + print('\nAverage test loss: ' + '{:.4f}'.format(aloss) + + ' Accuracy:' + '{:5}'.format(csamp) + '/' + + '{:5}'.format(samples) + ' (' + + '{:4.2f}'.format(100.0 * csamp / samples) + '%)\n') + +def train_iter_stream(model, optimz, data_load, loss_val, n_clips = 2, n_clip_frames=8): + """ + In causal mode with stream buffer a single video is fed to the network + using subclips of lenght n_clip_frames. + n_clips*n_clip_frames should be equal to the total number of frames presents + in the video. + + n_clips : number of clips that are used + n_clip_frames : number of frame contained in each clip + """ + #clean the buffer of activations + samples = len(data_load.dataset) + # model.cuda() + model.cpu() + model.train() + model.clean_activation_buffers() + optimz.zero_grad() + + for i, (data,_, target) in enumerate(data_load): + # data = data.cuda() + # target = target.cuda() + data = data.cpu() + target = target.cpu() + l_batch = 0 + #backward pass for each clip + for j in range(n_clips): + output = F.log_softmax(model(data[:,:,(n_clip_frames)*(j):(n_clip_frames)*(j+1)]), dim=1) + loss = F.nll_loss(output, target) + _, pred = torch.max(output, dim=1) + loss = F.nll_loss(output, target)/n_clips + loss.backward() + l_batch += loss.item()*n_clips + optimz.step() + optimz.zero_grad() + + #clean the buffer of activations + model.clean_activation_buffers() + if i % 50 == 0: + print('[' + '{:5}'.format(i * len(data)) + '/' + '{:5}'.format(samples) + + ' (' + '{:3.0f}'.format(100 * i / len(data_load)) + '%)] Loss: ' + + '{:6.4f}'.format(l_batch)) + loss_val.append(l_batch) + +def evaluate_stream(model, data_load, loss_val, n_clips = 2, n_clip_frames=8): + model.eval() + # model.cuda() + model.cpu() + samples = len(data_load.dataset) + csamp = 0 + tloss = 0 + with torch.no_grad(): + for data, _, target in data_load: + # data = data.cuda() + # target = target.cuda() + data = data.cpu() + target = target.cpu() + model.clean_activation_buffers() + for j in range(n_clips): + output = F.log_softmax(model(data[:,:,(n_clip_frames)*(j):(n_clip_frames)*(j+1)]), dim=1) + loss = F.nll_loss(output, target) + _, pred = torch.max(output, dim=1) + tloss += loss.item() + csamp += pred.eq(target).sum() + + aloss = tloss / len(data_load) + loss_val.append(aloss) + print('\nAverage test loss: ' + '{:.4f}'.format(aloss) + + ' Accuracy:' + '{:5}'.format(csamp) + '/' + + '{:5}'.format(samples) + ' (' + + '{:4.2f}'.format(100.0 * csamp / samples) + '%)\n') + +def train(train_loader, test_loader): + + + # Use the original movinet based on Kinetics400 dataset when we get pretrained. + model = MoViNet(_C.MODEL.MoViNetA0, causal = False, pretrained = True ) + start_time = time.time() + + trloss_val, tsloss_val = [], [] + model.classifier[3] = torch.nn.Conv3d(2048, 51, (1,1,1)) + optimz = optim.Adam(model.parameters(), lr=0.00005) + for epoch in range(1, N_EPOCHS + 1): + print('Epoch:', epoch) + train_iter(model, optimz, train_loader, trloss_val) + evaluate(model, test_loader, tsloss_val) + torch.save({ + 'model_state_dict': model.state_dict(), + 'optimizer_state_dict': optimz.state_dict(), + 'epoch': epoch, + 'train_loss': trloss_val, + 'test_loss': tsloss_val + }, f'movinet_{epoch}.pth') + + # Save every epoch and can compare across epochs too. Right now we stop at 1 + # HMDB51 has ~6000 clips with 51 classes. It has 3 splits. A split is a combination of train and test. + # We use the first split for this example. + # This trains on a split based on fold value selected when we load dataset + torch.save(model, f'movinet_hmdb51_{epoch}.pth') + + print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds') + + + +app = Typer() + +@app.command() +def inference(use_aperturedb:bool): + train_loader, test_loader = get_data_loaders(use_aperturedb=use_aperturedb) + print(test_loader.dataset.ci) + +@app.command() +def training(use_aperturedb:bool): + train_loader, test_loader = get_data_loaders(use_aperturedb=use_aperturedb) + train(train_loader=train_loader, test_loader=test_loader) + classes = test_loader.dataset.ci + rc = {v:k for k,v in classes.items()} + + # Preserve the classes to index mapping for this model. + import json + with open("classes.json", "w") as out: + json.dump(rc, out, indent=2) + +if __name__ == "__main__": + app() \ No newline at end of file