Data license (#48)

elias-ramzi · web-flow · commit 997bee85ad18 · 2025-02-20T09:40:07.000+01:00
* added data license

* scripts to release weights
diff --git a/LICENSE_DATA b/LICENSE_DATA
@@ -0,0 +1,5 @@
+The nuscenes_vavam.tar.gz file is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
+
+The data files in this repository contain information derived from the nuScenes dataset (https://www.nuscenes.org), which is licensed under CC BY-NC-SA 4.0.
+
+To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
diff --git a/README.md b/README.md
@@ -351,6 +351,12 @@ We are releasing the code in this repository under the [MIT License](LICENSE).
 
 We are releasing the pre-trained models / weights under the **research-only** [VideoActionModel License](LICENSE_MODEL). The weights were trained with datasets that are subjected to their own licenses and restrictions. Please see below.
 
+This project releases data derived from the [nuScenes dataset](https://www.nuscenes.org) and are licensed under [CC BY-NC-SA 4.0](LICENSE_DATA). Using those data come with the following terms of use:
+
+- Those data can be used for non-commercial purposes only
+- Any derivatives must be distributed under the same license
+- Attribution must be provided to both this project and nuScenes
+
 ## Citation
 
 If you use this code, please cite our technical report:
@@ -404,10 +410,10 @@ A0141014181).
 
 ## Credits
 
-**Project Lead (Research direction, technical roadmap and project coordination)** <br>
+**Project Lead (Research direction, technical roadmap, project coordination)** <br>
 Florent BARTOCCIONI
 
-**Core contributors (Contributed to all aspects of the codebase, ran experiments and evaluations)** <br>
+**Core contributors (All aspects of the codebase, experiments, evaluations)** <br>
 Florent BARTOCCIONI, Elias RAMZI
 
 **Contributors**<br>
@@ -418,14 +424,14 @@ Shashanka VENKATARAMANAN -- Depth anything pseudo-GT generation <br>
 Tuan-Hung VU -- GPT adaptation from nanoGPT <br>
 Yihong XU -- nuPlan preprocessing and initial dataloader development <br>
 
-**Paper (manuscript preparation, designing paper visualization and figures)** <br>
+**Technical report (Manuscript preparation, design, visualization, figures)** <br>
 Florent BARTOCCIONI, Elias RAMZI, Victor BESNIER, Shashanka VENKATARAMANAN, Eloi ZABLOCKI, Yihong XU, Tuan-Hung VU
 
-**Public Computing Grant Acquisition (project proposal writing for Adastra, EuroHPC and Jean-Zay grand challenges)** <br>
+**Grant Acquisitions (Grant proposals for Adastra, EuroHPC, and Jean Zay Grand Challenges)** <br>
 Florent BARTOCCIONI, Alexandre BOULCH, Eduardo VALLE, Spyros GIDARIS, Eloi ZABLOCKI, Matthieu CORD, Serkan ODABAS, David HURYCH
 
-**Advisory (research and organization guidance)** <br>
+**Advisory (Research and organization guidance)** <br>
 Eloi ZABLOCKI, Alexandre BOULCH, Mickael CHEN
 
-**Senior Advisory (research and organization guidance)** <br>
+**Senior Advisory (Research and organization guidance)** <br>
 Eduardo VALLE, Andrei BURSUC, Renaud MARLET, Matthieu CORD
diff --git a/scripts/handle_checkpoints.py b/scripts/handle_checkpoints.py
@@ -21,6 +21,7 @@
 import os
 import subprocess
 from glob import glob
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from tqdm import tqdm
 
@@ -68,6 +69,7 @@ def extract_tar_file(tar_dir: str, outdir: str) -> None:
     parser.add_argument("--outdir", type=expand_path, required=True)
     parser.add_argument("--maxsize", type=str, default="2G")
     parser.add_argument("--extension", type=str, default="pt")
+    parser.add_argument("--num_threads", type=int, default=1)
     args = parser.parse_args()
 
     os.makedirs(args.outdir, exist_ok=True)
@@ -80,8 +82,20 @@ def extract_tar_file(tar_dir: str, outdir: str) -> None:
             assert os.path.isfile(args.checkpoint_dir), "Invalid checkpoint path"
             checkpoint_paths = [args.checkpoint_dir]
 
-        for checkpoint_path in tqdm(checkpoint_paths, desc="Creating tar files"):
-            create_tar_file(checkpoint_path, args.outdir, args.maxsize)
+        if args.num_threads <= 1:
+            for checkpoint_path in tqdm(checkpoint_paths, desc="Creating tar files"):
+                create_tar_file(checkpoint_path, args.outdir, args.maxsize)
+        else:
+            with ThreadPoolExecutor(max_workers=args.num_threads) as plot_executor:
+                all_futures = []
+                for checkpoint_path in checkpoint_paths:
+                    future = plot_executor.submit(
+                        create_tar_file, checkpoint_path, args.outdir, args.maxsize
+                    )
+                    all_futures.append(future)
+
+                for future in tqdm(as_completed(all_futures), total=len(all_futures), desc="Creating tar files"):
+                    future.result()
 
     elif args.mode == "extract":
         extract_tar_file(args.checkpoint_dir, args.outdir)
diff --git a/scripts/prepare_checkpoint_release.sh b/scripts/prepare_checkpoint_release.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+: '
+srun -A cya@h100 -C h100 --pty \
+--nodes=1 --ntasks-per-node=1 --cpus-per-task=16 --gres=gpu:0 --hint=nomultithread \
+--qos=qos_gpu_h100-dev --time=00:45:00 \
+bash scripts/prepare_checkpoint_release.sh
+'
+
+module purge
+module load arch/h100
+module load pytorch-gpu/py3/2.4.0
+export PYTHONUSERBASE=$WORK/python_envs/video_action_model
+
+# Data folder containing pickles and jsons
+DATA_FOLDER="${ycy_ALL_CCFRSCRATCH}/release_weights_v1/datafiles"
+# Source folder containing all checkpoints
+SRC_FOLDER="${ycy_ALL_CCFRSCRATCH}/release_weights_v1"
+file_count=$(find "$SRC_FOLDER" -type f -name "*.pt" | wc -l)
+echo "Found $file_count files in $SRC_FOLDER"
+
+# Destination folder for all copied videos
+DEST_FOLDER="${ycy_ALL_CCFRSCRATCH}/release_weights_tar"
+# Create destination folder if it doesn't exist
+mkdir -p "$DEST_FOLDER"
+
+python scripts/handle_checkpoints.py \
+--mode create \
+--checkpoint_dir $SRC_FOLDER \
+--outdir $DEST_FOLDER \
+--num_threads 16 \
+--maxsize 2G
+
+# Upload the weights with the GitHub CLI
+# https://cli.github.com/manual/gh_release_uploads
+find $DEST_FOLDER -type f -name "*.tar.gz*" | while read -r filepath; do
+    # Copy the file to the destination with the new name
+    echo "[ ] Uploading: $filepath"
+    gh release upload v1.0.0 $filepath --clobber
+    echo "[x] Uploaded: $filepath"
+done
+
+find $DATA_FOLDER -type f -name "*.tar.gz" | while read -r filepath; do
+    # Copy the file to the destination with the new name
+    echo "[ ] Uploading: $filepath"
+    gh release upload v1.0.0 $filepath --clobber
+    echo "[x] Uploaded: $filepath"
+done