AdaptiveMotorControlLab
diff --git a/‎.vscode/launch.json‎
Lines changed: 170 additions & 177 deletions b/‎.vscode/launch.json‎
Lines changed: 170 additions & 177 deletions
diff --git a/‎docs/download_data.py‎
Lines changed: 32 additions & 0 deletions b/‎docs/download_data.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎llava/action/check_handvideo.py‎
Lines changed: 163 additions & 0 deletions b/‎llava/action/check_handvideo.py‎
Lines changed: 163 additions & 0 deletions
diff --git a/‎llava/action/crop_resize_video.sh‎
Lines changed: 57 additions & 0 deletions b/‎llava/action/crop_resize_video.sh‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎llava/action/crop_resize_video_parall.sh‎
Lines changed: 90 additions & 0 deletions b/‎llava/action/crop_resize_video_parall.sh‎
Lines changed: 90 additions & 0 deletions
@@ -1,9 +1,41 @@
 import os
+os.environ["HF_HOME"] = "/mnt/SV_storage/VFM/huggingface"
 from datasets import load_dataset
+from datasets import get_dataset_config_names, get_dataset_split_names
 from tqdm import tqdm
 import json
 import yaml
 
+dataset_name = "lmms-lab/LLaVA-Video-178K"
+
+save_root = "/mnt/SV_storage/VFM/onevision/llava_video_178k"
+
+subsets = get_dataset_config_names(dataset_name)
+for subset in subsets:
+    # download the dataset
+    data = load_dataset(dataset_name, subset)
+    for da in tqdm(data):
+        json_data = {}
+        json_data["id"] = da["id"]
+        json_data["video"] = da["video"]
+        json_data["conversations"] = da["conversations"]
+        with open(os.path.join(save_root, '{}.json'.format(da["id"])), "w") as f:
+            json.dump(json_data, f, indent=4, ensure_ascii=False)
+    aa= 1
+
+    # splits = get_dataset_split_names(dataset_name, subset)
+    
+        
+#     aa = 1
+
+
+# data = load_dataset("lmms-lab/LLaVA-Video-178K", '0_30_s_academic_v0_1', split="caption")
+
+# for da in tqdm(data):
+#     json_data = {}
+#     json_data["id"] = da["id"]
+#     aa= 2
+
 avaliable_datasets = ['CLEVR-Math(MathV360K)', 'FigureQA(MathV360K)', 'GEOS(MathV360K)', 'GeoQA+(MathV360K)', 
                       'Geometry3K(MathV360K)', 'IconQA(MathV360K)', 'MapQA(MathV360K)', 'PMC-VQA(MathV360K)', 
                       'Super-CLEVR(MathV360K)', 'TabMWP(MathV360K)', 'UniGeo(MathV360K)', 'VisualWebInstruct(filtered)', 
 
@@ -0,0 +1,163 @@
+import numpy as np
+import os
+import pandas as pd
+import decord
+import ast
+import cv2
+
+from joblib import Parallel, delayed
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+
+def crop_hands(video_reader, sample_frames, df, frame_id):
+    image_size = (384, 384)
+    # video_size = (1920, 1080)
+    handobj_size = (568, 320)
+    expand_ratio = 1.5
+    minimum_size = 20
+
+    # get the frame
+    try:
+        frame = video_reader[sample_frames[frame_id]].asnumpy()
+    except:
+        hand_image = np.zeros((image_size[0], image_size[1]*2, 3), dtype=np.uint8)
+        return hand_image
+
+    video_size = (frame.shape[1], frame.shape[0])
+    # get the hand detection results
+    hand_dets = df.iloc[frame_id]['hand_dets']
+
+    # change the string to list
+    hand_dets = np.array(ast.literal_eval(hand_dets)) if hand_dets != '[]' else None
+
+    left_image = np.zeros(image_size + (3,), dtype=np.uint8)
+    right_image = np.zeros(image_size + (3,), dtype=np.uint8)
+
+    if hand_dets is not None:
+        # select the left hand detection with the highest score
+        left_hand = hand_dets[hand_dets[:, -1] == 0]
+        if len(left_hand) > 0:
+            left_hand = left_hand[np.argmax(left_hand[:, 4])]
+            bbox = [left_hand[0] * video_size[0] / handobj_size[0], left_hand[1] * video_size[1] / handobj_size[1],
+                    left_hand[2] * video_size[0] / handobj_size[0], left_hand[3] * video_size[1] / handobj_size[1]]
+            if min(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2 > minimum_size:
+                # expand the bbox based on the expand_ratio and the longer side, and make the bbox square
+                half_side = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2
+                center = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
+                bbox = [center[0] - half_side * expand_ratio, center[1] - half_side * expand_ratio,
+                        center[0] + half_side * expand_ratio, center[1] + half_side * expand_ratio]
+                bbox = [int(np.round(x)) for x in bbox]
+
+                # crop the image with the bbox and zero padding
+                cropped_image = np.zeros((bbox[3] - bbox[1], bbox[2] - bbox[0], 3), dtype=np.uint8)
+                cropped_image[max(0, -bbox[1]):min(bbox[3] - bbox[1], video_size[1] - bbox[1]),
+                                max(0, -bbox[0]):min(bbox[2] - bbox[0], video_size[0] - bbox[0]), :] = frame[max(bbox[1], 0):min(bbox[3], video_size[1]), max(bbox[0], 0):min(bbox[2], video_size[0])]
+                
+                # resize the cropped image to the image_size
+                left_image = cv2.resize(cropped_image, image_size)
+
+        
+        # select the right hand detection with the highest score
+        right_hand = hand_dets[hand_dets[:, -1] == 1]
+        if len(right_hand) > 0:
+            right_hand = right_hand[np.argmax(right_hand[:, 4])]
+            bbox = [right_hand[0] * video_size[0] / handobj_size[0], right_hand[1] * video_size[1] / handobj_size[1],
+                    right_hand[2] * video_size[0] / handobj_size[0], right_hand[3] * video_size[1] / handobj_size[1]]
+            if min(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2 > minimum_size:
+                # expand the bbox based on the expand_ratio and the longer side, and make the bbox square
+                half_side = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2
+                center = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
+                bbox = [center[0] - half_side * expand_ratio, center[1] - half_side * expand_ratio,
+                        center[0] + half_side * expand_ratio, center[1] + half_side * expand_ratio]
+                bbox = [int(np.round(x)) for x in bbox]
+
+                # crop the image with the bbox and zero padding
+                cropped_image = np.zeros((bbox[3] - bbox[1], bbox[2] - bbox[0], 3), dtype=np.uint8)
+                cropped_image[max(0, -bbox[1]):min(bbox[3] - bbox[1], video_size[1] - bbox[1]), 
+                                max(0, -bbox[0]):min(bbox[2] - bbox[0], video_size[0] - bbox[0]), :] = frame[max(bbox[1], 0):min(bbox[3], video_size[1]), max(bbox[0], 0):min(bbox[2], video_size[0])]
+
+                # resize the cropped image to the image_size
+                right_image = cv2.resize(cropped_image, image_size)
+
+    # concatenate the left and right hand images
+    hand_image = np.concatenate((left_image, right_image), axis=1)
+
+    return hand_image[:, :, ::-1]
+
+def process_clip(clips, video_path, handobj_path, save_video_path, clip_i):
+    seconds = 15
+    handobj_fps = 30
+    image_size = (384, 384)
+    video_reader = decord.VideoReader(video_path)
+    video_fps = video_reader.get_avg_fps()
+    
+
+    clip = clips[clip_i]
+    clip_path = os.path.join(handobj_path, clip)
+    save_clip_path = os.path.join(save_video_path, clip[:-4])
+
+    # if not os.path.exists(save_clip_path):
+    #     os.makedirs(save_clip_path)
+
+    # initialize the video writer
+    video_writer = cv2.VideoWriter(save_clip_path, cv2.VideoWriter_fourcc(*'mp4v'), handobj_fps, (image_size[0]*2, image_size[1]))
+
+    # read the csv file
+    df = pd.read_csv(clip_path)
+
+    start_second = int(clip.split('.')[0])
+    end_second = start_second + seconds
+    start_frame = int(start_second * video_fps)
+    end_frame = min(int(end_second * video_fps), len(video_reader))
+
+    # sample seconds*handobj_fps frames
+    sample_frames = np.linspace(start_frame, end_frame, num=len(df), endpoint=False, dtype=int)
+
+    # # read the video frames
+    # frames = video_reader.get_batch(sample_frames).asnumpy()
+
+    for frame_id in range(len(df)):
+        hand_image = crop_hands(video_reader, sample_frames, df, frame_id)
+
+        # # save the frame as image
+        # cv2.imwrite(os.path.join(save_clip_path, f'{frame_id:05d}.png'), hand_image)
+
+        # write the frame to the video
+        video_writer.write(hand_image)
+
+    video_writer.release()
+    print(f"Save {save_clip_path}")
+    aa = 1
+
+if __name__ == "__main__":
+    hand_video_path = "/mnt/SV_storage/VFM/EK100/EK100_512resolution"
+    check_path = "/mnt/SV_storage/VFM/onevision/llava_video/EK100"
+
+    subjects = sorted(os.listdir(check_path))[11:]
+    for subject in subjects:
+        subject_path = os.path.join(check_path, subject)
+        hand_video_subject_path = os.path.join(hand_video_path, subject)
+
+        videos = sorted(os.listdir(subject_path))
+        for video in videos:
+            video_path = os.path.join(subject_path, video)
+            hand_video_video_path = os.path.join(hand_video_subject_path, video)
+            clips = sorted(os.listdir(video_path))
+
+            for clip in clips:
+                clip_path = os.path.join(video_path, clip)
+                hand_video_clip_path = os.path.join(hand_video_video_path, clip.replace(".MP4", ".mp4"))
+
+                assert os.path.exists(hand_video_clip_path), f"{hand_video_clip_path} does not exist"
+
+                # load both the hand video and the original video
+                hand_video_reader = decord.VideoReader(hand_video_clip_path)
+                video_reader = decord.VideoReader(clip_path)
+
+                assert len(hand_video_reader) in list(range(len(video_reader)-5, len(video_reader)+5)), f"{hand_video_clip_path} has different length with {clip_path}"
+
+                print(f"Checked {hand_video_clip_path}")
+
+
+            
+                
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+small_side=512
+cliplen_sec=15
+max_tries=5
+fps=30  # Set the desired frame rate here
+
+data_dir="/mnt/SV_storage/VFM/EK100/EPIC-KITCHENS/"
+save_dir="/mnt/SV_storage/VFM/EK100/EK100_512resolution"
+# find all the subject folders that start with P
+subjects=$(find $data_dir -mindepth 1 -maxdepth 1 -type d -name "P*")
+subjects=( $subjects )  # to array
+for subject_dir in "${subjects[@]}"; do
+    # set the video dir as the subject folder with the videos folder
+    indir="${subject_dir}/videos"
+    outdir="${save_dir}/$(basename $subject_dir)"
+    mkdir -p $outdir
+
+    cd $indir || exit
+    all_videos=$(find . -iname "*.MP4")
+    all_videos=( $all_videos )  # to array
+    cd -
+
+    for video in "${all_videos[@]}"; do
+        W=$( ffprobe -v quiet -show_format -show_streams -show_entries stream=width "${indir}/${video}" | grep width )
+        W=${W#width=}
+        H=$( ffprobe -v quiet -show_format -show_streams -show_entries stream=height "${indir}/${video}" | grep height )
+        H=${H#height=}
+        # Set the smaller side to small_side
+        # from https://superuser.com/a/624564
+        if [ $W -gt $H ] && [ $H -gt ${small_side} ]; then
+            scale_str="-filter:v scale=-1:${small_side}"
+        elif [ $H -gt $W ] && [ $W -gt ${small_side} ]; then
+            scale_str="-filter:v scale=${small_side}:-1"
+        else
+            # The small side is smaller than required size, so don't resize/distort the video
+            scale_str=""
+        fi
+        vidlen_sec=$( ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "${indir}/${video}" )
+        mkdir -p "${outdir}/${video}"
+        for st_sec in $(seq 0 ${cliplen_sec} ${vidlen_sec}); do
+            outfpath=${outdir}/${video}/${st_sec}.mp4
+            try=0
+            while [ $try -le $max_tries ]; do
+                ffmpeg -y -ss ${st_sec} -i "${indir}/${video}" ${scale_str} -t ${cliplen_sec} -r ${fps} "${outfpath}"
+                try=$(( $try + 1 ))
+                write_errors=$( ffprobe -v error -i "${outfpath}" )
+                # If no errors detected by ffprobe, we are done
+                if [ -z "$write_errors" ]; then
+                    echo $outfpath written successfully in $try tries!
+                    break
+                fi
+            done
+        done
+        echo "Converted ${video}"
+    done
+done
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+
+small_side=512
+cliplen_sec=15
+max_tries=5
+fps=30  # Set the desired frame rate
+MAX_JOBS=15  # <-- Adjust this to control how many processes run in parallel
+
+data_dir="/mnt/SV_storage/VFM/EK100/EPIC-KITCHENS/"
+save_dir="/mnt/SV_storage/VFM/EK100/EK100_512resolution"
+
+# Find all the subject folders that start with P
+subjects=$(find "$data_dir" -mindepth 1 -maxdepth 1 -type d -name "P*")
+
+# sort the subjects and start from subject P05
+subjects=$(echo "$subjects" | sort -V | grep -A 1000 P05)
+
+for subject_dir in $subjects; do
+
+    indir="${subject_dir}/videos"
+    outdir="${save_dir}/$(basename "$subject_dir")"
+    mkdir -p "$outdir"
+
+    # Gather all videos in this subject's "videos" folder
+    cd "$indir" || exit
+    all_videos=$(find . -iname "*.MP4")
+    cd - > /dev/null
+
+    for video in $all_videos; do
+
+        # Extract width/height
+        W=$(ffprobe -v quiet -show_format -show_streams -show_entries stream=width  "$indir/$video" \
+            | grep width= | cut -d= -f2)
+        H=$(ffprobe -v quiet -show_format -show_streams -show_entries stream=height "$indir/$video" \
+            | grep height= | cut -d= -f2)
+
+        # Decide scaling filter
+        if [ "$W" -gt "$H" ] && [ "$H" -gt "$small_side" ]; then
+            scale_str="-filter:v scale=-1:${small_side}"
+        elif [ "$H" -gt "$W" ] && [ "$W" -gt "$small_side" ]; then
+            scale_str="-filter:v scale=${small_side}:-1"
+        else
+            scale_str=""
+        fi
+
+        vidlen_sec=$(ffprobe -v error -show_entries format=duration -of \
+                     default=noprint_wrappers=1:nokey=1 "$indir/$video")
+
+        mkdir -p "${outdir}/${video}"
+
+        # Generate clips
+        for st_sec in $(seq 0 $cliplen_sec "${vidlen_sec%.*}"); do
+            outfpath="${outdir}/${video}/${st_sec}.mp4"
+
+            # Start a sub-shell { ... } in the background (&)
+            {
+                try=0
+                while [ $try -le $max_tries ]; do
+                    ffmpeg -y -ss "${st_sec}" -i "$indir/$video" \
+                        $scale_str -t $cliplen_sec -r $fps \
+                        "${outfpath}"
+
+                    # Check if written successfully
+                    write_errors=$(ffprobe -v error -i "$outfpath")
+                    if [ -z "$write_errors" ]; then
+                        echo "OK: ${outfpath} written successfully in $((try+1)) tries"
+                        break
+                    else
+                        echo "ERROR writing ${outfpath}, retrying..."
+                    fi
+                    ((try++))
+                done
+            } &  # run in background
+
+            # Limit concurrency
+            while [ "$(jobs -p | wc -l)" -ge "$MAX_JOBS" ]; do
+                # `wait -n` waits until one background job finishes.
+                # (Available in Bash 4.3+; for older Bash, use `wait` without -n.)
+                wait -n
+            done
+
+        done  # end of st_sec loop
+
+    done  # end of videos loop
+
+done  # end of subjects loop
+
+# Wait for any remaining jobs still in flight
+wait
+echo "All conversions done."