Skip to content

Commit a369e23

Browse files
committed
add multiple implementations, all training, hand video generation, llmseval, resizevideo
1 parent 0d63f10 commit a369e23

17 files changed

+1185
-210
lines changed

.vscode/launch.json

Lines changed: 170 additions & 177 deletions
Large diffs are not rendered by default.

docs/download_data.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,41 @@
11
import os
2+
os.environ["HF_HOME"] = "/mnt/SV_storage/VFM/huggingface"
23
from datasets import load_dataset
4+
from datasets import get_dataset_config_names, get_dataset_split_names
35
from tqdm import tqdm
46
import json
57
import yaml
68

9+
dataset_name = "lmms-lab/LLaVA-Video-178K"
10+
11+
save_root = "/mnt/SV_storage/VFM/onevision/llava_video_178k"
12+
13+
subsets = get_dataset_config_names(dataset_name)
14+
for subset in subsets:
15+
# download the dataset
16+
data = load_dataset(dataset_name, subset)
17+
for da in tqdm(data):
18+
json_data = {}
19+
json_data["id"] = da["id"]
20+
json_data["video"] = da["video"]
21+
json_data["conversations"] = da["conversations"]
22+
with open(os.path.join(save_root, '{}.json'.format(da["id"])), "w") as f:
23+
json.dump(json_data, f, indent=4, ensure_ascii=False)
24+
aa= 1
25+
26+
# splits = get_dataset_split_names(dataset_name, subset)
27+
28+
29+
# aa = 1
30+
31+
32+
# data = load_dataset("lmms-lab/LLaVA-Video-178K", '0_30_s_academic_v0_1', split="caption")
33+
34+
# for da in tqdm(data):
35+
# json_data = {}
36+
# json_data["id"] = da["id"]
37+
# aa= 2
38+
739
avaliable_datasets = ['CLEVR-Math(MathV360K)', 'FigureQA(MathV360K)', 'GEOS(MathV360K)', 'GeoQA+(MathV360K)',
840
'Geometry3K(MathV360K)', 'IconQA(MathV360K)', 'MapQA(MathV360K)', 'PMC-VQA(MathV360K)',
941
'Super-CLEVR(MathV360K)', 'TabMWP(MathV360K)', 'UniGeo(MathV360K)', 'VisualWebInstruct(filtered)',

llava/action/check_handvideo.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
import numpy as np
2+
import os
3+
import pandas as pd
4+
import decord
5+
import ast
6+
import cv2
7+
8+
from joblib import Parallel, delayed
9+
import matplotlib.pyplot as plt
10+
from tqdm import tqdm
11+
12+
def crop_hands(video_reader, sample_frames, df, frame_id):
13+
image_size = (384, 384)
14+
# video_size = (1920, 1080)
15+
handobj_size = (568, 320)
16+
expand_ratio = 1.5
17+
minimum_size = 20
18+
19+
# get the frame
20+
try:
21+
frame = video_reader[sample_frames[frame_id]].asnumpy()
22+
except:
23+
hand_image = np.zeros((image_size[0], image_size[1]*2, 3), dtype=np.uint8)
24+
return hand_image
25+
26+
video_size = (frame.shape[1], frame.shape[0])
27+
# get the hand detection results
28+
hand_dets = df.iloc[frame_id]['hand_dets']
29+
30+
# change the string to list
31+
hand_dets = np.array(ast.literal_eval(hand_dets)) if hand_dets != '[]' else None
32+
33+
left_image = np.zeros(image_size + (3,), dtype=np.uint8)
34+
right_image = np.zeros(image_size + (3,), dtype=np.uint8)
35+
36+
if hand_dets is not None:
37+
# select the left hand detection with the highest score
38+
left_hand = hand_dets[hand_dets[:, -1] == 0]
39+
if len(left_hand) > 0:
40+
left_hand = left_hand[np.argmax(left_hand[:, 4])]
41+
bbox = [left_hand[0] * video_size[0] / handobj_size[0], left_hand[1] * video_size[1] / handobj_size[1],
42+
left_hand[2] * video_size[0] / handobj_size[0], left_hand[3] * video_size[1] / handobj_size[1]]
43+
if min(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2 > minimum_size:
44+
# expand the bbox based on the expand_ratio and the longer side, and make the bbox square
45+
half_side = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2
46+
center = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
47+
bbox = [center[0] - half_side * expand_ratio, center[1] - half_side * expand_ratio,
48+
center[0] + half_side * expand_ratio, center[1] + half_side * expand_ratio]
49+
bbox = [int(np.round(x)) for x in bbox]
50+
51+
# crop the image with the bbox and zero padding
52+
cropped_image = np.zeros((bbox[3] - bbox[1], bbox[2] - bbox[0], 3), dtype=np.uint8)
53+
cropped_image[max(0, -bbox[1]):min(bbox[3] - bbox[1], video_size[1] - bbox[1]),
54+
max(0, -bbox[0]):min(bbox[2] - bbox[0], video_size[0] - bbox[0]), :] = frame[max(bbox[1], 0):min(bbox[3], video_size[1]), max(bbox[0], 0):min(bbox[2], video_size[0])]
55+
56+
# resize the cropped image to the image_size
57+
left_image = cv2.resize(cropped_image, image_size)
58+
59+
60+
# select the right hand detection with the highest score
61+
right_hand = hand_dets[hand_dets[:, -1] == 1]
62+
if len(right_hand) > 0:
63+
right_hand = right_hand[np.argmax(right_hand[:, 4])]
64+
bbox = [right_hand[0] * video_size[0] / handobj_size[0], right_hand[1] * video_size[1] / handobj_size[1],
65+
right_hand[2] * video_size[0] / handobj_size[0], right_hand[3] * video_size[1] / handobj_size[1]]
66+
if min(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2 > minimum_size:
67+
# expand the bbox based on the expand_ratio and the longer side, and make the bbox square
68+
half_side = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2
69+
center = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
70+
bbox = [center[0] - half_side * expand_ratio, center[1] - half_side * expand_ratio,
71+
center[0] + half_side * expand_ratio, center[1] + half_side * expand_ratio]
72+
bbox = [int(np.round(x)) for x in bbox]
73+
74+
# crop the image with the bbox and zero padding
75+
cropped_image = np.zeros((bbox[3] - bbox[1], bbox[2] - bbox[0], 3), dtype=np.uint8)
76+
cropped_image[max(0, -bbox[1]):min(bbox[3] - bbox[1], video_size[1] - bbox[1]),
77+
max(0, -bbox[0]):min(bbox[2] - bbox[0], video_size[0] - bbox[0]), :] = frame[max(bbox[1], 0):min(bbox[3], video_size[1]), max(bbox[0], 0):min(bbox[2], video_size[0])]
78+
79+
# resize the cropped image to the image_size
80+
right_image = cv2.resize(cropped_image, image_size)
81+
82+
# concatenate the left and right hand images
83+
hand_image = np.concatenate((left_image, right_image), axis=1)
84+
85+
return hand_image[:, :, ::-1]
86+
87+
def process_clip(clips, video_path, handobj_path, save_video_path, clip_i):
88+
seconds = 15
89+
handobj_fps = 30
90+
image_size = (384, 384)
91+
video_reader = decord.VideoReader(video_path)
92+
video_fps = video_reader.get_avg_fps()
93+
94+
95+
clip = clips[clip_i]
96+
clip_path = os.path.join(handobj_path, clip)
97+
save_clip_path = os.path.join(save_video_path, clip[:-4])
98+
99+
# if not os.path.exists(save_clip_path):
100+
# os.makedirs(save_clip_path)
101+
102+
# initialize the video writer
103+
video_writer = cv2.VideoWriter(save_clip_path, cv2.VideoWriter_fourcc(*'mp4v'), handobj_fps, (image_size[0]*2, image_size[1]))
104+
105+
# read the csv file
106+
df = pd.read_csv(clip_path)
107+
108+
start_second = int(clip.split('.')[0])
109+
end_second = start_second + seconds
110+
start_frame = int(start_second * video_fps)
111+
end_frame = min(int(end_second * video_fps), len(video_reader))
112+
113+
# sample seconds*handobj_fps frames
114+
sample_frames = np.linspace(start_frame, end_frame, num=len(df), endpoint=False, dtype=int)
115+
116+
# # read the video frames
117+
# frames = video_reader.get_batch(sample_frames).asnumpy()
118+
119+
for frame_id in range(len(df)):
120+
hand_image = crop_hands(video_reader, sample_frames, df, frame_id)
121+
122+
# # save the frame as image
123+
# cv2.imwrite(os.path.join(save_clip_path, f'{frame_id:05d}.png'), hand_image)
124+
125+
# write the frame to the video
126+
video_writer.write(hand_image)
127+
128+
video_writer.release()
129+
print(f"Save {save_clip_path}")
130+
aa = 1
131+
132+
if __name__ == "__main__":
133+
hand_video_path = "/mnt/SV_storage/VFM/EK100/EK100_512resolution"
134+
check_path = "/mnt/SV_storage/VFM/onevision/llava_video/EK100"
135+
136+
subjects = sorted(os.listdir(check_path))[11:]
137+
for subject in subjects:
138+
subject_path = os.path.join(check_path, subject)
139+
hand_video_subject_path = os.path.join(hand_video_path, subject)
140+
141+
videos = sorted(os.listdir(subject_path))
142+
for video in videos:
143+
video_path = os.path.join(subject_path, video)
144+
hand_video_video_path = os.path.join(hand_video_subject_path, video)
145+
clips = sorted(os.listdir(video_path))
146+
147+
for clip in clips:
148+
clip_path = os.path.join(video_path, clip)
149+
hand_video_clip_path = os.path.join(hand_video_video_path, clip.replace(".MP4", ".mp4"))
150+
151+
assert os.path.exists(hand_video_clip_path), f"{hand_video_clip_path} does not exist"
152+
153+
# load both the hand video and the original video
154+
hand_video_reader = decord.VideoReader(hand_video_clip_path)
155+
video_reader = decord.VideoReader(clip_path)
156+
157+
assert len(hand_video_reader) in list(range(len(video_reader)-5, len(video_reader)+5)), f"{hand_video_clip_path} has different length with {clip_path}"
158+
159+
print(f"Checked {hand_video_clip_path}")
160+
161+
162+
163+

llava/action/crop_resize_video.sh

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/bin/bash
2+
3+
small_side=512
4+
cliplen_sec=15
5+
max_tries=5
6+
fps=30 # Set the desired frame rate here
7+
8+
data_dir="/mnt/SV_storage/VFM/EK100/EPIC-KITCHENS/"
9+
save_dir="/mnt/SV_storage/VFM/EK100/EK100_512resolution"
10+
# find all the subject folders that start with P
11+
subjects=$(find $data_dir -mindepth 1 -maxdepth 1 -type d -name "P*")
12+
subjects=( $subjects ) # to array
13+
for subject_dir in "${subjects[@]}"; do
14+
# set the video dir as the subject folder with the videos folder
15+
indir="${subject_dir}/videos"
16+
outdir="${save_dir}/$(basename $subject_dir)"
17+
mkdir -p $outdir
18+
19+
cd $indir || exit
20+
all_videos=$(find . -iname "*.MP4")
21+
all_videos=( $all_videos ) # to array
22+
cd -
23+
24+
for video in "${all_videos[@]}"; do
25+
W=$( ffprobe -v quiet -show_format -show_streams -show_entries stream=width "${indir}/${video}" | grep width )
26+
W=${W#width=}
27+
H=$( ffprobe -v quiet -show_format -show_streams -show_entries stream=height "${indir}/${video}" | grep height )
28+
H=${H#height=}
29+
# Set the smaller side to small_side
30+
# from https://superuser.com/a/624564
31+
if [ $W -gt $H ] && [ $H -gt ${small_side} ]; then
32+
scale_str="-filter:v scale=-1:${small_side}"
33+
elif [ $H -gt $W ] && [ $W -gt ${small_side} ]; then
34+
scale_str="-filter:v scale=${small_side}:-1"
35+
else
36+
# The small side is smaller than required size, so don't resize/distort the video
37+
scale_str=""
38+
fi
39+
vidlen_sec=$( ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "${indir}/${video}" )
40+
mkdir -p "${outdir}/${video}"
41+
for st_sec in $(seq 0 ${cliplen_sec} ${vidlen_sec}); do
42+
outfpath=${outdir}/${video}/${st_sec}.mp4
43+
try=0
44+
while [ $try -le $max_tries ]; do
45+
ffmpeg -y -ss ${st_sec} -i "${indir}/${video}" ${scale_str} -t ${cliplen_sec} -r ${fps} "${outfpath}"
46+
try=$(( $try + 1 ))
47+
write_errors=$( ffprobe -v error -i "${outfpath}" )
48+
# If no errors detected by ffprobe, we are done
49+
if [ -z "$write_errors" ]; then
50+
echo $outfpath written successfully in $try tries!
51+
break
52+
fi
53+
done
54+
done
55+
echo "Converted ${video}"
56+
done
57+
done
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/usr/bin/env bash
2+
3+
small_side=512
4+
cliplen_sec=15
5+
max_tries=5
6+
fps=30 # Set the desired frame rate
7+
MAX_JOBS=15 # <-- Adjust this to control how many processes run in parallel
8+
9+
data_dir="/mnt/SV_storage/VFM/EK100/EPIC-KITCHENS/"
10+
save_dir="/mnt/SV_storage/VFM/EK100/EK100_512resolution"
11+
12+
# Find all the subject folders that start with P
13+
subjects=$(find "$data_dir" -mindepth 1 -maxdepth 1 -type d -name "P*")
14+
15+
# sort the subjects and start from subject P05
16+
subjects=$(echo "$subjects" | sort -V | grep -A 1000 P05)
17+
18+
for subject_dir in $subjects; do
19+
20+
indir="${subject_dir}/videos"
21+
outdir="${save_dir}/$(basename "$subject_dir")"
22+
mkdir -p "$outdir"
23+
24+
# Gather all videos in this subject's "videos" folder
25+
cd "$indir" || exit
26+
all_videos=$(find . -iname "*.MP4")
27+
cd - > /dev/null
28+
29+
for video in $all_videos; do
30+
31+
# Extract width/height
32+
W=$(ffprobe -v quiet -show_format -show_streams -show_entries stream=width "$indir/$video" \
33+
| grep width= | cut -d= -f2)
34+
H=$(ffprobe -v quiet -show_format -show_streams -show_entries stream=height "$indir/$video" \
35+
| grep height= | cut -d= -f2)
36+
37+
# Decide scaling filter
38+
if [ "$W" -gt "$H" ] && [ "$H" -gt "$small_side" ]; then
39+
scale_str="-filter:v scale=-1:${small_side}"
40+
elif [ "$H" -gt "$W" ] && [ "$W" -gt "$small_side" ]; then
41+
scale_str="-filter:v scale=${small_side}:-1"
42+
else
43+
scale_str=""
44+
fi
45+
46+
vidlen_sec=$(ffprobe -v error -show_entries format=duration -of \
47+
default=noprint_wrappers=1:nokey=1 "$indir/$video")
48+
49+
mkdir -p "${outdir}/${video}"
50+
51+
# Generate clips
52+
for st_sec in $(seq 0 $cliplen_sec "${vidlen_sec%.*}"); do
53+
outfpath="${outdir}/${video}/${st_sec}.mp4"
54+
55+
# Start a sub-shell { ... } in the background (&)
56+
{
57+
try=0
58+
while [ $try -le $max_tries ]; do
59+
ffmpeg -y -ss "${st_sec}" -i "$indir/$video" \
60+
$scale_str -t $cliplen_sec -r $fps \
61+
"${outfpath}"
62+
63+
# Check if written successfully
64+
write_errors=$(ffprobe -v error -i "$outfpath")
65+
if [ -z "$write_errors" ]; then
66+
echo "OK: ${outfpath} written successfully in $((try+1)) tries"
67+
break
68+
else
69+
echo "ERROR writing ${outfpath}, retrying..."
70+
fi
71+
((try++))
72+
done
73+
} & # run in background
74+
75+
# Limit concurrency
76+
while [ "$(jobs -p | wc -l)" -ge "$MAX_JOBS" ]; do
77+
# `wait -n` waits until one background job finishes.
78+
# (Available in Bash 4.3+; for older Bash, use `wait` without -n.)
79+
wait -n
80+
done
81+
82+
done # end of st_sec loop
83+
84+
done # end of videos loop
85+
86+
done # end of subjects loop
87+
88+
# Wait for any remaining jobs still in flight
89+
wait
90+
echo "All conversions done."

0 commit comments

Comments
 (0)