1+ import numpy as np
2+ import os
3+ import pandas as pd
4+ import decord
5+ import ast
6+ import cv2
7+
8+ from joblib import Parallel , delayed
9+ import matplotlib .pyplot as plt
10+ from tqdm import tqdm
11+
12+ def crop_hands (video_reader , sample_frames , df , frame_id ):
13+ image_size = (384 , 384 )
14+ # video_size = (1920, 1080)
15+ handobj_size = (568 , 320 )
16+ expand_ratio = 1.5
17+ minimum_size = 20
18+
19+ # get the frame
20+ try :
21+ frame = video_reader [sample_frames [frame_id ]].asnumpy ()
22+ except :
23+ hand_image = np .zeros ((image_size [0 ], image_size [1 ]* 2 , 3 ), dtype = np .uint8 )
24+ return hand_image
25+
26+ video_size = (frame .shape [1 ], frame .shape [0 ])
27+ # get the hand detection results
28+ hand_dets = df .iloc [frame_id ]['hand_dets' ]
29+
30+ # change the string to list
31+ hand_dets = np .array (ast .literal_eval (hand_dets )) if hand_dets != '[]' else None
32+
33+ left_image = np .zeros (image_size + (3 ,), dtype = np .uint8 )
34+ right_image = np .zeros (image_size + (3 ,), dtype = np .uint8 )
35+
36+ if hand_dets is not None :
37+ # select the left hand detection with the highest score
38+ left_hand = hand_dets [hand_dets [:, - 1 ] == 0 ]
39+ if len (left_hand ) > 0 :
40+ left_hand = left_hand [np .argmax (left_hand [:, 4 ])]
41+ bbox = [left_hand [0 ] * video_size [0 ] / handobj_size [0 ], left_hand [1 ] * video_size [1 ] / handobj_size [1 ],
42+ left_hand [2 ] * video_size [0 ] / handobj_size [0 ], left_hand [3 ] * video_size [1 ] / handobj_size [1 ]]
43+ if min (bbox [2 ] - bbox [0 ], bbox [3 ] - bbox [1 ]) / 2 > minimum_size :
44+ # expand the bbox based on the expand_ratio and the longer side, and make the bbox square
45+ half_side = max (bbox [2 ] - bbox [0 ], bbox [3 ] - bbox [1 ]) / 2
46+ center = [(bbox [0 ] + bbox [2 ]) / 2 , (bbox [1 ] + bbox [3 ]) / 2 ]
47+ bbox = [center [0 ] - half_side * expand_ratio , center [1 ] - half_side * expand_ratio ,
48+ center [0 ] + half_side * expand_ratio , center [1 ] + half_side * expand_ratio ]
49+ bbox = [int (np .round (x )) for x in bbox ]
50+
51+ # crop the image with the bbox and zero padding
52+ cropped_image = np .zeros ((bbox [3 ] - bbox [1 ], bbox [2 ] - bbox [0 ], 3 ), dtype = np .uint8 )
53+ cropped_image [max (0 , - bbox [1 ]):min (bbox [3 ] - bbox [1 ], video_size [1 ] - bbox [1 ]),
54+ max (0 , - bbox [0 ]):min (bbox [2 ] - bbox [0 ], video_size [0 ] - bbox [0 ]), :] = frame [max (bbox [1 ], 0 ):min (bbox [3 ], video_size [1 ]), max (bbox [0 ], 0 ):min (bbox [2 ], video_size [0 ])]
55+
56+ # resize the cropped image to the image_size
57+ left_image = cv2 .resize (cropped_image , image_size )
58+
59+
60+ # select the right hand detection with the highest score
61+ right_hand = hand_dets [hand_dets [:, - 1 ] == 1 ]
62+ if len (right_hand ) > 0 :
63+ right_hand = right_hand [np .argmax (right_hand [:, 4 ])]
64+ bbox = [right_hand [0 ] * video_size [0 ] / handobj_size [0 ], right_hand [1 ] * video_size [1 ] / handobj_size [1 ],
65+ right_hand [2 ] * video_size [0 ] / handobj_size [0 ], right_hand [3 ] * video_size [1 ] / handobj_size [1 ]]
66+ if min (bbox [2 ] - bbox [0 ], bbox [3 ] - bbox [1 ]) / 2 > minimum_size :
67+ # expand the bbox based on the expand_ratio and the longer side, and make the bbox square
68+ half_side = max (bbox [2 ] - bbox [0 ], bbox [3 ] - bbox [1 ]) / 2
69+ center = [(bbox [0 ] + bbox [2 ]) / 2 , (bbox [1 ] + bbox [3 ]) / 2 ]
70+ bbox = [center [0 ] - half_side * expand_ratio , center [1 ] - half_side * expand_ratio ,
71+ center [0 ] + half_side * expand_ratio , center [1 ] + half_side * expand_ratio ]
72+ bbox = [int (np .round (x )) for x in bbox ]
73+
74+ # crop the image with the bbox and zero padding
75+ cropped_image = np .zeros ((bbox [3 ] - bbox [1 ], bbox [2 ] - bbox [0 ], 3 ), dtype = np .uint8 )
76+ cropped_image [max (0 , - bbox [1 ]):min (bbox [3 ] - bbox [1 ], video_size [1 ] - bbox [1 ]),
77+ max (0 , - bbox [0 ]):min (bbox [2 ] - bbox [0 ], video_size [0 ] - bbox [0 ]), :] = frame [max (bbox [1 ], 0 ):min (bbox [3 ], video_size [1 ]), max (bbox [0 ], 0 ):min (bbox [2 ], video_size [0 ])]
78+
79+ # resize the cropped image to the image_size
80+ right_image = cv2 .resize (cropped_image , image_size )
81+
82+ # concatenate the left and right hand images
83+ hand_image = np .concatenate ((left_image , right_image ), axis = 1 )
84+
85+ return hand_image [:, :, ::- 1 ]
86+
87+ def process_clip (clips , video_path , handobj_path , save_video_path , clip_i ):
88+ seconds = 15
89+ handobj_fps = 30
90+ image_size = (384 , 384 )
91+ video_reader = decord .VideoReader (video_path )
92+ video_fps = video_reader .get_avg_fps ()
93+
94+
95+ clip = clips [clip_i ]
96+ clip_path = os .path .join (handobj_path , clip )
97+ save_clip_path = os .path .join (save_video_path , clip [:- 4 ])
98+
99+ # if not os.path.exists(save_clip_path):
100+ # os.makedirs(save_clip_path)
101+
102+ # initialize the video writer
103+ video_writer = cv2 .VideoWriter (save_clip_path , cv2 .VideoWriter_fourcc (* 'mp4v' ), handobj_fps , (image_size [0 ]* 2 , image_size [1 ]))
104+
105+ # read the csv file
106+ df = pd .read_csv (clip_path )
107+
108+ start_second = int (clip .split ('.' )[0 ])
109+ end_second = start_second + seconds
110+ start_frame = int (start_second * video_fps )
111+ end_frame = min (int (end_second * video_fps ), len (video_reader ))
112+
113+ # sample seconds*handobj_fps frames
114+ sample_frames = np .linspace (start_frame , end_frame , num = len (df ), endpoint = False , dtype = int )
115+
116+ # # read the video frames
117+ # frames = video_reader.get_batch(sample_frames).asnumpy()
118+
119+ for frame_id in range (len (df )):
120+ hand_image = crop_hands (video_reader , sample_frames , df , frame_id )
121+
122+ # # save the frame as image
123+ # cv2.imwrite(os.path.join(save_clip_path, f'{frame_id:05d}.png'), hand_image)
124+
125+ # write the frame to the video
126+ video_writer .write (hand_image )
127+
128+ video_writer .release ()
129+ print (f"Save { save_clip_path } " )
130+ aa = 1
131+
132+ if __name__ == "__main__" :
133+ hand_video_path = "/mnt/SV_storage/VFM/EK100/EK100_512resolution"
134+ check_path = "/mnt/SV_storage/VFM/onevision/llava_video/EK100"
135+
136+ subjects = sorted (os .listdir (check_path ))[11 :]
137+ for subject in subjects :
138+ subject_path = os .path .join (check_path , subject )
139+ hand_video_subject_path = os .path .join (hand_video_path , subject )
140+
141+ videos = sorted (os .listdir (subject_path ))
142+ for video in videos :
143+ video_path = os .path .join (subject_path , video )
144+ hand_video_video_path = os .path .join (hand_video_subject_path , video )
145+ clips = sorted (os .listdir (video_path ))
146+
147+ for clip in clips :
148+ clip_path = os .path .join (video_path , clip )
149+ hand_video_clip_path = os .path .join (hand_video_video_path , clip .replace (".MP4" , ".mp4" ))
150+
151+ assert os .path .exists (hand_video_clip_path ), f"{ hand_video_clip_path } does not exist"
152+
153+ # load both the hand video and the original video
154+ hand_video_reader = decord .VideoReader (hand_video_clip_path )
155+ video_reader = decord .VideoReader (clip_path )
156+
157+ assert len (hand_video_reader ) in list (range (len (video_reader )- 5 , len (video_reader )+ 5 )), f"{ hand_video_clip_path } has different length with { clip_path } "
158+
159+ print (f"Checked { hand_video_clip_path } " )
160+
161+
162+
163+
0 commit comments