Video-Summarization/summary_old.py at main · spesavento/Video-Summarization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
#!/usr/bin/env python

from skimage import data, img_as_float
from skimage.metrics import structural_similarity as ssim
from skimage import io
from sklearn import preprocessing
import numpy as np
import cv2
import numpy as np
import pickle as pl
import struct
import os
from os.path import isfile, join
from matplotlib import pyplot as plt
import moviepy.editor as mpe
import natsort
import wave
from PIL import Image
from imutils.object_detection import non_max_suppression
from imutils import paths
import imutils
from array import *

def FrameSimilarity(frames_jpg_path):
    # calculates the "structured similarity index" between adjacent frames
    # ssim() looks at luminance, contrast and structure, it is a scikit-image function
    # we use ssim() for both (1) Shot Change detection, and (2) Action weight
    files = [f for f in os.listdir(frames_jpg_path) if isfile(join(frames_jpg_path,f))]
    files.sort()
    # initialize array
    ssi_array = []
    # number of adjacent frames
    numadj = len(files)-2
    # loop through all adjacent frames and calculate the ssi
    for i in range (0, numadj):
    # for i in range (0, 4000):
        frame_a = cv2.imread(frames_jpg_path+'frame'+str(i)+'.jpg')
        frame_b = cv2.imread(frames_jpg_path+'frame'+str(i+1)+'.jpg')
        frame_a_bw = cv2.cvtColor(frame_a, cv2.COLOR_BGR2GRAY)
        frame_b_bw = cv2.cvtColor(frame_b, cv2.COLOR_BGR2GRAY)
        ssim_ab = ssim(frame_a_bw, frame_b_bw)
        ssim_ab = round(ssim_ab, 3)
        ssi_array.append(ssim_ab)
    return (ssi_array)

def FrameChange(ssi_array):
    # this function finds the frames at the shot boundary
    # length of ssi_array, how many adjacent frames
    num = len(ssi_array)
    # initialize the shot_array variable
    framechange_array = [0]
    last_hit = 0
    for i in range (0, num-3):
        ssim_ab = ssi_array[i]
        ssim_bc = ssi_array[i+1]
        ssim_cd = ssi_array[i+2]
        # 0.6 is chosen because a 60% change in similarity works well for a shot change threshold
        if (ssim_bc/ssim_ab < 0.6 and ssim_bc/ssim_cd < 0.6 and i-last_hit > 20):
            framechange_array.append(i+2)
            last_hit = i+2
    return (framechange_array)

def ShotArray(framechange_array):
    # from where the frames change, create an array of the video shots
    shot_array = []
    shot_begin = 0
    shot_end = 0
    for x in range (0, len(framechange_array)-1):
        shot_begin = framechange_array[x]
        shot_end = framechange_array[x+1]-1
        shot_array.append([shot_begin,shot_end])
    return(shot_array)

def FindAction(framechange_array, ssi_array):
    # initialize action array
    action_array = []
    for x in range (0, len(framechange_array)-1):
        frames_in_shot = framechange_array[x+1] - framechange_array[x] - 1
        ssi_total = 0
        ssi_average = 0
        for y in range (framechange_array[x], framechange_array[x+1]-1):
            ssi_total = ssi_total + ssi_array[y]
        ssi_average = ssi_total / frames_in_shot
        # instead of low is high action, make high is high action
        ssi_average = 1 - ssi_average
        action_array.append(ssi_average)
    # in the action array, a smaller value means more action (less similarity within shot frames)
    # return a normalized weighted array, value 0 to 1
    action_array_normalized = preprocessing.minmax_scale(action_array, feature_range=(0, 1))
    action_array = [round(num, 3) for num in action_array_normalized]
    return(action_array)

def FindFaces(framechange_array, frames_jpg_path):
    # Load face classifier, using "Haar" classifier, basic but works fine
    face_classifier = cv2.CascadeClassifier('haarcascade_face_classifier.xml')
    # initialize array variable to record faces
    face_array = []
    # loop through the number of shots
    for x in range (0, len(framechange_array)-1):
        frames_in_shot = framechange_array[x+1] - framechange_array[x] - 1
        face_total = 0
        for y in range (framechange_array[x], framechange_array[x+1]-1):
            # url of frame image to analyze
            filename=frames_jpg_path+'frame'+str(y)+'.jpg'
            # read it into OpenCV
            img = cv2.imread(filename)
            # convert to grayscale
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            # detect faces
            # scaleFactor – how much the image size is reduced at each image scale
            # minNeighbors = 4 gives few false positives, but misses a few faces
            faces = face_classifier.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=4)
            for (x, y, w, h) in faces:
                face_total = face_total + 1
        face_array.append(face_total)
    # return a normalized weighted array, value 0 to 1
    face_array_normalized = preprocessing.minmax_scale(face_array, feature_range=(0, 1))
    face_array = [round(num, 3) for num in face_array_normalized]
    return(face_array)

def FindPeople(framechange_array, frames_jpg_path):
    # OpenCV has a pre-trained person model using Histogram Oriented Gradients (HOG)
    # and Linear SVM
    hog = cv2.HOGDescriptor()
    hog.setSVMDetector(cv2.HOGDescriptor_getDefaultPeopleDetector())
    # initialize array variable to record faces
    people_array = []
    for x in range (0, len(framechange_array)-1):
        frames_in_shot = framechange_array[x+1] - framechange_array[x] - 1
        people_total = 0
        for y in range (framechange_array[x], framechange_array[x+1]-1):
            # url of frame image to analyze
            filename=frames_jpg_path+'frame'+str(y)+'.jpg'
            # read it into OpenCV
            image = cv2.imread(filename)
            # resize the image to increase speed (may try this on face detect as well)
            image = imutils.resize(image, width=min(400, image.shape[1]))
            orig = image.copy()
            # detect people in the image
            (rects, weights) = hog.detectMultiScale(image, winStride=(4, 4), padding=(8, 8), scale=1.05)
            for (x, y, w, h) in rects:
                people_total = people_total + 1
        people_array.append(people_total)
    # return a normalized weighted array, value 0 to 1
    people_array_normalized = preprocessing.minmax_scale(people_array, feature_range=(0, 1))
    people_array = [round(num, 3) for num in people_array_normalized]
    return(people_array)

def TotalWeights(shot_array, action_array, face_array, people_array):
    # use numpy to add the weight arrays
    # for now a simple addition of action, face, people weights
    face_array_scaled = [element * 0.5 for element in face_array]
    people_array_scaled = [element * 0.5 for element in people_array]
    arr = []
    arr.append(action_array)
    arr.append(face_array_scaled)
    arr.append(people_array_scaled)
    np_arr = np.array(arr)
    np_weight = np_arr.sum(axis=0)
    total_weight = list(np.around(np.array(np_weight),3))
    # total_weight = np_weight.tolist()
    for x in range (0, len(shot_array)):
        shot_array[x].append(total_weight[x])
    totalweight_array = shot_array
    # returns a multi-level weighted array [shot start, shot end, total weight]
    return(totalweight_array)

def SaveSummaryFrames(totalweight_array, summary_frame_path, frames_jpg_path):
    # with weighted shots, save the summary frames into summary_frame_path
    # sort the array by weight descending, best shots first
    sorted_array = sorted(totalweight_array, key=lambda x: x[2], reverse=True)
    print('\nsorted_array')
    print('shots ordered by highest weight first')
    print(str(sorted_array))
    frame_count = 0
    summary_array = []
    ordered_array = []
    # first truncated the shots that won't be used
    # do this by counting the top weighted shots until
    # frame count is < 2700 (90 seconds x 30 fps)
    for x in range (0, len(sorted_array)-1):
        start_frame = sorted_array[x][0]
        end_frame = sorted_array[x][1]
        num_frames = end_frame - start_frame
        frame_count = frame_count + num_frames
        # stop if frame_count is 90 sec (90 sec * 30 fps = 2700)
        if (frame_count < 2700):
            summary_array.insert(x, sorted_array[x])
    # ordered array sort by shot start frame number
    ordered_array = sorted(summary_array, key=lambda x: x[0])
    print('\nordered_array')
    print('shots trimmed down to < 2700 frames, ordered by scene number')
    print(str(ordered_array))
    num_shots=len(ordered_array)
    # create a numeric list 0000, 0001, to 9999
    numlist = ["%04d" % x for x in range(10000)]
    count = 0
    # print(str(num_shots))
    for y in range (0,num_shots):
        start = ordered_array[y][0]
        end = ordered_array[y][1]
        # print(str(start))
        for z in range (start, end):
            shot_image = frames_jpg_path+'frame'+str(z)+'.jpg'
            img = cv2.imread(shot_image)
            summary_image = summary_frame_path+numlist[count]+'.jpg'
            cv2.imwrite(summary_image,img)
            count = count+1

# Convert frames folder to video using OpenCV
def FramesToVideo(summary_frame_path,pathOut,fps,frame_width,frame_height):
    frame_array = []
    files = [f for f in os.listdir(summary_frame_path) if isfile(join(summary_frame_path,f))]
    # sort the files
    # see python reference https://docs.python.org/3/howto/sorting.html
    files.sort()
    for i in range(len(files)):
        filename=summary_frame_path+files[i]
        #reading each files
        img = cv2.imread(filename)
        # height, width, layers = img.shape
        # size = (width,height)
        #inserting the frames into an image array
        frame_array.append(img)
    # define the parameters for creating the video
    # .mp4 is a good choice for playing videos, works on OSX and Windows
    fourcc = cv2.VideoWriter_fourcc(*'MP4V')
    out = cv2.VideoWriter(pathOut, fourcc, fps, (frame_width,frame_height))
    # create the video from frame array
    for i in range(len(frame_array)):
        # writing to a image array
        out.write(frame_array[i])
    out.release()

def MakeCollage(framechange_array, frames_jpg_path, collage_path):
    # creates a collage of the shots in a video
    offset = 30
    i = 0
    # start with a blank image that is the same width (1600px) of 5 frames
    im_v = cv2.imread('top.jpg')
    for x in range (0, len(framechange_array)-5, 5):
        im_a = cv2.imread(frames_jpg_path+'frame'+str(framechange_array[x]+offset)+'.jpg')
        im_b = cv2.imread(frames_jpg_path+'frame'+str(framechange_array[x+1]+offset)+'.jpg')
        im_c = cv2.imread(frames_jpg_path+'frame'+str(framechange_array[x+2]+offset)+'.jpg')
        im_d = cv2.imread(frames_jpg_path+'frame'+str(framechange_array[x+3]+offset)+'.jpg')
        im_e = cv2.imread(frames_jpg_path+'frame'+str(framechange_array[x+4]+offset)+'.jpg')
        cv2.putText(im_a, str(x), (10,60), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 4)
        cv2.putText(im_b, str(x+1), (10,60), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 4)
        cv2.putText(im_c, str(x+2), (10,60), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 4)
        cv2.putText(im_d, str(x+3), (10,60), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 4)
        cv2.putText(im_e, str(x+4), (10,60), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 4)
        im_h = cv2.hconcat([im_a, im_b, im_c, im_d, im_e])
        im_v = cv2.vconcat([im_v, im_h])
    cv2.imwrite(collage_path, im_v)


def main():

    # name of the video to process
    video_name = 'concert'

    # jpg video frames to be analyzed - ordered frame0.jpg, frame1.jpg, etc.
    frames_jpg_path = 'project_dataset/frames/'+video_name+'/'

    # directory for summary frames and summary video
    summary_frame_path = 'summary/'+video_name+'/frames/'
    summary_video_path = 'summary/'+video_name+'/summary.mp4'
    collage_path = 'summary/'+video_name+'/collage.jpg'

    # start processing the video

    # get ssi_array, the structured similarity between adjacent frames
    print ('\nssi_array')
    print ('the similarity between adjacent frames ... takes a long minute')
    ssi_array = FrameSimilarity(frames_jpg_path)
    print(str(ssi_array[0 : 50])+' ... more')

    # get the framechange_array, which are the shot boundary frames
    print ('\nframechange_array')
    print ('these are the frames where the shot changed')
    framechange_array = FrameChange(ssi_array)
    print(str(framechange_array))

    # get the shot_array, showing the shot sequences start, end
    print ('\nshot_array')
    shot_array = ShotArray(framechange_array)
    print (str(len(shot_array))+' shots in the video')
    print(str(shot_array))

    # get action_array, shows the average action weight for each shot
    print ('\naction_array')
    action_array = FindAction(framechange_array, ssi_array)
    print(str(len(action_array))+' action weights')
    print(str(action_array))

    # get the face array
    print('\nface_array')
    face_array = FindFaces(framechange_array, frames_jpg_path)
    print(str(len(face_array))+' face weights')
    print(str(face_array))

    # get the people array
    print('\npeople_array')
    people_array = FindPeople(framechange_array, frames_jpg_path)
    print('there are '+str(len(people_array))+' people weights')
    print(str(people_array))

    # total the weights
    print('\ntotalweight_array')
    print('[shot start, shot end, total weight]')
    totalweight_array = TotalWeights(shot_array, action_array, face_array, people_array)
    print(str(totalweight_array))

    # create summary frames in a folder
    SaveSummaryFrames(totalweight_array,summary_frame_path, frames_jpg_path)

    # create summary video
    print('\nfrom the summary frames, creating a summary video')
    FramesToVideo(summary_frame_path, summary_video_path, 30, 320, 180)
    print('the summary video is stored as '+summary_video_path)

    # optional - make a photo collage of the shots
    print('\nbonus: photo collage of scenes saved as collage.jpg in the root folder')
    MakeCollage(framechange_array, frames_jpg_path, collage_path)

    # Add audio

    # Play with video player
    # vp.PlayVideo(summary_video_path)


if __name__=="__main__":
    main()