|
| 1 | +import time |
| 2 | +import requests |
| 3 | +import os |
| 4 | +from os import path |
| 5 | + |
| 6 | +import cv2 |
| 7 | +import numpy as np |
| 8 | + |
| 9 | +from aperturedb import ParallelLoader |
| 10 | +from aperturedb import CSVParser |
| 11 | +from aperturedb import ProgressBar |
| 12 | + |
| 13 | +HEADER_PATH = "filename" |
| 14 | +HEADER_URL = "url" |
| 15 | + |
| 16 | +class VideoDownloaderCSV(CSVParser.CSVParser): |
| 17 | + |
| 18 | + ''' |
| 19 | + ApertureDB Video Downloader. |
| 20 | + Expects a csv file with AT LEAST a "url" column, and |
| 21 | + optionally a "filename" field. |
| 22 | + If "filename" is not present, it is taken from the url. |
| 23 | + ''' |
| 24 | + |
| 25 | + def __init__(self, filename, check_video=True): |
| 26 | + |
| 27 | + self.has_filename = False |
| 28 | + self.check_video = check_video |
| 29 | + |
| 30 | + super().__init__(filename) |
| 31 | + |
| 32 | + def __getitem__(self, idx): |
| 33 | + |
| 34 | + url = self.df.loc[idx, HEADER_URL] |
| 35 | + |
| 36 | + if self.has_filename: |
| 37 | + filename = self.df.loc[idx, HEADER_PATH] |
| 38 | + else: |
| 39 | + filename = self.url_to_filename(url) |
| 40 | + |
| 41 | + return url, filename |
| 42 | + |
| 43 | + def url_to_filename(self, url): |
| 44 | + |
| 45 | + filename = url.split("/")[-1] |
| 46 | + folder = "/tmp/videos/" |
| 47 | + |
| 48 | + return folder + filename |
| 49 | + |
| 50 | + def validate(self): |
| 51 | + |
| 52 | + self.header = list(self.df.columns.values) |
| 53 | + |
| 54 | + if HEADER_URL not in self.header: |
| 55 | + raise Exception("Error with CSV file field: url. Must be a field") |
| 56 | + |
| 57 | + if HEADER_PATH in self.header: |
| 58 | + self.has_filename = True |
| 59 | + |
| 60 | +class VideoDownloader(ParallelLoader.ParallelLoader): |
| 61 | + |
| 62 | + def __init__(self, db, dry_run=False): |
| 63 | + |
| 64 | + super().__init__(db, dry_run=dry_run) |
| 65 | + |
| 66 | + self.type = "video" |
| 67 | + |
| 68 | + self.check_video = False |
| 69 | + |
| 70 | + def check_if_video_is_ok(self, filename, url): |
| 71 | + |
| 72 | + if not os.path.exists(filename): |
| 73 | + return False |
| 74 | + |
| 75 | + try: |
| 76 | + a = cv2.VideoCapture(filename) |
| 77 | + if a.isOpened() == False: |
| 78 | + print("Video present but error reading it:", url) |
| 79 | + return False |
| 80 | + except: |
| 81 | + print("Video present but error decoding:", url) |
| 82 | + return False |
| 83 | + |
| 84 | + return True |
| 85 | + |
| 86 | + def download_video(self, url, filename): |
| 87 | + |
| 88 | + start = time.time() |
| 89 | + |
| 90 | + if self.check_video and self.check_if_video_is_ok(filename, url): |
| 91 | + return |
| 92 | + |
| 93 | + folder = os.path.dirname(filename) |
| 94 | + if not os.path.exists(folder): |
| 95 | + os.makedirs(folder, exist_ok=True) |
| 96 | + |
| 97 | + videodata = requests.get(url) |
| 98 | + if videodata.ok: |
| 99 | + fd = open(filename, "wb") |
| 100 | + fd.write(videodata.content) |
| 101 | + fd.close() |
| 102 | + |
| 103 | + try: |
| 104 | + a = cv2.VideoCapture(filename) |
| 105 | + if a.isOpened() == False: |
| 106 | + print("Downloaded Video size error:", url) |
| 107 | + os.remove(filename) |
| 108 | + self.error_counter += 1 |
| 109 | + except: |
| 110 | + print("Downloaded Video cannot be decoded:", url) |
| 111 | + os.remove(filename) |
| 112 | + self.error_counter += 1 |
| 113 | + else: |
| 114 | + print("URL not found:", url) |
| 115 | + self.error_counter += 1 |
| 116 | + |
| 117 | + self.times_arr.append(time.time() - start) |
| 118 | + |
| 119 | + def worker(self, thid, generator, start, end): |
| 120 | + |
| 121 | + if thid == 0 and self.stats: |
| 122 | + pb = ProgressBar.ProgressBar("download_progress.txt") |
| 123 | + |
| 124 | + for i in range(start, end): |
| 125 | + |
| 126 | + url, filename = generator[i] |
| 127 | + |
| 128 | + self.download_video(url, filename) |
| 129 | + |
| 130 | + if thid == 0 and self.stats: |
| 131 | + pb.update((i - start) / (end - start)) |
| 132 | + |
| 133 | + if thid == 0 and self.stats: |
| 134 | + pb.update(1) |
| 135 | + |
| 136 | + def print_stats(self): |
| 137 | + |
| 138 | + print("====== ApertureDB VideoDownloader Stats ======") |
| 139 | + |
| 140 | + times = np.array(self.times_arr) |
| 141 | + print("Avg Video download time(s):", np.mean(times)) |
| 142 | + print("Img download time std:", np.std (times)) |
| 143 | + print("Avg download throughput (videos/s)):", |
| 144 | + 1 / np.mean(times) * self.numthreads) |
| 145 | + |
| 146 | + print("Total time(s):", self.ingestion_time) |
| 147 | + print("Overall throughput (videos/s):", |
| 148 | + self.total_elements / self.ingestion_time) |
| 149 | + print("Total errors encountered:", self.error_counter) |
| 150 | + print("=============================================") |
0 commit comments