|
| 1 | +# coding: utf-8 |
| 2 | + |
| 3 | +# Copyright (c) 2021-2060 Curtis G. Northcutt |
| 4 | +# This file is part of cgnorthcutt/label-errors. |
| 5 | +# |
| 6 | +# cleanlab is free software: you can redistribute it and/or modify |
| 7 | +# it under the terms of the GNU General Public License as published by |
| 8 | +# the Free Software Foundation, either version 3 of the License, or |
| 9 | +# (at your option) any later version. |
| 10 | +# |
| 11 | +# cgnorthcutt/label-errors is distributed in the hope that it will be useful, |
| 12 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | +# GNU General Public License for more details. |
| 15 | +# |
| 16 | +# You should have received a copy of the GNU General Public License |
| 17 | + |
| 18 | +# This agreement applies to this version and all previous versions of |
| 19 | +# cgnorthcutt/label-errors. |
| 20 | + |
| 21 | +""" |
| 22 | +Preprocess the tfrecord AudioSet feature embeddings into numpy data files. |
| 23 | +
|
| 24 | +Resources used: |
| 25 | +1. https://github.com/tensorflow/models/tree/master/research/audioset |
| 26 | +2. https://research.google.com/audioset/download.html |
| 27 | +3. https://github.com/audioset/ontology |
| 28 | +""" |
| 29 | + |
| 30 | + |
| 31 | +import argparse |
| 32 | +import os |
| 33 | +import numpy as np |
| 34 | +import tensorflow as tf # version 1.15.4 |
| 35 | +import multiprocessing |
| 36 | +import tqdm |
| 37 | +import pickle |
| 38 | +from keras.preprocessing.sequence import pad_sequences |
| 39 | + |
| 40 | +parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') |
| 41 | +parser.add_argument( |
| 42 | + '--audioset-dir', metavar='AUDIOSET_DIR', |
| 43 | + help='Specify path to ../audioset/audioset_v1_embeddings/', |
| 44 | +) |
| 45 | + |
| 46 | + |
| 47 | +def read_data(path, include_times=False): |
| 48 | + result = [[], [], []] |
| 49 | + if include_times: |
| 50 | + result += [[], []] |
| 51 | + for example in tf.python_io.tf_record_iterator(path): |
| 52 | + tf_example = tf.train.Example.FromString(example) |
| 53 | + vid_id = tf_example.features.feature['video_id'].bytes_list.value[ |
| 54 | + 0].decode(encoding='UTF-8') |
| 55 | + label = tf_example.features.feature['labels'].int64_list.value |
| 56 | + if include_times: |
| 57 | + result[3].append(tf_example.features.feature[ |
| 58 | + 'start_time_seconds'].float_list.value) |
| 59 | + result[4].append(tf_example.features.feature[ |
| 60 | + 'end_time_seconds'].float_list.value) |
| 61 | + tf_seq_example = tf.train.SequenceExample.FromString(example) |
| 62 | + tf_feature = tf_seq_example.feature_lists.feature_list[ |
| 63 | + 'audio_embedding'].feature |
| 64 | + n_frames = len(tf_feature) |
| 65 | + audio_frames = [] |
| 66 | + # Iterate through frames. |
| 67 | + for i in range(n_frames): |
| 68 | + hexembed = tf_feature[i].bytes_list.value[0].hex() |
| 69 | + arrayembed = [int(hexembed[i:i + 2], 16) for i in |
| 70 | + range(0, len(hexembed), 2)] |
| 71 | + audio_frames.append(arrayembed) |
| 72 | + result[0].append(vid_id) |
| 73 | + result[1].append(list(label)) |
| 74 | + result[2].append(np.stack(audio_frames).astype(np.uint8)) |
| 75 | + return result |
| 76 | + |
| 77 | + |
| 78 | +def pad(feature_matrix, maxlen=10): |
| 79 | + return pad_sequences(feature_matrix.T, maxlen=maxlen).T.astype(np.uint8) |
| 80 | + |
| 81 | + |
| 82 | +def preprocess_data(path, prefix='bal_train'): |
| 83 | + fns = [path + fn for fn in os.listdir(path)] |
| 84 | + with multiprocessing.Pool(multiprocessing.cpu_count()) as p: |
| 85 | + results = list(tqdm.tqdm(p.imap(read_data, fns), total=len(fns))) |
| 86 | + |
| 87 | + print('\nAll files read in. Now post-processing.') |
| 88 | + video_ids = [v for r in results for v in r[0]] |
| 89 | + labels = [l for r in results for l in r[1]] |
| 90 | + features = [f for r in results for f in r[2]] |
| 91 | + del results # Free memory |
| 92 | + # Make all inputs exactly the same shape. |
| 93 | + print("Padding with 0 to make all features shape (10,128) of type uint8.") |
| 94 | + with multiprocessing.Pool(multiprocessing.cpu_count()) as p: |
| 95 | + features = list(tqdm.tqdm(p.imap(pad, features), total=len(features))) |
| 96 | + |
| 97 | + print('Saving pickled results.') |
| 98 | + with open(prefix + '_features.p', 'wb') as wf: |
| 99 | + pickle.dump(features, wf, pickle.HIGHEST_PROTOCOL) |
| 100 | + with open(prefix + '_video_ids.p', 'wb') as wf: |
| 101 | + pickle.dump(video_ids, wf, pickle.HIGHEST_PROTOCOL) |
| 102 | + with open(prefix + '_labels.p', 'wb') as wf: |
| 103 | + pickle.dump(labels, wf, pickle.HIGHEST_PROTOCOL) |
| 104 | + |
| 105 | + print('Preprocessing complete.') |
| 106 | + |
| 107 | + |
| 108 | +def main(audioset_dir): |
| 109 | + for kind in ["eval", "bal_train", "unbal_train"]: |
| 110 | + preprocess_data(audioset_dir + kind + "/", prefix=kind) |
| 111 | + |
| 112 | + |
| 113 | +if __name__ == '__main__': |
| 114 | + arg_parser = parser.parse_args() |
| 115 | + if arg_parser.audioset_dir is None: |
| 116 | + parser.error("Specify the path to the audioset embeddings " |
| 117 | + "directory.\nFor example, if the data is stored in " |
| 118 | + "'/datasets/audioset/audioset_v1_embeddings/' " |
| 119 | + "you should call this script like this:\npython " |
| 120 | + "audioset_preprocessing.py --audioset-dir " |
| 121 | + "'/datasets/audioset/audioset_v1_embeddings/''") |
| 122 | + main(audioset_dir=arg_parser.audioset_dir) |
0 commit comments