Skip to content

Commit 5796589

Browse files
authored
script to create AudioSet numpy files.
1 parent 4260126 commit 5796589

File tree

1 file changed

+122
-0
lines changed

1 file changed

+122
-0
lines changed
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# coding: utf-8
2+
3+
# Copyright (c) 2021-2060 Curtis G. Northcutt
4+
# This file is part of cgnorthcutt/label-errors.
5+
#
6+
# cleanlab is free software: you can redistribute it and/or modify
7+
# it under the terms of the GNU General Public License as published by
8+
# the Free Software Foundation, either version 3 of the License, or
9+
# (at your option) any later version.
10+
#
11+
# cgnorthcutt/label-errors is distributed in the hope that it will be useful,
12+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
# GNU General Public License for more details.
15+
#
16+
# You should have received a copy of the GNU General Public License
17+
18+
# This agreement applies to this version and all previous versions of
19+
# cgnorthcutt/label-errors.
20+
21+
"""
22+
Preprocess the tfrecord AudioSet feature embeddings into numpy data files.
23+
24+
Resources used:
25+
1. https://github.com/tensorflow/models/tree/master/research/audioset
26+
2. https://research.google.com/audioset/download.html
27+
3. https://github.com/audioset/ontology
28+
"""
29+
30+
31+
import argparse
32+
import os
33+
import numpy as np
34+
import tensorflow as tf # version 1.15.4
35+
import multiprocessing
36+
import tqdm
37+
import pickle
38+
from keras.preprocessing.sequence import pad_sequences
39+
40+
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
41+
parser.add_argument(
42+
'--audioset-dir', metavar='AUDIOSET_DIR',
43+
help='Specify path to ../audioset/audioset_v1_embeddings/',
44+
)
45+
46+
47+
def read_data(path, include_times=False):
48+
result = [[], [], []]
49+
if include_times:
50+
result += [[], []]
51+
for example in tf.python_io.tf_record_iterator(path):
52+
tf_example = tf.train.Example.FromString(example)
53+
vid_id = tf_example.features.feature['video_id'].bytes_list.value[
54+
0].decode(encoding='UTF-8')
55+
label = tf_example.features.feature['labels'].int64_list.value
56+
if include_times:
57+
result[3].append(tf_example.features.feature[
58+
'start_time_seconds'].float_list.value)
59+
result[4].append(tf_example.features.feature[
60+
'end_time_seconds'].float_list.value)
61+
tf_seq_example = tf.train.SequenceExample.FromString(example)
62+
tf_feature = tf_seq_example.feature_lists.feature_list[
63+
'audio_embedding'].feature
64+
n_frames = len(tf_feature)
65+
audio_frames = []
66+
# Iterate through frames.
67+
for i in range(n_frames):
68+
hexembed = tf_feature[i].bytes_list.value[0].hex()
69+
arrayembed = [int(hexembed[i:i + 2], 16) for i in
70+
range(0, len(hexembed), 2)]
71+
audio_frames.append(arrayembed)
72+
result[0].append(vid_id)
73+
result[1].append(list(label))
74+
result[2].append(np.stack(audio_frames).astype(np.uint8))
75+
return result
76+
77+
78+
def pad(feature_matrix, maxlen=10):
79+
return pad_sequences(feature_matrix.T, maxlen=maxlen).T.astype(np.uint8)
80+
81+
82+
def preprocess_data(path, prefix='bal_train'):
83+
fns = [path + fn for fn in os.listdir(path)]
84+
with multiprocessing.Pool(multiprocessing.cpu_count()) as p:
85+
results = list(tqdm.tqdm(p.imap(read_data, fns), total=len(fns)))
86+
87+
print('\nAll files read in. Now post-processing.')
88+
video_ids = [v for r in results for v in r[0]]
89+
labels = [l for r in results for l in r[1]]
90+
features = [f for r in results for f in r[2]]
91+
del results # Free memory
92+
# Make all inputs exactly the same shape.
93+
print("Padding with 0 to make all features shape (10,128) of type uint8.")
94+
with multiprocessing.Pool(multiprocessing.cpu_count()) as p:
95+
features = list(tqdm.tqdm(p.imap(pad, features), total=len(features)))
96+
97+
print('Saving pickled results.')
98+
with open(prefix + '_features.p', 'wb') as wf:
99+
pickle.dump(features, wf, pickle.HIGHEST_PROTOCOL)
100+
with open(prefix + '_video_ids.p', 'wb') as wf:
101+
pickle.dump(video_ids, wf, pickle.HIGHEST_PROTOCOL)
102+
with open(prefix + '_labels.p', 'wb') as wf:
103+
pickle.dump(labels, wf, pickle.HIGHEST_PROTOCOL)
104+
105+
print('Preprocessing complete.')
106+
107+
108+
def main(audioset_dir):
109+
for kind in ["eval", "bal_train", "unbal_train"]:
110+
preprocess_data(audioset_dir + kind + "/", prefix=kind)
111+
112+
113+
if __name__ == '__main__':
114+
arg_parser = parser.parse_args()
115+
if arg_parser.audioset_dir is None:
116+
parser.error("Specify the path to the audioset embeddings "
117+
"directory.\nFor example, if the data is stored in "
118+
"'/datasets/audioset/audioset_v1_embeddings/' "
119+
"you should call this script like this:\npython "
120+
"audioset_preprocessing.py --audioset-dir "
121+
"'/datasets/audioset/audioset_v1_embeddings/''")
122+
main(audioset_dir=arg_parser.audioset_dir)

0 commit comments

Comments
 (0)