-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathfeature.py
More file actions
90 lines (75 loc) · 3.26 KB
/
feature.py
File metadata and controls
90 lines (75 loc) · 3.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import numpy as np
import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()
import tensorflow_hub as hub
import librosa
frill_nofrontend_model = hub.load('https://tfhub.dev/google/nonsemantic-speech-benchmark/frill-nofrontend/1')
def stabilized_log(data, additive_offset, floor):
"""TF version of mfcc_mel.StabilizedLog."""
return tf.math.log(tf.math.maximum(data, floor) + additive_offset)
def log_mel_spectrogram(data,
audio_sample_rate,
num_mel_bins=64,
log_additive_offset=0.001,
log_floor=1e-12,
window_length_secs=0.025,
hop_length_secs=0.010,
fft_length=None):
"""TF version of mfcc_mel.LogMelSpectrogram."""
window_length_samples = int(round(audio_sample_rate * window_length_secs))
hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
if not fft_length:
fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
spectrogram = tf.abs(
tf.signal.stft(
tf.cast(data, tf.dtypes.float64),
frame_length=window_length_samples,
frame_step=hop_length_samples,
fft_length=fft_length,
window_fn=tf.signal.hann_window,
)
)
to_mel = tf.signal.linear_to_mel_weight_matrix(
num_mel_bins=num_mel_bins,
num_spectrogram_bins=fft_length // 2 + 1,
sample_rate=audio_sample_rate,
lower_edge_hertz=125.0,
upper_edge_hertz=7500.0,
dtype=tf.dtypes.float64
)
mel = spectrogram @ to_mel
log_mel = stabilized_log(mel, log_additive_offset, log_floor)
return log_mel
def compute_frontend_features(samples, sr, frame_hop, n_required=16000, num_mel_bins=64, frame_width=96):
if samples.dtype == np.int16:
samples = tf.cast(samples, np.float32) / np.iinfo(np.int16).max
if samples.dtype == np.float64:
samples = tf.cast(samples, np.float32)
assert samples.dtype == np.float32, samples.dtype
n = tf.size(samples)
samples = tf.cond(
n < n_required,
lambda: tf.pad(samples, [(0, n_required - n)]),
lambda: samples
)
mel = log_mel_spectrogram(samples, sr, num_mel_bins=num_mel_bins)
mel = tf.signal.frame(mel, frame_length=frame_width, frame_step=frame_hop, axis=0)
return mel
def make_nonsemantic_frill_nofrontend_feat(filename):
waveform, _ = librosa.load(filename, sr=16000, mono=True, res_type="kaiser_fast")
frontend_feats = tf.expand_dims(compute_frontend_features(waveform, 16000, frame_hop=17), axis=-1).numpy().astype(np.float32)
assert frontend_feats.shape[1:] == (96, 64, 1)
embeddings = frill_nofrontend_model(frontend_feats)['embedding']
mean_emb = embeddings.numpy().mean(axis=0)
std_emb = embeddings.numpy().std(axis=0)
return np.concatenate((mean_emb, std_emb))
# # Extract Features
def get_features_of_list_audio(path, X):
X_trill_features = []
#extract train data features
for index, row in X.iterrows():
#get cough audio path
cough_path = os.path.join(path, row['file_path'])
X_trill_features.append(make_nonsemantic_frill_nofrontend_feat(cough_path))
return np.array(X_trill_features)