-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtrain.py
More file actions
120 lines (95 loc) · 4.43 KB
/
train.py
File metadata and controls
120 lines (95 loc) · 4.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# train.py
import tensorflow as tf
import numpy as np
import argparse
import yaml
import os
from voiceprint_id.data.audio_processor import AudioProcessor
from voiceprint_id.data.data_augmentation import AudioAugmentation
from voiceprint_id.data.dataset_loader import DatasetLoader
def main():
parser = argparse.ArgumentParser(description='Train VoicePrint ID Models')
parser.add_argument('--config', type=str, default='config.yaml', help='Config file path')
parser.add_argument('--model', type=str, choices=['speaker', 'emotion', 'language', 'spoof'], required=True)
parser.add_argument('--data_dir', type=str, required=True, help='Dataset directory')
parser.add_argument('--epochs', type=int, default=100, help='Number of training epochs')
args = parser.parse_args()
with open(args.config, 'r') as f:
config = yaml.safe_load(f)
print(f"Training {args.model} model...")
if args.model == 'speaker':
train_speaker_model(config, args.data_dir, args.epochs)
elif args.model == 'emotion':
train_emotion_model(config, args.data_dir, args.epochs)
elif args.model == 'language':
train_language_model(config, args.data_dir, args.epochs)
elif args.model == 'spoof':
train_spoof_model(config, args.data_dir, args.epochs)
def train_speaker_model(config, data_dir, epochs):
from voiceprint_id.models.speaker_models import SpeakerClassifier
loader = DatasetLoader(data_dir)
audio_data, speaker_labels = loader.load_librispeech_speaker()
feature_utils = FeatureUtils()
features = feature_utils.create_feature_matrix(audio_data, config['audio']['sample_rate'])
labels_encoded = feature_utils.encode_labels(speaker_labels)
num_speakers = len(np.unique(labels_encoded))
model = SpeakerClassifier(num_speakers=num_speakers)
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=config['training']['learning_rate']),
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
X_train, X_val, y_train, y_val = train_test_split(
features, labels_encoded, test_size=config['training']['validation_split'], random_state=42
)
history = model.fit(
X_train, y_train,
epochs=epochs,
batch_size=config['training']['batch_size'],
validation_data=(X_val, y_val),
callbacks=[
tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
tf.keras.callbacks.ReduceLROnPlateau(patience=5)
]
)
model.save_weights('models/speaker_classifier.h5')
print("Speaker model trained and saved.")
def train_emotion_model(config, data_dir, epochs):
from voiceprint_id.models.emotion_models import EmotionCNN
loader = DatasetLoader(data_dir)
audio_data, emotion_labels = loader.load_ravdess_emotion()
emotion_labels_encoded = LabelEncoder().fit_transform(emotion_labels)
num_emotions = len(np.unique(emotion_labels_encoded))
model = EmotionCNN(num_emotions=num_emotions)
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=config['training']['learning_rate']),
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
audio_processor = AudioProcessor()
features = []
for audio in audio_data:
mfcc = audio_processor.extract_mfcc(audio, config['audio']['sample_rate'])
mfcc = mfcc[:, :300]
if mfcc.shape[1] < 300:
mfcc = np.pad(mfcc, ((0, 0), (0, 300 - mfcc.shape[1])), mode='constant')
mfcc = np.expand_dims(mfcc, axis=-1)
features.append(mfcc)
features = np.array(features)
X_train, X_val, y_train, y_val = train_test_split(
features, emotion_labels_encoded, test_size=config['training']['validation_split'], random_state=42
)
history = model.fit(
X_train, y_train,
epochs=epochs,
batch_size=config['training']['batch_size'],
validation_data=(X_val, y_val),
callbacks=[
tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
tf.keras.callbacks.ReduceLROnPlateau(patience=5)
]
)
model.save_weights('models/emotion_classifier.h5')
print("Emotion model trained and saved.")
if __name__ == "__main__":
main()