diff --git a/data.py b/data.py index 3001b34..400693d 100644 --- a/data.py +++ b/data.py @@ -25,10 +25,11 @@ def __init__(self, mode, label_words_dict, wav_list, add_noise, preprocess_fun, """ self.mode = mode self.label_words_dict = label_words_dict - self.wav_list = wav_list + self.wav_list = wav_list[0] + self.label_list = wav_list[1] self.add_noise = add_noise self.sr = sr - self.n_silence = int(len(wav_list) * 0.09) + self.n_silence = int(len(self.wav_list) * 0.09) self.preprocess_fun = preprocess_fun self.preprocess_param = preprocess_param @@ -100,8 +101,7 @@ def __getitem__(self, idx): if self.mode == 'test': return {'spec': wav_tensor, 'id': self.wav_list[idx]} - label = self.label_words_dict[self.wav_list[idx].split("/")[-2]] if self.wav_list[idx].split( - "/")[-2] in self.label_words_dict else len(self.label_words_dict) + label = self.label_words_dict.get(self.label_list[idx], len(self.label_words_dict)) return {'spec': wav_tensor, 'id': self.wav_list[idx], 'label': label} @@ -132,15 +132,18 @@ def get_wav_list(words, unknown_ratio=0.2): # sample full train list sampled_train_list = [] + sampled_train_labels = [] for w in full_train_list: l = w.split("/")[-2] if l not in words: if random.random() < unknown_ratio: sampled_train_list.append(w) + sample_train_labels.append('unknown') else: sampled_train_list.append(w) + sampled_train_labels.append(l) - return sampled_train_list, full_test_list + return sampled_train_list, sampled_train_labels, full_test_list def get_sub_list(num, sub_path): @@ -148,17 +151,28 @@ def get_sub_list(num, sub_path): df = pd.read_csv(sub_path) words = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown'] each_num = int(num * 0.085) + labels = [] for w in words: tmp = df['fname'][df['label'] == w].sample(each_num).tolist() lst += ["../input/test/audio/" + x for x in tmp] - return lst + for _ in range(len(tmp)): + labels.append(w) + return lst, labels def get_semi_list(words, sub_path, unknown_ratio=0.2, test_ratio=0.2): - train_list, _ = get_wav_list(words=words, unknown_ratio=unknown_ratio) - test_list = get_sub_list(num=int(len(train_list) * test_ratio), sub_path=sub_path) - lst = train_list + test_list - return sample(lst, len(lst)) + train_list, train_labels, _ = get_wav_list(words=words, unknown_ratio=unknown_ratio) + test_list, test_labels = get_sub_list(num=int(len(train_list) * test_ratio), sub_path=sub_path) + file_list = train_list + test_list + label_list = train_labels + test_labels + assert(len(file_list) == len(label_list)) + + random.seed(2018) + file_list = sample(file_list, len(file_list)) + random.seed(2018) + label_list = sample(label_list, len(label_list)) + + return file_list, label_list def preprocess_mfcc(wave): @@ -189,4 +203,4 @@ def preprocess_wav(wav, normalization=True): if normalization: mean = data.mean() data -= mean - return data \ No newline at end of file + return data diff --git a/trainer.py b/trainer.py index 7318aa7..cb38705 100644 --- a/trainer.py +++ b/trainer.py @@ -65,18 +65,18 @@ def get_model(model=model_class, m=MGPU, pretrained=pretrained): optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, speechmodel.parameters()), lr=learning_rate, momentum=0.9, weight_decay=0.00001) speechmodel.train() if semi_train_path: - train_list = get_semi_list(words=label_to_int.keys(), sub_path=semi_train_path, + train_list, label_list = get_semi_list(words=label_to_int.keys(), sub_path=semi_train_path, test_ratio=choice([0.2, 0.25, 0.3, 0.35])) print("semi training list length: ", len(train_list)) else: - train_list, _ = get_wav_list(words=label_to_int.keys()) + train_list, label_list, _ = get_wav_list(words=label_to_int.keys()) if pretraining: traindataset = PreDataset(label_words_dict=label_to_int, add_noise=True, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param, resize_shape=reshape_size, is_1d=is_1d) else: - traindataset = SpeechDataset(mode='train', label_words_dict=label_to_int, wav_list=train_list, + traindataset = SpeechDataset(mode='train', label_words_dict=label_to_int, wav_list=(train_list, label_list), add_noise=True, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param, resize_shape=reshape_size, is_1d=is_1d) trainloader = DataLoader(traindataset, BATCH_SIZE, shuffle=True) @@ -108,8 +108,8 @@ def get_model(model=model_class, m=MGPU, pretrained=pretrained): trained_models = ["model/model_%s_%s.pth" % (CODER, b) for b in range(bagging_num)] # prediction - _, test_list = get_wav_list(words=label_to_int.keys()) - testdataset = SpeechDataset(mode='test', label_words_dict=label_to_int, wav_list=test_list, + _, _, test_list = get_wav_list(words=label_to_int.keys()) + testdataset = SpeechDataset(mode='test', label_words_dict=label_to_int, wav_list=(test_list, []), add_noise=False, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param, resize_shape=reshape_size, is_1d=is_1d) testloader = DataLoader(testdataset, BATCH_SIZE, shuffle=False)