diff --git a/README.md b/README.md index b47cd5f..aeb53de 100644 --- a/README.md +++ b/README.md @@ -33,27 +33,36 @@ Leader-board for the ZeroSpeech 2020 challenge can be found [here](https://zeros 3. Preprocess audio and extract train/test log-Mel spectrograms: ``` - python preprocess.py in_dir=/path/to/dataset dataset=[2019/english or 2019/surprise] + python preprocess.py in_dir=/path/to/dataset dataset=[2019/english or 2019/surprise or buckeye] ``` - Note: `in_dir` must be the path to the `2019` folder. - For `dataset` choose between `2019/english` or `2019/surprise`. - Other datasets will be added in the future. + For `dataset` choose between `2019/english`, `2019/surprise` or `buckeye`. + Note: `in_dir` must be the path to the `2019` folder or the original + Buckeye dataset directory. Other datasets will be added in the future. Example usage: ``` python preprocess.py in_dir=../datasets/2020/2019 dataset=2019/english ``` + or + ``` + python preprocess.py in_dir=/home/kamperh/endgame/projects/stellenbosch/vqseg/datasets/swb300-wavs/ dataset=swbd preprocessing=8khz + ``` ## Training 1. Train the VQ-CPC model (or download pretrained weights [here](https://github.com/bshall/VectorQuantizedCPC/releases/tag/v0.1)): ``` - python train_cpc.py checkpoint_dir=path/to/checkpoint_dir dataset=[2019/english or 2019/surprise] + python train_cpc.py checkpoint_dir=path/to/checkpoint_dir dataset=[2019/english or 2019/surprise or buckeye] ``` Example usage: ``` python train_cpc.py checkpoint_dir=checkpoints/cpc/2019english dataset=2019/english ``` + or + ``` + python train_cpc.py checkpoint_dir=checkpoints/cpc/buckeye dataset=buckeye training.sample_frames=64 model.encoder.n_embeddings=256 training.scheduler.warmup_epochs=250 + python train_cpc.py checkpoint_dir=checkpoints/cpc/swbd1 dataset=swbd training.sample_frames=64 preprocessing=8khz + ``` 2. Train the vocoder: ``` @@ -95,10 +104,16 @@ Voice conversion samples are available [here](https://bshall.github.io/VectorQua 1. Encode test data for evaluation: ``` - python encode.py checkpoint=path/to/checkpoint out_dir=path/to/out_dir dataset=[2019/english or 2019/surprise] + python encode.py checkpoint=path/to/checkpoint out_dir=path/to/out_dir dataset=[2019/english or 2019/surprise or buckeye] + ``` + Example usage: + ``` + python encode.py checkpoint=checkpoints/cpc/english2019/model.ckpt-22000.pt out_dir=submission/2019/english/test dataset=2019/english ``` + or ``` - e.g. python encode.py checkpoint=checkpoints/2019english/model.ckpt-500000.pt out_dir=submission/2019/english/test dataset=2019/english + python encode.py checkpoint=checkpoints/cpc/english2019/model.ckpt-22000.pt split=val save_indices=True out_dir=outputs/buckeye/val_zs2019/ dataset=buckeye + python encode.py checkpoint=checkpoints/cpc/swbd1/model.ckpt-22000.pt split=val save_indices=True out_dir=outputs/swbd/val_swbd1/ dataset=swbd preprocessing=8khz ``` 2. Run ABX evaluation script (see [bootphon/zerospeech2020](https://github.com/bootphon/zerospeech2020)). diff --git a/config/dataset/2019/english.yaml b/config/dataset/2019/english.yaml index 66a2dab..e594cb4 100644 --- a/config/dataset/2019/english.yaml +++ b/config/dataset/2019/english.yaml @@ -1,5 +1,3 @@ dataset: - dataset: 2019 - language: english path: 2019/english n_speakers: 102 diff --git a/config/dataset/2019/surprise.yaml b/config/dataset/2019/surprise.yaml index a2bd290..4094a96 100644 --- a/config/dataset/2019/surprise.yaml +++ b/config/dataset/2019/surprise.yaml @@ -1,5 +1,3 @@ dataset: - dataset: 2019 - language: surprise path: 2019/surprise n_speakers: 113 \ No newline at end of file diff --git a/config/dataset/buckeye.yaml b/config/dataset/buckeye.yaml new file mode 100644 index 0000000..a7e2a6f --- /dev/null +++ b/config/dataset/buckeye.yaml @@ -0,0 +1,3 @@ +dataset: + path: buckeye + n_speakers: 32 diff --git a/config/dataset/swbd.yaml b/config/dataset/swbd.yaml new file mode 100644 index 0000000..3a68a91 --- /dev/null +++ b/config/dataset/swbd.yaml @@ -0,0 +1,3 @@ +dataset: + path: swbd + n_speakers: 1131 diff --git a/config/encode.yaml b/config/encode.yaml index 95f9ec9..2ca9cd4 100644 --- a/config/encode.yaml +++ b/config/encode.yaml @@ -3,6 +3,9 @@ defaults: - preprocessing: default - model: default +split: test checkpoint: ??? out_dir: ??? -save_auxiliary: False \ No newline at end of file +save_auxiliary: False +save_indices: False +save_embedding: False diff --git a/config/preprocessing/8khz.yaml b/config/preprocessing/8khz.yaml new file mode 100644 index 0000000..182b63e --- /dev/null +++ b/config/preprocessing/8khz.yaml @@ -0,0 +1,10 @@ +preprocessing: + sr: 8000 + n_fft: 1024 + n_mels: 40 + fmin: 50 + preemph: 0.97 + top_db: 80 + hop_length: 80 + win_length: 200 + bits: 8 diff --git a/dataset.py b/dataset.py index e762272..6dc63e6 100755 --- a/dataset.py +++ b/dataset.py @@ -37,7 +37,7 @@ def __getitem__(self, index): mels = list() paths = random.sample(paths, self.n_utterances_per_speaker) for path in paths: - path = self.root.parent / path + path = self.root / path mel = np.load(path.with_suffix(".mel.npy")) pos = random.randint(0, mel.shape[1] - self.n_sample_frames) mel = mel[:, pos:pos + self.n_sample_frames] diff --git a/encode.py b/encode.py index 0f70177..ae614a4 100644 --- a/encode.py +++ b/encode.py @@ -17,7 +17,7 @@ def encode_dataset(cfg): out_dir.mkdir(exist_ok=True, parents=True) root_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path - with open(root_path / "test.json") as file: + with open((root_path / cfg.split).with_suffix(".json")) as file: metadata = json.load(file) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -32,6 +32,10 @@ def encode_dataset(cfg): encoder.eval() + if cfg.save_embedding: + embedding_path = out_dir / "embedding.npy" + np.save(embedding_path, encoder.codebook.embedding.cpu().numpy()) + if cfg.save_auxiliary: auxiliary = [] @@ -41,7 +45,7 @@ def hook(module, input, output): encoder.encoder[-1].register_forward_hook(hook) for _, _, _, path in tqdm(metadata): - path = root_path.parent / path + path = root_path / path mel = torch.from_numpy(np.load(path.with_suffix(".mel.npy"))).unsqueeze(0).to(device) with torch.no_grad(): z, c, indices = encoder.encode(mel) @@ -52,15 +56,24 @@ def hook(module, input, output): with open(out_path.with_suffix(".txt"), "w") as file: np.savetxt(file, z, fmt="%.16f") + if cfg.save_indices: + indices_path = out_dir / "indices" + indices_path.mkdir(exist_ok=True, parents=True) + out_path = indices_path / path.stem + indices = indices.squeeze().cpu().numpy() + if not indices.shape==(): + with open(out_path.with_suffix(".txt"), "w") as file: + np.savetxt(file, indices, fmt="%d") + if cfg.save_auxiliary: - aux_path = out_dir.parent / "auxiliary_embedding1" + aux_path = out_dir / "auxiliary_embedding1" aux_path.mkdir(exist_ok=True, parents=True) out_path = aux_path / path.stem c = c.squeeze().cpu().numpy() with open(out_path.with_suffix(".txt"), "w") as file: np.savetxt(file, c, fmt="%.16f") - aux_path = out_dir.parent / "auxiliary_embedding2" + aux_path = out_dir / "auxiliary_embedding2" aux_path.mkdir(exist_ok=True, parents=True) out_path = aux_path / path.stem aux = auxiliary.pop().squeeze().cpu().numpy() diff --git a/preprocess.py b/preprocess.py index 3085708..ba05ff1 100644 --- a/preprocess.py +++ b/preprocess.py @@ -54,18 +54,23 @@ def process_wav(wav_path, out_path, sr=160000, preemph=0.97, n_fft=2048, n_mels= @hydra.main(config_path="config/preprocessing.yaml") def preprocess_dataset(cfg): in_dir = Path(utils.to_absolute_path(cfg.in_dir)) - out_dir = Path(utils.to_absolute_path("datasets")) / str(cfg.dataset.dataset) + out_dir = Path(utils.to_absolute_path("datasets")) / str(cfg.dataset.path) out_dir.mkdir(parents=True, exist_ok=True) executor = ProcessPoolExecutor(max_workers=cpu_count()) - for split in ["train", "test"]: + for split in ["train", "test", "val"]: print("Extracting features for {} set".format(split)) futures = [] - split_path = out_dir / cfg.dataset.language / split + split_path = out_dir / split + if not split_path.with_suffix(".json").exists(): + print("Skipping {} (no json file)".format(split)) + continue with open(split_path.with_suffix(".json")) as file: metadata = json.load(file) for in_path, start, duration, out_path in metadata: wav_path = in_dir / in_path + assert wav_path.with_suffix(".wav").exists(), "'{}' does not exist".format( + wav_path.with_suffix(".wav")) out_path = out_dir / out_path out_path.parent.mkdir(parents=True, exist_ok=True) futures.append(executor.submit(