From dfa475f2cd233773e713bb638da4e518bb82a9c5 Mon Sep 17 00:00:00 2001
From: Herman Kamper <kamperh@gmail.com>
Date: Fri, 28 Aug 2020 08:30:52 +0200
Subject: [PATCH 1/9] Added validation set

---
 preprocess.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/preprocess.py b/preprocess.py
index 3085708..7aa0b68 100644
--- a/preprocess.py
+++ b/preprocess.py
@@ -58,10 +58,13 @@ def preprocess_dataset(cfg):
     out_dir.mkdir(parents=True, exist_ok=True)
 
     executor = ProcessPoolExecutor(max_workers=cpu_count())
-    for split in ["train", "test"]:
+    for split in ["train", "test", "val"]:
         print("Extracting features for {} set".format(split))
         futures = []
         split_path = out_dir / cfg.dataset.language / split
+        if not split_path.with_suffix(".json").exists():
+            print("Skipping {} (no json file)".format(split))
+            continue
         with open(split_path.with_suffix(".json")) as file:
             metadata = json.load(file)
             for in_path, start, duration, out_path in metadata:

From cd2a9ebdce8cadcc713d1662fd4f9a81442f831b Mon Sep 17 00:00:00 2001
From: Herman Kamper <kamperh@gmail.com>
Date: Fri, 28 Aug 2020 08:34:12 +0200
Subject: [PATCH 2/9] Changes for dataset where language isn't specified

---
 dataset.py    | 2 +-
 preprocess.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/dataset.py b/dataset.py
index e762272..6dc63e6 100755
--- a/dataset.py
+++ b/dataset.py
@@ -37,7 +37,7 @@ def __getitem__(self, index):
         mels = list()
         paths = random.sample(paths, self.n_utterances_per_speaker)
         for path in paths:
-            path = self.root.parent / path
+            path = self.root / path
             mel = np.load(path.with_suffix(".mel.npy"))
             pos = random.randint(0, mel.shape[1] - self.n_sample_frames)
             mel = mel[:, pos:pos + self.n_sample_frames]
diff --git a/preprocess.py b/preprocess.py
index 7aa0b68..53f09ae 100644
--- a/preprocess.py
+++ b/preprocess.py
@@ -61,7 +61,10 @@ def preprocess_dataset(cfg):
     for split in ["train", "test", "val"]:
         print("Extracting features for {} set".format(split))
         futures = []
-        split_path = out_dir / cfg.dataset.language / split
+        if "language" in cfg.dataset:
+            split_path = out_dir / cfg.dataset.language / split
+        else:
+            split_path = out_dir / split
         if not split_path.with_suffix(".json").exists():
             print("Skipping {} (no json file)".format(split))
             continue

From 1803a2c75489dd193d26a73f3b942e076a044972 Mon Sep 17 00:00:00 2001
From: Herman Kamper <kamperh@gmail.com>
Date: Fri, 28 Aug 2020 08:39:16 +0200
Subject: [PATCH 3/9] Changes to fix path issues

---
 encode.py     | 19 ++++++++++++++++---
 preprocess.py |  2 ++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/encode.py b/encode.py
index 0f70177..be79734 100644
--- a/encode.py
+++ b/encode.py
@@ -32,6 +32,10 @@ def encode_dataset(cfg):
 
     encoder.eval()
 
+    if cfg.save_embedding:
+        embedding_path = out_dir / "embedding.npy"
+        np.save(embedding_path, encoder.codebook.embedding.cpu().numpy())
+
     if cfg.save_auxiliary:
         auxiliary = []
 
@@ -41,7 +45,7 @@ def hook(module, input, output):
         encoder.encoder[-1].register_forward_hook(hook)
 
     for _, _, _, path in tqdm(metadata):
-        path = root_path.parent / path
+        path = root_path / path
         mel = torch.from_numpy(np.load(path.with_suffix(".mel.npy"))).unsqueeze(0).to(device)
         with torch.no_grad():
             z, c, indices = encoder.encode(mel)
@@ -52,15 +56,24 @@ def hook(module, input, output):
         with open(out_path.with_suffix(".txt"), "w") as file:
             np.savetxt(file, z, fmt="%.16f")
 
+        if cfg.save_indices:
+            indices_path = out_dir / "indices"
+            indices_path.mkdir(exist_ok=True, parents=True)
+            out_path = indices_path / path.stem
+            indices = indices.squeeze().cpu().numpy()
+            if not indices.shape==():
+                with open(out_path.with_suffix(".txt"), "w") as file:
+                    np.savetxt(file, indices, fmt="%d")
+
         if cfg.save_auxiliary:
-            aux_path = out_dir.parent / "auxiliary_embedding1"
+            aux_path = out_dir / "auxiliary_embedding1"
             aux_path.mkdir(exist_ok=True, parents=True)
             out_path = aux_path / path.stem
             c = c.squeeze().cpu().numpy()
             with open(out_path.with_suffix(".txt"), "w") as file:
                 np.savetxt(file, c, fmt="%.16f")
 
-            aux_path = out_dir.parent / "auxiliary_embedding2"
+            aux_path = out_dir / "auxiliary_embedding2"
             aux_path.mkdir(exist_ok=True, parents=True)
             out_path = aux_path / path.stem
             aux = auxiliary.pop().squeeze().cpu().numpy()
diff --git a/preprocess.py b/preprocess.py
index 53f09ae..80b2881 100644
--- a/preprocess.py
+++ b/preprocess.py
@@ -72,6 +72,8 @@ def preprocess_dataset(cfg):
             metadata = json.load(file)
             for in_path, start, duration, out_path in metadata:
                 wav_path = in_dir / in_path
+                assert wav_path.with_suffix(".wav").exists(), "'{}' does not exist".format(
+                    wav_path.with_suffix(".wav"))                
                 out_path = out_dir / out_path
                 out_path.parent.mkdir(parents=True, exist_ok=True)
                 futures.append(executor.submit(

From d3b86dbda4ca450f9f09f0f4b00a38b61854a7d5 Mon Sep 17 00:00:00 2001
From: Herman Kamper <kamperh@gmail.com>
Date: Fri, 28 Aug 2020 09:02:20 +0200
Subject: [PATCH 4/9] Removed special language flag

---
 preprocess.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/preprocess.py b/preprocess.py
index 80b2881..1929e01 100644
--- a/preprocess.py
+++ b/preprocess.py
@@ -61,10 +61,7 @@ def preprocess_dataset(cfg):
     for split in ["train", "test", "val"]:
         print("Extracting features for {} set".format(split))
         futures = []
-        if "language" in cfg.dataset:
-            split_path = out_dir / cfg.dataset.language / split
-        else:
-            split_path = out_dir / split
+        split_path = out_dir / split
         if not split_path.with_suffix(".json").exists():
             print("Skipping {} (no json file)".format(split))
             continue

From 10d0abaff084a9343714ae563668a5b0a005bb76 Mon Sep 17 00:00:00 2001
From: Herman Kamper <kamperh@gmail.com>
Date: Fri, 28 Aug 2020 09:10:11 +0200
Subject: [PATCH 5/9] Added buckeye config

---
 config/dataset/2019/english.yaml  | 2 --
 config/dataset/2019/surprise.yaml | 2 --
 config/dataset/buckeye.yaml       | 3 +++
 3 files changed, 3 insertions(+), 4 deletions(-)
 create mode 100644 config/dataset/buckeye.yaml

diff --git a/config/dataset/2019/english.yaml b/config/dataset/2019/english.yaml
index 66a2dab..e594cb4 100644
--- a/config/dataset/2019/english.yaml
+++ b/config/dataset/2019/english.yaml
@@ -1,5 +1,3 @@
 dataset:
-  dataset: 2019
-  language: english
   path: 2019/english
   n_speakers: 102
diff --git a/config/dataset/2019/surprise.yaml b/config/dataset/2019/surprise.yaml
index a2bd290..4094a96 100644
--- a/config/dataset/2019/surprise.yaml
+++ b/config/dataset/2019/surprise.yaml
@@ -1,5 +1,3 @@
 dataset:
-  dataset: 2019
-  language: surprise
   path: 2019/surprise
   n_speakers: 113
\ No newline at end of file
diff --git a/config/dataset/buckeye.yaml b/config/dataset/buckeye.yaml
new file mode 100644
index 0000000..a7e2a6f
--- /dev/null
+++ b/config/dataset/buckeye.yaml
@@ -0,0 +1,3 @@
+dataset:
+  path: buckeye
+  n_speakers: 32

From cb165071280dee77d1c6284acec632269246b537 Mon Sep 17 00:00:00 2001
From: Herman Kamper <kamperh@gmail.com>
Date: Fri, 28 Aug 2020 09:47:57 +0200
Subject: [PATCH 6/9] Added Buckeye dataset and cleaned up paths

---
 README.md          | 23 ++++++++++++++++-------
 config/encode.yaml |  5 ++++-
 encode.py          |  2 +-
 preprocess.py      |  2 +-
 4 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index b47cd5f..8fb942d 100644
--- a/README.md
+++ b/README.md
@@ -33,11 +33,11 @@ Leader-board for the ZeroSpeech 2020 challenge can be found [here](https://zeros
     
 3.  Preprocess audio and extract train/test log-Mel spectrograms:
     ```
-    python preprocess.py in_dir=/path/to/dataset dataset=[2019/english or 2019/surprise]
+    python preprocess.py in_dir=/path/to/dataset dataset=[2019/english or 2019/surprise or buckeye]
     ```
-    Note: `in_dir` must be the path to the `2019` folder. 
-    For `dataset` choose between `2019/english` or `2019/surprise`.
-    Other datasets will be added in the future.
+    For `dataset` choose between `2019/english`, `2019/surprise` or `buckeye`.
+    Note: `in_dir` must be the path to the `2019` folder or the original
+    Buckeye dataset directory. Other datasets will be added in the future.
     
     Example usage:
     ```
@@ -48,12 +48,16 @@ Leader-board for the ZeroSpeech 2020 challenge can be found [here](https://zeros
    
 1.  Train the VQ-CPC model (or download pretrained weights [here](https://github.com/bshall/VectorQuantizedCPC/releases/tag/v0.1)):
     ```
-    python train_cpc.py checkpoint_dir=path/to/checkpoint_dir dataset=[2019/english or 2019/surprise]
+    python train_cpc.py checkpoint_dir=path/to/checkpoint_dir dataset=[2019/english or 2019/surprise or buckeye]
     ```
     Example usage:
     ```
     python train_cpc.py checkpoint_dir=checkpoints/cpc/2019english dataset=2019/english
     ```
+    or
+    ```
+    python train_cpc.py checkpoint_dir=checkpoints/cpc/buckeye dataset=buckeye training.sample_frames=64  model.encoder.n_embeddings=256 training.scheduler.warmup_epochs=250
+    ```
     
 2.  Train the vocoder:
     ```
@@ -95,10 +99,15 @@ Voice conversion samples are available [here](https://bshall.github.io/VectorQua
     
 1.  Encode test data for evaluation:
     ```
-    python encode.py checkpoint=path/to/checkpoint out_dir=path/to/out_dir dataset=[2019/english or 2019/surprise]
+    python encode.py checkpoint=path/to/checkpoint out_dir=path/to/out_dir dataset=[2019/english or 2019/surprise or buckeye]
+    ```
+    Example usage:
+    ```
+    python encode.py checkpoint=checkpoints/cpc/english2019/model.ckpt-22000.pt out_dir=submission/2019/english/test dataset=2019/english
     ```
+    or
     ```
-    e.g. python encode.py checkpoint=checkpoints/2019english/model.ckpt-500000.pt out_dir=submission/2019/english/test dataset=2019/english
+    python encode.py checkpoint=checkpoints/cpc/english2019/model.ckpt-22000.pt split=val save_indices=True out_dir=outputs/buckeye/val_zs2019/ dataset=buckeye
     ```
     
 2. Run ABX evaluation script (see [bootphon/zerospeech2020](https://github.com/bootphon/zerospeech2020)).
diff --git a/config/encode.yaml b/config/encode.yaml
index 95f9ec9..2ca9cd4 100644
--- a/config/encode.yaml
+++ b/config/encode.yaml
@@ -3,6 +3,9 @@ defaults:
     - preprocessing: default
     - model: default
 
+split: test
 checkpoint: ???
 out_dir: ???
-save_auxiliary: False
\ No newline at end of file
+save_auxiliary: False
+save_indices: False
+save_embedding: False
diff --git a/encode.py b/encode.py
index be79734..ae614a4 100644
--- a/encode.py
+++ b/encode.py
@@ -17,7 +17,7 @@ def encode_dataset(cfg):
     out_dir.mkdir(exist_ok=True, parents=True)
 
     root_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path
-    with open(root_path / "test.json") as file:
+    with open((root_path / cfg.split).with_suffix(".json")) as file:
         metadata = json.load(file)
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
diff --git a/preprocess.py b/preprocess.py
index 1929e01..ba05ff1 100644
--- a/preprocess.py
+++ b/preprocess.py
@@ -54,7 +54,7 @@ def process_wav(wav_path, out_path, sr=160000, preemph=0.97, n_fft=2048, n_mels=
 @hydra.main(config_path="config/preprocessing.yaml")
 def preprocess_dataset(cfg):
     in_dir = Path(utils.to_absolute_path(cfg.in_dir))
-    out_dir = Path(utils.to_absolute_path("datasets")) / str(cfg.dataset.dataset)
+    out_dir = Path(utils.to_absolute_path("datasets")) / str(cfg.dataset.path)
     out_dir.mkdir(parents=True, exist_ok=True)
 
     executor = ProcessPoolExecutor(max_workers=cpu_count())

From b7f9b5c08faa29c2fad9e4e9d8f5dc91ebe7f071 Mon Sep 17 00:00:00 2001
From: Herman Kamper <kamperh@gmail.com>
Date: Fri, 28 Aug 2020 11:27:45 +0200
Subject: [PATCH 7/9] Added Switchboard configs

---
 config/dataset/swbd.yaml       |  3 +++
 config/preprocessing/8khz.yaml | 10 ++++++++++
 2 files changed, 13 insertions(+)
 create mode 100644 config/dataset/swbd.yaml
 create mode 100644 config/preprocessing/8khz.yaml

diff --git a/config/dataset/swbd.yaml b/config/dataset/swbd.yaml
new file mode 100644
index 0000000..3a68a91
--- /dev/null
+++ b/config/dataset/swbd.yaml
@@ -0,0 +1,3 @@
+dataset:
+  path: swbd
+  n_speakers: 1131
diff --git a/config/preprocessing/8khz.yaml b/config/preprocessing/8khz.yaml
new file mode 100644
index 0000000..182b63e
--- /dev/null
+++ b/config/preprocessing/8khz.yaml
@@ -0,0 +1,10 @@
+preprocessing:
+    sr: 8000
+    n_fft: 1024
+    n_mels: 40
+    fmin: 50
+    preemph: 0.97
+    top_db: 80
+    hop_length: 80
+    win_length: 200
+    bits: 8

From 7120f35123eebb3bc8c254700c2f06b028277fb1 Mon Sep 17 00:00:00 2001
From: Herman Kamper <kamperh@gmail.com>
Date: Fri, 28 Aug 2020 11:45:29 +0200
Subject: [PATCH 8/9] Updated readme

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 8fb942d..5c8d8a8 100644
--- a/README.md
+++ b/README.md
@@ -43,6 +43,10 @@ Leader-board for the ZeroSpeech 2020 challenge can be found [here](https://zeros
     ```
     python preprocess.py in_dir=../datasets/2020/2019 dataset=2019/english
     ```
+    or
+    ```
+    python preprocess.py in_dir=/home/kamperh/endgame/projects/stellenbosch/vqseg/datasets/swb300-wavs/ dataset=swbd preprocessing=8khz
+    ```
     
 ## Training
    

From 1fefeadf092e8a1744e8ca1e0879674540181a70 Mon Sep 17 00:00:00 2001
From: Herman Kamper <kamperh@gmail.com>
Date: Tue, 1 Sep 2020 09:26:22 +0200
Subject: [PATCH 9/9] Small changes to readme

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 5c8d8a8..aeb53de 100644
--- a/README.md
+++ b/README.md
@@ -61,6 +61,7 @@ Leader-board for the ZeroSpeech 2020 challenge can be found [here](https://zeros
     or
     ```
     python train_cpc.py checkpoint_dir=checkpoints/cpc/buckeye dataset=buckeye training.sample_frames=64  model.encoder.n_embeddings=256 training.scheduler.warmup_epochs=250
+    python train_cpc.py checkpoint_dir=checkpoints/cpc/swbd1 dataset=swbd training.sample_frames=64  preprocessing=8khz
     ```
     
 2.  Train the vocoder:
@@ -112,6 +113,7 @@ Voice conversion samples are available [here](https://bshall.github.io/VectorQua
     or
     ```
     python encode.py checkpoint=checkpoints/cpc/english2019/model.ckpt-22000.pt split=val save_indices=True out_dir=outputs/buckeye/val_zs2019/ dataset=buckeye
+    python encode.py checkpoint=checkpoints/cpc/swbd1/model.ckpt-22000.pt split=val save_indices=True out_dir=outputs/swbd/val_swbd1/ dataset=swbd preprocessing=8khz 
     ```
     
 2. Run ABX evaluation script (see [bootphon/zerospeech2020](https://github.com/bootphon/zerospeech2020)).