remove IPEX for embedding model, update steps for CI

alexsin368 · alexsin368 · commit 37f73e9b89f2 · 2024-12-03T14:56:56.000-08:00
diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/inference_custom.py b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/inference_custom.py
@@ -69,12 +69,10 @@ def __init__(self, ipex_op=False, bf16=False, int8_model=False):
                 print("BF16 enabled")
                 self.language_id.mods["compute_features"] = ipex.optimize(self.language_id.mods["compute_features"], dtype=torch.bfloat16)
                 self.language_id.mods["mean_var_norm"] = ipex.optimize(self.language_id.mods["mean_var_norm"], dtype=torch.bfloat16)
-                #self.language_id.mods["embedding_model"] = ipex.optimize(self.language_id.mods["embedding_model"], dtype=torch.bfloat16)
                 self.language_id.mods["classifier"] = ipex.optimize(self.language_id.mods["classifier"], dtype=torch.bfloat16)
             else:
                 self.language_id.mods["compute_features"] = ipex.optimize(self.language_id.mods["compute_features"])
                 self.language_id.mods["mean_var_norm"] = ipex.optimize(self.language_id.mods["mean_var_norm"])
-                #self.language_id.mods["embedding_model"] = ipex.optimize(self.language_id.mods["embedding_model"])
                 self.language_id.mods["classifier"] = ipex.optimize(self.language_id.mods["classifier"])
             
             # Torchscript to resolve performance issues with reorder operations
diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/README.md b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/README.md
@@ -112,20 +112,16 @@ cd ./Training
 
 ### Option 1: Run in Jupyter Notebook
 
-1. Install Jupyter Notebook.
-   ```
-   pip install notebook
-   ```
-2. Launch Jupyter Notebook.
+1. Launch Jupyter Notebook.
    ```
    jupyter notebook --ip 0.0.0.0 --port 8888 --allow-root
    ```
-3. Follow the instructions to open the URL with the token in your browser.
-4. Locate and select the Training Notebook.
+2. Follow the instructions to open the URL with the token in your browser.
+3. Locate and select the Training Notebook.
    ```
    lang_id_training.ipynb
    ```
-5. Follow the instructions in the Notebook.
+4. Follow the instructions in the Notebook.
 
 
 ### Option 2: Run in a Console
@@ -217,10 +213,10 @@ After training, the output should be inside the `results/epaca/1987` folder. By
    cp label_encoder.txt ../.
    ```
 
-4. Change to the latest `CKPT` folder, and copy the classifier.ckpt and embedding_model.ckpt files into the `/Inference/lang_id_commonvoice_model/` folder which is two directories up.
+4. Change to the latest `CKPT` folder, and copy the classifier.ckpt and embedding_model.ckpt files into the `/Inference/lang_id_commonvoice_model/` folder which is two directories up. By default, the command below will navigate into the single CKPT folder that is present, but you can change it to the specific folder name. 
    ```bash
    # Navigate into the CKPT folder
-   cd CKPT<DATE_OF_RUN>
+   cd CKPT*
 
    cp classifier.ckpt ../../.
    cp embedding_model.ckpt ../../
@@ -253,20 +249,16 @@ To run inference, you must have already run all of the training scripts, generat
 
 ### Option 1: Run in Jupyter Notebook
 
-1. If you have not already done so, install Jupyter Notebook.
-   ```
-   pip install notebook
-   ```
-2. Launch Jupyter Notebook.
+1. Launch Jupyter Notebook.
    ```
    jupyter notebook --ip 0.0.0.0 --port 8889 --allow-root
    ```
-3. Follow the instructions to open the URL with the token in your browser.
-4. Locate and select the inference Notebook.
+2. Follow the instructions to open the URL with the token in your browser.
+3. Locate and select the inference Notebook.
    ```
    lang_id_inference.ipynb
    ```
-5. Follow the instructions in the Notebook.
+4. Follow the instructions in the Notebook.
 
 ### Option 2: Run in a Console
 
diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/lang_id_training.ipynb b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/lang_id_training.ipynb
@@ -177,7 +177,7 @@
     "\n",
     "# 4)\n",
     "# Navigate into the CKPT folder\n",
-    "!cd CKPT<DATE_OF_RUN> #@TODO: set this to your CKPT folder\n",
+    "!cd CKPT* # Set this to your CKPT folder. By default it will navigate into the one that is present.\n",
     "!cp classifier.ckpt ../../.\n",
     "!cp embedding_model.ckpt ../../\n",
     "!cd ../.."
diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/initialize.sh b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/initialize.sh
@@ -11,10 +11,13 @@ cd ..
 export PYTHONPATH=$PYTHONPATH:$(pwd)/speechbrain
 
 # Install huggingface datasets and other requirements
-conda install -y datasets tqdm librosa
+conda install -y datasets tqdm librosa jupyter ipykernel ipywidgets
 
 # Install webdataset
 python -m pip install webdataset==0.2.100
 
 # Install libraries for MP3 to WAV conversion
 python -m pip install pydub
+
+# Install notebook to run Jupyter notebooks
+python -m pip install notebook
diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/sample.json b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/sample.json
@@ -12,36 +12,17 @@
       {
         "id": "Language_Identification_E2E",
         "env": [
+          "export COMMON_VOICE_PATH=/data/commonVoice"
         ],
         "steps": [
-          "export COMMON_VOICE_PATH=/data/commonVoice",
-          "sudo apt-get update && apt-get install ffmpeg libgl1",
-          "git clone https://github.com/oneapi-src/oneAPI-samples.git",
-          "cd oneAPI-samples/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification",
+          "apt-get update && apt-get install ffmpeg libgl1 -y",
           "source initialize.sh",
-          "cd /Training",
-          "cp speechbrain/recipes/VoxLingua107/lang_id/create_wds_shards.py create_wds_shards.py",
-          "cp speechbrain/recipes/VoxLingua107/lang_id/train.py train.py",
-          "cp speechbrain/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml train_ecapa.yaml",
-          "patch < create_wds_shards.patch",
-          "patch < train_ecapa.patch",
-          "python prepareAllCommonVoice.py -path $COMMON_VOICE_PATH -max_samples 2000 --createCsv --train --dev --test",
-          "python create_wds_shards.py ${COMMON_VOICE_PATH}/processed_data/train ${COMMON_VOICE_PATH}/processed_data/commonVoice_shards/train",
-          "python create_wds_shards.py ${COMMON_VOICE_PATH}/processed_data/dev ${COMMON_VOICE_PATH}/processed_data/commonVoice_shards/dev",
-          "python train.py train_ecapa.yaml --device cpu",
-          "cp -R results/epaca/1987 ../Inference/lang_id_commonvoice_model",
-          "cd ../Inference/lang_id_commonvoice_model/save",
-          "cp label_encoder.txt ../.",
-          "cd CKPT<DATE_OF_RUN>",
-          "cp classifier.ckpt ../../.",
-          "cp embedding_model.ckpt ../../",
-          "cd ../..",
-          "cd /Inference",
-          "python inference_commonVoice.py -p ${COMMON_VOICE_PATH}/processed_data/test",
-          "python inference_custom.py -p data_custom -d 3 -s 50 --vad",
-          "python inference_custom.py -p data_custom -d 3 -s 50 --vad --ipex --verbose",
-          "python quantize_model.py -p ./lang_id_commonvoice_model -datapath $COMMON_VOICE_PATH/processed_data/dev",
-          "python inference_custom.py -p data_custom -d 3 -s 50 --vad --int8_model --verbose"
+          "cd ./Dataset",
+          "python get_dataset.py --output_dir ${COMMON_VOICE_PATH}",
+          "cd ../Training",
+          "jupyter nbconvert --execute --to notebook --inplace --debug lang_id_training.ipynb",
+          "cd ../Inference",
+          "jupyter nbconvert --execute --to notebook --inplace --debug lang_id_inference.ipynb"
         ]
       }
     ]