Skip to content

Commit 7057cb4

Browse files
authored
Merge pull request #2550 from alexsin368/lang_id_modernization
Lang id modernization
2 parents 6f041c9 + 3ff787d commit 7057cb4

20 files changed

+385
-288
lines changed
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import os
2+
import shutil
3+
import argparse
4+
from datasets import load_dataset
5+
from tqdm import tqdm
6+
7+
language_to_code = {
8+
"japanese": "ja",
9+
"swedish": "sv-SE"
10+
}
11+
12+
def download_dataset(output_dir):
13+
for lang, lang_code in language_to_code.items():
14+
print(f"Processing dataset for language: {lang_code}")
15+
16+
# Load the dataset for the specific language
17+
dataset = load_dataset("mozilla-foundation/common_voice_11_0", lang_code, split="train", trust_remote_code=True)
18+
19+
# Create a language-specific output folder
20+
output_folder = os.path.join(output_dir, lang, lang_code, "clips")
21+
os.makedirs(output_folder, exist_ok=True)
22+
23+
# Extract and copy MP3 files
24+
for sample in tqdm(dataset, desc=f"Extracting and copying MP3 files for {lang}"):
25+
audio_path = sample['audio']['path']
26+
shutil.copy(audio_path, output_folder)
27+
28+
print("Extraction and copy complete.")
29+
30+
if __name__ == "__main__":
31+
parser = argparse.ArgumentParser(description="Extract and copy audio files from a dataset to a specified directory.")
32+
parser.add_argument("--output_dir", type=str, default="/data/commonVoice", help="Base output directory for saving the files. Default is /data/commonVoice")
33+
args = parser.parse_args()
34+
35+
download_dataset(args.output_dir)
Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#!/bin/bash
22

3-
rm -R RIRS_NOISES
4-
rm -R tmp
5-
rm -R speechbrain
6-
rm -f rirs_noises.zip noise.csv reverb.csv vad_file.txt
3+
echo "Deleting .wav files, tmp"
74
rm -f ./*.wav
5+
rm -R tmp

AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/inference_commonVoice.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def __init__(self, dirpath, filename):
2929
self.sampleRate = 0
3030
self.waveData = ''
3131
self.wavesize = 0
32-
self.waveduriation = 0
32+
self.waveduration = 0
3333
if filename.endswith(".wav") or filename.endswith(".wmv"):
3434
self.wavefile = filename
3535
self.wavepath = dirpath + os.sep + filename
@@ -173,12 +173,12 @@ def main(argv):
173173
data = datafile(testDataDirectory, filename)
174174
predict_list = []
175175
use_entire_audio_file = False
176-
if data.waveduration < sample_dur:
176+
if int(data.waveduration) <= sample_dur:
177177
# Use entire audio file if the duration is less than the sampling duration
178178
use_entire_audio_file = True
179179
sample_list = [0 for _ in range(sample_size)]
180180
else:
181-
start_time_list = list(range(sample_size - int(data.waveduration) + 1))
181+
start_time_list = list(range(0, int(data.waveduration) - sample_dur))
182182
sample_list = []
183183
for i in range(sample_size):
184184
sample_list.append(random.sample(start_time_list, 1)[0])
@@ -198,10 +198,6 @@ def main(argv):
198198
predict_list.append(' ')
199199
pass
200200

201-
# Clean up
202-
if use_entire_audio_file:
203-
os.remove("./" + data.filename)
204-
205201
# Pick the top rated prediction result
206202
occurence_count = Counter(predict_list)
207203
total_count = sum(occurence_count.values())

AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/inference_custom.py

Lines changed: 45 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __init__(self, dirpath, filename):
3030
self.sampleRate = 0
3131
self.waveData = ''
3232
self.wavesize = 0
33-
self.waveduriation = 0
33+
self.waveduration = 0
3434
if filename.endswith(".wav") or filename.endswith(".wmv"):
3535
self.wavefile = filename
3636
self.wavepath = dirpath + os.sep + filename
@@ -61,41 +61,45 @@ def __init__(self, ipex_op=False, bf16=False, int8_model=False):
6161
self.model_int8 = load(source_model_int8_path, self.language_id)
6262
self.model_int8.eval()
6363
elif ipex_op:
64+
self.language_id.eval()
65+
6466
# Optimize for inference with IPEX
6567
print("Optimizing inference with IPEX")
66-
self.language_id.eval()
67-
sampleInput = (torch.load("./sample_input_features.pt"), torch.load("./sample_input_wav_lens.pt"))
6868
if bf16:
6969
print("BF16 enabled")
7070
self.language_id.mods["compute_features"] = ipex.optimize(self.language_id.mods["compute_features"], dtype=torch.bfloat16)
7171
self.language_id.mods["mean_var_norm"] = ipex.optimize(self.language_id.mods["mean_var_norm"], dtype=torch.bfloat16)
72-
self.language_id.mods["embedding_model"] = ipex.optimize(self.language_id.mods["embedding_model"], dtype=torch.bfloat16)
7372
self.language_id.mods["classifier"] = ipex.optimize(self.language_id.mods["classifier"], dtype=torch.bfloat16)
7473
else:
7574
self.language_id.mods["compute_features"] = ipex.optimize(self.language_id.mods["compute_features"])
7675
self.language_id.mods["mean_var_norm"] = ipex.optimize(self.language_id.mods["mean_var_norm"])
77-
self.language_id.mods["embedding_model"] = ipex.optimize(self.language_id.mods["embedding_model"])
7876
self.language_id.mods["classifier"] = ipex.optimize(self.language_id.mods["classifier"])
7977

8078
# Torchscript to resolve performance issues with reorder operations
79+
print("Applying Torchscript")
80+
sampleWavs = torch.load("./sample_wavs.pt")
81+
sampleWavLens = torch.ones(sampleWavs.shape[0])
8182
with torch.no_grad():
82-
I2 = self.language_id.mods["embedding_model"](*sampleInput)
83+
I1 = self.language_id.mods["compute_features"](sampleWavs)
84+
I2 = self.language_id.mods["mean_var_norm"](I1, sampleWavLens)
85+
I3 = self.language_id.mods["embedding_model"](I2, sampleWavLens)
86+
8387
if bf16:
8488
with torch.cpu.amp.autocast():
85-
self.language_id.mods["compute_features"] = torch.jit.trace( self.language_id.mods["compute_features"] , example_inputs=(torch.rand(1,32000)))
86-
self.language_id.mods["mean_var_norm"] = torch.jit.trace(self.language_id.mods["mean_var_norm"], example_inputs=sampleInput)
87-
self.language_id.mods["embedding_model"] = torch.jit.trace(self.language_id.mods["embedding_model"], example_inputs=sampleInput)
88-
self.language_id.mods["classifier"] = torch.jit.trace(self.language_id.mods["classifier"], example_inputs=I2)
89+
self.language_id.mods["compute_features"] = torch.jit.trace( self.language_id.mods["compute_features"] , example_inputs=sampleWavs)
90+
self.language_id.mods["mean_var_norm"] = torch.jit.trace(self.language_id.mods["mean_var_norm"], example_inputs=(I1, sampleWavLens))
91+
self.language_id.mods["embedding_model"] = torch.jit.trace(self.language_id.mods["embedding_model"], example_inputs=(I2, sampleWavLens))
92+
self.language_id.mods["classifier"] = torch.jit.trace(self.language_id.mods["classifier"], example_inputs=I3)
8993

9094
self.language_id.mods["compute_features"] = torch.jit.freeze(self.language_id.mods["compute_features"])
9195
self.language_id.mods["mean_var_norm"] = torch.jit.freeze(self.language_id.mods["mean_var_norm"])
9296
self.language_id.mods["embedding_model"] = torch.jit.freeze(self.language_id.mods["embedding_model"])
9397
self.language_id.mods["classifier"] = torch.jit.freeze( self.language_id.mods["classifier"])
9498
else:
95-
self.language_id.mods["compute_features"] = torch.jit.trace( self.language_id.mods["compute_features"] , example_inputs=(torch.rand(1,32000)))
96-
self.language_id.mods["mean_var_norm"] = torch.jit.trace(self.language_id.mods["mean_var_norm"], example_inputs=sampleInput)
97-
self.language_id.mods["embedding_model"] = torch.jit.trace(self.language_id.mods["embedding_model"], example_inputs=sampleInput)
98-
self.language_id.mods["classifier"] = torch.jit.trace(self.language_id.mods["classifier"], example_inputs=I2)
99+
self.language_id.mods["compute_features"] = torch.jit.trace( self.language_id.mods["compute_features"] , example_inputs=sampleWavs)
100+
self.language_id.mods["mean_var_norm"] = torch.jit.trace(self.language_id.mods["mean_var_norm"], example_inputs=(I1, sampleWavLens))
101+
self.language_id.mods["embedding_model"] = torch.jit.trace(self.language_id.mods["embedding_model"], example_inputs=(I2, sampleWavLens))
102+
self.language_id.mods["classifier"] = torch.jit.trace(self.language_id.mods["classifier"], example_inputs=I3)
99103

100104
self.language_id.mods["compute_features"] = torch.jit.freeze(self.language_id.mods["compute_features"])
101105
self.language_id.mods["mean_var_norm"] = torch.jit.freeze(self.language_id.mods["mean_var_norm"])
@@ -114,11 +118,11 @@ def predict(self, data_path="", ipex_op=False, bf16=False, int8_model=False, ver
114118
with torch.no_grad():
115119
if bf16:
116120
with torch.cpu.amp.autocast():
117-
prediction = self.language_id.classify_batch(signal)
121+
prediction = self.language_id.classify_batch(signal)
118122
else:
119-
prediction = self.language_id.classify_batch(signal)
123+
prediction = self.language_id.classify_batch(signal)
120124
else: # default
121-
prediction = self.language_id.classify_batch(signal)
125+
prediction = self.language_id.classify_batch(signal)
122126

123127
inference_end_time = time()
124128
inference_latency = inference_end_time - inference_start_time
@@ -195,13 +199,13 @@ def main(argv):
195199
with open(OUTPUT_SUMMARY_CSV_FILE, 'w') as f:
196200
writer = csv.writer(f)
197201
writer.writerow(["Audio File",
198-
"Input Frequency",
202+
"Input Frequency (Hz)",
199203
"Expected Language",
200204
"Top Consensus",
201205
"Top Consensus %",
202206
"Second Consensus",
203207
"Second Consensus %",
204-
"Average Latency",
208+
"Average Latency (s)",
205209
"Result"])
206210

207211
total_samples = 0
@@ -273,12 +277,12 @@ def main(argv):
273277
predict_list = []
274278
use_entire_audio_file = False
275279
latency_sum = 0.0
276-
if data.waveduration < sample_dur:
280+
if int(data.waveduration) <= sample_dur:
277281
# Use entire audio file if the duration is less than the sampling duration
278282
use_entire_audio_file = True
279283
sample_list = [0 for _ in range(sample_size)]
280284
else:
281-
start_time_list = list(range(sample_size - int(data.waveduration) + 1))
285+
start_time_list = list(range(int(data.waveduration) - sample_dur))
282286
sample_list = []
283287
for i in range(sample_size):
284288
sample_list.append(random.sample(start_time_list, 1)[0])
@@ -346,17 +350,36 @@ def main(argv):
346350
avg_latency,
347351
result
348352
])
353+
else:
354+
# Write results to a .csv file
355+
with open(OUTPUT_SUMMARY_CSV_FILE, 'a') as f:
356+
writer = csv.writer(f)
357+
writer.writerow([
358+
filename,
359+
sample_rate_for_csv,
360+
"N/A",
361+
top_occurance,
362+
str(topPercentage) + "%",
363+
sec_occurance,
364+
str(secPercentage) + "%",
365+
avg_latency,
366+
"N/A"
367+
])
368+
349369

350370
if ground_truth_compare:
351371
# Summary of results
352372
print("\n\n Correctly predicted %d/%d\n" %(correct_predictions, total_samples))
353-
print("\n See %s for summary\n" %(OUTPUT_SUMMARY_CSV_FILE))
373+
374+
print("\n See %s for summary\n" %(OUTPUT_SUMMARY_CSV_FILE))
354375

355376
elif os.path.isfile(path):
356377
print("\nIt is a normal file", path)
357378
else:
358379
print("It is a special file (socket, FIFO, device file)" , path)
359380

381+
print("Done.\n")
382+
360383
if __name__ == "__main__":
361384
import sys
362385
sys.exit(main(sys.argv))

AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/initialize.sh

Lines changed: 0 additions & 23 deletions
This file was deleted.

AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/interfaces.patch

Lines changed: 0 additions & 11 deletions
This file was deleted.

AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/lang_id_inference.ipynb

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,15 +47,15 @@
4747
"metadata": {},
4848
"outputs": [],
4949
"source": [
50-
"!python inference_commonVoice.py -p /data/commonVoice/test"
50+
"!python inference_commonVoice.py -p ${COMMON_VOICE_PATH}/processed_data/test"
5151
]
5252
},
5353
{
5454
"cell_type": "markdown",
5555
"metadata": {},
5656
"source": [
5757
"## inference_custom.py for Custom Data \n",
58-
"To generate an overall results output summary, the audio_ground_truth_labels.csv file needs to be modified with the name of the audio file and expected audio label (i.e. en for English). By default, this is disabled but if desired, the *--ground_truth_compare* can be used. To run inference on custom data, you must specify a folder with WAV files and pass the path in as an argument. "
58+
"To run inference on custom data, you must specify a folder with .wav files and pass the path in as an argument. You can do so by creating a folder named `data_custom` and then copy 1 or 2 .wav files from your test dataset into it. .mp3 files will NOT work. "
5959
]
6060
},
6161
{
@@ -65,7 +65,7 @@
6565
"### Randomly select audio clips from audio files for prediction\n",
6666
"python inference_custom.py -p DATAPATH -d DURATION -s SIZE\n",
6767
"\n",
68-
"An output file output_summary.csv will give the summary of the results."
68+
"An output file `output_summary.csv` will give the summary of the results."
6969
]
7070
},
7171
{
@@ -104,6 +104,8 @@
104104
"### Optimizations with Intel® Extension for PyTorch (IPEX) \n",
105105
"python inference_custom.py -p data_custom -d 3 -s 50 --vad --ipex --verbose \n",
106106
"\n",
107+
"This will apply ipex.optimize to the model(s) and TorchScript. You can also add the --bf16 option along with --ipex to run in the BF16 data type, supported on 4th Gen Intel® Xeon® Scalable processors and newer.\n",
108+
"\n",
107109
"Note that the *--verbose* option is required to view the latency measurements. "
108110
]
109111
},
@@ -121,7 +123,7 @@
121123
"metadata": {},
122124
"source": [
123125
"## Quantization with Intel® Neural Compressor (INC)\n",
124-
"To improve inference latency, Intel® Neural Compressor (INC) can be used to quantize the trained model from FP32 to INT8 by running quantize_model.py. The *-datapath* argument can be used to specify a custom evaluation dataset but by default it is set to */data/commonVoice/dev* which was generated from the data preprocessing scripts in the *Training* folder. "
126+
"To improve inference latency, Intel® Neural Compressor (INC) can be used to quantize the trained model from FP32 to INT8 by running quantize_model.py. The *-datapath* argument can be used to specify a custom evaluation dataset but by default it is set to `$COMMON_VOICE_PATH/processed_data/dev` which was generated from the data preprocessing scripts in the `Training` folder. "
125127
]
126128
},
127129
{
@@ -130,14 +132,46 @@
130132
"metadata": {},
131133
"outputs": [],
132134
"source": [
133-
"!python quantize_model.py -p ./lang_id_commonvoice_model -datapath $COMMON_VOICE_PATH/dev"
135+
"!python quantize_model.py -p ./lang_id_commonvoice_model -datapath $COMMON_VOICE_PATH/processed_data/dev"
136+
]
137+
},
138+
{
139+
"cell_type": "markdown",
140+
"metadata": {},
141+
"source": [
142+
"After quantization, the model will be stored in lang_id_commonvoice_model_INT8 and neural_compressor.utils.pytorch.load will have to be used to load the quantized model for inference. If self.language_id is the original model and data_path is the path to the audio file:\n",
143+
"\n",
144+
"```\n",
145+
"from neural_compressor.utils.pytorch import load\n",
146+
"model_int8 = load(\"./lang_id_commonvoice_model_INT8\", self.language_id)\n",
147+
"signal = self.language_id.load_audio(data_path)\n",
148+
"prediction = self.model_int8(signal)\n",
149+
"```"
134150
]
135151
},
136152
{
137153
"cell_type": "markdown",
138154
"metadata": {},
139155
"source": [
140-
"After quantization, the model will be stored in *lang_id_commonvoice_model_INT8* and *neural_compressor.utils.pytorch.load* will have to be used to load the quantized model for inference. "
156+
"The code above is integrated into inference_custom.py. You can now run inference on your data using this INT8 model:"
157+
]
158+
},
159+
{
160+
"cell_type": "code",
161+
"execution_count": null,
162+
"metadata": {},
163+
"outputs": [],
164+
"source": [
165+
"!python inference_custom.py -p data_custom -d 3 -s 50 --vad --int8_model --verbose"
166+
]
167+
},
168+
{
169+
"cell_type": "markdown",
170+
"metadata": {},
171+
"source": [
172+
"### (Optional) Comparing Predictions with Ground Truth\n",
173+
"\n",
174+
"You can choose to modify audio_ground_truth_labels.csv to include the name of the audio file and expected audio label (like, en for English), then run inference_custom.py with the --ground_truth_compare option. By default, this is disabled."
141175
]
142176
},
143177
{

AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/quantize_model.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
from neural_compressor.utils.pytorch import load
1919
from speechbrain.pretrained import EncoderClassifier
2020

21-
DEFAULT_EVAL_DATA_PATH = "/data/commonVoice/dev"
22-
2321
def prepare_dataset(path):
2422
data_list = []
2523
for dir_name in os.listdir(path):
@@ -33,7 +31,7 @@ def main(argv):
3331
import argparse
3432
parser = argparse.ArgumentParser()
3533
parser.add_argument('-p', type=str, required=True, help="Path to the model to be optimized")
36-
parser.add_argument('-datapath', type=str, default=DEFAULT_EVAL_DATA_PATH, help="Path to evaluation dataset")
34+
parser.add_argument('-datapath', type=str, required=True, help="Path to evaluation dataset")
3735
args = parser.parse_args()
3836

3937
model_path = args.p

0 commit comments

Comments
 (0)