fix multiprocessing in torch dataProvider

pythonlessons · pythonlessons · commit ee6f0b58a131 · 2023-10-04T09:49:15.000+03:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## [1.1.4] - 2022-09-29
+### Changed
+- Improoved `mltu.torch.dataProvider.DataProvider` to hangle `multiprocessing` when it doesn't work to switch to `multithreading`
+
 ## [1.1.3] - 2022-09-29
 ### Changed
 - Removed `Librosa` library dependency in requirements, now it is optional and required only with modules that use librosa
diff --git a/Tutorials/10_wav2vec2_torch/requirements.txt b/Tutorials/10_wav2vec2_torch/requirements.txt
@@ -1,3 +1,5 @@
-torch==1.13.1+cu117
+torch>=1.13.1+cu117
 transformers==4.33.1
-onnx
+mltu==1.1.4
+onnx
+onnxruntime
diff --git a/Tutorials/10_wav2vec2_torch/test.py b/Tutorials/10_wav2vec2_torch/test.py
@@ -39,5 +39,6 @@ def predict(self, audio: np.ndarray):
 
         accum_cer.append(cer)
         accum_wer.append(wer)
+        print(label)
 
         pbar.set_description(f"Average CER: {np.average(accum_cer):.4f}, Average WER: {np.average(accum_wer):.4f}")
diff --git a/Tutorials/10_wav2vec2_torch/train.py b/Tutorials/10_wav2vec2_torch/train.py
@@ -65,15 +65,15 @@ def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024):
         ],
     transformers=[
         LabelIndexer(vocab),
-        LabelPadding(max_word_length=configs.max_label_length, padding_value=len(vocab)),
         ],
     use_cache=False,
     batch_postprocessors=[
-        AudioPadding(max_audio_length=configs.max_audio_length, padding_value=0, use_on_batch=True)
+        AudioPadding(max_audio_length=configs.max_audio_length, padding_value=0, use_on_batch=True),
+        LabelPadding(padding_value=len(vocab), use_on_batch=True),
     ],
     use_multiprocessing=True,
     max_queue_size=10,
-    workers=64,
+    workers=configs.train_workers,
 )
 train_dataProvider, test_dataProvider = data_provider.split(split=0.9)
 
diff --git a/mltu/__init__.py b/mltu/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.1.3"
+__version__ = "1.1.4"
 
 from .annotations.images import Image
 from .annotations.images import CVImage
diff --git a/mltu/torch/dataProvider.py b/mltu/torch/dataProvider.py
@@ -181,7 +181,12 @@ def start_executor(self) -> None:
 
         if not hasattr(self, "_executor"):
             if self.use_multiprocessing:
-                self._executor = ProcessExecutor(self.process_data, self.workers)
+                try:
+                    self._executor = ProcessExecutor(self.process_data, self.workers)
+                except:
+                    self.use_multiprocessing = False
+                    self.logger.error("Failed to start multiprocessing, switching to multithreading")
+                    self._executor = ThreadExecutor(self.process_data, self.workers)
             else:
                 self._executor = ThreadExecutor(self.process_data, self.workers)
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "1.1.3"`
	`1`	`+__version__ = "1.1.4"`
`2`	`2`
`3`	`3`	`from .annotations.images import Image`
`4`	`4`	`from .annotations.images import CVImage`