fix: tflite, initial states, results

nglehuy · nglehuy · commit db66008b3d2b · 2025-05-25T15:45:23.000+07:00
diff --git a/.pylintrc b/.pylintrc
@@ -125,6 +125,7 @@ disable=too-few-public-methods,
         abstract-method,
         too-many-ancestors,
         import-outside-toplevel,
+        too-many-positional-arguments,
         
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
diff --git a/README.md b/README.md
@@ -147,7 +147,7 @@ See [tflite_convertion](./docs/tutorials/tflite.md)
 
 ## Pretrained Models
 
-Go to [drive](https://drive.google.com/drive/folders/1BD0AK30n8hc-yR28C5FW3LqzZxtLOQfl?usp=sharing)
+See the results on each example folder, e.g. [./examples/models//transducer/conformer/results/sentencepiece/README.md](./examples/models//transducer/conformer/results/sentencepiece/README.md)
 
 ## Corpus Sources
 
@@ -165,6 +165,7 @@ Go to [drive](https://drive.google.com/drive/folders/1BD0AK30n8hc-yR28C5FW3LqzZx
 | Vivos                                  | [https://ailab.hcmus.edu.vn/vivos](https://www.kaggle.com/datasets/kynthesis/vivos-vietnamese-speech-corpus-for-asr) | 15h       |
 | InfoRe Technology 1                    | [InfoRe1 (passwd: BroughtToYouByInfoRe)](https://files.huylenguyen.com/datasets/infore/25hours.zip)                  | 25h       |
 | InfoRe Technology 2 (used in VLSP2019) | [InfoRe2 (passwd: BroughtToYouByInfoRe)](https://files.huylenguyen.com/datasets/infore/audiobooks.zip)               | 415h      |
+| VieitBud500                            | [https://huggingface.co/datasets/linhtran92/viet_bud500](https://huggingface.co/datasets/linhtran92/viet_bud500)     | 500h      |
 
 ## How to contribute
 
diff --git a/examples/models/ctc/conformer/results/sentencepiece/README.md b/examples/models/ctc/conformer/results/sentencepiece/README.md
@@ -1,18 +1,20 @@
 - [\[English\] LibriSpeech](#english-librispeech)
   - [I. Small + SentencePiece 256](#i-small--sentencepiece-256)
+  - [II. Small + Streaming + SentencePiece 256](#ii-small--streaming--sentencepiece-256)
 
 # [English] LibriSpeech
 
 ## I. Small + SentencePiece 256
 
-| Category          | Description                                                |
-| :---------------- | :--------------------------------------------------------- |
-| Config            | [small.yml.j2](../../small.yml.j2)                         |
-| Tensorflow        | **2.18.0**                                                 |
-| Device            | Google Cloud TPUs v4-8                                     |
-| Mixed Precision   | strict                                                     |
-| Global Batch Size | 8 * 4 * 8 = 256 (as 4 TPUs, 8 Gradient Accumulation Steps) |
-| Max Epochs        | 450                                                        |
+| Category          | Description                                                                              |
+| :---------------- | :--------------------------------------------------------------------------------------- |
+| Config            | [small.yml.j2](../../small.yml.j2)                                                       |
+| Tensorflow        | **2.18.0**                                                                               |
+| Device            | Google Cloud TPUs v4-8                                                                   |
+| Mixed Precision   | strict                                                                                   |
+| Global Batch Size | 8 * 4 * 8 = 256 (as 4 TPUs, 8 Gradient Accumulation Steps)                               |
+| Max Epochs        | 450                                                                                      |
+| Pretrained        | [Link](https://www.kaggle.com/models/lordh9072/tfasr-conformer-ctc/tensorFlow2/v3-small) |
 
 **Config:**
 
@@ -30,17 +32,18 @@
 | 170   | test-clean | greedy   | 0.0967171 | 0.031954  | 0.0958403 | 0.168307 | 0.831693 |
 | 170   | test-other | greedy   | 0.201612  | 0.0812955 | 0.197415  | 0.330207 | 0.669793 |
 
-<!--
+
 ## II. Small + Streaming + SentencePiece 256
 
-| Category          | Description                                                |
-| :---------------- | :--------------------------------------------------------- |
-| Config            | [small-streaming.yml.j2](../../small-streaming.yml.j2)     |
-| Tensorflow        | **2.18.0**                                                 |
-| Device            | Google Cloud TPUs v4-8                                     |
-| Mixed Precision   | strict                                                     |
-| Global Batch Size | 8 * 4 * 8 = 256 (as 4 TPUs, 8 Gradient Accumulation Steps) |
-| Max Epochs        | 450                                                        |
+| Category          | Description                                                                                        |
+| :---------------- | :------------------------------------------------------------------------------------------------- |
+| Config            | [small-streaming.yml.j2](../../small-streaming.yml.j2)                                             |
+| Tensorflow        | **2.18.0**                                                                                         |
+| Device            | Google Cloud TPUs v4-8                                                                             |
+| Mixed Precision   | strict                                                                                             |
+| Global Batch Size | 8 * 4 * 8 = 256 (as 4 TPUs, 8 Gradient Accumulation Steps)                                         |
+| Max Epochs        | 450                                                                                                |
+| Pretrained        | [Link](https://www.kaggle.com/models/lordh9072/tfasr-conformer-ctc/tensorFlow2/v3-small-streaming) |
 
 **Config:**
 
@@ -51,8 +54,28 @@
 {{config}}
 ```
 
+**Tensorboard:**
+
+<table>
+  <tr>
+    <td align="center">
+      <img src="./figs/librispeech-small-streaming-epoch-loss.jpg" width="200px"><br>
+      <sub><strong>Epoch Loss</strong></sub>
+    </td>
+    <td align="center">
+      <img src="./figs/librispeech-small-streaming-batch-loss.jpg" width="200px"><br>
+      <sub><strong>Batch Loss</strong></sub>
+    </td>
+    <td align="center">
+      <img src="./figs/librispeech-small-streaming-lr.jpg " width="200px"><br>
+      <sub><strong>Learning Rate</strong></sub>
+    </td>
+  </tr>
+</table>
+
 **Results:**
 
-| Epoch | Dataset | decoding | wer  | cer  | mer  | wil  | wip  |
-| :---- | :------ | :------- | :--- | :--- | :--- | :--- | :--- |
--->
+| Epoch | Dataset    | decoding | wer       | cer       | mer       | wil     | wip     |
+| :---- | :--------- | :------- | :-------- | :-------- | :-------- | :------ | :------ |
+| 60    | test-clean | greedy   | 0.0848106 | 0.0286257 | 0.0841686 | 0.14896 | 0.85104 |
+| 60    | test-other | greedy   | 0.217221  | 0.0913044 | 0.213409  | 0.3555  | 0.6445  |
diff --git a/examples/models/ctc/conformer/results/sentencepiece/figs/librispeech-small-streaming-batch-loss.jpg b/examples/models/ctc/conformer/results/sentencepiece/figs/librispeech-small-streaming-batch-loss.jpg
diff --git a/examples/models/ctc/conformer/results/sentencepiece/figs/librispeech-small-streaming-epoch-loss.jpg b/examples/models/ctc/conformer/results/sentencepiece/figs/librispeech-small-streaming-epoch-loss.jpg
diff --git a/examples/models/ctc/conformer/results/sentencepiece/figs/librispeech-small-streaming-lr.jpg b/examples/models/ctc/conformer/results/sentencepiece/figs/librispeech-small-streaming-lr.jpg
diff --git a/examples/models/transducer/conformer/results/sentencepiece/README.md b/examples/models/transducer/conformer/results/sentencepiece/README.md
@@ -10,14 +10,15 @@
 
 ## I. Small + SentencePiece 1k
 
-| Category          | Description                                                |
-| :---------------- | :--------------------------------------------------------- |
-| Config            | [small.yml.j2](../../small.yml.j2)                         |
-| Tensorflow        | **2.18.0**                                                 |
-| Device            | Google Cloud TPUs v4-8                                     |
-| Mixed Precision   | strict                                                     |
-| Global Batch Size | 4 * 4 * 8 = 128 (as 4 TPUs, 8 Gradient Accumulation Steps) |
-| Max Epochs        | 300                                                        |
+| Category          | Description                                                                                     |
+| :---------------- | :---------------------------------------------------------------------------------------------- |
+| Config            | [small.yml.j2](../../small.yml.j2)                                                              |
+| Tensorflow        | **2.18.0**                                                                                      |
+| Device            | Google Cloud TPUs v4-8                                                                          |
+| Mixed Precision   | strict                                                                                          |
+| Global Batch Size | 4 * 4 * 8 = 128 (as 4 TPUs, 8 Gradient Accumulation Steps)                                      |
+| Max Epochs        | 300                                                                                             |
+| Pretrained        | [Link](https://www.kaggle.com/models/lordh9072/tfasr-conformer-transducer/tensorFlow2/v3-small) |
 
 **Config:**
 
@@ -37,14 +38,15 @@
 
 ## II. Small + Streaming + SentencePiece 1k
 
-| Category          | Description                                                |
-| :---------------- | :--------------------------------------------------------- |
-| Config            | [small-streaming.yml.j2](../../small-streaming.yml.j2)     |
-| Tensorflow        | **2.18.0**                                                 |
-| Device            | Google Cloud TPUs v4-8                                     |
-| Mixed Precision   | strict                                                     |
-| Global Batch Size | 4 * 4 * 8 = 128 (as 4 TPUs, 8 Gradient Accumulation Steps) |
-| Max Epochs        | 300                                                        |
+| Category          | Description                                                                                               |
+| :---------------- | :-------------------------------------------------------------------------------------------------------- |
+| Config            | [small-streaming.yml.j2](../../small-streaming.yml.j2)                                                    |
+| Tensorflow        | **2.18.0**                                                                                                |
+| Device            | Google Cloud TPUs v4-8                                                                                    |
+| Mixed Precision   | strict                                                                                                    |
+| Global Batch Size | 4 * 4 * 8 = 128 (as 4 TPUs, 8 Gradient Accumulation Steps)                                                |
+| Max Epochs        | 300                                                                                                       |
+| Pretrained        | [Link](https://www.kaggle.com/models/lordh9072/tfasr-conformer-transducer/tensorFlow2/v3-small-streaming) |
 
 **Config:**
 
@@ -57,25 +59,26 @@
 
 **Results:**
 
-| Epoch | Dataset    | decoding | wer      | cer       | mer      | wil      | wip      |
-| :---- | :--------- | :------- | :------- | :-------- | :------- | :------- | :------- |
-| 45    | test-clean | greedy   | 0.110564 | 0.0460022 | 0.109064 | 0.186109 | 0.813891 |
-| 45    | test-other | greedy   | 0.267772 | 0.139369  | 0.260952 | 0.417361 | 0.582639 |
+| Epoch | Dataset    | decoding | wer       | cer       | mer       | wil      | wip      |
+| :---- | :--------- | :------- | :-------- | :-------- | :-------- | :------- | :------- |
+| 45    | test-clean | greedy   | 0.0797322 | 0.0312862 | 0.0790049 | 0.137228 | 0.862772 |
+| 45    | test-other | greedy   | 0.211872  | 0.104173  | 0.207305  | 0.341269 | 0.658731 |
 
 <!-- ----------------------------------------------------- VN ------------------------------------------------------ -->
 
 # [Vietnamese] VietBud500
 
 ## I. Small + Streaming + SentencePiece 1k
 
-| Category          | Description                                                |
-| :---------------- | :--------------------------------------------------------- |
-| Config            | [small-streaming.yml.j2](../../small-streaming.yml.j2)     |
-| Tensorflow        | **2.18.0**                                                 |
-| Device            | Google Cloud TPUs v4-8                                     |
-| Mixed Precision   | strict                                                     |
-| Global Batch Size | 8 * 4 * 8 = 256 (as 4 TPUs, 8 Gradient Accumulation Steps) |
-| Max Epochs        | 300                                                        |
+| Category          | Description                                                                                                       |
+| :---------------- | :---------------------------------------------------------------------------------------------------------------- |
+| Config            | [small-streaming.yml.j2](../../small-streaming.yml.j2)                                                            |
+| Tensorflow        | **2.18.0**                                                                                                        |
+| Device            | Google Cloud TPUs v4-8                                                                                            |
+| Mixed Precision   | strict                                                                                                            |
+| Global Batch Size | 8 * 4 * 8 = 256 (as 4 TPUs, 8 Gradient Accumulation Steps)                                                        |
+| Max Epochs        | 300                                                                                                               |
+| Pretrained        | [Link](https://www.kaggle.com/models/lordh9072/tfasr-vietbud500-conformer-transducer/tensorFlow2/small-streaming) |
 
 **Config:**
 
@@ -109,6 +112,4 @@
 
 | Epoch | decoding | wer      | cer      | mer     | wil      | wip      |
 | :---- | :------- | :------- | :------- | :------ | :------- | :------- |
-| 52    | greedy   | 0.053723 | 0.034548 | 0.05362 | 0.086421 | 0.913579 |
-
-**Pretrained Model**: [Link](https://www.kaggle.com/models/lordh9072/tfasr-vietbud500-conformer-transducer/tensorFlow2/small-streaming)
+| 52    | greedy   | 0.053723 | 0.034548 | 0.05362 | 0.086421 | 0.913579 |
diff --git a/tensorflow_asr/models/base_model.py b/tensorflow_asr/models/base_model.py
@@ -317,10 +317,10 @@ def get_initial_tokens(self, batch_size=1):
         return tf.ones([batch_size, 1], dtype=tf.int32) * self.tokenizer.blank
 
     def get_initial_encoder_states(self, batch_size=1):
-        return None
+        return []
 
     def get_initial_decoder_states(self, batch_size=1):
-        return None
+        return []
 
     def recognize(self, inputs: schemas.PredictInput, **kwargs) -> schemas.PredictOutput:
         """Greedy decoding function that used in self.predict_step"""
@@ -351,8 +351,8 @@ def tflite_func(inputs: schemas.PredictInput):
             inputs=tf.TensorSpec([batch_size, None], dtype=tf.float32),
             inputs_length=tf.TensorSpec([batch_size], dtype=tf.int32),
             previous_tokens=tf.TensorSpec.from_tensor(self.get_initial_tokens(batch_size)),
-            previous_encoder_states=tf.TensorSpec.from_tensor(self.get_initial_encoder_states(batch_size)),
-            previous_decoder_states=tf.TensorSpec.from_tensor(self.get_initial_decoder_states(batch_size)),
+            previous_encoder_states=tf.nest.map_structure(tf.TensorSpec.from_tensor, self.get_initial_encoder_states(batch_size)),
+            previous_decoder_states=tf.nest.map_structure(tf.TensorSpec.from_tensor, self.get_initial_decoder_states(batch_size)),
         )
 
         return tf.function(
diff --git a/tensorflow_asr/models/ctc/base_ctc.py b/tensorflow_asr/models/ctc/base_ctc.py
@@ -92,10 +92,10 @@ def call_next(
         return outputs, outputs_length, next_encoder_states, next_decoder_states
 
     def get_initial_encoder_states(self, batch_size=1):
-        return None
+        return []
 
     def get_initial_decoder_states(self, batch_size=1):
-        return None
+        return []
 
     # -------------------------------- GREEDY -------------------------------------
 
diff --git a/tensorflow_asr/models/encoders/conformer.py b/tensorflow_asr/models/encoders/conformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" http://arxiv.org/abs/2005.08100 """
+"""http://arxiv.org/abs/2005.08100"""
 
 from tensorflow_asr import keras, tf
 from tensorflow_asr.models.activations.glu import GLU
@@ -21,6 +21,7 @@
 from tensorflow_asr.models.layers.multihead_attention import MultiHeadAttention, MultiHeadRelativeAttention
 from tensorflow_asr.models.layers.positional_encoding import RelativeSinusoidalPositionalEncoding, SinusoidalPositionalEncoding
 from tensorflow_asr.models.layers.residual import Residual
+from tensorflow_asr.utils import data_util
 
 L2 = keras.regularizers.l2(1e-6)
 
@@ -664,7 +665,9 @@ def __init__(
             self.content_attention_bias, self.positional_attention_bias = None, None
 
     def get_initial_state(self, batch_size: int):
-        return [block.get_initial_state(batch_size) for block in self.conformer_blocks]
+        states = [block.get_initial_state(batch_size) for block in self.conformer_blocks]
+        states = [s for s in states if s is not None]
+        return states
 
     def call(
         self,
@@ -684,7 +687,7 @@ def call(
                 (outputs, relative_position_encoding),
                 content_attention_bias=self.content_attention_bias,
                 positional_attention_bias=self.positional_attention_bias,
-                initial_state=None if initial_state is None else initial_state[i],
+                initial_state=data_util.get(initial_state, i, None),
                 training=training,
                 use_causal_mask=self._use_attention_causal_mask,
                 use_auto_mask=self._use_attention_auto_mask,
diff --git a/tensorflow_asr/models/encoders/transformer.py b/tensorflow_asr/models/encoders/transformer.py
@@ -20,6 +20,7 @@
 from tensorflow_asr.models.layers.positional_encoding import RelativeSinusoidalPositionalEncoding, SinusoidalPositionalEncoding
 from tensorflow_asr.models.layers.residual import Residual
 from tensorflow_asr.models.layers.subsampling import Conv1dSubsampling, Conv2dSubsampling, VggSubsampling
+from tensorflow_asr.utils import data_util
 
 
 @keras.utils.register_keras_serializable(package=__name__)
@@ -330,7 +331,7 @@ def call(
                 [outputs, relative_position_encoding],
                 content_attention_bias=self.content_attention_bias,
                 positional_attention_bias=self.positional_attention_bias,
-                initial_state=None if initial_state is None else initial_state[i],
+                initial_state=data_util.get(initial_state, i, None),
                 training=training,
                 use_causal_mask=self._use_attention_causal_mask,
                 use_auto_mask=self._use_attention_auto_mask,
diff --git a/tensorflow_asr/models/transducer/base_transducer.py b/tensorflow_asr/models/transducer/base_transducer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" https://arxiv.org/pdf/1811.06621.pdf """
+"""https://arxiv.org/pdf/1811.06621.pdf"""
 
 import collections
 import typing
@@ -464,7 +464,7 @@ def call_next(
             return ytu, new_states
 
     def get_initial_encoder_states(self, batch_size=1):
-        return None
+        return []
 
     def get_initial_decoder_states(self, batch_size=1):
         return self.predict_net.get_initial_state(batch_size)
diff --git a/tensorflow_asr/scripts/save.py b/tensorflow_asr/scripts/save.py
@@ -12,12 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
 
 from tensorflow_asr import keras, tf, tokenizers
 from tensorflow_asr.configs import Config
 from tensorflow_asr.models.base_model import BaseModel
-from tensorflow_asr.utils import cli_util, env_util, file_util
+from tensorflow_asr.utils import cli_util, env_util, keras_util
+
+logger = logging.getLogger(__name__)
 
 
 def main(
@@ -36,16 +39,18 @@ def main(
     tokenizer = tokenizers.get(config)
     tokenizer.make()
 
-    model: BaseModel = keras.Model.from_config(config.model_config)
+    logger.info(f"Configs: {str(config)}")
+
+    model: BaseModel = keras_util.model_from_config(config.model_config)
     model.tokenizer = tokenizer
     model.make(batch_size=bs)
     if h5 and tf.io.gfile.exists(h5):
-        model.load_weights(h5, by_name=file_util.is_hdf5_filepath(h5))
+        model.load_weights(h5, skip_mismatch=False)
     model.summary()
 
     model.save(output, save_format=save_format)
     loaded_model: BaseModel = keras.models.load_model(output)
-    print(loaded_model.to_json())
+    logger.info(loaded_model.to_json())
     loaded_model.summary()
 
 
diff --git a/tensorflow_asr/scripts/tflite.py b/tensorflow_asr/scripts/tflite.py
diff --git a/tensorflow_asr/utils/data_util.py b/tensorflow_asr/utils/data_util.py