Fix Melotts csim bug by changing torch.sum op (#2894)

liuyang-1123 · web-flow · commit b1909922c5fd · 2026-03-19T05:11:32.000Z
Csim can run AIHub binaries successfully now after changing `y_lengths =
torch.sum(w_ceil, [1, 2])` to `y_lengths = torch.sum(torch.sum(w_ceil,
dim=2), dim=1)` for the encoder, it is a QNN bug.
QNN can't identify following lines, after converting to context binary, the y_lengths is computed wrongly,
w_ceil.shape = [1, 1, 512].
`y_lengths = torch.sum(w_ceil, [1, 2])`
`y_lengths = torch.tensor([w_ceil.detach().numpy().sum() ],
dtype=torch.float32)`
`y_lengths = torch.tensor([w_ceil.squeeze().cumsum(dim=0)[-1] ],
dtype=torch.float32)`
diff --git a/qai_hub_models/models/_shared/hf_whisper/model.py b/qai_hub_models/models/_shared/hf_whisper/model.py
@@ -5,6 +5,8 @@
 
 from __future__ import annotations
 
+import base64
+import os
 from abc import abstractmethod
 from typing import Any, cast
 
@@ -19,10 +21,12 @@
 from transformers.models.whisper.modeling_whisper import WhisperDecoder, WhisperEncoder
 from typing_extensions import Self
 
+from qai_hub_models.configs.metadata_yaml import ModelMetadata
 from qai_hub_models.models._shared.hf_whisper.model_adaptation import (
     monkey_patch_model,
 )
 from qai_hub_models.models.common import Precision, TargetRuntime
+from qai_hub_models.utils.asset_loaders import CachedWebModelAsset
 from qai_hub_models.utils.base_model import (
     BaseModel,
     CollectionModel,
@@ -52,6 +56,12 @@
 
 # Mask neg
 MASK_NEG = -100.0
+TIKTOKEN_URL = CachedWebModelAsset(
+    "https://raw.githubusercontent.com/openai/whisper/839639a223b92ad61851baae9ad8a695ccb41ce5/whisper/assets/multilingual.tiktoken",
+    "hf_whisper_shared",
+    1,
+    "multilingual.tiktoken",
+)
 
 
 class HfWhisperEncoder(BaseModel):
@@ -363,6 +373,26 @@ def from_pretrained(cls) -> Self:
         decoder = HfWhisperDecoder(config, whisper.get_decoder())
         return cls(encoder, decoder, config, cls.get_hf_whisper_version())
 
+    def write_supplementary_files(
+        self, output_dir: str | os.PathLike, metadata: ModelMetadata
+    ) -> None:
+        whisper_tiktoken = TIKTOKEN_URL.fetch()
+
+        with open(whisper_tiktoken, "rb") as f:
+            lines = f.readlines()
+
+        with open(os.path.join(output_dir, "vocab.bin"), "wb") as f:
+            for line in lines:
+                l = line.split()
+                if len(l) < 2:
+                    continue
+                token = base64.b64decode(line.split()[0])
+                if b"\0" in token:
+                    f.write(token)
+                else:
+                    f.write(token)
+                    f.write(b"\0")
+
 
 def get_feature_extractor(
     hf_whisper_version: str = "openai/whisper-small",
diff --git a/qai_hub_models/models/_shared/melotts/app.py b/qai_hub_models/models/_shared/melotts/app.py
@@ -231,11 +231,11 @@ def tts_to_file(
             length_scale_pt,
             noise_scale_w_pt,
         )
-
         # Flow input
         y_mask = torch.unsqueeze(
-            torch.arange(MAX_SEQ_LEN * 3) < y_lengths[:, None], dim=1
+            torch.arange(MAX_SEQ_LEN * 3) < y_lengths.unsqueeze(dim=-1), dim=1
         ).to(torch.float32)
+
         attn_mask = x_mask.unsqueeze(dim=2) * y_mask.unsqueeze(dim=-1)
         attn = generate_path(w_ceil, attn_mask)
         attn_squeezed = attn.squeeze(1).to(torch.float32)
@@ -352,7 +352,7 @@ def get_calibration_data(
                 )
 
                 y_mask = torch.unsqueeze(
-                    torch.arange(MAX_SEQ_LEN * 3) < y_lengths[:, None], dim=1
+                    torch.arange(MAX_SEQ_LEN * 3) < y_lengths.unsqueeze(dim=-1), dim=1
                 ).to(torch.float32)
                 attn_mask = x_mask.unsqueeze(dim=2) * y_mask.unsqueeze(dim=-1)
                 attn = generate_path(w_ceil, attn_mask)
@@ -398,7 +398,7 @@ def get_calibration_data(
                 )
 
                 y_mask = torch.unsqueeze(
-                    torch.arange(MAX_SEQ_LEN * 3) < y_lengths[:, None], dim=1
+                    torch.arange(MAX_SEQ_LEN * 3) < y_lengths.unsqueeze(dim=-1), dim=1
                 ).to(torch.float32)
                 attn_mask = x_mask.unsqueeze(dim=2) * y_mask.unsqueeze(dim=-1)
                 attn = generate_path(w_ceil, attn_mask)
diff --git a/qai_hub_models/models/_shared/melotts/meloTTS_encoder.py b/qai_hub_models/models/_shared/melotts/meloTTS_encoder.py
@@ -106,8 +106,9 @@ def forward(
 
         x = self.encoder(x * x_mask, x_mask, g=g)
 
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
+        stats = self.proj(x)
+        m, logs = torch.chunk(stats, 2, dim=1)
+        # m, logs = torch.split(stats, self.out_channels, dim=1) # this line has the same effect as above line
         return x, m, logs, x_mask
 
 
diff --git a/qai_hub_models/models/_shared/melotts/meloTTS_metadata_json.py b/qai_hub_models/models/_shared/melotts/meloTTS_metadata_json.py
@@ -38,6 +38,7 @@ class VoiceSpec(BaseQAIHMConfig):
     sample_rate: int = 44100
     language_code: int = 0
     description: str
+    capabilities: TTSCapabilities
 
 
 class TTSCapabilities(BaseQAIHMConfig):
@@ -53,15 +54,22 @@ class TTSCapabilities(BaseQAIHMConfig):
     supports_resampling: bool = False
 
 
+class QNNVersion(BaseQAIHMConfig):
+    """Version of QNN SDK."""
+
+    major: int
+    minor: int
+    patch: int = 0
+
+
 class RuntimeInfo(BaseQAIHMConfig):
     """Runtime configuration information."""
 
     language: str
-    qnn_version_major: int
-    qnn_version_minor: int
-    qnn_version_patch: int
+    qnn_version: QNNVersion
     arch_bit: int = 64
     scratch_mem_size_req: int = 3200000
+    is_model_quantized: bool = False
 
 
 class ModelAssets(BaseQAIHMConfig):
@@ -92,7 +100,6 @@ class TTSMetadata(BaseQAIHMConfig):
     version: str = "1.0.0"
     description: str
     voices: list[VoiceSpec]
-    capabilities: TTSCapabilities
     model_type: str = "melo"
     runtime: RuntimeInfo | None = None
     assets: ModelAssets | None = None
@@ -107,11 +114,11 @@ def from_melo_tts_model(
         model_name: str,
         display_name: str,
         description: str,
+        tool_versions: ToolVersions,
         voice_specs: list[VoiceSpec] | None = None,
         capabilities: TTSCapabilities | None = None,
         runtime: RuntimeInfo | None = None,
         assets: ModelAssets | None = None,
-        tool_versions: ToolVersions | None = None,
     ) -> TTSMetadata:
         """
         Construct a ``TTSMetadata`` object from the information
@@ -127,6 +134,8 @@ def from_melo_tts_model(
             Human-readable name.
         description
             Short description of the model.
+        tool_versions
+            Optional tool-version information.
         voice_specs
             List of :class:`VoiceSpec` describing each voice.
         capabilities
@@ -135,8 +144,6 @@ def from_melo_tts_model(
             Optional runtime information; if omitted a minimal default is used.
         assets
             Optional asset paths; if omitted a minimal default is used.
-        tool_versions
-            Optional tool-version information.
 
         Returns
         -------
@@ -146,23 +153,18 @@ def from_melo_tts_model(
         if capabilities is None:
             capabilities = TTSCapabilities()
         if runtime is None:
-            # Default runtime - QNN version is taken from ``tool_versions`` if present.
-            qnn_version = {"major": 2, "minor": 33, "patch": 0}
-            if tool_versions and tool_versions.qairt is not None:
-                qnn_version = {
-                    "major": int(tool_versions.qairt.framework.major),
-                    "minor": int(tool_versions.qairt.framework.minor),
-                    "patch": int(
+            assert tool_versions.qairt is not None
+            runtime = RuntimeInfo(
+                language=LANGUAGE_MAP[language],
+                qnn_version=QNNVersion(
+                    major=int(tool_versions.qairt.framework.major),
+                    minor=int(tool_versions.qairt.framework.minor),
+                    patch=int(
                         tool_versions.qairt.framework.patch
                         if tool_versions.qairt.framework.patch
                         else 0
                     ),
-                }
-            runtime = RuntimeInfo(
-                language=LANGUAGE_MAP[language],
-                qnn_version_major=qnn_version["major"],
-                qnn_version_minor=qnn_version["minor"],
-                qnn_version_patch=qnn_version["patch"],
+                ),
             )
         if assets is None:
             assets = ModelAssets()
@@ -173,6 +175,7 @@ def from_melo_tts_model(
                     language=LANGUAGE_MAP[language],
                     language_name=language.capitalize(),
                     description=f"Default voice for {language.capitalize()}",
+                    capabilities=capabilities,
                 )
             ]
 
@@ -181,7 +184,6 @@ def from_melo_tts_model(
             display_name=display_name,
             description=description,
             voices=voice_specs,
-            capabilities=capabilities,
             runtime=runtime,
             assets=assets,
         )
@@ -249,6 +251,6 @@ def create_tts_metadata(
         model_name=model_name,
         display_name=display_name,
         description=description,
-        assets=assets,
         tool_versions=metadata.tool_versions,
+        assets=assets,
     )
diff --git a/qai_hub_models/models/_shared/melotts/model.py b/qai_hub_models/models/_shared/melotts/model.py
@@ -126,7 +126,7 @@ def get_input_spec() -> InputSpec:
         }
 
     def _sample_inputs_impl(
-        self, input_spec: InputSpec | None = None
+        self, input_spec: InputSpec | None = None, **kwargs: Any
     ) -> SampleInputsType:
         """
         This is a default implementation that returns a single random data array
@@ -212,6 +212,7 @@ def forward(
             # This does not use a minimum of 0 because some models only have 1 speaker. That would result in a clamp(0, 0) operator, which is invalid in QNN.
             sid = torch.clamp(sid, max=self.model.emb_g.num_embeddings - 1)
             g = self.model.emb_g(sid).unsqueeze(-1)
+
         x, m_p, logs_p, x_mask = self.encoder.forward(
             x, x_lengths, tone, language, bert, ja_bert, g=g
         )
@@ -223,9 +224,12 @@ def forward(
         logw = logw.masked_fill(x_mask == 0, -1e9)
 
         w = torch.exp(logw + torch.log(self.scale * length_scale)) * x_mask
-        w_ceil = torch.ceil(w)
-        y_lengths = torch.sum(w_ceil, [1, 2])
-
+        w_ceil = torch.ceil(w)  # shape: [1, 1, 512]
+        # y_lengths = torch.sum(w_ceil, [1, 2])    # after converting to context binary, QNN can't sum correctly
+        # y_lengths = torch.tensor([w_ceil.detach().numpy().sum() ], dtype=torch.float32) # QNN can't sum correctly
+        # y_lengths = torch.tensor([w_ceil.squeeze().cumsum(dim=0)[-1] ], dtype=torch.float32) # QNN can't sum correctly
+        y_lengths = torch.sum(torch.sum(w_ceil, dim=2), dim=1)  # QNN sums correctly
+        # TODO https://jira-dc.qualcomm.com/jira/projects/AISW/issues/AISW-175294
         return y_lengths, x_mask, m_p, logs_p, g, w_ceil
 
     def sdp_forward(
@@ -251,25 +255,21 @@ def sdp_forward(
             shape of (1, 1, MAX_SEQ_LEN)
         """
         sdp = self.model.sdp
-        x = torch.detach(x)
         assert hasattr(sdp, "pre") and callable(sdp.pre)
-        x = sdp.pre(x)
         assert hasattr(sdp, "cond") and callable(sdp.cond)
+        assert hasattr(sdp, "convs") and callable(sdp.convs)
+        assert hasattr(sdp, "proj") and callable(sdp.proj)
+        assert hasattr(sdp, "flows") and isinstance(sdp.flows, Iterable)
+        x = torch.detach(x)
+        x = sdp.pre(x)
         if g is not None:
             g = torch.detach(g)
             x = x + sdp.cond(g)
-        assert hasattr(sdp, "convs") and callable(sdp.convs)
         x = sdp.convs(x, x_mask)
-        assert hasattr(sdp, "proj") and callable(sdp.proj)
         x = sdp.proj(x) * x_mask
 
-        assert hasattr(sdp, "flows") and isinstance(sdp.flows, Iterable)
         flows = list(sdp.flows)[::-1]
-        flows = [
-            *flows[:-2],
-            flows[-1],
-        ]
-
+        flows = [*flows[:-2], flows[-1]]
         z = self.sdp_noise[:, :, : x.size(2)] * noise_scale_w
 
         half_channels = None
@@ -304,16 +304,13 @@ def get_hub_compile_options(
         device: Device | None = None,
         context_graph_name: str | None = None,
     ) -> str:
-        if target_runtime.qairt_version_changes_compilation:
-            other_compile_options += " --quantize_io false "
         compile_options = super().get_hub_compile_options(
             target_runtime,
             precision,
             other_compile_options,
             device,
             context_graph_name="encoder",
         )
-        # # Must use --truncate_64bit_io when input tensors have type int64.
         if target_runtime != TargetRuntime.ONNX:
             compile_options += " --truncate_64bit_tensors --truncate_64bit_io "
         return compile_options
@@ -427,7 +424,7 @@ def get_hub_compile_options(
         context_graph_name: str | None = None,
     ) -> str:
         if target_runtime.qairt_version_changes_compilation:
-            other_compile_options += " --quantize_io false "
+            other_compile_options += " --quantize_io  "
         return super().get_hub_compile_options(
             target_runtime,
             precision,
@@ -494,7 +491,7 @@ def get_hub_compile_options(
         context_graph_name: str | None = None,
     ) -> str:
         if target_runtime.qairt_version_changes_compilation:
-            other_compile_options += " --quantize_io false "
+            other_compile_options += " --quantize_io "
         return super().get_hub_compile_options(
             target_runtime,
             precision,
@@ -943,8 +940,6 @@ def get_hub_compile_options(
         device: Device | None = None,
         context_graph_name: str | None = None,
     ) -> str:
-        if target_runtime.qairt_version_changes_compilation:
-            other_compile_options += " --quantize_io false "
         compile_options = super().get_hub_compile_options(
             target_runtime,
             precision,
@@ -956,10 +951,6 @@ def get_hub_compile_options(
             compile_options += " --truncate_64bit_tensors --truncate_64bit_io "
         return compile_options
 
-    @staticmethod
-    def calibration_dataset_name() -> str:
-        return "common_voice_text"
-
     @staticmethod
     def component_precision() -> Precision:
         return Precision.float
diff --git a/qai_hub_models/models/whisper_small_quantized/model.py b/qai_hub_models/models/whisper_small_quantized/model.py
diff --git a/qai_hub_models/utils/input_spec.py b/qai_hub_models/utils/input_spec.py