Added check to not pass Custom_IO yaml when model weight and pkv are both in bfloat16.

quic-dhirajku · asmigosw · commit 8680d1af1953 · 2026-03-18T05:49:56.000Z
Added a patch incloud infer to map bfloat16 or 11 key type to np.float16 for AI200 inference.

Signed-off-by: Dhiraj Kumar Sah &lt;dhirajku@qti.qualcomm.com&gt;
Signed-off-by: Asmita Goswami &lt;asmigosw@qti.qualcomm.com&gt;
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -538,12 +538,19 @@ def _compile(
             command.append(f"-network-specialization-config={specializations_json}")
 
         # Write custom_io.yaml file
+        model_in_bfloat16 = self.config.torch_dtype == torch.bfloat16
+        pkv_in_bfloat16 = any("past_" in key and "bfloat16" in value for key, value in custom_io.items())
         if custom_io is not None:
             custom_io_yaml = compile_dir / "custom_io.yaml"
             with open(custom_io_yaml, "w") as fp:
                 for io_name, dtype in custom_io.items():
                     fp.write(f" - IOName: {io_name}\n   Precision: {dtype}\n\n")
-            command.append(f"-custom-IO-list-file={custom_io_yaml}")
+            if model_in_bfloat16 and pkv_in_bfloat16:
+                logger.warning(
+                    "Model and Past KV types are both bfloat16. Custom IO list file will be ignored during compile."
+                )
+            else:
+                command.append(f"-custom-IO-list-file={custom_io_yaml}")
 
         command.append(f"-aic-binary-dir={qpc_path}")
         logger.info(f"Running compiler: {' '.join(command)}")
diff --git a/QEfficient/generation/cloud_infer.py b/QEfficient/generation/cloud_infer.py
@@ -65,6 +65,7 @@ def __init__(
 
         # Build dtype mapping once (depends on aicapi constants)
         self.aic_to_np_dtype_mapping = {
+            getattr(aicapi, "BFLOAT16_TYPE", 11): np.dtype(np.float16),
             aicapi.FLOAT_TYPE: np.dtype(np.float32),
             aicapi.FLOAT_16_TYPE: np.dtype(np.float16),
             aicapi.INT8_Q_TYPE: np.dtype(np.int8),