Support for Prefix caching Feature in QNN Compilation Path. (quic#262)

shubhagr-quic · web-flow · commit 1ae33793d66e · 2025-02-21T16:49:44.000+05:30
Signed-off-by: Shubham Agrawal &lt;quic_shubhagr@quicinc.com&gt;
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -339,6 +339,7 @@ def _qnn_compile(
         mxfp6_matmul: bool = False,
         mxint8_kv_cache: bool = False,
         qnn_config: Optional[str] = None,
+        kv_cache_batch_size: Optional[int] = None,
     ) -> str:
         """
         Interface for QNN compiler
@@ -356,6 +357,7 @@ def _qnn_compile(
             :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
             :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
             :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
+            :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
         """
         if onnx_path is None and self.onnx_path is None:
             self.export()
@@ -415,6 +417,7 @@ def _qnn_compile(
             full_batch_size=full_batch_size,
             qnn_config=qnn_config,
             qnn_binary_dir=qpc_path,
+            kv_cache_batch_size=kv_cache_batch_size,
         )
 
         self.qpc_path = qpc_path
diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py
@@ -89,7 +89,7 @@
         default=False,
         help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
              If not provided, the default configuration will be used.\
-             Sample Config: QEfficient/cloud/compile/qnn_config.json",
+             Sample Config: QEfficient/compile/qnn_config.json",
     )
     parser.add_argument(
         "qnn_config",
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
@@ -223,7 +223,7 @@ def main(
         default=False,
         help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
              If not provided, the default configuration will be used.\
-             Sample Config: QEfficient/cloud/compile/qnn_config.json",
+             Sample Config: QEfficient/compile/qnn_config.json",
     )
     parser.add_argument(
         "qnn_config",
diff --git a/QEfficient/compile/qnn_compiler.py b/QEfficient/compile/qnn_compiler.py
@@ -338,6 +338,7 @@ def compile(
     full_batch_size=None,
     qnn_config: Optional[str] = None,
     qnn_binary_dir: Optional[str] = None,
+    kv_cache_batch_size: Optional[int] = None,
     **kwargs,
 ) -> str:
     """
@@ -362,6 +363,7 @@ def compile(
         :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.``
         :qnn_config (str): Path to ``qnn_config.json`` file (formatted as a string). ``Defaults to None.``
         :qnn_binary_dir (str): Path for saving qnn binaries.
+        :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
 
     Returns:
         :str: Path to compiled ``qpc`` package.
@@ -386,6 +388,7 @@ def compile(
         file_path=custom_io_file_path,
         full_batch_size=full_batch_size,
         kv_precision=kv_precision,
+        kv_cache_batch_size=kv_cache_batch_size,
     )
 
     if not os.path.isfile(custom_io_file_path):
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -1503,6 +1503,7 @@ def compile(
                 mxfp6_matmul=mxfp6_matmul,
                 mxint8_kv_cache=mxint8_kv_cache,
                 qnn_config=qnn_config,
+                kv_cache_batch_size=kv_cache_batch_size,
             )
         else:
             # Custom IO
diff --git a/QEfficient/utils/generate_qnn_network_specialization_config.py b/QEfficient/utils/generate_qnn_network_specialization_config.py
@@ -24,9 +24,27 @@ def fetch_nodes_info(
     context_length: int,
     file_path: str = "custom_io_config.yaml",
     full_batch_size: Optional[int] = None,
-    decode_only: Optional[bool] = False,
     kv_precision: Optional[str] = "float16",
+    kv_cache_batch_size: Optional[int] = None,
 ) -> None:
+    """
+    Generates network specialization config custom IO file for convertor stage in QNN compilation.
+    Reads onnx graph and creates a custom IO configuration file according to the passed parameters and
+    save it as a yaml file provided in file_path argument.
+
+    ``Mandatory`` Args:
+        :onnx_graph_path (str): Generated ``ONNX`` Model Path.
+        :batch_size (int): Batch size to compile the model for.
+        :sequence_length (int): Sequence length for the model to compile.
+        :context_length (int): Maximum context length to compile the model.
+
+    ``Optional`` Args:
+        :file_path (str): File path to save the generated custom IO config. ``Defaults to custom_io_config.yaml.``
+        :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Default to None``
+        :kv_precision (str): Sets kv precision for compilation.  ``Defaults to float16.``
+        :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
+    """
+
     # Load the ONNX model
     onnx_model = onnx.load(onnx_graph_path)
 
@@ -46,7 +64,9 @@ def fetch_nodes_info(
             if full_batch_size:
                 input_info["Shape"] = f"(1, 1), ({full_batch_size}, 1)"
             else:
-                input_info["Shape"] = "(1, 1)"
+                raise AttributeError(
+                    "ERROR: Full batch size is required for populating batch_index in custom_io_config.yaml"
+                )
         else:
             shapes = []
             for input_shape in node.type.tensor_type.shape.dim:
@@ -67,11 +87,14 @@ def fetch_nodes_info(
                 for shape in shapes:
                     if isinstance(shape, str):
                         if "full_batch_size" in shape:
-                            if full_batch_size:
+                            if ("past_key" in node.name or "past_value" in node.name) and kv_cache_batch_size:
+                                shapeList.append(kv_cache_batch_size)
+                            elif full_batch_size:
                                 shapeList.append(full_batch_size)
                             else:
-                                print("ERROR: Full batch size is required to generate custom_io_config.yaml")
-                                exit()
+                                raise AttributeError(
+                                    "ERROR: Full batch size is required to generate custom_io_config.yaml"
+                                )
                         elif "batch_size" in shape:
                             shapeList.append(batch_size)
                         elif shape in ["ctx_len", "max_context_len"]:
@@ -107,7 +130,7 @@ def fetch_nodes_info(
                         .replace("[", "(")
                         .replace("]", ")")
                     )
-                shape = shape_2 if decode_only else shape_1 + "," + shape_2
+                shape = shape_1 + "," + shape_2
             elif ("batch_size" in shapes or "full_batch_size" in shapes) and (
                 "ctx_len" in shapes or "max_context_len" in shapes
             ):
@@ -153,6 +176,21 @@ def generate_data_format_config(
     model_dlc_name: Optional[str] = "model",
     file_path: str = "qnn_data_format_config.json",
 ) -> None:
+    """
+    Generates data format config for context binary generation stage in QNN compilation path.
+    It defines the tensor format for KV nodes when precision is set to mxint8.
+    Reads onnx graph and creates a data format configuration file and save it as a json file provided in
+    file_path argument.
+
+    ``Mandatory`` Args:
+        :onnx_graph_path (str): Generated ``ONNX`` Model Path.
+
+    ``Optional`` Args:
+        :data_format (str): Tensor format for KV nodes. ``Defaults to QNN_TENSOR_DATA_FORMAT_MX.``
+        :model_dlc_name (str): DLC Name generated by the convertor stage in QNN Compilation. ``Defaults to model.``
+        :file_path (str): File path to save the generated data format config. ``Defaults to qnn_data_format_config.json.``
+    """
+
     # Load the ONNX model
     onnx_model = onnx.load(onnx_graph_path)
 
diff --git a/tests/transformers/models/test_prefix_caching.py b/tests/transformers/models/test_prefix_caching.py
@@ -5,12 +5,15 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
+
 import numpy as np
 import pytest
 from transformers import AutoTokenizer
 
 from QEfficient.generation.text_generation_inference import TextGeneration
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.utils._utils import create_json
 
 test_models = ["gpt2"]
 
@@ -27,14 +30,48 @@ def test_simple_prefix_caching(model_name):
         kv_cache_batch_size=4,
         num_cores=14,
     )
+    prefix_caching_inference(model_name=model_name, qpc_path=qeff_model.qpc_path)
+
+
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.parametrize("model_name", test_models)
+def test_simple_prefix_caching_qnn(model_name):
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, continuous_batching=True)
+    qnn_config = {
+        "convertor_args_extension": "",
+        "context_binary_generator_args_extension": "--log_level debug",
+        "qnn_compilation_backend": {
+            "compiler_enable_depth_first": True,
+            "compiler_printDDRStats": False,
+            "compiler_printPerfMetrics": False,
+        },
+        "SKIP_QNN_CONVERTOR_STEP": False,
+    }
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, qnn_config)
+
+    qeff_model.compile(
+        prefill_seq_len=128,
+        ctx_len=256,
+        full_batch_size=2,
+        kv_cache_batch_size=4,
+        num_cores=14,
+        enable_qnn=True,
+        qnn_config=qnn_config_json_path,
+    )
+    prefix_caching_inference(model_name=model_name, qpc_path=qeff_model.qpc_path)
+    os.remove(qnn_config_json_path)
+
 
+def prefix_caching_inference(model_name, qpc_path):
     prefixes = ["Once upon a time ", "Once upon a time "]
     suffixes1 = ["in a land far away", "there was a small village"]
     suffixes2 = ["a little girl", "in a bustling city"]
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 
-    generator = TextGeneration(tokenizer=tokenizer, qpc_path=qeff_model.qpc_path, full_batch_size=2, ctx_len=256)
+    generator = TextGeneration(tokenizer=tokenizer, qpc_path=qpc_path, full_batch_size=2, ctx_len=256)
 
     prompts = [pref + suff for pref, suff in zip(prefixes, suffixes1)]
 

Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@`
`89`	`89`	`default=False,`
`90`	`90`	`help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\`
`91`	`91`	`If not provided, the default configuration will be used.\`
`92`		`- Sample Config: QEfficient/cloud/compile/qnn_config.json",`
	`92`	`+ Sample Config: QEfficient/compile/qnn_config.json",`
`93`	`93`	`)`
`94`	`94`	`parser.add_argument(`
`95`	`95`	`"qnn_config",`
Original file line number	Diff line number	Diff line change
`@@ -223,7 +223,7 @@ def main(`
`223`	`223`	`default=False,`
`224`	`224`	`help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\`
`225`	`225`	`If not provided, the default configuration will be used.\`
`226`		`- Sample Config: QEfficient/cloud/compile/qnn_config.json",`
	`226`	`+ Sample Config: QEfficient/compile/qnn_config.json",`
`227`	`227`	`)`
`228`	`228`	`parser.add_argument(`
`229`	`229`	`"qnn_config",`
Original file line number	Diff line number	Diff line change
`@@ -1503,6 +1503,7 @@ def compile(`
`1503`	`1503`	`mxfp6_matmul=mxfp6_matmul,`
`1504`	`1504`	`mxint8_kv_cache=mxint8_kv_cache,`
`1505`	`1505`	`qnn_config=qnn_config,`
	`1506`	`+ kv_cache_batch_size=kv_cache_batch_size,`
`1506`	`1507`	`)`
`1507`	`1508`	`else:`
`1508`	`1509`	`# Custom IO`