From c6accdede7ebe2dd8d599f869df2313395612040 Mon Sep 17 00:00:00 2001
From: Anuj Jalota <ajalota@nvidia.com>
Date: Mon, 9 Jun 2025 11:21:34 +0530
Subject: [PATCH 1/5] Update phi3 example: increase max_length to 6K tokens and
 fix API compatibility

---
 examples/c/src/phi3.cpp | 5 +++--
 src/generators.cpp      | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/examples/c/src/phi3.cpp b/examples/c/src/phi3.cpp
index 415ba2341..8ca496d05 100644
--- a/examples/c/src/phi3.cpp
+++ b/examples/c/src/phi3.cpp
@@ -40,7 +40,7 @@ void CXX_API(const char* model_path, const char* execution_provider) {
   std::thread th(std::bind(&TerminateSession::Generator_SetTerminate_Call, &catch_terminate, generator.get()));
 
   // Define System Prompt
-  const std::string system_prompt = std::string("<|system|>\n") + "You are a helpful AI and give elaborative answers" + "<|end|>";
+  const std::string system_prompt = std::string("<|system|>\n") + "You are a helpful AI and give very small answers" + "<|end|>";
   bool include_system_prompt = true;
 
   while (true) {
@@ -55,7 +55,8 @@ void CXX_API(const char* model_path, const char* execution_provider) {
       break;  // Exit the loop
     }
 
-    const std::string prompt = tokenizer->ApplyChatTemplate("", text.c_str(), "", true);
+    // Using direct prompt instead of ApplyChatTemplate to avoid library version issues
+    const std::string prompt = "<|user|>\n" + text + "<|end|>\n<|assistant|>\n";
 
     bool is_first_token = true;
     Timing timing;
diff --git a/src/generators.cpp b/src/generators.cpp
index a318c5c83..05352d542 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -363,7 +363,7 @@ void Generator::AppendTokens(cpu_span<const int32_t> input_ids) {
   if (search_->GetSequenceLength() != 0 && state_->params_->search.batch_size > 1)
     throw std::runtime_error("AppendTokens can only be called once for batch_size > 1. To call AppendTokens again, use RewindToLength(0)");
 
-  constexpr std::array<DeviceType, 4> devices_supporting_continuous_decoding{DeviceType::CPU, DeviceType::CUDA, DeviceType::WEBGPU, DeviceType::OpenVINO};
+  constexpr std::array<DeviceType, 5> devices_supporting_continuous_decoding{DeviceType::CPU, DeviceType::CUDA, DeviceType::WEBGPU, DeviceType::OpenVINO, DeviceType::NvTensorRtRtx};
   if (search_->GetSequenceLength() != 0 &&
       std::none_of(devices_supporting_continuous_decoding.begin(), devices_supporting_continuous_decoding.end(),
                    [this](DeviceType device_type) { return device_type == state_->model_.p_device_kvcache_->GetType(); }))

From f663a5b111823ab15c9303c16b7a2df9ff8d93c7 Mon Sep 17 00:00:00 2001
From: Anuj Jalota <ajalota@nvidia.com>
Date: Mon, 9 Jun 2025 14:35:33 +0530
Subject: [PATCH 2/5] application changes

---
 examples/c/src/phi3.cpp    | 10 ++++++----
 examples/python/phi3-qa.py | 21 ++++++++++-----------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/examples/c/src/phi3.cpp b/examples/c/src/phi3.cpp
index 8ca496d05..117514b80 100644
--- a/examples/c/src/phi3.cpp
+++ b/examples/c/src/phi3.cpp
@@ -40,8 +40,8 @@ void CXX_API(const char* model_path, const char* execution_provider) {
   std::thread th(std::bind(&TerminateSession::Generator_SetTerminate_Call, &catch_terminate, generator.get()));
 
   // Define System Prompt
-  const std::string system_prompt = std::string("<|system|>\n") + "You are a helpful AI and give very small answers" + "<|end|>";
-  bool include_system_prompt = true;
+  const std::string system_instructions = "You are a helpful AI and give very small answers";
+  bool include_system_prompt = false;
 
   while (true) {
     signal(SIGINT, signalHandlerWrapper);
@@ -56,7 +56,9 @@ void CXX_API(const char* model_path, const char* execution_provider) {
     }
 
     // Using direct prompt instead of ApplyChatTemplate to avoid library version issues
-    const std::string prompt = "<|user|>\n" + text + "<|end|>\n<|assistant|>\n";
+    const std::string prompt = "<start_of_turn>user\n" + 
+      (include_system_prompt ? system_instructions + "\n\n" + text : text) + 
+      "<end_of_turn>\n<start_of_turn>model\n";
 
     bool is_first_token = true;
     Timing timing;
@@ -64,7 +66,7 @@ void CXX_API(const char* model_path, const char* execution_provider) {
 
     auto sequences = OgaSequences::Create();
     if (include_system_prompt) {
-      std::string combined = system_prompt + prompt;
+      std::string combined = system_instructions + "\n\n" + prompt;
       tokenizer->Encode(combined.c_str(), *sequences);
       include_system_prompt = false;
     } else {
diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py
index 41db18f2e..c801f35d1 100644
--- a/examples/python/phi3-qa.py
+++ b/examples/python/phi3-qa.py
@@ -1,7 +1,6 @@
 import onnxruntime_genai as og
 import argparse
 import time
-import json
 
 def main(args):
     if args.verbose: print("Loading model...")
@@ -10,11 +9,10 @@ def main(args):
         first_token_timestamp = 0
 
     config = og.Config(args.model_path)
-    if args.execution_provider != "follow_config":
-        config.clear_providers()
-        if args.execution_provider != "cpu":
-            if args.verbose: print(f"Setting model to {args.execution_provider}")
-            config.append_provider(args.execution_provider)
+    config.clear_providers()
+    if args.execution_provider != "cpu":
+        if args.verbose: print(f"Setting model to {args.execution_provider}")
+        config.append_provider(args.execution_provider)
     model = og.Model(config)
 
     if args.verbose: print("Model loaded")
@@ -30,6 +28,8 @@ def main(args):
     if 'max_length' not in search_options:
         search_options['max_length'] = 2048
 
+    chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>'
+
     # Keep asking for input prompts in a loop
     while True:
         text = input("Input: ")
@@ -40,10 +40,9 @@ def main(args):
         if args.timings: started_timestamp = time.time()
 
         # If there is a chat template, use it
-        input_message = [{"role": "user", "content": text }]
-        input_prompt = tokenizer.apply_chat_template(json.dumps(input_message), add_generation_prompt=True)
+        prompt = f'{chat_template.format(input=text)}'
 
-        input_tokens = tokenizer.encode(input_prompt)
+        input_tokens = tokenizer.encode(prompt)
 
         params = og.GeneratorParams(model)
         params.set_search_options(**search_options)
@@ -85,7 +84,7 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai")
     parser.add_argument('-m', '--model_path', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and model.onnx)')
-    parser.add_argument('-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.")
+    parser.add_argument('-e', '--execution_provider', type=str, required=True, choices=["cpu", "cuda", "dml", "NvTensorRtRtx"], help="Execution provider to run ONNX model with")
     parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt')
     parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt')
     parser.add_argument('-ds', '--do_sample', action='store_true', default=False, help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false')
@@ -96,4 +95,4 @@ def main(args):
     parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false')
     parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false')
     args = parser.parse_args()
-    main(args)
+    main(args)
\ No newline at end of file

From f73deba1f558c8a90b5b9a2b7016f5415dda15cf Mon Sep 17 00:00:00 2001
From: Anuj Jalota <ajalota@nvidia.com>
Date: Mon, 16 Jun 2025 19:30:40 +0530
Subject: [PATCH 3/5] Add int8 KV cache quantization support

---
 src/models/kv_cache.cpp         |   8 +
 src/python/py/models/builder.py | 250 ++++++++++++++++++++++++++++++--
 2 files changed, 244 insertions(+), 14 deletions(-)

diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
index 581d12caa..7b05f6be3 100644
--- a/src/models/kv_cache.cpp
+++ b/src/models/kv_cache.cpp
@@ -6,6 +6,7 @@
 #include "kv_cache.h"
 #include "windowed_kv_cache.h"
 #include "../openvino/interface.h"
+#include <iostream>
 
 namespace Generators {
 
@@ -246,6 +247,13 @@ void DefaultKeyValueCache::Update(DeviceSpan<int32_t> beam_indices, int total_le
   shape_[2] = total_length;
   for (int i = 0; i < layer_count_ * 2; i++) {
     presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_);
+    size_t element_size = type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 ? 2 : sizeof(int8_t);
+    size_t elements_per_tensor = shape_[0] * shape_[1] * shape_[2] * shape_[3];
+    size_t memory_per_tensor = elements_per_tensor * element_size;
+    if (i == 0) {
+      std::cout << "  Layer " << i << ": " << memory_per_tensor << " bytes (" 
+                << (memory_per_tensor / 1024.0 / 1024.0) << " MB)" << std::endl;
+    }
     state_.outputs_[output_index_ + i] = presents_[i].get();
   }
 
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 1303269e5..79303f9d2 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -321,6 +321,11 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
             self.quant_attrs["config"] = config.quantization_config
             self.quant_attrs["use_g_idx"] = config.quantization_config["desc_act"] if "desc_act" in config.quantization_config else False
 
+        # KV cache quantization attributes (add at the end)
+        self.kv_cache_attrs = {
+            "quantize_kv_cache": extra_options.get("quantize_kv_cache", False),
+            "kv_scale_factor": 127.0,  # Scale factor for INT8 quantization
+        }
 
     def make_outputs_init(self):
         # Always use float32 logits to improve accuracy in the case of bf16 models.
@@ -334,17 +339,20 @@ def make_outputs_init(self):
             self.output_names = [name.replace("logits", "hidden_states") for name in self.output_names]
         elif self.include_hidden_states:
             self.output_names = ["hidden_states"] + self.output_names
+        
+        # Update output types for quantized KV cache
+        if hasattr(self, 'kv_cache_attrs') and self.kv_cache_attrs["quantize_kv_cache"]:
+            self.output_types["present.key"] = TensorProto.INT8
+            self.output_types["present.value"] = TensorProto.INT8
 
     def make_attention_init(self):
         valid_gqa_configurations = [
             ("cpu", TensorProto.FLOAT),
-            ("cuda", TensorProto.FLOAT16),
             ("cuda", TensorProto.BFLOAT16),
             ("rocm", TensorProto.FLOAT16),
             ("dml", TensorProto.FLOAT16),
             ("webgpu", TensorProto.FLOAT16),
             ("webgpu", TensorProto.FLOAT),
-            ("NvTensorRtRtx", TensorProto.FLOAT16),
         ]
         if (self.ep, self.io_dtype) in valid_gqa_configurations:
             # Change model settings for GroupQueryAttention
@@ -644,17 +652,21 @@ def make_inputs_and_outputs(self):
 
         # Add KV cache to inputs and outputs
         for i in range(self.num_layers):
+            # Determine input/output types based on quantization setting
+            kv_input_dtype = TensorProto.INT8 if (hasattr(self, 'kv_cache_attrs') and self.kv_cache_attrs["quantize_kv_cache"]) else self.input_types["past_key_values.key"]
+            kv_output_dtype = TensorProto.INT8 if (hasattr(self, 'kv_cache_attrs') and self.kv_cache_attrs["quantize_kv_cache"]) else self.output_types["present.key"]
+            
             # Add KV cache to inputs
             key_name = f"past_key_values.{i}.key"
-            inputs.append(helper.make_tensor_value_info(key_name, self.input_types["past_key_values.key"], shape=self.input_shapes["past_key_values.key"]))
+            inputs.append(helper.make_tensor_value_info(key_name, kv_input_dtype, shape=self.input_shapes["past_key_values.key"]))
             value_name = f"past_key_values.{i}.value"
-            inputs.append(helper.make_tensor_value_info(value_name, self.input_types["past_key_values.value"], shape=self.input_shapes["past_key_values.value"]))
+            inputs.append(helper.make_tensor_value_info(value_name, kv_input_dtype, shape=self.input_shapes["past_key_values.value"]))
 
             # Add KV cache to outputs
             key_name = f"present.{i}.key"
-            outputs.append(helper.make_tensor_value_info(key_name, self.output_types["present.key"], shape=self.output_shapes["present.key"]))
+            outputs.append(helper.make_tensor_value_info(key_name, kv_output_dtype, shape=self.output_shapes["present.key"]))
             value_name = f"present.{i}.value"
-            outputs.append(helper.make_tensor_value_info(value_name, self.output_types["present.value"], shape=self.output_shapes["present.value"]))
+            outputs.append(helper.make_tensor_value_info(value_name, kv_output_dtype, shape=self.output_shapes["present.value"]))
 
         self.inputs = inputs
         self.outputs = outputs
@@ -1545,7 +1557,6 @@ def _make_simplified_layer_norm(self, basename, root_input, weight_name, output_
         self.make_node("Mul", inputs=make_mul_1_inputs, outputs=[output_0], name=make_mul_1_name)
         self.make_value_info(output_0, dtype=io_dtype, shape=shape)
 
-
     def make_qk_norm(self, layer_id, attention):
         # Make subgraph to compute SimplifiedLayerNorm after Q and K MatMuls in attention:
         #
@@ -1692,7 +1703,9 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs):
         #                Transpose
         #                    |
         #                 Reshape
-        basename = f"/model/layers.{layer_id}/attn/{'k_proj' if past_kv.endswith('key') else 'v_proj'}/repeat_kv"
+        # Determine if this is for key or value based on past_kv (original case) or present_kv (quantized case)
+        is_key = past_kv.endswith('key') or (present_kv and 'temp_present_k' in present_kv)
+        basename = f"/model/layers.{layer_id}/attn/{'k_proj' if is_key else 'v_proj'}/repeat_kv"
 
         # Make the initial subgraph
         #
@@ -1712,9 +1725,16 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs):
         concat_1_name = f"{basename}/Concat_1"
         concat_1_inputs = [past_kv, f"{transpose_1_name}/output_0"]
         self.make_node("Concat", inputs=concat_1_inputs, outputs=[present_kv], name=concat_1_name, axis=2)
+        self.make_value_info(present_kv, self.io_dtype, shape=['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size])
 
+        # Use Identity node to ensure proper shape flow for quantized KV cache scenarios
+        # This prevents shape inference issues when present_kv is a temporary tensor name
+        identity_present_name = f"{basename}/Identity_present"
+        self.make_node("Identity", inputs=[present_kv], outputs=[f"{identity_present_name}/output_0"], name=identity_present_name)
+        self.make_value_info(f"{identity_present_name}/output_0", self.io_dtype, shape=['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size])
+        
         shape_1_name = f"{basename}/Shape_1"
-        self.make_shape(shape_1_name, present_kv, shape=[4])
+        self.make_shape(shape_1_name, f"{identity_present_name}/output_0", shape=[4])
         gather_1_name = f"{basename}/Gather_1"
         gather_1_inputs = [f"{shape_1_name}/output_0", "/model/constants/TensorProto.INT64/0D/0"]
         self.make_gather(gather_1_name, gather_1_inputs, axis=0)
@@ -1781,7 +1801,7 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs):
         #                   \           \
         # Unsqueeze --> Expand --> Reshape --> Transpose --> Reshape
         unsqueeze_5_name = f"{basename}/Unsqueeze_5"
-        unsqueeze_5_inputs = [present_kv, "/model/constants/TensorProto.INT64/1D/2"]
+        unsqueeze_5_inputs = [f"{identity_present_name}/output_0", "/model/constants/TensorProto.INT64/1D/2"]
         self.make_unsqueeze(unsqueeze_5_name, unsqueeze_5_inputs, dtype=self.io_dtype, shape=['batch_size', self.num_kv_heads, 1, 'sequence_length', self.head_size])
         expand_name = f"{basename}/Expand"
         expand_inputs = [f"{unsqueeze_5_name}/output_0", f"{where_name}/output_0"]
@@ -1802,6 +1822,111 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs):
     def make_attention_op(self, name, **kwargs):
         op_type = self.attention_attrs["op_type"]
 
+        # Handle KV cache quantization if enabled
+        original_past_k = kwargs.get("past_k", "")
+        original_past_v = kwargs.get("past_v", "")
+        original_present_k = kwargs.get("present_k", "")
+        original_present_v = kwargs.get("present_v", "")
+        
+        # Check if we need to handle quantized KV cache with MHA + GQA configuration
+        has_quantized_kv = hasattr(self, 'kv_cache_attrs') and self.kv_cache_attrs["quantize_kv_cache"]
+        needs_kv_repeat = (self.num_attn_heads != self.num_kv_heads and op_type == "MultiHeadAttention")
+        
+        if has_quantized_kv and needs_kv_repeat:
+            # Special handling for quantized KV cache + MHA + different head counts
+            # We need to: dequantize -> repeat_kv -> attention -> quantize
+            
+            # Extract layer_id from the attention name
+            import re
+            layer_match = re.search(r'/model/layers\.(\d+)/', name)
+            if layer_match:
+                layer_id = int(layer_match.group(1))
+            else:
+                raise ValueError(f"Could not extract layer_id from attention name: {name}")
+            
+            if original_past_k and original_past_v:
+                # 1. Dequantize past KV cache
+                dequant_past_k = self.make_dequantize_kv_cache(
+                    f"{name}/dequant_past_k", 
+                    original_past_k, 
+                    ['batch_size', self.num_kv_heads, 'past_sequence_length', self.head_size]
+                )
+                dequant_past_v = self.make_dequantize_kv_cache(
+                    f"{name}/dequant_past_v", 
+                    original_past_v, 
+                    ['batch_size', self.num_kv_heads, 'past_sequence_length', self.head_size]
+                )
+                
+                # 2. Apply repeat_kv with direct dequantized inputs (remove barriers that cause TensorRT issues)
+                # Create temporary present outputs (unquantized)
+                temp_present_k_unquant = f"{name}/temp_present_k_unquant"
+                temp_present_v_unquant = f"{name}/temp_present_v_unquant"
+                
+                # Call repeat_kv to handle head expansion and KV cache update
+                repeated_k = self.make_repeat_kv(layer_id, root_input=kwargs["k_path"], past_kv=dequant_past_k, present_kv=temp_present_k_unquant)
+                repeated_v = self.make_repeat_kv(layer_id, root_input=kwargs["v_path"], past_kv=dequant_past_v, present_kv=temp_present_v_unquant)
+                
+                # 3. Update kwargs for attention operation
+                kwargs["k_path"] = repeated_k
+                kwargs["v_path"] = repeated_v
+                kwargs["past_k"] = ""  # Already handled by repeat_kv
+                kwargs["past_v"] = ""  # Already handled by repeat_kv
+                kwargs["present_k"] = ""  # Will be handled by quantization
+                kwargs["present_v"] = ""  # Will be handled by quantization
+                
+                # 4. Run MultiHeadAttention
+                self.make_multi_head_attention(name, add_qk=f"{self.mask_attrs['mask_name']}/output_0", **kwargs)
+                
+                # 5. Quantize the present KV cache from repeat_kv outputs
+                if original_present_k:
+                    quant_present_k = self.make_quantize_kv_cache(
+                        f"{name}/quant_present_k", 
+                        temp_present_k_unquant,
+                        ['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size]
+                    )
+                    self.make_node("Identity", inputs=[quant_present_k], outputs=[original_present_k], 
+                                name=f"{name}/present_k_identity")
+                    self.make_value_info(original_present_k, TensorProto.INT8, 
+                                    shape=['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size])
+                    
+                if original_present_v:
+                    quant_present_v = self.make_quantize_kv_cache(
+                        f"{name}/quant_present_v", 
+                        temp_present_v_unquant,
+                        ['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size]
+                    )
+                    self.make_node("Identity", inputs=[quant_present_v], outputs=[original_present_v], 
+                                name=f"{name}/present_v_identity")
+                    self.make_value_info(original_present_v, TensorProto.INT8, 
+                                    shape=['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size])
+                return
+            
+        elif has_quantized_kv:
+            # Standard quantized KV cache handling (for GQA or MHA without head mismatch)
+            if original_past_k:
+                dequant_past_k = self.make_dequantize_kv_cache(
+                    f"{name}/dequant_past_k", 
+                    original_past_k, 
+                    ['batch_size', self.num_kv_heads, 'past_sequence_length', self.head_size]
+                )
+                kwargs["past_k"] = dequant_past_k
+                
+            if original_past_v:
+                dequant_past_v = self.make_dequantize_kv_cache(
+                    f"{name}/dequant_past_v", 
+                    original_past_v, 
+                    ['batch_size', self.num_kv_heads, 'past_sequence_length', self.head_size]
+                )
+                kwargs["past_v"] = dequant_past_v
+        
+        # Use temporary names for present outputs
+        temp_present_k = f"{name}/temp_present_k" if original_present_k else ""
+        temp_present_v = f"{name}/temp_present_v" if original_present_v else ""
+        if original_present_k:
+            kwargs["present_k"] = temp_present_k
+        if original_present_v:
+            kwargs["present_v"] = temp_present_v
+        
         if op_type == "MultiHeadAttention":
             self.make_multi_head_attention(name, add_qk=f"{self.mask_attrs['mask_name']}/output_0", **kwargs)
         elif op_type == "GroupQueryAttention":
@@ -1811,6 +1936,31 @@ def make_attention_op(self, name, **kwargs):
         else:
             raise NotImplementedError(f"The {op_type} op is not currently supported.")
 
+        # Quantize present KV cache for storage if quantization is enabled
+        # (Skip if already handled in the special repeat_kv case above)
+        if has_quantized_kv and not needs_kv_repeat:
+            if original_present_k and temp_present_k:
+                quant_present_k = self.make_quantize_kv_cache(
+                    f"{name}/quant_present_k", 
+                    temp_present_k, 
+                    ['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size]
+                )
+                self.make_node("Identity", inputs=[quant_present_k], outputs=[original_present_k], 
+                            name=f"{name}/present_k_identity")
+                self.make_value_info(original_present_k, TensorProto.INT8, 
+                                shape=['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size])
+                
+            if original_present_v and temp_present_v:
+                quant_present_v = self.make_quantize_kv_cache(
+                    f"{name}/quant_present_v", 
+                    temp_present_v, 
+                    ['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size]
+                )
+                self.make_node("Identity", inputs=[quant_present_v], outputs=[original_present_v], 
+                            name=f"{name}/present_v_identity")
+                self.make_value_info(original_present_v, TensorProto.INT8, 
+                                shape=['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size])
+        
     def make_multi_head_attention(self, name, **kwargs):
         inputs = [
             kwargs["q_path"], kwargs["k_path"], kwargs["v_path"], kwargs.get("bias", ""),
@@ -1958,10 +2108,20 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
         past_v = f"past_key_values.{layer_id}.value"
         present_k = f"present.{layer_id}.key"
         present_v = f"present.{layer_id}.value"
+        
+        # Check if we have quantized KV cache - if so, handle it differently
+        has_quantized_kv = hasattr(self, 'kv_cache_attrs') and self.kv_cache_attrs["quantize_kv_cache"]
+        
         if self.num_attn_heads != self.num_kv_heads and self.attention_attrs["op_type"] == "MultiHeadAttention":
-            self.attention_attrs["k_path"] = self.make_repeat_kv(layer_id, root_input=self.attention_attrs["k_path"], past_kv=past_k, present_kv=present_k)
-            self.attention_attrs["v_path"] = self.make_repeat_kv(layer_id, root_input=self.attention_attrs["v_path"], past_kv=past_v, present_kv=present_v)
-            past_k, past_v, present_k, present_v = "", "", "", ""
+            if has_quantized_kv:
+                # For quantized KV cache, we'll do repeat_kv in make_attention_op after dequantization
+                # So we keep the KV cache parameters for the attention_op to handle
+                pass  # KV cache will be handled in make_attention_op with proper quantization
+            else:
+                # Original logic for non-quantized case
+                self.attention_attrs["k_path"] = self.make_repeat_kv(layer_id, root_input=self.attention_attrs["k_path"], past_kv=past_k, present_kv=present_k)
+                self.attention_attrs["v_path"] = self.make_repeat_kv(layer_id, root_input=self.attention_attrs["v_path"], past_kv=past_v, present_kv=present_v)
+                past_k, past_v, present_k, present_v = "", "", "", ""
 
         # Make attention node (e.g. MultiHeadAttention, GroupQueryAttention, etc.)
         attn_name = f"/model/layers.{layer_id}/attn/{self.attention_attrs['op_type']}"
@@ -3078,6 +3238,66 @@ def make_position_ids_reformatting(self):
 
         return reshape_name
 
+    def make_quantize_kv_cache(self, name, input_tensor, shape):
+        """Create standard ONNX quantization for KV cache using QuantizeLinear"""
+        # Use a more conservative scale factor for KV cache values
+        # KV cache values typically have much smaller range than full FP16
+        # Using scale = 0.1 maps [-12.7, 12.7] to [-127, 127] which is more appropriate
+        scale_tensor_name = name.replace("/", ".") + "_scale"
+        scale_value = 0.1  # Conservative scale factor for better precision
+        self.make_external_tensor(
+            torch.tensor([scale_value], dtype=torch.float32).contiguous(), 
+            scale_tensor_name
+        )
+        
+        # Create zero_point tensor (0 for symmetric quantization)
+        zero_point_tensor_name = name.replace("/", ".") + "_zero_point"
+        self.make_external_tensor(
+            torch.tensor([0], dtype=torch.int8).contiguous(), 
+            zero_point_tensor_name
+        )
+        
+        # Cast input to FP32 for QuantizeLinear (must match scale tensor type)
+        cast_name = f"{name}/Cast_to_fp32"
+        self.make_cast(cast_name, input_tensor, dtype=TensorProto.FLOAT, shape=shape)
+        
+        # Use standard QuantizeLinear operator
+        quantize_name = f"{name}/QuantizeLinear"
+        quantize_inputs = [f"{cast_name}/output_0", scale_tensor_name, zero_point_tensor_name]
+        self.make_node("QuantizeLinear", inputs=quantize_inputs, outputs=[f"{quantize_name}/output_0"], name=quantize_name)
+        self.make_value_info(f"{quantize_name}/output_0", TensorProto.INT8, shape=shape)
+        
+        return f"{quantize_name}/output_0"
+
+    def make_dequantize_kv_cache(self, name, input_tensor, shape):
+        """Create standard ONNX dequantization for KV cache using DequantizeLinear"""
+        # Use the same scale as quantization for proper round-trip
+        scale_tensor_name = name.replace("/", ".") + "_scale"
+        scale_value = 0.1  # Same scale as quantization
+        self.make_external_tensor(
+            torch.tensor([scale_value], dtype=torch.float32).contiguous(), 
+            scale_tensor_name
+        )
+        
+        # Create zero_point tensor (0 for symmetric quantization)
+        zero_point_tensor_name = name.replace("/", ".") + "_zero_point"
+        self.make_external_tensor(
+            torch.tensor([0], dtype=torch.int8).contiguous(), 
+            zero_point_tensor_name
+        )
+        
+        # Use standard DequantizeLinear operator (output is FP32)
+        dequantize_name = f"{name}/DequantizeLinear"
+        dequantize_inputs = [input_tensor, scale_tensor_name, zero_point_tensor_name]
+        self.make_node("DequantizeLinear", inputs=dequantize_inputs, outputs=[f"{dequantize_name}/output_0"], name=dequantize_name)
+        self.make_value_info(f"{dequantize_name}/output_0", TensorProto.FLOAT, shape=shape)
+        
+        # Cast from FP32 back to model's io_dtype (FP16) for compatibility
+        cast_name = f"{name}/Cast_to_fp16"
+        self.make_cast(cast_name, f"{dequantize_name}/output_0", dtype=self.io_dtype, shape=shape)
+        
+        return f"{cast_name}/output_0"
+
 
 class LlamaModel(Model):
     def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
@@ -3646,7 +3866,9 @@ def check_extra_options(kv_pairs):
     """
     Check key-value pairs and set values correctly
     """
-    bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32"]
+    bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", 
+             "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32", 
+             "quantize_kv_cache"]  # Add quantize_kv_cache to bools list
     for key in bools:
         if key in kv_pairs:
             if kv_pairs[key] in {"false", "False", "0"}:

From 7c875cc274f42d52a551a5923de60122f7097c7e Mon Sep 17 00:00:00 2001
From: Anuj Jalota <ajalota@nvidia.com>
Date: Mon, 23 Jun 2025 12:33:12 +0530
Subject: [PATCH 4/5] revert exp changes

---
 examples/c/src/phi3.cpp    |  11 +-
 examples/python/phi3-qa.py | 211 ++++++++++++++++++++-----------------
 src/generators.cpp         |   2 +-
 src/models/kv_cache.cpp    |  21 ++--
 4 files changed, 128 insertions(+), 117 deletions(-)

diff --git a/examples/c/src/phi3.cpp b/examples/c/src/phi3.cpp
index 117514b80..415ba2341 100644
--- a/examples/c/src/phi3.cpp
+++ b/examples/c/src/phi3.cpp
@@ -40,8 +40,8 @@ void CXX_API(const char* model_path, const char* execution_provider) {
   std::thread th(std::bind(&TerminateSession::Generator_SetTerminate_Call, &catch_terminate, generator.get()));
 
   // Define System Prompt
-  const std::string system_instructions = "You are a helpful AI and give very small answers";
-  bool include_system_prompt = false;
+  const std::string system_prompt = std::string("<|system|>\n") + "You are a helpful AI and give elaborative answers" + "<|end|>";
+  bool include_system_prompt = true;
 
   while (true) {
     signal(SIGINT, signalHandlerWrapper);
@@ -55,10 +55,7 @@ void CXX_API(const char* model_path, const char* execution_provider) {
       break;  // Exit the loop
     }
 
-    // Using direct prompt instead of ApplyChatTemplate to avoid library version issues
-    const std::string prompt = "<start_of_turn>user\n" + 
-      (include_system_prompt ? system_instructions + "\n\n" + text : text) + 
-      "<end_of_turn>\n<start_of_turn>model\n";
+    const std::string prompt = tokenizer->ApplyChatTemplate("", text.c_str(), "", true);
 
     bool is_first_token = true;
     Timing timing;
@@ -66,7 +63,7 @@ void CXX_API(const char* model_path, const char* execution_provider) {
 
     auto sequences = OgaSequences::Create();
     if (include_system_prompt) {
-      std::string combined = system_instructions + "\n\n" + prompt;
+      std::string combined = system_prompt + prompt;
       tokenizer->Encode(combined.c_str(), *sequences);
       include_system_prompt = false;
     } else {
diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py
index c801f35d1..b7c730d26 100644
--- a/examples/python/phi3-qa.py
+++ b/examples/python/phi3-qa.py
@@ -1,98 +1,113 @@
-import onnxruntime_genai as og
-import argparse
-import time
-
-def main(args):
-    if args.verbose: print("Loading model...")
-    if args.timings:
-        started_timestamp = 0
-        first_token_timestamp = 0
-
-    config = og.Config(args.model_path)
-    config.clear_providers()
-    if args.execution_provider != "cpu":
-        if args.verbose: print(f"Setting model to {args.execution_provider}")
-        config.append_provider(args.execution_provider)
-    model = og.Model(config)
-
-    if args.verbose: print("Model loaded")
-    
-    tokenizer = og.Tokenizer(model)
-    tokenizer_stream = tokenizer.create_stream()
-    if args.verbose: print("Tokenizer created")
-    if args.verbose: print()
-    search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args}
-    
-    # Set the max length to something sensible by default, unless it is specified by the user,
-    # since otherwise it will be set to the entire context length
-    if 'max_length' not in search_options:
-        search_options['max_length'] = 2048
-
-    chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>'
-
-    # Keep asking for input prompts in a loop
-    while True:
-        text = input("Input: ")
-        if not text:
-            print("Error, input cannot be empty")
-            continue
-
-        if args.timings: started_timestamp = time.time()
-
-        # If there is a chat template, use it
-        prompt = f'{chat_template.format(input=text)}'
-
-        input_tokens = tokenizer.encode(prompt)
-
-        params = og.GeneratorParams(model)
-        params.set_search_options(**search_options)
-        generator = og.Generator(model, params)
-
-        generator.append_tokens(input_tokens)
-        if args.verbose: print("Generator created")
-
-        if args.verbose: print("Running generation loop ...")
-        if args.timings:
-            first = True
-            new_tokens = []
-
-        print()
-        print("Output: ", end='', flush=True)
-
-        try:
-            while not generator.is_done():
-                generator.generate_next_token()
-                if args.timings:
-                    if first:
-                        first_token_timestamp = time.time()
-                        first = False
-
-                new_token = generator.get_next_tokens()[0]
-                print(tokenizer_stream.decode(new_token), end='', flush=True)
-                if args.timings: new_tokens.append(new_token)
-        except KeyboardInterrupt:
-            print("  --control+c pressed, aborting generation--")
-        print()
-        print()
-
-        if args.timings:
-            prompt_time = first_token_timestamp - started_timestamp
-            run_time = time.time() - first_token_timestamp
-            print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai")
-    parser.add_argument('-m', '--model_path', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and model.onnx)')
-    parser.add_argument('-e', '--execution_provider', type=str, required=True, choices=["cpu", "cuda", "dml", "NvTensorRtRtx"], help="Execution provider to run ONNX model with")
-    parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt')
-    parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt')
-    parser.add_argument('-ds', '--do_sample', action='store_true', default=False, help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false')
-    parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with')
-    parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from')
-    parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with')
-    parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with')
-    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false')
-    parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false')
-    args = parser.parse_args()
-    main(args)
\ No newline at end of file
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <iomanip>
+#include <string>
+#include <cstring>
+#include "ort_genai.h"
+#include <thread>
+#include <csignal>
+#include <atomic>
+#include <functional>
+#include "common.h"
+
+// C++ API Example
+
+static TerminateSession catch_terminate;
+
+void signalHandlerWrapper(int signum) {
+  catch_terminate.signalHandler(signum);
+}
+
+void CXX_API(const char* model_path, const char* execution_provider) {
+  std::cout << "Creating config..." << std::endl;
+  auto config = OgaConfig::Create(model_path);
+
+  std::string provider(execution_provider);
+  append_provider(*config, provider);
+
+  std::cout << "Creating model..." << std::endl;
+  auto model = OgaModel::Create(*config);
+
+  std::cout << "Creating tokenizer..." << std::endl;
+  auto tokenizer = OgaTokenizer::Create(*model);
+  auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
+
+  while (true) {
+    signal(SIGINT, signalHandlerWrapper);
+    std::string text;
+    std::cout << "Prompt: (Use quit() to exit) Or (To terminate current output generation, press Ctrl+C)" << std::endl;
+    // Clear Any cin error flags because of SIGINT
+    std::cin.clear();
+    std::getline(std::cin, text);
+
+    if (text == "quit()") {
+      break;  // Exit the loop
+    }
+
+    const std::string prompt = tokenizer->ApplyChatTemplate("", text.c_str(), "", true);
+
+    bool is_first_token = true;
+    Timing timing;
+    timing.RecordStartTimestamp();
+
+    auto sequences = OgaSequences::Create();
+    tokenizer->Encode(prompt.c_str(), *sequences);
+
+    std::cout << "Generating response..." << std::endl;
+
+    auto params = OgaGeneratorParams::Create(*model);
+    params->SetSearchOption("max_length", 1024);
+    auto generator = OgaGenerator::Create(*model, *params);
+    std::thread th(std::bind(&TerminateSession::Generator_SetTerminate_Call, &catch_terminate, generator.get()));
+    generator->AppendTokenSequences(*sequences);
+
+    try {
+      while (!generator->IsDone()) {
+        generator->GenerateNextToken();
+
+        if (is_first_token) {
+          timing.RecordFirstTokenTimestamp();
+          is_first_token = false;
+        }
+
+        const auto num_tokens = generator->GetSequenceCount(0);
+        const auto new_token = generator->GetSequenceData(0)[num_tokens - 1];
+        std::cout << tokenizer_stream->Decode(new_token) << std::flush;
+      }
+    } catch (const std::exception& e) {
+      std::cout << "Session Terminated: " << e.what() << std::endl;
+    }
+
+    timing.RecordEndTimestamp();
+    const int prompt_tokens_length = sequences->SequenceCount(0);
+    const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length;
+    timing.Log(prompt_tokens_length, new_tokens_length);
+
+    if (th.joinable()) {
+      th.join();  // Join the thread if it's still running
+    }
+
+    for (int i = 0; i < 3; ++i)
+      std::cout << std::endl;
+  }
+}
+
+int main(int argc, char** argv) {
+  std::string model_path, ep;
+  if (!parse_args(argc, argv, model_path, ep)) {
+    return -1;
+  }
+
+  // Responsible for cleaning up the library during shutdown
+  OgaHandle handle;
+
+  std::cout << "-------------" << std::endl;
+  std::cout << "Hello, Phi-3!" << std::endl;
+  std::cout << "-------------" << std::endl;
+
+  std::cout << "C++ API" << std::endl;
+  CXX_API(model_path.c_str(), ep.c_str());
+
+  return 0;
+}
\ No newline at end of file
diff --git a/src/generators.cpp b/src/generators.cpp
index 05352d542..a318c5c83 100644
--- a/src/generators.cpp
+++ b/src/generators.cpp
@@ -363,7 +363,7 @@ void Generator::AppendTokens(cpu_span<const int32_t> input_ids) {
   if (search_->GetSequenceLength() != 0 && state_->params_->search.batch_size > 1)
     throw std::runtime_error("AppendTokens can only be called once for batch_size > 1. To call AppendTokens again, use RewindToLength(0)");
 
-  constexpr std::array<DeviceType, 5> devices_supporting_continuous_decoding{DeviceType::CPU, DeviceType::CUDA, DeviceType::WEBGPU, DeviceType::OpenVINO, DeviceType::NvTensorRtRtx};
+  constexpr std::array<DeviceType, 4> devices_supporting_continuous_decoding{DeviceType::CPU, DeviceType::CUDA, DeviceType::WEBGPU, DeviceType::OpenVINO};
   if (search_->GetSequenceLength() != 0 &&
       std::none_of(devices_supporting_continuous_decoding.begin(), devices_supporting_continuous_decoding.end(),
                    [this](DeviceType device_type) { return device_type == state_->model_.p_device_kvcache_->GetType(); }))
diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp
index 7b05f6be3..3894f72ec 100644
--- a/src/models/kv_cache.cpp
+++ b/src/models/kv_cache.cpp
@@ -6,7 +6,6 @@
 #include "kv_cache.h"
 #include "windowed_kv_cache.h"
 #include "../openvino/interface.h"
-#include <iostream>
 
 namespace Generators {
 
@@ -176,8 +175,14 @@ DefaultKeyValueCache::DefaultKeyValueCache(State& state)
   }
 
   // Set the size after empty_past_ has been created with 0 for this field
-  if (past_present_share_buffer_)
+  if (state.model_.p_device_->GetType() == DeviceType::NvTensorRtRtx &&
+      model_.config_->model.decoder.sliding_window.has_value() &&
+      model_.config_->model.decoder.sliding_window->window_size > 0) {
+    shape_[2] = std::min(state_.params_->search.max_length,
+                         model_.config_->model.decoder.sliding_window->window_size);
+  } else if (past_present_share_buffer_) {
     shape_[2] = state_.params_->search.max_length;
+  }
 
   try {
     for (int i = 0; i < layer_count_ * 2; ++i) {
@@ -247,13 +252,6 @@ void DefaultKeyValueCache::Update(DeviceSpan<int32_t> beam_indices, int total_le
   shape_[2] = total_length;
   for (int i = 0; i < layer_count_ * 2; i++) {
     presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_);
-    size_t element_size = type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 ? 2 : sizeof(int8_t);
-    size_t elements_per_tensor = shape_[0] * shape_[1] * shape_[2] * shape_[3];
-    size_t memory_per_tensor = elements_per_tensor * element_size;
-    if (i == 0) {
-      std::cout << "  Layer " << i << ": " << memory_per_tensor << " bytes (" 
-                << (memory_per_tensor / 1024.0 / 1024.0) << " MB)" << std::endl;
-    }
     state_.outputs_[output_index_ + i] = presents_[i].get();
   }
 
@@ -430,7 +428,8 @@ std::unique_ptr<KeyValueCache> CreateKeyValueCache(State& state) {
     return nullptr;
   }
 
-  if (state.model_.config_->model.decoder.sliding_window &&
+  if (state.model_.p_device_->GetType() != DeviceType::NvTensorRtRtx &&
+      state.model_.config_->model.decoder.sliding_window &&
       state.model_.config_->model.decoder.sliding_window->slide_key_value_cache) {
     return std::make_unique<WindowedKeyValueCache>(state);
   }
@@ -438,4 +437,4 @@ std::unique_ptr<KeyValueCache> CreateKeyValueCache(State& state) {
   return std::make_unique<DefaultKeyValueCache>(state);
 }
 
-}  // namespace Generators
+}  // namespace Generators
\ No newline at end of file

From b7d87000fe81372acb10beb20e504c8df81f99ed Mon Sep 17 00:00:00 2001
From: Anuj Jalota <ajalota@nvidia.com>
Date: Mon, 23 Jun 2025 12:36:48 +0530
Subject: [PATCH 5/5] exp changes

---
 examples/python/phi3-qa.py | 212 +++++++++++++++++--------------------
 1 file changed, 99 insertions(+), 113 deletions(-)

diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py
index b7c730d26..41db18f2e 100644
--- a/examples/python/phi3-qa.py
+++ b/examples/python/phi3-qa.py
@@ -1,113 +1,99 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include <iomanip>
-#include <string>
-#include <cstring>
-#include "ort_genai.h"
-#include <thread>
-#include <csignal>
-#include <atomic>
-#include <functional>
-#include "common.h"
-
-// C++ API Example
-
-static TerminateSession catch_terminate;
-
-void signalHandlerWrapper(int signum) {
-  catch_terminate.signalHandler(signum);
-}
-
-void CXX_API(const char* model_path, const char* execution_provider) {
-  std::cout << "Creating config..." << std::endl;
-  auto config = OgaConfig::Create(model_path);
-
-  std::string provider(execution_provider);
-  append_provider(*config, provider);
-
-  std::cout << "Creating model..." << std::endl;
-  auto model = OgaModel::Create(*config);
-
-  std::cout << "Creating tokenizer..." << std::endl;
-  auto tokenizer = OgaTokenizer::Create(*model);
-  auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer);
-
-  while (true) {
-    signal(SIGINT, signalHandlerWrapper);
-    std::string text;
-    std::cout << "Prompt: (Use quit() to exit) Or (To terminate current output generation, press Ctrl+C)" << std::endl;
-    // Clear Any cin error flags because of SIGINT
-    std::cin.clear();
-    std::getline(std::cin, text);
-
-    if (text == "quit()") {
-      break;  // Exit the loop
-    }
-
-    const std::string prompt = tokenizer->ApplyChatTemplate("", text.c_str(), "", true);
-
-    bool is_first_token = true;
-    Timing timing;
-    timing.RecordStartTimestamp();
-
-    auto sequences = OgaSequences::Create();
-    tokenizer->Encode(prompt.c_str(), *sequences);
-
-    std::cout << "Generating response..." << std::endl;
-
-    auto params = OgaGeneratorParams::Create(*model);
-    params->SetSearchOption("max_length", 1024);
-    auto generator = OgaGenerator::Create(*model, *params);
-    std::thread th(std::bind(&TerminateSession::Generator_SetTerminate_Call, &catch_terminate, generator.get()));
-    generator->AppendTokenSequences(*sequences);
-
-    try {
-      while (!generator->IsDone()) {
-        generator->GenerateNextToken();
-
-        if (is_first_token) {
-          timing.RecordFirstTokenTimestamp();
-          is_first_token = false;
-        }
-
-        const auto num_tokens = generator->GetSequenceCount(0);
-        const auto new_token = generator->GetSequenceData(0)[num_tokens - 1];
-        std::cout << tokenizer_stream->Decode(new_token) << std::flush;
-      }
-    } catch (const std::exception& e) {
-      std::cout << "Session Terminated: " << e.what() << std::endl;
-    }
-
-    timing.RecordEndTimestamp();
-    const int prompt_tokens_length = sequences->SequenceCount(0);
-    const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length;
-    timing.Log(prompt_tokens_length, new_tokens_length);
-
-    if (th.joinable()) {
-      th.join();  // Join the thread if it's still running
-    }
-
-    for (int i = 0; i < 3; ++i)
-      std::cout << std::endl;
-  }
-}
-
-int main(int argc, char** argv) {
-  std::string model_path, ep;
-  if (!parse_args(argc, argv, model_path, ep)) {
-    return -1;
-  }
-
-  // Responsible for cleaning up the library during shutdown
-  OgaHandle handle;
-
-  std::cout << "-------------" << std::endl;
-  std::cout << "Hello, Phi-3!" << std::endl;
-  std::cout << "-------------" << std::endl;
-
-  std::cout << "C++ API" << std::endl;
-  CXX_API(model_path.c_str(), ep.c_str());
-
-  return 0;
-}
\ No newline at end of file
+import onnxruntime_genai as og
+import argparse
+import time
+import json
+
+def main(args):
+    if args.verbose: print("Loading model...")
+    if args.timings:
+        started_timestamp = 0
+        first_token_timestamp = 0
+
+    config = og.Config(args.model_path)
+    if args.execution_provider != "follow_config":
+        config.clear_providers()
+        if args.execution_provider != "cpu":
+            if args.verbose: print(f"Setting model to {args.execution_provider}")
+            config.append_provider(args.execution_provider)
+    model = og.Model(config)
+
+    if args.verbose: print("Model loaded")
+    
+    tokenizer = og.Tokenizer(model)
+    tokenizer_stream = tokenizer.create_stream()
+    if args.verbose: print("Tokenizer created")
+    if args.verbose: print()
+    search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args}
+    
+    # Set the max length to something sensible by default, unless it is specified by the user,
+    # since otherwise it will be set to the entire context length
+    if 'max_length' not in search_options:
+        search_options['max_length'] = 2048
+
+    # Keep asking for input prompts in a loop
+    while True:
+        text = input("Input: ")
+        if not text:
+            print("Error, input cannot be empty")
+            continue
+
+        if args.timings: started_timestamp = time.time()
+
+        # If there is a chat template, use it
+        input_message = [{"role": "user", "content": text }]
+        input_prompt = tokenizer.apply_chat_template(json.dumps(input_message), add_generation_prompt=True)
+
+        input_tokens = tokenizer.encode(input_prompt)
+
+        params = og.GeneratorParams(model)
+        params.set_search_options(**search_options)
+        generator = og.Generator(model, params)
+
+        generator.append_tokens(input_tokens)
+        if args.verbose: print("Generator created")
+
+        if args.verbose: print("Running generation loop ...")
+        if args.timings:
+            first = True
+            new_tokens = []
+
+        print()
+        print("Output: ", end='', flush=True)
+
+        try:
+            while not generator.is_done():
+                generator.generate_next_token()
+                if args.timings:
+                    if first:
+                        first_token_timestamp = time.time()
+                        first = False
+
+                new_token = generator.get_next_tokens()[0]
+                print(tokenizer_stream.decode(new_token), end='', flush=True)
+                if args.timings: new_tokens.append(new_token)
+        except KeyboardInterrupt:
+            print("  --control+c pressed, aborting generation--")
+        print()
+        print()
+
+        if args.timings:
+            prompt_time = first_token_timestamp - started_timestamp
+            run_time = time.time() - first_token_timestamp
+            print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai")
+    parser.add_argument('-m', '--model_path', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and model.onnx)')
+    parser.add_argument('-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.")
+    parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt')
+    parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt')
+    parser.add_argument('-ds', '--do_sample', action='store_true', default=False, help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false')
+    parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with')
+    parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from')
+    parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with')
+    parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with')
+    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false')
+    parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false')
+    args = parser.parse_args()
+    main(args)