From c6accdede7ebe2dd8d599f869df2313395612040 Mon Sep 17 00:00:00 2001 From: Anuj Jalota Date: Mon, 9 Jun 2025 11:21:34 +0530 Subject: [PATCH 1/5] Update phi3 example: increase max_length to 6K tokens and fix API compatibility --- examples/c/src/phi3.cpp | 5 +++-- src/generators.cpp | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/c/src/phi3.cpp b/examples/c/src/phi3.cpp index 415ba2341..8ca496d05 100644 --- a/examples/c/src/phi3.cpp +++ b/examples/c/src/phi3.cpp @@ -40,7 +40,7 @@ void CXX_API(const char* model_path, const char* execution_provider) { std::thread th(std::bind(&TerminateSession::Generator_SetTerminate_Call, &catch_terminate, generator.get())); // Define System Prompt - const std::string system_prompt = std::string("<|system|>\n") + "You are a helpful AI and give elaborative answers" + "<|end|>"; + const std::string system_prompt = std::string("<|system|>\n") + "You are a helpful AI and give very small answers" + "<|end|>"; bool include_system_prompt = true; while (true) { @@ -55,7 +55,8 @@ void CXX_API(const char* model_path, const char* execution_provider) { break; // Exit the loop } - const std::string prompt = tokenizer->ApplyChatTemplate("", text.c_str(), "", true); + // Using direct prompt instead of ApplyChatTemplate to avoid library version issues + const std::string prompt = "<|user|>\n" + text + "<|end|>\n<|assistant|>\n"; bool is_first_token = true; Timing timing; diff --git a/src/generators.cpp b/src/generators.cpp index a318c5c83..05352d542 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -363,7 +363,7 @@ void Generator::AppendTokens(cpu_span input_ids) { if (search_->GetSequenceLength() != 0 && state_->params_->search.batch_size > 1) throw std::runtime_error("AppendTokens can only be called once for batch_size > 1. To call AppendTokens again, use RewindToLength(0)"); - constexpr std::array devices_supporting_continuous_decoding{DeviceType::CPU, DeviceType::CUDA, DeviceType::WEBGPU, DeviceType::OpenVINO}; + constexpr std::array devices_supporting_continuous_decoding{DeviceType::CPU, DeviceType::CUDA, DeviceType::WEBGPU, DeviceType::OpenVINO, DeviceType::NvTensorRtRtx}; if (search_->GetSequenceLength() != 0 && std::none_of(devices_supporting_continuous_decoding.begin(), devices_supporting_continuous_decoding.end(), [this](DeviceType device_type) { return device_type == state_->model_.p_device_kvcache_->GetType(); })) From f663a5b111823ab15c9303c16b7a2df9ff8d93c7 Mon Sep 17 00:00:00 2001 From: Anuj Jalota Date: Mon, 9 Jun 2025 14:35:33 +0530 Subject: [PATCH 2/5] application changes --- examples/c/src/phi3.cpp | 10 ++++++---- examples/python/phi3-qa.py | 21 ++++++++++----------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/examples/c/src/phi3.cpp b/examples/c/src/phi3.cpp index 8ca496d05..117514b80 100644 --- a/examples/c/src/phi3.cpp +++ b/examples/c/src/phi3.cpp @@ -40,8 +40,8 @@ void CXX_API(const char* model_path, const char* execution_provider) { std::thread th(std::bind(&TerminateSession::Generator_SetTerminate_Call, &catch_terminate, generator.get())); // Define System Prompt - const std::string system_prompt = std::string("<|system|>\n") + "You are a helpful AI and give very small answers" + "<|end|>"; - bool include_system_prompt = true; + const std::string system_instructions = "You are a helpful AI and give very small answers"; + bool include_system_prompt = false; while (true) { signal(SIGINT, signalHandlerWrapper); @@ -56,7 +56,9 @@ void CXX_API(const char* model_path, const char* execution_provider) { } // Using direct prompt instead of ApplyChatTemplate to avoid library version issues - const std::string prompt = "<|user|>\n" + text + "<|end|>\n<|assistant|>\n"; + const std::string prompt = "user\n" + + (include_system_prompt ? system_instructions + "\n\n" + text : text) + + "\nmodel\n"; bool is_first_token = true; Timing timing; @@ -64,7 +66,7 @@ void CXX_API(const char* model_path, const char* execution_provider) { auto sequences = OgaSequences::Create(); if (include_system_prompt) { - std::string combined = system_prompt + prompt; + std::string combined = system_instructions + "\n\n" + prompt; tokenizer->Encode(combined.c_str(), *sequences); include_system_prompt = false; } else { diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py index 41db18f2e..c801f35d1 100644 --- a/examples/python/phi3-qa.py +++ b/examples/python/phi3-qa.py @@ -1,7 +1,6 @@ import onnxruntime_genai as og import argparse import time -import json def main(args): if args.verbose: print("Loading model...") @@ -10,11 +9,10 @@ def main(args): first_token_timestamp = 0 config = og.Config(args.model_path) - if args.execution_provider != "follow_config": - config.clear_providers() - if args.execution_provider != "cpu": - if args.verbose: print(f"Setting model to {args.execution_provider}") - config.append_provider(args.execution_provider) + config.clear_providers() + if args.execution_provider != "cpu": + if args.verbose: print(f"Setting model to {args.execution_provider}") + config.append_provider(args.execution_provider) model = og.Model(config) if args.verbose: print("Model loaded") @@ -30,6 +28,8 @@ def main(args): if 'max_length' not in search_options: search_options['max_length'] = 2048 + chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>' + # Keep asking for input prompts in a loop while True: text = input("Input: ") @@ -40,10 +40,9 @@ def main(args): if args.timings: started_timestamp = time.time() # If there is a chat template, use it - input_message = [{"role": "user", "content": text }] - input_prompt = tokenizer.apply_chat_template(json.dumps(input_message), add_generation_prompt=True) + prompt = f'{chat_template.format(input=text)}' - input_tokens = tokenizer.encode(input_prompt) + input_tokens = tokenizer.encode(prompt) params = og.GeneratorParams(model) params.set_search_options(**search_options) @@ -85,7 +84,7 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai") parser.add_argument('-m', '--model_path', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and model.onnx)') - parser.add_argument('-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.") + parser.add_argument('-e', '--execution_provider', type=str, required=True, choices=["cpu", "cuda", "dml", "NvTensorRtRtx"], help="Execution provider to run ONNX model with") parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt') parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt') parser.add_argument('-ds', '--do_sample', action='store_true', default=False, help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false') @@ -96,4 +95,4 @@ def main(args): parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false') parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false') args = parser.parse_args() - main(args) + main(args) \ No newline at end of file From f73deba1f558c8a90b5b9a2b7016f5415dda15cf Mon Sep 17 00:00:00 2001 From: Anuj Jalota Date: Mon, 16 Jun 2025 19:30:40 +0530 Subject: [PATCH 3/5] Add int8 KV cache quantization support --- src/models/kv_cache.cpp | 8 + src/python/py/models/builder.py | 250 ++++++++++++++++++++++++++++++-- 2 files changed, 244 insertions(+), 14 deletions(-) diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp index 581d12caa..7b05f6be3 100644 --- a/src/models/kv_cache.cpp +++ b/src/models/kv_cache.cpp @@ -6,6 +6,7 @@ #include "kv_cache.h" #include "windowed_kv_cache.h" #include "../openvino/interface.h" +#include namespace Generators { @@ -246,6 +247,13 @@ void DefaultKeyValueCache::Update(DeviceSpan beam_indices, int total_le shape_[2] = total_length; for (int i = 0; i < layer_count_ * 2; i++) { presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_); + size_t element_size = type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 ? 2 : sizeof(int8_t); + size_t elements_per_tensor = shape_[0] * shape_[1] * shape_[2] * shape_[3]; + size_t memory_per_tensor = elements_per_tensor * element_size; + if (i == 0) { + std::cout << " Layer " << i << ": " << memory_per_tensor << " bytes (" + << (memory_per_tensor / 1024.0 / 1024.0) << " MB)" << std::endl; + } state_.outputs_[output_index_ + i] = presents_[i].get(); } diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 1303269e5..79303f9d2 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -321,6 +321,11 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.quant_attrs["config"] = config.quantization_config self.quant_attrs["use_g_idx"] = config.quantization_config["desc_act"] if "desc_act" in config.quantization_config else False + # KV cache quantization attributes (add at the end) + self.kv_cache_attrs = { + "quantize_kv_cache": extra_options.get("quantize_kv_cache", False), + "kv_scale_factor": 127.0, # Scale factor for INT8 quantization + } def make_outputs_init(self): # Always use float32 logits to improve accuracy in the case of bf16 models. @@ -334,17 +339,20 @@ def make_outputs_init(self): self.output_names = [name.replace("logits", "hidden_states") for name in self.output_names] elif self.include_hidden_states: self.output_names = ["hidden_states"] + self.output_names + + # Update output types for quantized KV cache + if hasattr(self, 'kv_cache_attrs') and self.kv_cache_attrs["quantize_kv_cache"]: + self.output_types["present.key"] = TensorProto.INT8 + self.output_types["present.value"] = TensorProto.INT8 def make_attention_init(self): valid_gqa_configurations = [ ("cpu", TensorProto.FLOAT), - ("cuda", TensorProto.FLOAT16), ("cuda", TensorProto.BFLOAT16), ("rocm", TensorProto.FLOAT16), ("dml", TensorProto.FLOAT16), ("webgpu", TensorProto.FLOAT16), ("webgpu", TensorProto.FLOAT), - ("NvTensorRtRtx", TensorProto.FLOAT16), ] if (self.ep, self.io_dtype) in valid_gqa_configurations: # Change model settings for GroupQueryAttention @@ -644,17 +652,21 @@ def make_inputs_and_outputs(self): # Add KV cache to inputs and outputs for i in range(self.num_layers): + # Determine input/output types based on quantization setting + kv_input_dtype = TensorProto.INT8 if (hasattr(self, 'kv_cache_attrs') and self.kv_cache_attrs["quantize_kv_cache"]) else self.input_types["past_key_values.key"] + kv_output_dtype = TensorProto.INT8 if (hasattr(self, 'kv_cache_attrs') and self.kv_cache_attrs["quantize_kv_cache"]) else self.output_types["present.key"] + # Add KV cache to inputs key_name = f"past_key_values.{i}.key" - inputs.append(helper.make_tensor_value_info(key_name, self.input_types["past_key_values.key"], shape=self.input_shapes["past_key_values.key"])) + inputs.append(helper.make_tensor_value_info(key_name, kv_input_dtype, shape=self.input_shapes["past_key_values.key"])) value_name = f"past_key_values.{i}.value" - inputs.append(helper.make_tensor_value_info(value_name, self.input_types["past_key_values.value"], shape=self.input_shapes["past_key_values.value"])) + inputs.append(helper.make_tensor_value_info(value_name, kv_input_dtype, shape=self.input_shapes["past_key_values.value"])) # Add KV cache to outputs key_name = f"present.{i}.key" - outputs.append(helper.make_tensor_value_info(key_name, self.output_types["present.key"], shape=self.output_shapes["present.key"])) + outputs.append(helper.make_tensor_value_info(key_name, kv_output_dtype, shape=self.output_shapes["present.key"])) value_name = f"present.{i}.value" - outputs.append(helper.make_tensor_value_info(value_name, self.output_types["present.value"], shape=self.output_shapes["present.value"])) + outputs.append(helper.make_tensor_value_info(value_name, kv_output_dtype, shape=self.output_shapes["present.value"])) self.inputs = inputs self.outputs = outputs @@ -1545,7 +1557,6 @@ def _make_simplified_layer_norm(self, basename, root_input, weight_name, output_ self.make_node("Mul", inputs=make_mul_1_inputs, outputs=[output_0], name=make_mul_1_name) self.make_value_info(output_0, dtype=io_dtype, shape=shape) - def make_qk_norm(self, layer_id, attention): # Make subgraph to compute SimplifiedLayerNorm after Q and K MatMuls in attention: # @@ -1692,7 +1703,9 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs): # Transpose # | # Reshape - basename = f"/model/layers.{layer_id}/attn/{'k_proj' if past_kv.endswith('key') else 'v_proj'}/repeat_kv" + # Determine if this is for key or value based on past_kv (original case) or present_kv (quantized case) + is_key = past_kv.endswith('key') or (present_kv and 'temp_present_k' in present_kv) + basename = f"/model/layers.{layer_id}/attn/{'k_proj' if is_key else 'v_proj'}/repeat_kv" # Make the initial subgraph # @@ -1712,9 +1725,16 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs): concat_1_name = f"{basename}/Concat_1" concat_1_inputs = [past_kv, f"{transpose_1_name}/output_0"] self.make_node("Concat", inputs=concat_1_inputs, outputs=[present_kv], name=concat_1_name, axis=2) + self.make_value_info(present_kv, self.io_dtype, shape=['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size]) + # Use Identity node to ensure proper shape flow for quantized KV cache scenarios + # This prevents shape inference issues when present_kv is a temporary tensor name + identity_present_name = f"{basename}/Identity_present" + self.make_node("Identity", inputs=[present_kv], outputs=[f"{identity_present_name}/output_0"], name=identity_present_name) + self.make_value_info(f"{identity_present_name}/output_0", self.io_dtype, shape=['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size]) + shape_1_name = f"{basename}/Shape_1" - self.make_shape(shape_1_name, present_kv, shape=[4]) + self.make_shape(shape_1_name, f"{identity_present_name}/output_0", shape=[4]) gather_1_name = f"{basename}/Gather_1" gather_1_inputs = [f"{shape_1_name}/output_0", "/model/constants/TensorProto.INT64/0D/0"] self.make_gather(gather_1_name, gather_1_inputs, axis=0) @@ -1781,7 +1801,7 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs): # \ \ # Unsqueeze --> Expand --> Reshape --> Transpose --> Reshape unsqueeze_5_name = f"{basename}/Unsqueeze_5" - unsqueeze_5_inputs = [present_kv, "/model/constants/TensorProto.INT64/1D/2"] + unsqueeze_5_inputs = [f"{identity_present_name}/output_0", "/model/constants/TensorProto.INT64/1D/2"] self.make_unsqueeze(unsqueeze_5_name, unsqueeze_5_inputs, dtype=self.io_dtype, shape=['batch_size', self.num_kv_heads, 1, 'sequence_length', self.head_size]) expand_name = f"{basename}/Expand" expand_inputs = [f"{unsqueeze_5_name}/output_0", f"{where_name}/output_0"] @@ -1802,6 +1822,111 @@ def make_repeat_kv(self, layer_id, root_input, past_kv, present_kv, **kwargs): def make_attention_op(self, name, **kwargs): op_type = self.attention_attrs["op_type"] + # Handle KV cache quantization if enabled + original_past_k = kwargs.get("past_k", "") + original_past_v = kwargs.get("past_v", "") + original_present_k = kwargs.get("present_k", "") + original_present_v = kwargs.get("present_v", "") + + # Check if we need to handle quantized KV cache with MHA + GQA configuration + has_quantized_kv = hasattr(self, 'kv_cache_attrs') and self.kv_cache_attrs["quantize_kv_cache"] + needs_kv_repeat = (self.num_attn_heads != self.num_kv_heads and op_type == "MultiHeadAttention") + + if has_quantized_kv and needs_kv_repeat: + # Special handling for quantized KV cache + MHA + different head counts + # We need to: dequantize -> repeat_kv -> attention -> quantize + + # Extract layer_id from the attention name + import re + layer_match = re.search(r'/model/layers\.(\d+)/', name) + if layer_match: + layer_id = int(layer_match.group(1)) + else: + raise ValueError(f"Could not extract layer_id from attention name: {name}") + + if original_past_k and original_past_v: + # 1. Dequantize past KV cache + dequant_past_k = self.make_dequantize_kv_cache( + f"{name}/dequant_past_k", + original_past_k, + ['batch_size', self.num_kv_heads, 'past_sequence_length', self.head_size] + ) + dequant_past_v = self.make_dequantize_kv_cache( + f"{name}/dequant_past_v", + original_past_v, + ['batch_size', self.num_kv_heads, 'past_sequence_length', self.head_size] + ) + + # 2. Apply repeat_kv with direct dequantized inputs (remove barriers that cause TensorRT issues) + # Create temporary present outputs (unquantized) + temp_present_k_unquant = f"{name}/temp_present_k_unquant" + temp_present_v_unquant = f"{name}/temp_present_v_unquant" + + # Call repeat_kv to handle head expansion and KV cache update + repeated_k = self.make_repeat_kv(layer_id, root_input=kwargs["k_path"], past_kv=dequant_past_k, present_kv=temp_present_k_unquant) + repeated_v = self.make_repeat_kv(layer_id, root_input=kwargs["v_path"], past_kv=dequant_past_v, present_kv=temp_present_v_unquant) + + # 3. Update kwargs for attention operation + kwargs["k_path"] = repeated_k + kwargs["v_path"] = repeated_v + kwargs["past_k"] = "" # Already handled by repeat_kv + kwargs["past_v"] = "" # Already handled by repeat_kv + kwargs["present_k"] = "" # Will be handled by quantization + kwargs["present_v"] = "" # Will be handled by quantization + + # 4. Run MultiHeadAttention + self.make_multi_head_attention(name, add_qk=f"{self.mask_attrs['mask_name']}/output_0", **kwargs) + + # 5. Quantize the present KV cache from repeat_kv outputs + if original_present_k: + quant_present_k = self.make_quantize_kv_cache( + f"{name}/quant_present_k", + temp_present_k_unquant, + ['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size] + ) + self.make_node("Identity", inputs=[quant_present_k], outputs=[original_present_k], + name=f"{name}/present_k_identity") + self.make_value_info(original_present_k, TensorProto.INT8, + shape=['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size]) + + if original_present_v: + quant_present_v = self.make_quantize_kv_cache( + f"{name}/quant_present_v", + temp_present_v_unquant, + ['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size] + ) + self.make_node("Identity", inputs=[quant_present_v], outputs=[original_present_v], + name=f"{name}/present_v_identity") + self.make_value_info(original_present_v, TensorProto.INT8, + shape=['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size]) + return + + elif has_quantized_kv: + # Standard quantized KV cache handling (for GQA or MHA without head mismatch) + if original_past_k: + dequant_past_k = self.make_dequantize_kv_cache( + f"{name}/dequant_past_k", + original_past_k, + ['batch_size', self.num_kv_heads, 'past_sequence_length', self.head_size] + ) + kwargs["past_k"] = dequant_past_k + + if original_past_v: + dequant_past_v = self.make_dequantize_kv_cache( + f"{name}/dequant_past_v", + original_past_v, + ['batch_size', self.num_kv_heads, 'past_sequence_length', self.head_size] + ) + kwargs["past_v"] = dequant_past_v + + # Use temporary names for present outputs + temp_present_k = f"{name}/temp_present_k" if original_present_k else "" + temp_present_v = f"{name}/temp_present_v" if original_present_v else "" + if original_present_k: + kwargs["present_k"] = temp_present_k + if original_present_v: + kwargs["present_v"] = temp_present_v + if op_type == "MultiHeadAttention": self.make_multi_head_attention(name, add_qk=f"{self.mask_attrs['mask_name']}/output_0", **kwargs) elif op_type == "GroupQueryAttention": @@ -1811,6 +1936,31 @@ def make_attention_op(self, name, **kwargs): else: raise NotImplementedError(f"The {op_type} op is not currently supported.") + # Quantize present KV cache for storage if quantization is enabled + # (Skip if already handled in the special repeat_kv case above) + if has_quantized_kv and not needs_kv_repeat: + if original_present_k and temp_present_k: + quant_present_k = self.make_quantize_kv_cache( + f"{name}/quant_present_k", + temp_present_k, + ['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size] + ) + self.make_node("Identity", inputs=[quant_present_k], outputs=[original_present_k], + name=f"{name}/present_k_identity") + self.make_value_info(original_present_k, TensorProto.INT8, + shape=['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size]) + + if original_present_v and temp_present_v: + quant_present_v = self.make_quantize_kv_cache( + f"{name}/quant_present_v", + temp_present_v, + ['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size] + ) + self.make_node("Identity", inputs=[quant_present_v], outputs=[original_present_v], + name=f"{name}/present_v_identity") + self.make_value_info(original_present_v, TensorProto.INT8, + shape=['batch_size', self.num_kv_heads, 'total_sequence_length', self.head_size]) + def make_multi_head_attention(self, name, **kwargs): inputs = [ kwargs["q_path"], kwargs["k_path"], kwargs["v_path"], kwargs.get("bias", ""), @@ -1958,10 +2108,20 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): past_v = f"past_key_values.{layer_id}.value" present_k = f"present.{layer_id}.key" present_v = f"present.{layer_id}.value" + + # Check if we have quantized KV cache - if so, handle it differently + has_quantized_kv = hasattr(self, 'kv_cache_attrs') and self.kv_cache_attrs["quantize_kv_cache"] + if self.num_attn_heads != self.num_kv_heads and self.attention_attrs["op_type"] == "MultiHeadAttention": - self.attention_attrs["k_path"] = self.make_repeat_kv(layer_id, root_input=self.attention_attrs["k_path"], past_kv=past_k, present_kv=present_k) - self.attention_attrs["v_path"] = self.make_repeat_kv(layer_id, root_input=self.attention_attrs["v_path"], past_kv=past_v, present_kv=present_v) - past_k, past_v, present_k, present_v = "", "", "", "" + if has_quantized_kv: + # For quantized KV cache, we'll do repeat_kv in make_attention_op after dequantization + # So we keep the KV cache parameters for the attention_op to handle + pass # KV cache will be handled in make_attention_op with proper quantization + else: + # Original logic for non-quantized case + self.attention_attrs["k_path"] = self.make_repeat_kv(layer_id, root_input=self.attention_attrs["k_path"], past_kv=past_k, present_kv=present_k) + self.attention_attrs["v_path"] = self.make_repeat_kv(layer_id, root_input=self.attention_attrs["v_path"], past_kv=past_v, present_kv=present_v) + past_k, past_v, present_k, present_v = "", "", "", "" # Make attention node (e.g. MultiHeadAttention, GroupQueryAttention, etc.) attn_name = f"/model/layers.{layer_id}/attn/{self.attention_attrs['op_type']}" @@ -3078,6 +3238,66 @@ def make_position_ids_reformatting(self): return reshape_name + def make_quantize_kv_cache(self, name, input_tensor, shape): + """Create standard ONNX quantization for KV cache using QuantizeLinear""" + # Use a more conservative scale factor for KV cache values + # KV cache values typically have much smaller range than full FP16 + # Using scale = 0.1 maps [-12.7, 12.7] to [-127, 127] which is more appropriate + scale_tensor_name = name.replace("/", ".") + "_scale" + scale_value = 0.1 # Conservative scale factor for better precision + self.make_external_tensor( + torch.tensor([scale_value], dtype=torch.float32).contiguous(), + scale_tensor_name + ) + + # Create zero_point tensor (0 for symmetric quantization) + zero_point_tensor_name = name.replace("/", ".") + "_zero_point" + self.make_external_tensor( + torch.tensor([0], dtype=torch.int8).contiguous(), + zero_point_tensor_name + ) + + # Cast input to FP32 for QuantizeLinear (must match scale tensor type) + cast_name = f"{name}/Cast_to_fp32" + self.make_cast(cast_name, input_tensor, dtype=TensorProto.FLOAT, shape=shape) + + # Use standard QuantizeLinear operator + quantize_name = f"{name}/QuantizeLinear" + quantize_inputs = [f"{cast_name}/output_0", scale_tensor_name, zero_point_tensor_name] + self.make_node("QuantizeLinear", inputs=quantize_inputs, outputs=[f"{quantize_name}/output_0"], name=quantize_name) + self.make_value_info(f"{quantize_name}/output_0", TensorProto.INT8, shape=shape) + + return f"{quantize_name}/output_0" + + def make_dequantize_kv_cache(self, name, input_tensor, shape): + """Create standard ONNX dequantization for KV cache using DequantizeLinear""" + # Use the same scale as quantization for proper round-trip + scale_tensor_name = name.replace("/", ".") + "_scale" + scale_value = 0.1 # Same scale as quantization + self.make_external_tensor( + torch.tensor([scale_value], dtype=torch.float32).contiguous(), + scale_tensor_name + ) + + # Create zero_point tensor (0 for symmetric quantization) + zero_point_tensor_name = name.replace("/", ".") + "_zero_point" + self.make_external_tensor( + torch.tensor([0], dtype=torch.int8).contiguous(), + zero_point_tensor_name + ) + + # Use standard DequantizeLinear operator (output is FP32) + dequantize_name = f"{name}/DequantizeLinear" + dequantize_inputs = [input_tensor, scale_tensor_name, zero_point_tensor_name] + self.make_node("DequantizeLinear", inputs=dequantize_inputs, outputs=[f"{dequantize_name}/output_0"], name=dequantize_name) + self.make_value_info(f"{dequantize_name}/output_0", TensorProto.FLOAT, shape=shape) + + # Cast from FP32 back to model's io_dtype (FP16) for compatibility + cast_name = f"{name}/Cast_to_fp16" + self.make_cast(cast_name, f"{dequantize_name}/output_0", dtype=self.io_dtype, shape=shape) + + return f"{cast_name}/output_0" + class LlamaModel(Model): def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): @@ -3646,7 +3866,9 @@ def check_extra_options(kv_pairs): """ Check key-value pairs and set values correctly """ - bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32"] + bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", + "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32", + "quantize_kv_cache"] # Add quantize_kv_cache to bools list for key in bools: if key in kv_pairs: if kv_pairs[key] in {"false", "False", "0"}: From 7c875cc274f42d52a551a5923de60122f7097c7e Mon Sep 17 00:00:00 2001 From: Anuj Jalota Date: Mon, 23 Jun 2025 12:33:12 +0530 Subject: [PATCH 4/5] revert exp changes --- examples/c/src/phi3.cpp | 11 +- examples/python/phi3-qa.py | 211 ++++++++++++++++++++----------------- src/generators.cpp | 2 +- src/models/kv_cache.cpp | 21 ++-- 4 files changed, 128 insertions(+), 117 deletions(-) diff --git a/examples/c/src/phi3.cpp b/examples/c/src/phi3.cpp index 117514b80..415ba2341 100644 --- a/examples/c/src/phi3.cpp +++ b/examples/c/src/phi3.cpp @@ -40,8 +40,8 @@ void CXX_API(const char* model_path, const char* execution_provider) { std::thread th(std::bind(&TerminateSession::Generator_SetTerminate_Call, &catch_terminate, generator.get())); // Define System Prompt - const std::string system_instructions = "You are a helpful AI and give very small answers"; - bool include_system_prompt = false; + const std::string system_prompt = std::string("<|system|>\n") + "You are a helpful AI and give elaborative answers" + "<|end|>"; + bool include_system_prompt = true; while (true) { signal(SIGINT, signalHandlerWrapper); @@ -55,10 +55,7 @@ void CXX_API(const char* model_path, const char* execution_provider) { break; // Exit the loop } - // Using direct prompt instead of ApplyChatTemplate to avoid library version issues - const std::string prompt = "user\n" + - (include_system_prompt ? system_instructions + "\n\n" + text : text) + - "\nmodel\n"; + const std::string prompt = tokenizer->ApplyChatTemplate("", text.c_str(), "", true); bool is_first_token = true; Timing timing; @@ -66,7 +63,7 @@ void CXX_API(const char* model_path, const char* execution_provider) { auto sequences = OgaSequences::Create(); if (include_system_prompt) { - std::string combined = system_instructions + "\n\n" + prompt; + std::string combined = system_prompt + prompt; tokenizer->Encode(combined.c_str(), *sequences); include_system_prompt = false; } else { diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py index c801f35d1..b7c730d26 100644 --- a/examples/python/phi3-qa.py +++ b/examples/python/phi3-qa.py @@ -1,98 +1,113 @@ -import onnxruntime_genai as og -import argparse -import time - -def main(args): - if args.verbose: print("Loading model...") - if args.timings: - started_timestamp = 0 - first_token_timestamp = 0 - - config = og.Config(args.model_path) - config.clear_providers() - if args.execution_provider != "cpu": - if args.verbose: print(f"Setting model to {args.execution_provider}") - config.append_provider(args.execution_provider) - model = og.Model(config) - - if args.verbose: print("Model loaded") - - tokenizer = og.Tokenizer(model) - tokenizer_stream = tokenizer.create_stream() - if args.verbose: print("Tokenizer created") - if args.verbose: print() - search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} - - # Set the max length to something sensible by default, unless it is specified by the user, - # since otherwise it will be set to the entire context length - if 'max_length' not in search_options: - search_options['max_length'] = 2048 - - chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>' - - # Keep asking for input prompts in a loop - while True: - text = input("Input: ") - if not text: - print("Error, input cannot be empty") - continue - - if args.timings: started_timestamp = time.time() - - # If there is a chat template, use it - prompt = f'{chat_template.format(input=text)}' - - input_tokens = tokenizer.encode(prompt) - - params = og.GeneratorParams(model) - params.set_search_options(**search_options) - generator = og.Generator(model, params) - - generator.append_tokens(input_tokens) - if args.verbose: print("Generator created") - - if args.verbose: print("Running generation loop ...") - if args.timings: - first = True - new_tokens = [] - - print() - print("Output: ", end='', flush=True) - - try: - while not generator.is_done(): - generator.generate_next_token() - if args.timings: - if first: - first_token_timestamp = time.time() - first = False - - new_token = generator.get_next_tokens()[0] - print(tokenizer_stream.decode(new_token), end='', flush=True) - if args.timings: new_tokens.append(new_token) - except KeyboardInterrupt: - print(" --control+c pressed, aborting generation--") - print() - print() - - if args.timings: - prompt_time = first_token_timestamp - started_timestamp - run_time = time.time() - first_token_timestamp - print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai") - parser.add_argument('-m', '--model_path', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and model.onnx)') - parser.add_argument('-e', '--execution_provider', type=str, required=True, choices=["cpu", "cuda", "dml", "NvTensorRtRtx"], help="Execution provider to run ONNX model with") - parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt') - parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt') - parser.add_argument('-ds', '--do_sample', action='store_true', default=False, help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false') - parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with') - parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from') - parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with') - parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with') - parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false') - parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false') - args = parser.parse_args() - main(args) \ No newline at end of file +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include +#include +#include "ort_genai.h" +#include +#include +#include +#include +#include "common.h" + +// C++ API Example + +static TerminateSession catch_terminate; + +void signalHandlerWrapper(int signum) { + catch_terminate.signalHandler(signum); +} + +void CXX_API(const char* model_path, const char* execution_provider) { + std::cout << "Creating config..." << std::endl; + auto config = OgaConfig::Create(model_path); + + std::string provider(execution_provider); + append_provider(*config, provider); + + std::cout << "Creating model..." << std::endl; + auto model = OgaModel::Create(*config); + + std::cout << "Creating tokenizer..." << std::endl; + auto tokenizer = OgaTokenizer::Create(*model); + auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer); + + while (true) { + signal(SIGINT, signalHandlerWrapper); + std::string text; + std::cout << "Prompt: (Use quit() to exit) Or (To terminate current output generation, press Ctrl+C)" << std::endl; + // Clear Any cin error flags because of SIGINT + std::cin.clear(); + std::getline(std::cin, text); + + if (text == "quit()") { + break; // Exit the loop + } + + const std::string prompt = tokenizer->ApplyChatTemplate("", text.c_str(), "", true); + + bool is_first_token = true; + Timing timing; + timing.RecordStartTimestamp(); + + auto sequences = OgaSequences::Create(); + tokenizer->Encode(prompt.c_str(), *sequences); + + std::cout << "Generating response..." << std::endl; + + auto params = OgaGeneratorParams::Create(*model); + params->SetSearchOption("max_length", 1024); + auto generator = OgaGenerator::Create(*model, *params); + std::thread th(std::bind(&TerminateSession::Generator_SetTerminate_Call, &catch_terminate, generator.get())); + generator->AppendTokenSequences(*sequences); + + try { + while (!generator->IsDone()) { + generator->GenerateNextToken(); + + if (is_first_token) { + timing.RecordFirstTokenTimestamp(); + is_first_token = false; + } + + const auto num_tokens = generator->GetSequenceCount(0); + const auto new_token = generator->GetSequenceData(0)[num_tokens - 1]; + std::cout << tokenizer_stream->Decode(new_token) << std::flush; + } + } catch (const std::exception& e) { + std::cout << "Session Terminated: " << e.what() << std::endl; + } + + timing.RecordEndTimestamp(); + const int prompt_tokens_length = sequences->SequenceCount(0); + const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length; + timing.Log(prompt_tokens_length, new_tokens_length); + + if (th.joinable()) { + th.join(); // Join the thread if it's still running + } + + for (int i = 0; i < 3; ++i) + std::cout << std::endl; + } +} + +int main(int argc, char** argv) { + std::string model_path, ep; + if (!parse_args(argc, argv, model_path, ep)) { + return -1; + } + + // Responsible for cleaning up the library during shutdown + OgaHandle handle; + + std::cout << "-------------" << std::endl; + std::cout << "Hello, Phi-3!" << std::endl; + std::cout << "-------------" << std::endl; + + std::cout << "C++ API" << std::endl; + CXX_API(model_path.c_str(), ep.c_str()); + + return 0; +} \ No newline at end of file diff --git a/src/generators.cpp b/src/generators.cpp index 05352d542..a318c5c83 100644 --- a/src/generators.cpp +++ b/src/generators.cpp @@ -363,7 +363,7 @@ void Generator::AppendTokens(cpu_span input_ids) { if (search_->GetSequenceLength() != 0 && state_->params_->search.batch_size > 1) throw std::runtime_error("AppendTokens can only be called once for batch_size > 1. To call AppendTokens again, use RewindToLength(0)"); - constexpr std::array devices_supporting_continuous_decoding{DeviceType::CPU, DeviceType::CUDA, DeviceType::WEBGPU, DeviceType::OpenVINO, DeviceType::NvTensorRtRtx}; + constexpr std::array devices_supporting_continuous_decoding{DeviceType::CPU, DeviceType::CUDA, DeviceType::WEBGPU, DeviceType::OpenVINO}; if (search_->GetSequenceLength() != 0 && std::none_of(devices_supporting_continuous_decoding.begin(), devices_supporting_continuous_decoding.end(), [this](DeviceType device_type) { return device_type == state_->model_.p_device_kvcache_->GetType(); })) diff --git a/src/models/kv_cache.cpp b/src/models/kv_cache.cpp index 7b05f6be3..3894f72ec 100644 --- a/src/models/kv_cache.cpp +++ b/src/models/kv_cache.cpp @@ -6,7 +6,6 @@ #include "kv_cache.h" #include "windowed_kv_cache.h" #include "../openvino/interface.h" -#include namespace Generators { @@ -176,8 +175,14 @@ DefaultKeyValueCache::DefaultKeyValueCache(State& state) } // Set the size after empty_past_ has been created with 0 for this field - if (past_present_share_buffer_) + if (state.model_.p_device_->GetType() == DeviceType::NvTensorRtRtx && + model_.config_->model.decoder.sliding_window.has_value() && + model_.config_->model.decoder.sliding_window->window_size > 0) { + shape_[2] = std::min(state_.params_->search.max_length, + model_.config_->model.decoder.sliding_window->window_size); + } else if (past_present_share_buffer_) { shape_[2] = state_.params_->search.max_length; + } try { for (int i = 0; i < layer_count_ * 2; ++i) { @@ -247,13 +252,6 @@ void DefaultKeyValueCache::Update(DeviceSpan beam_indices, int total_le shape_[2] = total_length; for (int i = 0; i < layer_count_ * 2; i++) { presents_[i] = OrtValue::CreateTensor(Allocator(), shape_, type_); - size_t element_size = type_ == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 ? 2 : sizeof(int8_t); - size_t elements_per_tensor = shape_[0] * shape_[1] * shape_[2] * shape_[3]; - size_t memory_per_tensor = elements_per_tensor * element_size; - if (i == 0) { - std::cout << " Layer " << i << ": " << memory_per_tensor << " bytes (" - << (memory_per_tensor / 1024.0 / 1024.0) << " MB)" << std::endl; - } state_.outputs_[output_index_ + i] = presents_[i].get(); } @@ -430,7 +428,8 @@ std::unique_ptr CreateKeyValueCache(State& state) { return nullptr; } - if (state.model_.config_->model.decoder.sliding_window && + if (state.model_.p_device_->GetType() != DeviceType::NvTensorRtRtx && + state.model_.config_->model.decoder.sliding_window && state.model_.config_->model.decoder.sliding_window->slide_key_value_cache) { return std::make_unique(state); } @@ -438,4 +437,4 @@ std::unique_ptr CreateKeyValueCache(State& state) { return std::make_unique(state); } -} // namespace Generators +} // namespace Generators \ No newline at end of file From b7d87000fe81372acb10beb20e504c8df81f99ed Mon Sep 17 00:00:00 2001 From: Anuj Jalota Date: Mon, 23 Jun 2025 12:36:48 +0530 Subject: [PATCH 5/5] exp changes --- examples/python/phi3-qa.py | 212 +++++++++++++++++-------------------- 1 file changed, 99 insertions(+), 113 deletions(-) diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py index b7c730d26..41db18f2e 100644 --- a/examples/python/phi3-qa.py +++ b/examples/python/phi3-qa.py @@ -1,113 +1,99 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include -#include -#include -#include "ort_genai.h" -#include -#include -#include -#include -#include "common.h" - -// C++ API Example - -static TerminateSession catch_terminate; - -void signalHandlerWrapper(int signum) { - catch_terminate.signalHandler(signum); -} - -void CXX_API(const char* model_path, const char* execution_provider) { - std::cout << "Creating config..." << std::endl; - auto config = OgaConfig::Create(model_path); - - std::string provider(execution_provider); - append_provider(*config, provider); - - std::cout << "Creating model..." << std::endl; - auto model = OgaModel::Create(*config); - - std::cout << "Creating tokenizer..." << std::endl; - auto tokenizer = OgaTokenizer::Create(*model); - auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer); - - while (true) { - signal(SIGINT, signalHandlerWrapper); - std::string text; - std::cout << "Prompt: (Use quit() to exit) Or (To terminate current output generation, press Ctrl+C)" << std::endl; - // Clear Any cin error flags because of SIGINT - std::cin.clear(); - std::getline(std::cin, text); - - if (text == "quit()") { - break; // Exit the loop - } - - const std::string prompt = tokenizer->ApplyChatTemplate("", text.c_str(), "", true); - - bool is_first_token = true; - Timing timing; - timing.RecordStartTimestamp(); - - auto sequences = OgaSequences::Create(); - tokenizer->Encode(prompt.c_str(), *sequences); - - std::cout << "Generating response..." << std::endl; - - auto params = OgaGeneratorParams::Create(*model); - params->SetSearchOption("max_length", 1024); - auto generator = OgaGenerator::Create(*model, *params); - std::thread th(std::bind(&TerminateSession::Generator_SetTerminate_Call, &catch_terminate, generator.get())); - generator->AppendTokenSequences(*sequences); - - try { - while (!generator->IsDone()) { - generator->GenerateNextToken(); - - if (is_first_token) { - timing.RecordFirstTokenTimestamp(); - is_first_token = false; - } - - const auto num_tokens = generator->GetSequenceCount(0); - const auto new_token = generator->GetSequenceData(0)[num_tokens - 1]; - std::cout << tokenizer_stream->Decode(new_token) << std::flush; - } - } catch (const std::exception& e) { - std::cout << "Session Terminated: " << e.what() << std::endl; - } - - timing.RecordEndTimestamp(); - const int prompt_tokens_length = sequences->SequenceCount(0); - const int new_tokens_length = generator->GetSequenceCount(0) - prompt_tokens_length; - timing.Log(prompt_tokens_length, new_tokens_length); - - if (th.joinable()) { - th.join(); // Join the thread if it's still running - } - - for (int i = 0; i < 3; ++i) - std::cout << std::endl; - } -} - -int main(int argc, char** argv) { - std::string model_path, ep; - if (!parse_args(argc, argv, model_path, ep)) { - return -1; - } - - // Responsible for cleaning up the library during shutdown - OgaHandle handle; - - std::cout << "-------------" << std::endl; - std::cout << "Hello, Phi-3!" << std::endl; - std::cout << "-------------" << std::endl; - - std::cout << "C++ API" << std::endl; - CXX_API(model_path.c_str(), ep.c_str()); - - return 0; -} \ No newline at end of file +import onnxruntime_genai as og +import argparse +import time +import json + +def main(args): + if args.verbose: print("Loading model...") + if args.timings: + started_timestamp = 0 + first_token_timestamp = 0 + + config = og.Config(args.model_path) + if args.execution_provider != "follow_config": + config.clear_providers() + if args.execution_provider != "cpu": + if args.verbose: print(f"Setting model to {args.execution_provider}") + config.append_provider(args.execution_provider) + model = og.Model(config) + + if args.verbose: print("Model loaded") + + tokenizer = og.Tokenizer(model) + tokenizer_stream = tokenizer.create_stream() + if args.verbose: print("Tokenizer created") + if args.verbose: print() + search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} + + # Set the max length to something sensible by default, unless it is specified by the user, + # since otherwise it will be set to the entire context length + if 'max_length' not in search_options: + search_options['max_length'] = 2048 + + # Keep asking for input prompts in a loop + while True: + text = input("Input: ") + if not text: + print("Error, input cannot be empty") + continue + + if args.timings: started_timestamp = time.time() + + # If there is a chat template, use it + input_message = [{"role": "user", "content": text }] + input_prompt = tokenizer.apply_chat_template(json.dumps(input_message), add_generation_prompt=True) + + input_tokens = tokenizer.encode(input_prompt) + + params = og.GeneratorParams(model) + params.set_search_options(**search_options) + generator = og.Generator(model, params) + + generator.append_tokens(input_tokens) + if args.verbose: print("Generator created") + + if args.verbose: print("Running generation loop ...") + if args.timings: + first = True + new_tokens = [] + + print() + print("Output: ", end='', flush=True) + + try: + while not generator.is_done(): + generator.generate_next_token() + if args.timings: + if first: + first_token_timestamp = time.time() + first = False + + new_token = generator.get_next_tokens()[0] + print(tokenizer_stream.decode(new_token), end='', flush=True) + if args.timings: new_tokens.append(new_token) + except KeyboardInterrupt: + print(" --control+c pressed, aborting generation--") + print() + print() + + if args.timings: + prompt_time = first_token_timestamp - started_timestamp + run_time = time.time() - first_token_timestamp + print(f"Prompt length: {len(input_tokens)}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens)/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end AI Question/Answer example for gen-ai") + parser.add_argument('-m', '--model_path', type=str, required=True, help='Onnx model folder path (must contain genai_config.json and model.onnx)') + parser.add_argument('-e', '--execution_provider', type=str, required=False, default='follow_config', choices=["cpu", "cuda", "dml", "follow_config"], help="Execution provider to run the ONNX Runtime session with. Defaults to follow_config that uses the execution provider listed in the genai_config.json instead.") + parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt') + parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt') + parser.add_argument('-ds', '--do_sample', action='store_true', default=False, help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false') + parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with') + parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from') + parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with') + parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with') + parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false') + parser.add_argument('-g', '--timings', action='store_true', default=False, help='Print timing information for each generation step. Defaults to false') + args = parser.parse_args() + main(args)