Update how Hugging Face's config files are processed (microsoft#1693)

kunal-vaishnavi · web-flow · commit 13fe16005e18 · 2025-08-20T09:49:41.000-07:00
### Description This PR updates how Hugging Face's config files are processed while creating the `genai_config.json` file for ONNX Runtime GenAI. ### Motivation and Context The `config.json` file typically stores model attributes while the `generation_config.json` file typically stores search attributes. Some fields can overlap between the two files, and the latter file typically contains more detailed information than the former file for a particular attribute (e.g. the [`config.json`](https://huggingface.co/google/gemma-3-270m-it/blob/main/config.json#L10) file for `google/gemma-3-270m-it` says that `eos_token_id = 1` but the [`generation_config.json`](https://huggingface.co/google/gemma-3-270m-it/blob/main/generation_config.json#L4) file for the same model says that `eos_token_id = [1, 106]`). To ensure the right attributes are stored, we compare the values stored in `generation_config.json` with the traditional default values loaded in `config.json`. If they do not match, then the values in `config.json` are rewritten with the `generation_config.json` values. With this PR, models such as [`google/gemma-3-270m-it`](https://huggingface.co/google/gemma-3-270m-it) and [`janhq/Jan-v1-4B`](https://huggingface.co/janhq/Jan-v1-4B) can be set with their default search values.
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -374,10 +374,26 @@ def make_attention_init(self):
         self.past_present_share_buffer = self.attention_attrs["op_type"] == "GroupQueryAttention"
 
     def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
+        # Create config with attributes from config.json and generation_config.json (if latter file exists)
+        config = AutoConfig.from_pretrained(model_name_or_path, token=self.hf_token, trust_remote_code=True, **extra_kwargs)
         try:
-            config = GenerationConfig.from_pretrained(model_name_or_path, token=self.hf_token, trust_remote_code=True, **extra_kwargs)
+            # Override search attributes in config based on values in generation_config.json
+            gen_config = GenerationConfig.from_pretrained(model_name_or_path, token=self.hf_token, trust_remote_code=True, **extra_kwargs)
+            defaults = {
+                "bos_token_id": None,
+                "do_sample": False,
+                "eos_token_id": None,
+                "pad_token_id": None,
+                "temperature": 1.0,
+                "top_k": 50,
+                "top_p": 1.0,
+            }
+            for key, default_val in defaults.items():
+                val = getattr(gen_config, key)
+                if val != default_val:
+                    setattr(config, key, getattr(gen_config, key))
         except:
-            config = AutoConfig.from_pretrained(model_name_or_path, token=self.hf_token, trust_remote_code=True, **extra_kwargs)
+            pass
 
         inputs = dict(zip(self.input_names, self.input_names))
         inputs.update({
@@ -393,9 +409,12 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
             # Remove 'hidden_states' from 'outputs' entry in config since ORT GenAI doesn't use it
             del outputs["hidden_states"]
 
+        bos_token_id = config.bos_token_id if hasattr(config, "bos_token_id") and config.bos_token_id is not None else 1
+        eos_token_id = config.eos_token_id
+        pad_token_id = config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id[0] if isinstance(config.eos_token_id, list) else config.eos_token_id
         genai_config = {
             "model": {
-                "bos_token_id": config.bos_token_id if hasattr(config, "bos_token_id") and config.bos_token_id is not None else 1,  # config.bos_token_id not present in ChatGLM model configs.
+                "bos_token_id": bos_token_id,
                 "context_length": self.context_length,
                 "decoder": {
                     "session_options" : {
@@ -411,8 +430,8 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
                     "num_hidden_layers": self.num_layers,
                     "num_key_value_heads": self.num_kv_heads,
                 },
-                "eos_token_id": config.eos_token_id,
-                "pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id[0] if isinstance(config.eos_token_id, list) else config.eos_token_id,
+                "eos_token_id": eos_token_id,
+                "pad_token_id": pad_token_id,
                 "type": self.model_type[ : self.model_type.find("For") if "For" in self.model_type else len(self.model_type)].lower(),
                 "vocab_size": self.vocab_size,
             },
@@ -4268,6 +4287,7 @@ def create_model(model_name, input_path, output_dir, precision, execution_provid
         # List architecture options in alphabetical order
         if config.architectures[0] == "ChatGLMForConditionalGeneration" or config.architectures[0] == "ChatGLMModel":
             # Quantized ChatGLM model has ChatGLMForConditionalGeneration as architecture whereas HF model as the latter
+            config.bos_token_id = 1
             config.hidden_act = "swiglu"
             onnx_model = ChatGLMModel(config, io_dtype, onnx_dtype, execution_provider, cache_dir, extra_options)
         elif config.architectures[0] == "Ernie4_5_ForCausalLM":