add Jiutian

foldl · foldl · commit 773d5acc1e8f · 2025-07-29T11:15:14.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -88,6 +88,7 @@ set(core_files src/backend.cpp
     models/instella.cpp
     models/internlm.cpp
     models/jina.cpp
+    models/jiutian.cpp
     models/llama.cpp
     models/m_a_p.cpp
     models/megrez.cpp
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ pure C++ implementation based on [@ggerganov](https://github.com/ggerganov)'s [g
 
 **What's New:**
 
+* 2025-07-29: Jiutian
 * 2025-07-10: SmolLM-3
 * 2025-07-05: Pangu-Pro-MoE
 * 2025-07-04: ERNIE-MoE
diff --git a/convert.py b/convert.py
@@ -205,6 +205,8 @@ class ModelType(Enum):
 
     Exaone4         = 0x2800
 
+    JiuTian         = 0x2900
+
     BCE_Embedding           = 0x10000100
     BCE_ReRanker            = 0x10000101
     BGE_M3                  = 0x10000102
@@ -7171,6 +7173,26 @@ def get_block(prefix: str):
 
         return weights + dac_weights
 
+
+class JiuTianConverter(BaseConverter):
+    MODEL_TYPE = ModelType.JiuTian
+
+    @staticmethod
+    def dump_config(f, config, ggml_type):
+        assert config.qkv_bias
+        dump_llama_like_config(f, config, ggml_type)
+
+        config_values = [
+            config.num_key_value_heads,
+            1 if config.tie_word_embeddings else 0,
+        ]
+        f.write(struct.pack("i" * len(config_values), *config_values))
+        f.write(struct.pack("<f", config.rope_theta))
+
+    @staticmethod
+    def get_weight_names(config):
+        return QWen2Converter.get_weight_names(config)
+
 def convert_grok_1_base(args, vocab, ggml_type):
     def ffn_size(emb_size, widening_factor):
         _ffn_size = int(widening_factor * emb_size) * 2 // 3
@@ -7758,6 +7780,8 @@ def main():
         ERNIEMoEConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'PanguProMoEForCausalLM':
         PanguMoEConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
+    elif arch == 'JiutianForCausalLM':
+        JiuTianConverter.convert(config, model_files, vocab, ggml_type, args.save_path)
     elif arch == 'deepseek-r1-distill-qwen3':
         QWen3Converter.MODEL_TYPE = ModelType.DeepSeek_R1_Distill_QWen3
         QWen3Converter.convert(config, model_files, vocab, ggml_type, args.save_path)
diff --git a/docs/models.md b/docs/models.md
@@ -92,6 +92,12 @@
     * [x] v2.5: [Chat-1.8B](https://huggingface.co/internlm/internlm2_5-1_8b-chat), [Chat-7B](https://huggingface.co/internlm/internlm2_5-7b-chat), [Chat-7B-1M](https://huggingface.co/internlm/internlm2_5-7b-chat-1m), [Chat-20B](https://huggingface.co/internlm/internlm2_5-20b-chat)
     * [x] v3: [Instruct-8B](https://huggingface.co/internlm/internlm3-8b-instruct)
 
+* Jiutian (`JiutianForCausalLM`)
+    * [x] [Math-8B](https://huggingface.co/JT-LM/JT-Math-8B-Instruct/tree/00a347fdae86ddd9e616aa0771492c6aff735697),
+    [Math-8B-Thinking](https://huggingface.co/JT-LM/JT-Math-8B-Thinking/tree/87d8db3e39c65fa123c59a97266a3ec02ebf6bd6),
+    [Coder-8B-Instruct](https://huggingface.co/JT-LM/JT-Coder-8B-Instruct/tree/9160d51e9acaae266cfef8493ea25d15e7ed6904),
+    [DA-8B](https://huggingface.co/JT-LM/JT-DA-8B/commit/8bd5bb1a76305dcc777786b65c239b362cee808e)
+
 * Ling (`BailingMoeForCausalLM`)
     * [x] [Lite](https://huggingface.co/inclusionAI/Ling-lite/tree/a80ae6c479251f1ae33dda517ab83cdc6a312f99), [Coder-Lite](https://huggingface.co/inclusionAI/Ling-Coder-lite/tree/4a8647acf9d3855d599adaaaf4bf6ca14239d2ab)
 
diff --git a/models/jiutian.cpp b/models/jiutian.cpp
@@ -0,0 +1,34 @@
+#include "qwen.h"
+#include "../src/models_priv.h"
+
+namespace chatllm::jiutian
+{
+    struct Config : public BaseConfig
+    {
+        int num_key_value_heads;
+        int tie_word_embeddings;
+        float rope_theta;
+    };
+
+    static qwen::v2::Config convert(const Config &config)
+    {
+        qwen::v2::Config r;
+        *(BaseConfig *)&r = *(BaseConfig *)&config;
+        r.num_key_value_heads = config.num_key_value_heads;
+        r.sliding_window      = -1;
+        r.rope_theta          = config.rope_theta;
+        return r;
+    }
+
+    typedef qwen::v2::Tokenizer Tokenizer;
+
+    class ConditionalGeneration : public qwen::v2::ConditionalGeneration
+    {
+    public:
+        ConditionalGeneration(const Config &config, const RuntimeConfig &runtime_config)
+            : qwen::v2::ConditionalGeneration(convert(config), runtime_config, MODEL_TYPE_JIUTIAN, config.tie_word_embeddings != 0)
+        {}
+    };
+
+    REGISTER_MODEL_LOADER(JIUTIAN, jiutian, 1);
+}
diff --git a/scripts/models.json b/scripts/models.json
@@ -3055,5 +3055,21 @@
                 }
             }
         }
+    },
+    "jiutian-coder": {
+        "brief": "A series of high-performance and energy-efficient code large language models (LLMs) developed by the JiuTian team.",
+        "default": "8b",
+        "license": "Apache License 2.0",
+        "variants": {
+            "8b": {
+                "default": "q8",
+                "quantized": {
+                    "q8": {
+                        "size": 8317413728,
+                        "url": "chatllm_quantized_jiutian/jt-coder-8b-it.bin"
+                    }
+                }
+            }
+        }
     }
 }
diff --git a/src/chat.cpp b/src/chat.cpp
@@ -1385,7 +1385,7 @@ namespace chatllm
                 n_dims = 2;
 
             CHATLLM_CHECK(ndim == n_dims)
-                << "tensor " << name << " ndim mismatch: expect " << n_dims << " but got " << ndim;
+                << "tensor " << name << " ndim mismatch: expect " << n_dims << " but got " << ndim << ". expected shape: " << shape_to_string(tensor);
 
             if (partial)
             {
diff --git a/src/models_priv.h b/src/models_priv.h
@@ -165,6 +165,8 @@ namespace chatllm
 
         MODEL_TYPE_EXAONE4          = 0x2800,
 
+        MODEL_TYPE_JIUTIAN          = 0x2900,
+
         MODEL_TYPE_BCE_Embedding = 0x10000100,
         MODEL_TYPE_BCE_ReRanker  = 0x10000101,
         MODEL_TYPE_BGE_M3        = 0x10000102,

Original file line number	Diff line number	Diff line change
`@@ -3055,5 +3055,21 @@`
`3055`	`3055`	`}`
`3056`	`3056`	`}`
`3057`	`3057`	`}`
	`3058`	`+ },`
	`3059`	`+ "jiutian-coder": {`
	`3060`	`+ "brief": "A series of high-performance and energy-efficient code large language models (LLMs) developed by the JiuTian team.",`
	`3061`	`+ "default": "8b",`
	`3062`	`+ "license": "Apache License 2.0",`
	`3063`	`+ "variants": {`
	`3064`	`+ "8b": {`
	`3065`	`+ "default": "q8",`
	`3066`	`+ "quantized": {`
	`3067`	`+ "q8": {`
	`3068`	`+ "size": 8317413728,`
	`3069`	`+ "url": "chatllm_quantized_jiutian/jt-coder-8b-it.bin"`
	`3070`	`+ }`
	`3071`	`+ }`
	`3072`	`+ }`
	`3073`	`+ }`
`3058`	`3074`	`}`
`3059`	`3075`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1385,7 +1385,7 @@ namespace chatllm`
`1385`	`1385`	`n_dims = 2;`
`1386`	`1386`
`1387`	`1387`	`CHATLLM_CHECK(ndim == n_dims)`
`1388`		`- << "tensor " << name << " ndim mismatch: expect " << n_dims << " but got " << ndim;`
	`1388`	`+ << "tensor " << name << " ndim mismatch: expect " << n_dims << " but got " << ndim << ". expected shape: " << shape_to_string(tensor);`
`1389`	`1389`
`1390`	`1390`	`if (partial)`
`1391`	`1391`	`{`