diff --git a/keras_hub/src/models/qwen3_moe/__init__.py b/keras_hub/src/models/qwen3_moe/__init__.py new file mode 100644 index 0000000000..65619bdb77 --- /dev/null +++ b/keras_hub/src/models/qwen3_moe/__init__.py @@ -0,0 +1,5 @@ +from keras_hub.src.models.qwen3_moe.qwen3_moe_backbone import Qwen3MoeBackbone +from keras_hub.src.models.qwen3_moe.qwen3_moe_presets import backbone_presets +from keras_hub.src.utils.preset_utils import register_presets + +register_presets(backbone_presets, Qwen3MoeBackbone) diff --git a/keras_hub/src/models/qwen3_moe/qwen3_moe_presets.py b/keras_hub/src/models/qwen3_moe/qwen3_moe_presets.py new file mode 100644 index 0000000000..5f3b8c1393 --- /dev/null +++ b/keras_hub/src/models/qwen3_moe/qwen3_moe_presets.py @@ -0,0 +1,30 @@ +"""Qwen3 MoE model preset configurations.""" + +backbone_presets = { + "qwen3_moe_30b_a3b_en": { + "metadata": { + "description": ( + " Mixture-of-Experts (MoE) model has 30.5 billion total" + " parameters with 3.3 billion activated, built on 48 layers" + " and utilizes 32 query and 4 key/value attention heads" + " with 128 experts (8 active)." + ), + "params": 30532122624, + "path": "qwen3_moe", + }, + "kaggle_handle": "kaggle://keras/qwen-3-moe/keras/qwen3_moe_30b_a3b_en/2", + }, + "qwen3_moe_235b_a22b_en": { + "metadata": { + "description": ( + " Mixture-of-Experts (MoE) model has 235 billion" + " total parameters with 22 billion activated, built on 94" + " layers and utilizes 64 query and 4 key/value attention heads" + " with 128 experts (8 active)." + ), + "params": 235093634560, + "path": "qwen3_moe", + }, + "kaggle_handle": "kaggle://keras/qwen-3-moe/keras/qwen3_moe_235b_a22b_en/1", + }, +}