HabanaAI
diff --git a/‎benchmarks/kernels/benchmark_moe.py
Lines changed: 5 additions & 1 deletion b/‎benchmarks/kernels/benchmark_moe.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎benchmarks/kernels/benchmark_moe_permute_unpermute.py
Lines changed: 1 addition & 0 deletions b/‎benchmarks/kernels/benchmark_moe_permute_unpermute.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/models/supported_models.md
Lines changed: 7 additions & 4 deletions b/‎docs/models/supported_models.md
Lines changed: 7 additions & 4 deletions
diff --git a/‎examples/offline_inference/vision_language.py
Lines changed: 40 additions & 2 deletions b/‎examples/offline_inference/vision_language.py
Lines changed: 40 additions & 2 deletions
diff --git a/‎tests/distributed/test_pipeline_parallel.py
Lines changed: 2 additions & 2 deletions b/‎tests/distributed/test_pipeline_parallel.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/entrypoints/openai/test_video.py
Lines changed: 1 addition & 1 deletion b/‎tests/entrypoints/openai/test_video.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/lora/test_add_lora.py
Lines changed: 1 addition & 1 deletion b/‎tests/lora/test_add_lora.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/lora/test_chatglm3_tp.py
Lines changed: 1 addition & 1 deletion b/‎tests/lora/test_chatglm3_tp.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/language/generation/test_common.py
Lines changed: 1 addition & 1 deletion b/‎tests/models/language/generation/test_common.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/multimodal/generation/test_common.py
Lines changed: 29 additions & 1 deletion b/‎tests/models/multimodal/generation/test_common.py
Lines changed: 29 additions & 1 deletion
@@ -575,7 +575,11 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
         intermediate_size = config.intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    elif config.architectures[0] in ("DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"):
+    elif config.architectures[0] in (
+        "DeepseekV3ForCausalLM",
+        "DeepseekV2ForCausalLM",
+        "Glm4MoeForCausalLM",
+    ):
         E = config.n_routed_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
 
@@ -318,6 +318,7 @@ def main(args: argparse.Namespace):
     elif (
         config.architectures[0] == "DeepseekV3ForCausalLM"
         or config.architectures[0] == "DeepseekV2ForCausalLM"
+        or config.architectures[0] == "Glm4MoeForCausalLM"
     ):
         E = config.n_routed_experts
         topk = config.num_experts_per_tok
 
@@ -307,7 +307,7 @@ Specified using `--task generate`.
 | `BambaForCausalLM`                                | Bamba                                               | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B`                                                                                                                   | ✅︎                     | ✅︎                          |
 | `BloomForCausalLM`                                | BLOOM, BLOOMZ, BLOOMChat                            | `bigscience/bloom`, `bigscience/bloomz`, etc.                                                                                                                                |                        | ✅︎                          |
 | `BartForConditionalGeneration`                    | BART                                                | `facebook/bart-base`, `facebook/bart-large-cnn`, etc.                                                                                                                        |                        |                             |
-| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM                                             | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc.                                                                                                       | ✅︎                     | ✅︎                          |
+| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM                                             | `zai-org/chatglm2-6b`, `zai-org/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc.                                                                                                       | ✅︎                     | ✅︎                          |
 | `CohereForCausalLM`, `Cohere2ForCausalLM`         | Command-R                                           | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.                                                                                               | ✅︎                     | ✅︎                          |
 | `DbrxForCausalLM`                                 | DBRX                                                | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.                                                                                                                     |                        | ✅︎                          |
 | `DeciLMForCausalLM`                               | DeciLM                                              | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.                                                                                                                               | ✅︎                     | ✅︎                          |
@@ -321,8 +321,8 @@ Specified using `--task generate`.
 | `GemmaForCausalLM`                                | Gemma                                               | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc.                                                                                                                            | ✅︎                     | ✅︎                          |
 | `Gemma2ForCausalLM`                               | Gemma 2                                             | `google/gemma-2-9b`, `google/gemma-2-27b`, etc.                                                                                                                              | ✅︎                     | ✅︎                          |
 | `Gemma3ForCausalLM`                               | Gemma 3                                             | `google/gemma-3-1b-it`, etc.                                                                                                                                                 | ✅︎                     | ✅︎                          |
-| `GlmForCausalLM`                                  | GLM-4                                               | `THUDM/glm-4-9b-chat-hf`, etc.                                                                                                                                               | ✅︎                     | ✅︎                          |
-| `Glm4ForCausalLM`                                 | GLM-4-0414                                          | `THUDM/GLM-4-32B-0414`, etc.                                                                                                                                                 | ✅︎                     | ✅︎                          |
+| `GlmForCausalLM`                                  | GLM-4                                               | `zai-org/glm-4-9b-chat-hf`, etc.                                                                                                                                               | ✅︎                     | ✅︎                          |
+| `Glm4ForCausalLM`                                 | GLM-4-0414                                          | `zai-org/GLM-4-32B-0414`, etc.                                                                                                                                                 | ✅︎                     | ✅︎                          |
 | `GPT2LMHeadModel`                                 | GPT-2                                               | `gpt2`, `gpt2-xl`, etc.                                                                                                                                                      |                        | ✅︎                          |
 | `GPTBigCodeForCausalLM`                           | StarCoder, SantaCoder, WizardCoder                  | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.                                                                                 | ✅︎                     | ✅︎                          |
 | `GPTJForCausalLM`                                 | GPT-J                                               | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.                                                                                                                            |                        | ✅︎                          |
@@ -521,7 +521,10 @@ Specified using `--task generate`.
 | `Florence2ForConditionalGeneration`          | Florence-2                                                               | T + I                                                                 | `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc.                                                                                          |                        |                             |                       |
 | `FuyuForCausalLM`                            | Fuyu                                                                     | T + I                                                                 | `adept/fuyu-8b` etc.                                                                                                                                    |                       | ✅︎                           | ✅︎                     |
 | `Gemma3ForConditionalGeneration`             | Gemma 3                                                                  | T + I<sup>+</sup>                                                     | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.                                                                                                   | ✅︎                     | ✅︎                          | ⚠️                    |
-| `GLM4VForCausalLM`<sup>^</sup>               | GLM-4V                                                                   | T + I                                                                 | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc.                                                                                                    | ✅︎                     | ✅︎                          | ✅︎                    |
+| `GLM4VForCausalLM`<sup>^</sup>               | GLM-4V                                                                   | T + I                                                                 | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220` etc.                                                                                                    | ✅︎                     | ✅︎                          | ✅︎                    |
+| `Glm4vForConditionalGeneration`              | GLM-4.1V-Thinking                                                        | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `zai-org/GLM-4.1V-9B-Thinkg`,  etc.                                                                                                                       | ✅︎                     | ✅︎                          | ✅︎                    |
+| `Glm4MoeForCausalLM` | GLM-4.5 | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `Glm4v_moeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration`      | Granite Speech                                                           | T + A                                                                 | `ibm-granite/granite-speech-3.3-8b`                                                                                                                     | ✅︎                     | ✅︎                          | ✅︎                    |
 | `H2OVLChatModel`                             | H2OVL                                                                    | T + I<sup>E+</sup>                                                    | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.                                                                                      |                       | ✅︎                          | ✅︎\*                     |
 | `Idefics3ForConditionalGeneration`           | Idefics3                                                                 | T + I                                                                 | `HuggingFaceM4/Idefics3-8B-Llama3` etc.                                                                                                                 | ✅︎                     |                           |  ✅︎                     |
 
@@ -221,7 +221,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
 # GLM-4v
 def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
-    model_name = "THUDM/glm-4v-9b"
+    model_name = "zai-org/glm-4v-9b"
 
     engine_args = EngineArgs(
         model=model_name,
@@ -248,6 +248,42 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# GLM-4.1V
+def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "zai-org/GLM-4.1V-9B-Thinking"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        mm_processor_kwargs={
+            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
+            "fps": 1,
+        },
+        limit_mm_per_prompt={modality: 1},
+        enforce_eager=True,
+    )
+
+    if modality == "image":
+        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+    elif modality == "video":
+        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    prompts = [
+        (
+            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
+            f"{placeholder}"
+            f"{question}<|assistant|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # H2OVL-Mississippi
 def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1083,6 +1119,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
     "fuyu": run_fuyu,
     "gemma3": run_gemma3,
     "glm4v": run_glm4v,
+    "glm4_1v": run_glm4_1v,
     "h2ovl_chat": run_h2ovl,
     "idefics3": run_idefics3,
     "internvl_chat": run_internvl,
@@ -1140,10 +1177,11 @@ def get_multi_modal_input(args):
     if args.modality == "video":
         # Input video and question
         video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
+        metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata
         vid_questions = ["Why is this video funny?"]
 
         return {
-            "data": video,
+            "data": [(video, metadata)] if args.model_type == "glm4_1v" else video,
             "questions": vid_questions,
         }
 
 
@@ -153,7 +153,7 @@ def iter_params(self, model_id: str):
     "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
     "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
     "bigscience/bloomz-1b1": PPTestSettings.fast(),
-    "THUDM/chatglm3-6b": PPTestSettings.fast(),
+    "zai-org/chatglm3-6b": PPTestSettings.fast(),
     "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
     "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
     "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
@@ -220,7 +220,7 @@ def iter_params(self, model_id: str):
     "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
     "facebook/chameleon-7b": PPTestSettings.fast(),
     "adept/fuyu-8b": PPTestSettings.fast(),
-    "THUDM/glm-4v-9b": PPTestSettings.fast(),
+    "zai-org/glm-4v-9b": PPTestSettings.fast(),
     "OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
     "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
     "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
 
@@ -50,7 +50,7 @@ async def client(server):
 @pytest.fixture(scope="session")
 def base64_encoded_video() -> dict[str, str]:
     return {
-        video_url: encode_video_base64(fetch_video(video_url))
+        video_url: encode_video_base64(fetch_video(video_url)[0])
         for video_url in TEST_VIDEO_URLS
     }
 
 
@@ -14,7 +14,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import merge_async_iterators
 
-MODEL_PATH = "THUDM/chatglm3-6b"
+MODEL_PATH = "zai-org/chatglm3-6b"
 LORA_RANK = 64
 DEFAULT_MAX_LORAS = 4 * 3
 
 
@@ -6,7 +6,7 @@
 
 from ..utils import create_new_process_for_each_test, multi_gpu_test
 
-MODEL_PATH = "THUDM/chatglm3-6b"
+MODEL_PATH = "zai-org/chatglm3-6b"
 
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
 
 
@@ -53,7 +53,7 @@
             marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
         pytest.param(
-            "THUDM/chatglm3-6b",  # chatglm (text-only)
+            "zai-org/chatglm3-6b",  # chatglm (text-only)
         ),
         pytest.param(
             "meta-llama/Llama-3.2-1B-Instruct",  # llama
 
@@ -290,7 +290,7 @@
         num_logprobs=10,
     ),
     "glm4v": VLMTestInfo(
-        models=["THUDM/glm-4v-9b"],
+        models=["zai-org/glm-4v-9b"],
         test_type=VLMTestType.IMAGE,
         prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
         single_image_prompts=IMAGE_ASSETS.prompts({
@@ -308,6 +308,34 @@
         num_logprobs=10,
         marks=[large_gpu_mark(min_gb=32)],
     ),
+    "glm4_1v": VLMTestInfo(
+        models=["zai-org/GLM-4.1V-9B-Thinking"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501
+        max_model_len=2048,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: [151329, 151336, 151338],
+        num_logprobs=10,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        auto_cls=AutoModelForImageTextToText,
+    ),
+    "glm4_1v-video": VLMTestInfo(
+        models=["zai-org/GLM-4.1V-9B-Thinking"],
+        # GLM4.1V require include video metadata for input
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        patch_hf_runner=model_utils.glm4_1v_patch_hf_runner,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.video_with_metadata_glm4_1v(),
+            limit_mm_per_prompt={"video": 1},
+        )],
+        # This is needed to run on machine with 24GB VRAM
+        vllm_runner_kwargs={"gpu_memory_utilization": 0.95},
+    ),
     "h2ovl": VLMTestInfo(
         models = [
             "h2oai/h2ovl-mississippi-800m",
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ async def client(server):`
`50`	`50`	`@pytest.fixture(scope="session")`
`51`	`51`	`def base64_encoded_video() -> dict[str, str]:`
`52`	`52`	`return {`
`53`		`- video_url: encode_video_base64(fetch_video(video_url))`
	`53`	`+ video_url: encode_video_base64(fetch_video(video_url)[0])`
`54`	`54`	`for video_url in TEST_VIDEO_URLS`
`55`	`55`	`}`
`56`	`56`