@@ -398,12 +398,12 @@ def mtmd_input_chunk_get_id(chunk: mtmd_input_chunk_p) -> c_char_p:
398398 """
399399 ...
400400
401- # // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
401+ # // number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
402402# MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk);
403403@ctypes_function_mtmd ("mtmd_input_chunk_get_n_pos" , [mtmd_input_chunk_p_ctypes ], c_int32 )
404404def mtmd_input_chunk_get_n_pos (chunk : mtmd_input_chunk_p ) -> c_int32 :
405405 """
406- number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
406+ number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
407407 """
408408 ...
409409
@@ -457,11 +457,12 @@ def mtmd_image_tokens_get_ny(image_tokens: mtmd_image_tokens_p) -> c_size_t:
457457def mtmd_image_tokens_get_id (image_tokens : mtmd_image_tokens_p ) -> c_char_p :
458458 ...
459459
460- # // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
460+ # // number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
461461# MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
462462@ctypes_function_mtmd (
463463 "mtmd_image_tokens_get_n_pos" , [mtmd_image_tokens_p_ctypes ], c_int32 )
464464def mtmd_image_tokens_get_n_pos (image_tokens : mtmd_image_tokens_p ) -> c_int32 :
465+ """number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)"""
465466 ...
466467
467468# // tokenize an input text prompt and a list of bitmaps (images/audio)
0 commit comments