Skip to content

Commit 2ebaa43

Browse files
committed
cleanup debug logs and hardcoded portions
1 parent bfc234d commit 2ebaa43

File tree

1 file changed

+65
-7
lines changed

1 file changed

+65
-7
lines changed

convert_hf_to_gguf.py

Lines changed: 65 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -296,9 +296,17 @@ def prepare_tensors(self):
296296
break
297297

298298
for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
299+
# Debug tensor shape tracking
300+
if any(x in new_name for x in ["ssm_a", "ssm_d", "ssm_conv1d.weight"]):
301+
print(f"DEBUG: Pre-numpy {new_name} torch shape: {data_torch.shape}")
302+
299303
# TODO: why do we squeeze here?
300304
# data = data_torch.squeeze().numpy()
301305
data = data_torch.numpy()
306+
307+
# Debug numpy shape
308+
if any(x in new_name for x in ["ssm_a", "ssm_d", "ssm_conv1d.weight"]):
309+
print(f"DEBUG: Post-numpy {new_name} numpy shape: {data.shape}")
302310

303311
# if data ends up empty, it means data_torch was a scalar tensor -> restore
304312
if len(data.shape) == 0:
@@ -384,6 +392,11 @@ def prepare_tensors(self):
384392

385393
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
386394

395+
# Debug shape before and after reversal
396+
if any(x in new_name for x in ["ssm_a", "ssm_d", "ssm_conv1d.weight"]):
397+
print(f"DEBUG: {new_name} raw shape: {shape}")
398+
print(f"DEBUG: {new_name} reversed: {list(reversed(shape))}")
399+
387400
# reverse shape to make it similar to the internal ggml dimension order
388401
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
389402

@@ -7919,6 +7932,41 @@ def __init__(self, *args, **kwargs):
79197932
# Determine attention layers
79207933
self._attn_layers = self._get_attn_layers()
79217934

7935+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7936+
"""Override Mamba2 tensor transformation with Nemotron-H specific logic"""
7937+
7938+
if name.startswith("model.backbone") or name.startswith("model.lm_head"):
7939+
# map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
7940+
name = name.removeprefix("model.")
7941+
7942+
if name.endswith(".dt_bias"):
7943+
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
7944+
7945+
new_name = self.map_tensor_name(name)
7946+
7947+
if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
7948+
# For conv1d weights: [12288, 1, 4] -> squeeze -> [12288, 4] -> transpose -> [4, 12288]
7949+
data_torch = data_torch.squeeze() # Remove dim 1
7950+
if len(data_torch.shape) == 2:
7951+
data_torch = data_torch.t().contiguous() # [12288, 4] -> [4, 12288]
7952+
elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
7953+
gguf.MODEL_TENSOR.SSM_A,
7954+
gguf.MODEL_TENSOR.SSM_D,
7955+
]):
7956+
# For SSM A/D: NVIDIA [128] -> llama.cpp expects [128, 1]
7957+
# But ensure exactly [128, 1] not [1, 128] to avoid GGML reversal issues
7958+
if len(data_torch.shape) == 1: # [128]
7959+
data_torch = data_torch.unsqueeze(1) # -> [128, 1] explicitly
7960+
elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
7961+
data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group))
7962+
7963+
# Apply A_log transformation
7964+
if name.endswith(".A_log"):
7965+
logger.debug("A_log --> A ==> " + new_name)
7966+
data_torch = -torch.exp(data_torch)
7967+
7968+
yield (new_name, data_torch)
7969+
79227970
def set_gguf_parameters(self):
79237971
"""Override to skip Mamba2 parameter validation that doesn't apply to hybrid architecture"""
79247972
d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
@@ -7983,27 +8031,34 @@ def modify_tensors(self, data_torch, name, bid):
79838031
# NVIDIA GROUND TRUTH TENSOR TRANSFORMATIONS
79848032

79858033
# Conv1d: NVIDIA [12288, 1, 4] -> llama.cpp [4, 12288]
8034+
# IMPORTANT: GGUF reverses dimensions, so we need [12288, 4] to get {4, 12288} in metadata
79868035
if "conv1d.weight" in layer_component:
79878036
original_shape = data_torch.shape
79888037
if len(data_torch.shape) == 3: # [12288, 1, 4]
7989-
# Remove middle dimension and transpose: [12288, 1, 4] -> [12288, 4] -> [4, 12288]
7990-
data_torch = data_torch.squeeze(1).t().contiguous() # -> [4, 12288]
8038+
# Remove middle dimension: [12288, 1, 4] -> [12288, 4] (no transpose for GGUF reversal)
8039+
data_torch = data_torch.squeeze(1).contiguous() # -> [12288, 4]
79918040
elif len(data_torch.shape) == 2: # [12288, 4]
7992-
data_torch = data_torch.t().contiguous() # [12288, 4] -> [4, 12288]
7993-
# Ensure final shape is exactly [4, 12288]
7994-
assert data_torch.shape == (4, 12288), f"Conv1d wrong final shape: {data_torch.shape}"
8041+
data_torch = data_torch.contiguous() # Keep [12288, 4] (no transpose for GGUF reversal)
8042+
# Ensure final shape is exactly [12288, 4] (will become {4, 12288} after GGUF reversal)
8043+
assert data_torch.shape == (12288, 4), f"Conv1d wrong final shape: {data_torch.shape}"
79958044
print(f"DEBUG: Conv1d {layer_component} {original_shape} -> {data_torch.shape}")
79968045

79978046
# A_log: NVIDIA [128] -> llama.cpp [128, 1] with -exp transform
8047+
# IMPORTANT: GGUF reverses dimensions, so we need [1, 128] to get {128, 1} in metadata
79988048
if layer_component.endswith("A_log"):
8049+
original_shape = data_torch.shape
79998050
data_torch = -torch.exp(data_torch) # Apply -exp transformation
80008051
if len(data_torch.shape) == 1: # [128]
8001-
data_torch = data_torch.reshape(128, 1) # -> [128, 1] explicitly
8052+
data_torch = data_torch.reshape(1, 128) # -> [1, 128] for GGUF reversal
8053+
print(f"DEBUG: A_log {layer_component} {original_shape} -> {data_torch.shape}")
80028054

80038055
# D: NVIDIA [128] -> llama.cpp [128, 1]
8056+
# IMPORTANT: GGUF reverses dimensions, so we need [1, 128] to get {128, 1} in metadata
80048057
if layer_component.endswith("D"):
8058+
original_shape = data_torch.shape
80058059
if len(data_torch.shape) == 1: # [128]
8006-
data_torch = data_torch.reshape(128, 1) # -> [128, 1] explicitly
8060+
data_torch = data_torch.reshape(1, 128) # -> [1, 128] for GGUF reversal
8061+
print(f"DEBUG: D {layer_component} {original_shape} -> {data_torch.shape}")
80078062

80088063
# Grouped RMSNorm: NVIDIA [10240] -> llama.cpp [1280, 8]
80098064
if layer_component == "mixer.norm.weight":
@@ -8052,6 +8107,9 @@ def modify_tensors(self, data_torch, name, bid):
80528107
# Fallback to default mapping
80538108
return super().modify_tensors(data_torch, name, bid)
80548109

8110+
# Debug: verify final tensor shape before returning (accounting for GGUF reversal)
8111+
if any(x in layer_component for x in ["A_log", "D", "conv1d.weight"]):
8112+
print(f"DEBUG: Final tensor {new_name} shape: {data_torch.shape} (will reverse to GGUF metadata)")
80558113
return [(new_name, data_torch)]
80568114

80578115
# Default to parent processing

0 commit comments

Comments
 (0)