$ sysctl kern.version
kern.version: Darwin Kernel Version 23.6.0: Fri Jul 5 17:55:37 PDT 2024; root:xnu-10063.141.1~2/RELEASE_ARM64_T6030
Fetching 8 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8[/8](http://localhost:8888/8) [00:00<00:00, 47060.91it[/s](http://localhost:8888/s)]
voxcpm_model_path: /Users/fedor/.cache/huggingface/hub/models--openbmb--VoxCPM2/snapshots/e8b928065859f2869644c1e2881cbd21f888c659, zipenhancer_model_path: None, enable_denoiser: False
[/Users/fedor/voxcpm_env/lib/python3.10/site-packages/torch/nn/utils/weight_norm.py:144](http://localhost:8888/lab/workspaces/auto-J/tree/tts/voxcpm_env/lib/python3.10/site-packages/torch/nn/utils/weight_norm.py#line=143): FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.
WeightNorm.apply(module, name, dim)
Loading AudioVAE from pytorch: /Users/fedor/.cache/huggingface/hub/models--openbmb--VoxCPM2/snapshots/e8b928065859f2869644c1e2881cbd21f888c659/audiovae.pth
Running on device: cpu, dtype: bfloat16
Loading model from safetensors: /Users/fedor/.cache/huggingface/hub/models--openbmb--VoxCPM2/snapshots/e8b928065859f2869644c1e2881cbd21f888c659/model.safetensors
Warning: torch.compile disabled - VoxCPMModel can only be optimized on CUDA device
Loaded VoxCPM2Model
Warm up VoxCPMModel...
0%| | 0[/10](http://localhost:8888/10) [00:00<?, ?it[/s](http://localhost:8888/s)]
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
Cell In[1], line 6
1 from voxcpm import VoxCPM
2 import soundfile as sf
----> 6 model = VoxCPM.from_pretrained(
7 "openbmb[/VoxCPM2](http://localhost:8888/VoxCPM2)",
8 load_denoiser=False,
9 device='cpu'
10 )
File [~/VoxCPM/src/voxcpm/core.py:162](http://localhost:8888/lab/workspaces/auto-J/tree/tts/~/VoxCPM/src/voxcpm/core.py#line=161), in VoxCPM.from_pretrained(cls, hf_model_id, load_denoiser, zipenhancer_model_id, cache_dir, local_files_only, optimize, device, lora_config, lora_weights_path, **kwargs)
154 else:
155 # Otherwise, try from_pretrained (Hub); exit on failure
156 local_path = snapshot_download(
157 repo_id=repo_id,
158 cache_dir=cache_dir,
159 local_files_only=local_files_only,
160 )
--> 162 return cls(
163 voxcpm_model_path=local_path,
164 zipenhancer_model_path=zipenhancer_model_id if load_denoiser else None,
165 enable_denoiser=load_denoiser,
166 optimize=optimize,
167 device=device,
168 lora_config=lora_config,
169 lora_weights_path=lora_weights_path,
170 **kwargs,
171 )
File [~/VoxCPM/src/voxcpm/core.py:97](http://localhost:8888/lab/workspaces/auto-J/tree/tts/~/VoxCPM/src/voxcpm/core.py#line=96), in VoxCPM.__init__(self, voxcpm_model_path, zipenhancer_model_path, enable_denoiser, optimize, device, lora_config, lora_weights_path)
95 if optimize:
96 print("Warm up VoxCPMModel...", file=sys.stderr)
---> 97 self.tts_model.generate(
98 target_text="Hello, this is the first test sentence.",
99 max_len=10,
100 )
File [~/VoxCPM/src/voxcpm/model/voxcpm2.py:444](http://localhost:8888/lab/workspaces/auto-J/tree/tts/~/VoxCPM/src/voxcpm/model/voxcpm2.py#line=443), in VoxCPM2Model.generate(self, *args, **kwargs)
443 def generate(self, *args, **kwargs) -> torch.Tensor:
--> 444 return next(self._generate(*args, streaming=False, **kwargs))
File [~/voxcpm_env/lib/python3.10/site-packages/torch/utils/_contextlib.py:40](http://localhost:8888/lab/workspaces/auto-J/tree/tts/~/voxcpm_env/lib/python3.10/site-packages/torch/utils/_contextlib.py#line=39), in _wrap_generator.<locals>.generator_context(*args, **kwargs)
37 try:
38 # Issuing `None` to a generator fires it up
39 with ctx_factory():
---> 40 response = gen.send(None)
42 while True:
43 try:
44 # Forward the response to our caller and get its next request
File [~/VoxCPM/src/voxcpm/model/voxcpm2.py:644](http://localhost:8888/lab/workspaces/auto-J/tree/tts/~/VoxCPM/src/voxcpm/model/voxcpm2.py#line=643), in VoxCPM2Model._generate(self, target_text, prompt_text, prompt_wav_path, reference_wav_path, min_len, max_len, inference_timesteps, cfg_value, retry_badcase, retry_badcase_max_times, retry_badcase_ratio_threshold, trim_silence_vad, streaming, streaming_prefix_len)
642 break
643 else:
--> 644 latent_pred, pred_audio_feat, context_len = next(inference_result)
645 if retry_badcase:
646 if pred_audio_feat.shape[0] >= target_text_length * retry_badcase_ratio_threshold:
File [~/voxcpm_env/lib/python3.10/site-packages/torch/utils/_contextlib.py:40](http://localhost:8888/lab/workspaces/auto-J/tree/tts/~/voxcpm_env/lib/python3.10/site-packages/torch/utils/_contextlib.py#line=39), in _wrap_generator.<locals>.generator_context(*args, **kwargs)
37 try:
38 # Issuing `None` to a generator fires it up
39 with ctx_factory():
---> 40 response = gen.send(None)
42 while True:
43 try:
44 # Forward the response to our caller and get its next request
File [~/VoxCPM/src/voxcpm/model/voxcpm2.py:1081](http://localhost:8888/lab/workspaces/auto-J/tree/tts/~/VoxCPM/src/voxcpm/model/voxcpm2.py#line=1080), in VoxCPM2Model._inference(self, text, text_mask, feat, feat_mask, min_len, max_len, inference_timesteps, cfg_value, streaming, streaming_prefix_len)
1078 if i > min_len and stop_flag == 1:
1079 break
-> 1081 lm_hidden = self.base_lm.forward_step(
1082 curr_embed[:, 0, :], torch.tensor([self.base_lm.kv_cache.step()], device=curr_embed.device)
1083 ).clone()
1085 lm_hidden = self.fsq_layer(lm_hidden)
1086 curr_residual_input = self.fusion_concat_proj(torch.cat((lm_hidden, curr_embed[:, 0, :]), dim=-1))
File [~/VoxCPM/src/voxcpm/modules/minicpm4/model.py:406](http://localhost:8888/lab/workspaces/auto-J/tree/tts/~/VoxCPM/src/voxcpm/modules/minicpm4/model.py#line=405), in MiniCPMModel.forward_step(self, inputs_embeds, position_id)
403 hidden_states = inputs_embeds
405 for i, decoder_layer in enumerate(self.layers):
--> 406 hidden_states = decoder_layer.forward_step(
407 hidden_states,
408 position_emb,
409 position_id,
410 self.kv_cache.get_layer_cache(i),
411 )
413 hidden_states = self.norm(hidden_states)
414 return hidden_states
File [~/VoxCPM/src/voxcpm/modules/minicpm4/model.py:298](http://localhost:8888/lab/workspaces/auto-J/tree/tts/~/VoxCPM/src/voxcpm/modules/minicpm4/model.py#line=297), in MiniCPMDecoderLayer.forward_step(self, hidden_states, position_emb, position_id, kv_cache)
296 hidden_states = self.input_layernorm(hidden_states)
297 # Self Attention
--> 298 hidden_states = self.self_attn.forward_step(
299 hidden_states=hidden_states,
300 position_emb=position_emb,
301 position_id=position_id,
302 kv_cache=kv_cache,
303 )
305 if self.use_mup:
306 hidden_states = residual + hidden_states * (self.scale_depth [/](http://localhost:8888/) math.sqrt(self.num_hidden_layers))
File [~/VoxCPM/src/voxcpm/modules/minicpm4/model.py:206](http://localhost:8888/lab/workspaces/auto-J/tree/tts/~/VoxCPM/src/voxcpm/modules/minicpm4/model.py#line=205), in MiniCPMAttention.forward_step(self, hidden_states, position_emb, position_id, kv_cache)
204 key_cache = key_cache.contiguous()
205 value_cache = value_cache.contiguous()
--> 206 attn_output = torch.nn.functional.scaled_dot_product_attention(
207 query_states,
208 key_cache,
209 value_cache,
210 attn_mask=attn_mask,
211 enable_gqa=True,
212 )
214 attn_output = attn_output.transpose(1, 2).contiguous()
215 attn_output = attn_output.reshape(bsz, self.num_heads * self.head_dim)
IndexError: Dimension out of range (expected to be in range of [-1, 0], but got -2)
Here's how I am installing VoxCPM
Then I open Jupyter lab and run te following code
Output:
Just in case, here's my
pip list output: https://pastebin.com/b2RsQW0C