-
Notifications
You must be signed in to change notification settings - Fork 67
Open
Description
I tried running this: https://coqui-tts.readthedocs.io/en/latest/models/xtts.html#streaming-manually, but it doesn't work. I got the following error: AttributeError: 'int' object has no attribute '_pad_token_tensor'
Here's the script I ran:
import torch
from transformers import pytorch_utils
import torchaudio
from tqdm import tqdm
from underthesea import sent_tokenize
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
# Device configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# Model paths
# xtts_checkpoint = "/media/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-July-16-2025_03+13PM-8e59ec3/best_model_404878.pth"
xtts_checkpoint="/media/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-July-16-2025_03+13PM-8e59ec3/optimized_model.pth"
xtts_config = "/media/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-July-16-2025_03+13PM-8e59ec3/config.json"
xtts_vocab = "/media/XTTSv2-Finetuning-for-New-Languages/checkpoints/XTTS_v2.0_original_model_files/vocab.json"
# Load model
config = XttsConfig()
config.load_json(xtts_config)
XTTS_MODEL = Xtts.init_from_config(config)
XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
XTTS_MODEL.cuda()
print("Model loaded successfully!")
audio_path="/media/coqui-ai-TTS/Adito/audio-reference_adito_0152.wav"
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
audio_path=audio_path,
gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
max_ref_length=XTTS_MODEL.config.max_ref_len,
sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
)
import time
import torch
import sounddevice as sd
import torchaudio
print("Inference...")
t0 = time.time()
chunks = XTTS_MODEL.inference_stream(
"Indonesia has thousands of islands that hold incredible natural beauty, from the white sand beaches of Lombok to the unique culture of Toraja, making it one of the most diverse and unforgettable travel destinations.",
"en",
gpt_cond_latent,
speaker_embedding
)
stream = sd.OutputStream(samplerate=24000, channels=1, dtype='float32')
stream.start()
wav_chunks = []
for i, chunk in enumerate(chunks):
if i == 0:
print(f"Time to first chunk: {time.time() - t0:.2f} sec")
print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
wav_chunks.append(chunk)
data = chunk.squeeze().cpu().numpy().astype('float32')
bytes_data = data.tobytes()
print(f'ini hasil byte chunk ke {i}: {bytes_data}')
stream.write(data)
stream.stop()
stream.close()
wav = torch.cat(wav_chunks, dim=0)
torchaudio.save("xtts_streaming_multi_.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels