Skip to content

Streaming manually problem #64

@kin0303

Description

@kin0303

I tried running this: https://coqui-tts.readthedocs.io/en/latest/models/xtts.html#streaming-manually, but it doesn't work. I got the following error: AttributeError: 'int' object has no attribute '_pad_token_tensor'

Here's the script I ran:

import torch
from transformers import pytorch_utils
import torchaudio
from tqdm import tqdm
from underthesea import sent_tokenize

from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

# Device configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Model paths
# xtts_checkpoint = "/media/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-July-16-2025_03+13PM-8e59ec3/best_model_404878.pth"
xtts_checkpoint="/media/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-July-16-2025_03+13PM-8e59ec3/optimized_model.pth"
xtts_config = "/media/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-July-16-2025_03+13PM-8e59ec3/config.json"
xtts_vocab = "/media/XTTSv2-Finetuning-for-New-Languages/checkpoints/XTTS_v2.0_original_model_files/vocab.json"

# Load model
config = XttsConfig()
config.load_json(xtts_config)
XTTS_MODEL = Xtts.init_from_config(config)
XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
XTTS_MODEL.cuda()

print("Model loaded successfully!")

audio_path="/media/coqui-ai-TTS/Adito/audio-reference_adito_0152.wav"

gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
    audio_path=audio_path,
    gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
    max_ref_length=XTTS_MODEL.config.max_ref_len,
    sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
)

import time
import torch
import sounddevice as sd
import torchaudio

print("Inference...")
t0 = time.time()
chunks = XTTS_MODEL.inference_stream(
    "Indonesia has thousands of islands that hold incredible natural beauty, from the white sand beaches of Lombok to the unique culture of Toraja, making it one of the most diverse and unforgettable travel destinations.",
    "en",
    gpt_cond_latent,
    speaker_embedding
)
stream = sd.OutputStream(samplerate=24000, channels=1, dtype='float32')
stream.start()

wav_chunks = []
for i, chunk in enumerate(chunks):
    if i == 0:
        print(f"Time to first chunk: {time.time() - t0:.2f} sec")

    print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
    wav_chunks.append(chunk)
    data = chunk.squeeze().cpu().numpy().astype('float32')
    bytes_data = data.tobytes()
    print(f'ini hasil byte chunk ke {i}: {bytes_data}')
    stream.write(data)

stream.stop()
stream.close()

wav = torch.cat(wav_chunks, dim=0)
torchaudio.save("xtts_streaming_multi_.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions