Streaming manually problem

I tried running this: https://coqui-tts.readthedocs.io/en/latest/models/xtts.html#streaming-manually, but it doesn't work. I got the following error: ```AttributeError: 'int' object has no attribute '_pad_token_tensor'```

Here's the script I ran:
```
import torch
from transformers import pytorch_utils
import torchaudio
from tqdm import tqdm
from underthesea import sent_tokenize

from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

# Device configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Model paths
# xtts_checkpoint = "/media/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-July-16-2025_03+13PM-8e59ec3/best_model_404878.pth"
xtts_checkpoint="/media/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-July-16-2025_03+13PM-8e59ec3/optimized_model.pth"
xtts_config = "/media/XTTSv2-Finetuning-for-New-Languages/checkpoints/GPT_XTTS_FT-July-16-2025_03+13PM-8e59ec3/config.json"
xtts_vocab = "/media/XTTSv2-Finetuning-for-New-Languages/checkpoints/XTTS_v2.0_original_model_files/vocab.json"

# Load model
config = XttsConfig()
config.load_json(xtts_config)
XTTS_MODEL = Xtts.init_from_config(config)
XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
XTTS_MODEL.cuda()

print("Model loaded successfully!")

audio_path="/media/coqui-ai-TTS/Adito/audio-reference_adito_0152.wav"

gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
    audio_path=audio_path,
    gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
    max_ref_length=XTTS_MODEL.config.max_ref_len,
    sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
)

import time
import torch
import sounddevice as sd
import torchaudio

print("Inference...")
t0 = time.time()
chunks = XTTS_MODEL.inference_stream(
    "Indonesia has thousands of islands that hold incredible natural beauty, from the white sand beaches of Lombok to the unique culture of Toraja, making it one of the most diverse and unforgettable travel destinations.",
    "en",
    gpt_cond_latent,
    speaker_embedding
)
stream = sd.OutputStream(samplerate=24000, channels=1, dtype='float32')
stream.start()

wav_chunks = []
for i, chunk in enumerate(chunks):
    if i == 0:
        print(f"Time to first chunk: {time.time() - t0:.2f} sec")

    print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
    wav_chunks.append(chunk)
    data = chunk.squeeze().cpu().numpy().astype('float32')
    bytes_data = data.tobytes()
    print(f'ini hasil byte chunk ke {i}: {bytes_data}')
    stream.write(data)

stream.stop()
stream.close()

wav = torch.cat(wav_chunks, dim=0)
torchaudio.save("xtts_streaming_multi_.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Streaming manually problem #64

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Streaming manually problem #64

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions