ValueError: Labels' sequence length 949 cannot exceed the maximum allowed length of 448 tokens. #2458
Replies: 1 comment
-
Hi @Sabakhupenia, Sorry to disappoint you, I am not Sanchit Gandhi himself! Although a response from him would surely redirect you to this page: I would try my best here since I also came across a similar error recently. What could possibly resolve your problem is the following (a snippet modified from official repo: https://github.com/huggingface/transformers/blob/main/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py): #firstly, it would be better for you to use full_dataset, rather than train_dataset and eval_dataset
#
# filter training data that is shorter than min_input_length or longer than
# max_input_length
max_input_length = max_duration_in_seconds * sampling_rate
min_input_length = min_duration_in_seconds * sampling_rate
def is_audio_in_length_range(length):
return min_input_length < length < max_input_length
max_label_length = model.config.max_length
def filter_labels(labels):
"""Filter label sequences longer than max length"""
return len(labels) < max_label_length
vectorized_datasets = vectorized_datasets.filter(filter_labels, input_columns=["labels"])
if training_args.do_train:
vectorized_datasets["train"] = vectorized_datasets["train"].filter(
is_audio_in_length_range,
input_columns=["input_length"],
) Once you apply this filter, the error should be resolved. |
Beta Was this translation helpful? Give feedback.
Uh oh!
There was an error while loading. Please reload this page.
-
Hello, I am trying to fine-tune Whisper on my custom dataset, but I encountered an error during the process. I read through @sanchit-gandhi's comments on GitHub and Hugging Face issues, but I couldn’t fully understand where I might be making a mistake.
The issue is that when I filter down to only 448 samples, my dataset becomes too small, and I can’t fine-tune the model effectively with such a limited size.
Is there anyone here who can help me with this task?
Or perhaps @sanchit-gandhi himself 😁 — I’d really appreciate your assistance!
hello i am trying to fine-tune whisper on my custom_datset and i get this error while fine-tuning. i saw @sanchit-gandhi github and huggingface issue comments but did not really understood where i am doing mistake
thing is when i filter only to 448 my dataset becomes too small and i cant fine-tune a model based on that.
is here anyone that can help me on this task?
or the @sanchit-gandhi himself😁 i would appreciate your help!
Install necessary libraries
!pip install datasets transformers soundfile huggingface_hub evaluate
Import libraries
import os
import tarfile
import pandas as pd
from datasets import Dataset, Audio
from transformers import (
WhisperFeatureExtractor,
WhisperTokenizer,
WhisperProcessor,
WhisperForConditionalGeneration,
Seq2SeqTrainer,
Seq2SeqTrainingArguments,
)
from huggingface_hub import hf_hub_download
import evaluate
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import shutil
import numpy as np
Set Hugging Face token
HF_TOKEN = "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" # Replace with your actual token
Download dataset from Hugging Face
REPO_ID = 'SabaKhupenia/whispergeodataset_1'
Download 'clips.tar.gz'
clips_tar_path = hf_hub_download(
repo_id=REPO_ID,
filename='clips.tar.gz',
token=HF_TOKEN,
repo_type='dataset',
)
Download 'metadata.tsv'
metadata_path = hf_hub_download(
repo_id=REPO_ID,
filename='metadata.tsv',
token=HF_TOKEN,
repo_type='dataset',
)
Remove 'clips' directory if it exists
if os.path.exists('clips'):
shutil.rmtree('clips')
Extract the audio clips into the current directory
with tarfile.open(clips_tar_path, 'r:gz') as tar:
tar.extractall()
Load metadata.tsv into a DataFrame
metadata_df = pd.read_csv(metadata_path, sep='\t', encoding='utf-8')
Add full path to audio files
metadata_df['audio'] = metadata_df['path'].apply(lambda x: os.path.join('clips', x))
Verify that the audio files exist
assert os.path.exists(metadata_df['audio'].iloc[0]), f"Audio file not found: {metadata_df['audio'].iloc[0]}"
Print total number of samples
print(f"Total number of samples: {len(metadata_df)}")
Calculate total size before processing
def get_file_size(file_path):
return os.path.getsize(file_path)
total_size_before = sum(get_file_size(path) for path in metadata_df['audio'])
total_size_before_gb = total_size_before / (1024 ** 3)
print(f"Total size of dataset before processing: {total_size_before_gb:.2f} GB")
Convert to a Hugging Face Dataset
dataset = Dataset.from_pandas(metadata_df)
Cast 'audio' column to Audio feature
dataset = dataset.cast_column('audio', Audio(sampling_rate=16000))
Initialize the feature extractor
feature_extractor = WhisperFeatureExtractor.from_pretrained(
"SabaKhupenia/whisper-turbo-KA_GE",
token=HF_TOKEN
)
Initialize the tokenizer
tokenizer = WhisperTokenizer.from_pretrained(
"SabaKhupenia/whisper-turbo-KA_GE",
language="Georgian",
task="transcribe",
token=HF_TOKEN
)
Initialize the processor
processor = WhisperProcessor.from_pretrained(
"SabaKhupenia/whisper-turbo-KA_GE",
language="Georgian",
task="transcribe",
token=HF_TOKEN
)
Load model
model = WhisperForConditionalGeneration.from_pretrained(
"SabaKhupenia/whisper-turbo-KA_GE",
token=HF_TOKEN
)
Set max_length immediately after loading the model
model.config.max_length = 1024 # Set to your desired maximum length
max_label_length = model.config.max_length
Set model generation config
model.generation_config.language = "Georgian"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None
Split the dataset
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset['train']
eval_dataset = dataset['test']
Compute label lengths before mapping
def compute_label_length(batch):
# Encode target text to label ids
labels = tokenizer(batch["sentence"]).input_ids
batch["labels_length"] = len(labels)
return batch
Apply to training and evaluation datasets
train_dataset = train_dataset.map(compute_label_length)
eval_dataset = eval_dataset.map(compute_label_length)
Filter datasets based on label length
def filter_labels(batch):
"""Filter label sequences longer than max length"""
return batch['labels_length'] < max_label_length
train_dataset = train_dataset.filter(filter_labels)
eval_dataset = eval_dataset.filter(filter_labels)
Remove 'labels_length' column as it's no longer needed
train_dataset = train_dataset.remove_columns(['labels_length'])
eval_dataset = eval_dataset.remove_columns(['labels_length'])
Prepare the datasets
def prepare_dataset(batch):
audio = batch["audio"]
# Compute log-Mel input features from input audio array
batch["input_features"] = feature_extractor(
audio["array"], sampling_rate=audio["sampling_rate"]
).input_features[0]
# Encode target text to label ids
batch["labels"] = tokenizer(batch["sentence"]).input_ids
return batch
columns_to_remove = ['audio', 'sentence', 'client_id', 'path']
Process training dataset
train_dataset = train_dataset.map(
prepare_dataset,
remove_columns=columns_to_remove,
num_proc=1 # Use single processing to avoid issues during debugging
)
Process evaluation dataset
eval_dataset = eval_dataset.map(
prepare_dataset,
remove_columns=columns_to_remove,
num_proc=1
)
Calculate total size after processing (approximate)
def get_dataset_size(dataset):
total_size = 0
for example in dataset:
input_features = example['input_features'].numpy()
labels = np.array(example['labels'])
total_size += input_features.nbytes
total_size += labels.nbytes
return total_size
total_size_train = get_dataset_size(train_dataset)
total_size_train_gb = total_size_train / (1024 ** 3)
print(f"Total size of training dataset after processing: {total_size_train_gb:.2f} GB")
total_size_eval = get_dataset_size(eval_dataset)
total_size_eval_gb = total_size_eval / (1024 ** 3)
print(f"Total size of evaluation dataset after processing: {total_size_eval_gb:.2f} GB")
total_size_after_gb = total_size_train_gb + total_size_eval_gb
print(f"Total size of dataset after processing: {total_size_after_gb:.2f} GB")
Update the data collator to include attention masks
@DataClass
class DataCollatorSpeechSeq2SeqWithPadding:
processor: Any
decoder_start_token_id: int
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
processor=processor,
decoder_start_token_id=model.config.decoder_start_token_id,
)
Load metric
metric = evaluate.load("wer")
Define compute_metrics function
def compute_metrics(pred):
pred_ids = pred.predictions
label_ids = pred.label_ids
Set up training arguments
training_args = Seq2SeqTrainingArguments(
output_dir="./whisper-turbo-KA_GE-fine-tuned",
per_device_train_batch_size=16,
gradient_accumulation_steps=1,
learning_rate=1e-5,
warmup_steps=500,
max_steps=4000,
gradient_checkpointing=True,
fp16=True,
evaluation_strategy="steps",
per_device_eval_batch_size=8,
predict_with_generate=True,
generation_max_length=model.config.max_length,
save_steps=1000,
eval_steps=1000,
logging_steps=25,
report_to=["tensorboard"],
load_best_model_at_end=True,
metric_for_best_model="wer",
greater_is_better=False,
push_to_hub=False,
)
Initialize trainer
trainer = Seq2SeqTrainer(
args=training_args,
model=model,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
compute_metrics=compute_metrics,
tokenizer=processor.feature_extractor,
)
Save processor
processor.save_pretrained(training_args.output_dir)
Start training
trainer.train()
Beta Was this translation helpful? Give feedback.
All reactions