'utf-8' codec can't decode byte 0xe2 in position 4785: invalid continuation byte #2552

LeeKIngKIng · 2025-03-19T05:57:36Z

LeeKIngKIng
Mar 19, 2025

When I ttranscribe the audio, the error happens.

UnicodeDecodeError Traceback (most recent call last)
Cell In[1], line 138
132 add_subtitles_with_watermark(compose_clip, subtitles, output_path, watermark_path, top_padding, left_padding)
134 # Clean up temporary audio file
135 # os.remove(audio_path)
--> 138 process_video_folder('德玛西亚人在塔在啦啦啦啦哈哈哈哈')

Cell In[1], line 105, in process_video_folder(words_str, video_folder, output_folder)
101 extract_audio('video_folder/4.mp4', audio_path)
103 # Transcribe audio
104 # transcriptions = transcribe_audio(audio_path)
--> 105 words = transcribe_words(audio_path)
106 print(words)
107 return

Cell In[1], line 31, in transcribe_words(audio)
28 import whisper
30 model = whisper.load_model("medium")
---> 31 result = model.transcribe(audio)
32 return json.dumps(result)

File ~/anaconda3/lib/python3.12/site-packages/whisper/transcribe.py:146, in transcribe(model, audio, verbose, temperature, compression_ratio_threshold, logprob_threshold, no_speech_threshold, condition_on_previous_text, initial_prompt, word_timestamps, prepend_punctuations, append_punctuations, clip_timestamps, hallucination_silence_threshold, **decode_options)
142 print(
143 "Detecting language using up to the first 30 seconds. Use --language to specify the language"
144 )
145 mel_segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype)
--> 146 _, probs = model.detect_language(mel_segment)
147 decode_options["language"] = max(probs, key=probs.get)
148 if verbose is not None:

File ~/anaconda3/lib/python3.12/site-packages/torch/utils/_contextlib.py:116, in context_decorator..decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)

File ~/anaconda3/lib/python3.12/site-packages/whisper/decoding.py:35, in detect_language(model, mel, tokenizer)
22 """
23 Detect the spoken language in the audio, and return them as list of strings, along with the ids
24 of the most probable language tokens and the probability distribution over all language tokens.
(...)
32 list of dictionaries containing the probability distribution over all languages.
33 """
34 if tokenizer is None:
---> 35 tokenizer = get_tokenizer(
36 model.is_multilingual, num_languages=model.num_languages
37 )
38 if (
39 tokenizer.language is None
40 or tokenizer.language_token not in tokenizer.sot_sequence
41 ):
42 raise ValueError(
43 "This model doesn't have language tokens so it can't perform lang id"
44 )

File ~/anaconda3/lib/python3.12/site-packages/whisper/tokenizer.py:391, in get_tokenizer(multilingual, num_languages, language, task)
388 language = None
389 task = None
--> 391 encoding = get_encoding(name=encoding_name, num_languages=num_languages)
393 return Tokenizer(
394 encoding=encoding, num_languages=num_languages, language=language, task=task
395 )

File ~/anaconda3/lib/python3.12/site-packages/whisper/tokenizer.py:335, in get_encoding(name, num_languages)
330 @lru_cache(maxsize=None)
331 def get_encoding(name: str = "gpt2", num_languages: int = 99):
332 vocab_path = os.path.join(os.path.dirname(file), "assets", f"{name}.tiktoken")
333 ranks = {
334 base64.b64decode(token): int(rank)
--> 335 for token, rank in (line.split() for line in open(vocab_path) if line)
336 }
337 n_vocab = len(ranks)
338 special_tokens = {}

File ~/anaconda3/lib/python3.12/site-packages/whisper/tokenizer.py:335, in (.0)
330 @lru_cache(maxsize=None)
331 def get_encoding(name: str = "gpt2", num_languages: int = 99):
332 vocab_path = os.path.join(os.path.dirname(file), "assets", f"{name}.tiktoken")
333 ranks = {
334 base64.b64decode(token): int(rank)
--> 335 for token, rank in (line.split() for line in open(vocab_path) if line)
336 }
337 n_vocab = len(ranks)
338 special_tokens = {}

File :322, in decode(self, input, final)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe2 in position 4785: invalid continuation byte

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

'utf-8' codec can't decode byte 0xe2 in position 4785: invalid continuation byte #2552

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

'utf-8' codec can't decode byte 0xe2 in position 4785: invalid continuation byte #2552

Uh oh!

LeeKIngKIng Mar 19, 2025

Replies: 0 comments

LeeKIngKIng
Mar 19, 2025