Skip to content
2 changes: 1 addition & 1 deletion nemo_skills/inference/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def get_model(server_type, tokenizer=None, model_class: str | None = None, **kwa

def get_code_execution_model(server_type, tokenizer=None, code_execution=None, sandbox=None, **kwargs):
"""A helper function to make it easier to set server through cmd."""
model = get_model(server_type=server_type, tokenizer=tokenizer, **kwargs)
model = get_model(server_type=server_type, tokenizer=tokenizer, require_tokenizer=True, **kwargs)
if code_execution is None:
code_execution = {}
code_execution_config = CodeExecutionConfig(**code_execution)
Expand Down
10 changes: 8 additions & 2 deletions nemo_skills/inference/model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ def __init__(
# Directory paths for data and output
data_dir: str = "",
output_dir: str | None = None,
# Request tokenizer initialization independent of soft_fail
require_tokenizer: bool = False,
):
self._tunnel = None
self.model_name_or_path = model
Expand Down Expand Up @@ -126,7 +128,7 @@ def __init__(
else:
self.base_url = base_url

if enable_soft_fail:
if enable_soft_fail or require_tokenizer:
self.tokenizer = self._get_tokenizer(tokenizer)
else:
self.tokenizer = None
Expand Down Expand Up @@ -202,7 +204,11 @@ def _initialize_tokenizer(self, tokenizer: str | None) -> WrapperAutoTokenizer |
if tokenizer is None:
return None
if isinstance(tokenizer, str):
return WrapperAutoTokenizer(tokenizer)
try:
return WrapperAutoTokenizer(tokenizer)
except OSError:
LOG.warning(f"Tokenizer not found at '{tokenizer}', trying fallback to server /tokenize endpoint")
return None
Comment on lines 207 to 211
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

# First, let's look at the actual code at lines 207-211
head -n 215 nemo_skills/inference/model/base.py | tail -n 20

Repository: NVIDIA-NeMo/Skills

Length of output: 801


🏁 Script executed:

# Find where AutoTokenizer comes from
rg "AutoTokenizer" nemo_skills/inference/model/base.py -B 3 -A 3

Repository: NVIDIA-NeMo/Skills

Length of output: 1280


🏁 Script executed:

# Check the entire tokenizer initialization context
sed -n '190,220p' nemo_skills/inference/model/base.py

Repository: NVIDIA-NeMo/Skills

Length of output: 1128


🏁 Script executed:

# Look for any documentation or requirements that specify Transformers version
fd -e "txt" -e "in" -e "yaml" | xargs rg -l "transformers" | head -5

Repository: NVIDIA-NeMo/Skills

Length of output: 162


🏁 Script executed:

# Find and examine WrapperAutoTokenizer implementation
find . -name "utils.py" -path "*/inference/model/*" -exec cat {} \;

Repository: NVIDIA-NeMo/Skills

Length of output: 4062


🏁 Script executed:

# Check the main requirements to understand Transformers version
cat requirements/main.txt | grep -i transformers

Repository: NVIDIA-NeMo/Skills

Length of output: 117


🏁 Script executed:

# Also check stem requirements
cat requirements/stem.txt | grep -i transformers

Repository: NVIDIA-NeMo/Skills

Length of output: 74


🏁 Script executed:

# Search for documentation or examples of AutoTokenizer exceptions in the codebase
rg "AutoTokenizer" -A 5 -B 5 | grep -i "except\|raise\|error" | head -20

Repository: NVIDIA-NeMo/Skills

Length of output: 1792


🏁 Script executed:

# Check if there are other try-except blocks handling tokenizer initialization elsewhere
rg "from_pretrained" -A 3 -B 1 | grep -E "(except|OSError|ValueError|ImportError)"

Repository: NVIDIA-NeMo/Skills

Length of output: 358


🌐 Web query:

AutoTokenizer.from_pretrained exceptions transformers library

💡 Result:

Common AutoTokenizer.from_pretrained(...) exceptions in 🤗 Transformers, what they mean, and the usual fix:

  1. OSError: We couldn't connect to 'https://huggingface.co' ... couldn't find it in the cached files ... not the path to a directory containing ... config.json
    Cause: no internet / blocked Hub access, or you’re pointing to a local folder that doesn’t actually contain the needed tokenizer files.
    Fix: ensure network access (or pre-download), or pass a correct local directory that contains tokenizer artifacts (e.g., tokenizer.json, tokenizer_config.json, vocab files). (discuss.huggingface.co)

  2. huggingface_hub.utils._validators.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name' ...
    Cause: you passed something that is not a Hub repo ID (often a file path like .../tf_model.h5, an S3 URI, or a malformed string).
    Fix: pass either a valid Hub ID ("org/model") or a local folder path (directory), not a single weights file / invalid URI. (stackoverflow.com)

  3. ValueError: Unrecognized model ... Should have a 'model_type' key in its config.json ...
    Cause: the local folder/repo you’re loading is missing a proper config.json (or it lacks model_type), or you’re using an incompatible/old export.
    Fix: verify the folder has a valid config.json with model_type (and that you saved/pushed the model/tokenizer correctly). (discuss.huggingface.co)

  4. ValueError: Loading <repo> requires you to execute the configuration file in that repo ... set trust_remote_code=True
    Cause: the model/tokenizer relies on custom Python code from the repo.
    Fix: only if you trust the repo, pass trust_remote_code=True (and ensure you pass it everywhere needed in your loading flow). (discuss.huggingface.co)

  5. Auth/token-related breakages (e.g., private/gated repos), including reports around token= behavior changing
    Cause: missing/incorrect auth token, or a version-specific regression/change in how token is propagated during tokenizer loading.
    Fix: confirm you’re logged in / passing a valid token; if it “suddenly” started failing after an upgrade, try pinning/adjusting transformers/huggingface_hub versions and retest. (github.com)

Citations:


🏁 Script executed:

# Check how other parts of the codebase handle similar initialization errors
rg "from_pretrained|WrapperAutoTokenizer" -B 3 -A 5 | head -50

Repository: NVIDIA-NeMo/Skills

Length of output: 3855


🏁 Script executed:

# Check if there are any HFValidationError imports or usages
rg "HFValidationError|huggingface_hub" nemo_skills/ -B 2 -A 2

Repository: NVIDIA-NeMo/Skills

Length of output: 1379


🌐 Web query:

transformers AutoTokenizer.from_pretrained exception types ValueError OSError ImportError

💡 Result:

transformers.AutoTokenizer.from_pretrained(...) (and the underlying tokenizer .from_pretrained(...)) commonly fails with these exception types:

  • OSError (historically also raised as EnvironmentError, which is an alias of OSError on Python 3): raised when the tokenizer files can’t be resolved/loaded (bad model id, wrong directory, missing required tokenizer files, etc.). The HF code explicitly does raise EnvironmentError("Can't load tokenizer for ..."). [1]

  • ValueError: raised for “logical” issues such as:

    • passing a single file/URL to tokenizers that require a directory/model id (explicit raise ValueError(...)). [1]
    • AutoTokenizer can’t map the model/config to a tokenizer (unrecognized configuration), or a required tokenizer implementation is unavailable (e.g., SentencePiece-backed slow tokenizer class is None so AutoTokenizer raises a ValueError instructing to install sentencepiece). [2]
    • fast tokenizer backend can’t be instantiated (no tokenizer.json, no slow tokenizer available to convert, etc.), raising a ValueError with the “Couldn't instantiate the backend tokenizer…” message. [3]
  • ImportError: can be raised indirectly when the load path requires optional deps that aren’t installed (most often during slow→fast conversion), e.g. requires_protobuf(...) raising ImportError: ... requires the protobuf library .... [4]

Sources: [1] [2] [3] [4]


Narrow exception handling for tokenizer initialization.
Catching broad Exception can mask unexpected bugs and silently degrade; replace with specific exception types that AutoTokenizer.from_pretrained() raises.

🛠️ Proposed refinement
-            except (OSError, Exception) as e:
+            except (OSError, ValueError, ImportError) as e:
                 LOG.debug(f"Failed to initialize tokenizer from '{tokenizer}': {e}")
                 return None

AutoTokenizer.from_pretrained() raises OSError (missing files, network issues), ValueError (unrecognized model config, missing dependencies), and ImportError (optional dep not installed). These three types cover the documented failure modes for tokenizer loading.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
try:
return WrapperAutoTokenizer(tokenizer)
except (OSError, Exception) as e:
LOG.debug(f"Failed to initialize tokenizer from '{tokenizer}': {e}")
return None
try:
return WrapperAutoTokenizer(tokenizer)
except (OSError, ValueError, ImportError) as e:
LOG.debug(f"Failed to initialize tokenizer from '{tokenizer}': {e}")
return None
🧰 Tools
🪛 Ruff (0.14.13)

209-209: Do not catch blind exception: Exception

(BLE001)

🤖 Prompt for AI Agents
In `@nemo_skills/inference/model/base.py` around lines 207 - 211, The try/except
in the tokenizer initialization around WrapperAutoTokenizer(tokenizer) is
catching a broad Exception; narrow it to the documented failure modes by
catching OSError, ValueError and ImportError specifically (instead of Exception)
so failures from AutoTokenizer.from_pretrained()/WrapperAutoTokenizer are
handled but other unexpected exceptions still surface; update the except clause
accordingly in the code that constructs WrapperAutoTokenizer(tokenizer).

Comment on lines +207 to +211
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only catching OSError may miss other exceptions during tokenizer initialization (e.g., ImportError, ValueError). If the goal is to gracefully fall back to server endpoint, consider catching broader exceptions.

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!

Comment on lines +207 to +211
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Catching only OSError doesn't follow CONTRIBUTING.md guidelines about not being overly defensive. If require_tokenizer is True, the code should fail loudly when tokenizer initialization fails, not silently fall back. The runtime check on line 279 of code_execution.py will catch this later, but it happens during generation (after model setup), which could cause issues in production.

Consider checking the flag here and only catching when fallback is acceptable (when the flag is False).


@abc.abstractmethod
def _build_chat_request_params(self, **kwargs) -> dict:
Expand Down
47 changes: 33 additions & 14 deletions nemo_skills/inference/model/code_execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ async def _generate_single(
# if there's an unfinished code block
if output.count(code_end) + 1 == output.count(code_begin):
output += code_end
# Count tokens for the manually added code_end
num_generated_tokens += len(self.model.tokenizer.encode(code_end))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In non-streaming mode, code_end tokens are added to num_generated_tokens which came from output_dict.get("num_generated_tokens", 0). If the server already counted tokens for code_end when it was manually added, this would double-count those tokens.


# Update the prompt based on format
if is_openai_format:
Expand All @@ -162,16 +164,14 @@ async def _generate_single(
else:
request["prompt"] += output

# if it's the extra iteration, we don't execute the code block and just finish

if generation_index == effective_max_code_executions:
break
# adjusting requested tokens to account for what has been generated already
request["tokens_to_generate"] -= num_generated_tokens
total_num_generated_tokens += num_generated_tokens
generation_time += int(time.time() - generation_time_start)
# TODO: currently we don't account for tokens in the code output that we add to the prompt
# in most cases the output should be small though

# if it's the extra iteration, we don't execute the code block and just finish
if generation_index == effective_max_code_executions:
break
if request["tokens_to_generate"] <= 0:
break
# .rfind(code_end, 0, -1) searches for the second-to-last occurrence of code_end and checks
Expand All @@ -195,6 +195,12 @@ async def _generate_single(
if "process_status" in execution_dict and execution_dict["process_status"] == "timeout":
num_code_timeouts += 1

# Account for tokens in the code output
code_output_tokens = len(self.model.tokenizer.encode(code_output))
request["tokens_to_generate"] -= code_output_tokens
total_num_generated_tokens += code_output_tokens
if request["tokens_to_generate"] <= 0:
break
if is_openai_format:
request["prompt"][-2]["content"] += code_output
else:
Expand Down Expand Up @@ -270,6 +276,12 @@ async def generate_async(

Not every server supports that, so make sure to override this method directly if that's not the case.
"""
if self.model.tokenizer is None:
raise RuntimeError(
"Tokenizer is required for CodeExecutionWrapper to correctly count tokens. "
"Please initialize the model with require_tokenizer=True or provide a valid tokenizer."
)

if top_logprobs is not None: # TODO: add this
raise NotImplementedError("top_logprobs is not supported yet.")

Expand Down Expand Up @@ -363,24 +375,25 @@ async def _stream_single(
model_token_iterator = await self.model.generate_async(prompt=current_full_prompt, **request)

current_output_segment = ""
num_generated_tokens = 0
async for chunk in model_token_iterator:
yield chunk
current_output_segment += chunk["generation"]
num_generated_tokens += 1

request["tokens_to_generate"] -= num_generated_tokens
if request["tokens_to_generate"] <= 0:
break
if not current_output_segment:
break

# openai and trtllm don't show what stop word was triggered, so we assume that it was `code_end`
# if there's an unfinished code block
if current_output_segment.count(code_end) + 1 == current_output_segment.count(code_begin):
current_output_segment += code_end
yield {"generation": code_end}

# Calculate token count for this segment (after adding code_end if needed)
num_generated_tokens = len(self.model.tokenizer.encode(current_output_segment))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Token count uses encode() on the segment string, which may differ from actual token count if tokenization is context-dependent. Consider verifying this matches the LLM's actual token usage.


request["tokens_to_generate"] -= num_generated_tokens
if request["tokens_to_generate"] <= 0:
break
if not current_output_segment:
break

# Update the prompt based on format
if is_openai_format:
current_full_prompt.append({"role": "assistant", "content": current_output_segment})
Expand Down Expand Up @@ -417,6 +430,12 @@ async def _stream_single(
)
yield {"generation": formatted_code_output} # Yield the entire formatted code output as one chunk

# Account for tokens in the code output
code_output_tokens = len(self.model.tokenizer.encode(formatted_code_output))
request["tokens_to_generate"] -= code_output_tokens
if request["tokens_to_generate"] <= 0:
break

# Append executed code's output to the prompt
if is_openai_format:
current_full_prompt[-2]["content"] += formatted_code_output
Expand Down