Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
torch>=1.13.1
huggingface-hub==0.29.1
transformers==4.49.0
huggingface-hub>=0.34.0,<1.0
transformers>=4.51.0
Comment on lines +2 to +3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

Verify dependency compatibility and check for breaking changes.

The dependency versions have been updated with significant changes:

  • huggingface-hub: from exact pin 0.29.1 to range >=0.34.0,<1.0 (major version jump)
  • transformers: from exact pin 4.49.0 to minimum >=4.51.0 (minor version jump)

These changes relax version constraints, which improves flexibility but may introduce compatibility issues. Verify that:

  • The new versions are compatible with the codebase and don't introduce breaking changes.
  • The version ranges are tested and known to work.
  • Other dependencies (torch, peft, etc.) remain compatible with the new versions.

Run the following script to check for known security vulnerabilities and verify the latest stable versions:


🏁 Script executed:

#!/bin/bash
# Description: Check for security advisories and latest versions

# Check for security advisories in huggingface-hub
gh api graphql -f query='
{
  securityVulnerabilities(first: 5, ecosystem: PIP, package: "huggingface-hub") {
    nodes {
      advisory {
        summary
        severity
        publishedAt
      }
      vulnerableVersionRange
      firstPatchedVersion {
        identifier
      }
    }
  }
}'

# Check for security advisories in transformers
gh api graphql -f query='
{
  securityVulnerabilities(first: 5, ecosystem: PIP, package: "transformers") {
    nodes {
      advisory {
        summary
        severity
        publishedAt
      }
      vulnerableVersionRange
      firstPatchedVersion {
        identifier
      }
    }
  }
}'

# Verify latest versions
curl -s https://pypi.org/pypi/huggingface-hub/json | jq -r '.info.version'
curl -s https://pypi.org/pypi/transformers/json | jq -r '.info.version'

Length of output: 1382


Restrict transformers version to avoid ReDoS and breaking changes
Transformers <4.53.0 are vulnerable to multiple ReDoS issues; update requirement to transformers>=4.53.0,<5.0.0.
huggingface-hub >=0.34.0,<1.0 has no known advisories.

🤖 Prompt for AI Agents
In requirements.txt around lines 2 to 3, the transformers pin is too permissive
and includes vulnerable versions; update the transformers requirement to
"transformers>=4.53.0,<5.0.0" (leave huggingface-hub as
"huggingface-hub>=0.34.0,<1.0") so the package range excludes known
ReDoS-vulnerable releases while preventing major breaking upgrades.

datasets>=2.14.3
accelerate>=0.27.2
loguru==0.7.0
Expand Down
57 changes: 57 additions & 0 deletions src/core/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"Qwen/Qwen2.5-32B-Instruct",
"Qwen/Qwen2.5-72B",
"Qwen/Qwen2.5-72B-Instruct",
"Qwen/Qwen3-4B-Instruct-2507",
# yi 1.5
"01-ai/Yi-1.5-6B",
"01-ai/Yi-1.5-6B-Chat",
Expand Down Expand Up @@ -50,3 +51,59 @@
"microsoft/Phi-4-mini-instruct",
"microsoft/phi-4",
]

MODEL_TEMPLATE_MAP = {
# Qwen
"Qwen/Qwen2.5-0.5B": "qwen1.5",
"Qwen/Qwen2.5-0.5B-Instruct": "qwen1.5",
"Qwen/Qwen2.5-1.5B": "qwen1.5",
"Qwen/Qwen2.5-1.5B-Instruct": "qwen1.5",
"Qwen/Qwen2.5-3B": "qwen1.5",
"Qwen/Qwen2.5-3B-Instruct": "qwen1.5",
"Qwen/Qwen2.5-7B": "qwen1.5",
"Qwen/Qwen2.5-7B-Instruct": "qwen1.5",
"Qwen/Qwen2.5-14B": "qwen1.5",
"Qwen/Qwen2.5-14B-Instruct": "qwen1.5",
"Qwen/Qwen2.5-32B": "qwen1.5",
"Qwen/Qwen2.5-32B-Instruct": "qwen1.5",
"Qwen/Qwen2.5-72B": "qwen1.5",
"Qwen/Qwen2.5-72B-Instruct": "qwen1.5",
"Qwen/Qwen3-4B-Instruct-2507": "qwen3",
# Yi
"01-ai/Yi-1.5-6B": "yi",
"01-ai/Yi-1.5-6B-Chat": "yi",
"01-ai/Yi-1.5-9B": "yi",
"01-ai/Yi-1.5-9B-Chat": "yi",
"01-ai/Yi-1.5-34B": "yi",
"01-ai/Yi-1.5-34B-Chat": "yi",
# Mistral
"mistralai/Mistral-7B-v0.3": "mistral",
"mistralai/Mistral-7B-Instruct-v0.3": "mistral",
"mistralai/Ministral-8B-Instruct-2410": "mistral",
# Mixtral
"mistralai/Mixtral-8x7B-v0.1": "mixtral",
"mistralai/Mixtral-8x7B-Instruct-v0.1": "mixtral",
# Gemma 2
"google/gemma-2-2b": "gemma",
"google/gemma-2-9b": "gemma",
"google/gemma-2-27b": "gemma",
"google/gemma-2-2b-it": "gemma",
"google/gemma-2-9b-it": "gemma",
"google/gemma-2-27b-it": "gemma",
# LLaMA 3 + 3.1
"meta-llama/Meta-Llama-3-8B": "llama3",
"meta-llama/Meta-Llama-3-8B-Instruct": "llama3",
"meta-llama/Meta-Llama-3-70B": "llama3",
"meta-llama/Meta-Llama-3-70B-Instruct": "llama3",
"meta-llama/Meta-Llama-3.1-8B": "llama3",
"meta-llama/Meta-Llama-3.1-8B-Instruct": "llama3",
"meta-llama/Meta-Llama-3.1-70B": "llama3",
"meta-llama/Meta-Llama-3.1-70B-Instruct": "llama3",
# Phi 3
"microsoft/Phi-3.5-mini-instruct": "phi3",
"microsoft/Phi-3-mini-4k-instruct": "phi3",
"microsoft/Phi-3-medium-4k-instruct": "phi3",
# Phi 4
"microsoft/Phi-4-mini-instruct": "phi4",
"microsoft/phi-4": "phi4",
}
17 changes: 10 additions & 7 deletions src/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,15 @@ def __getitem__(self, index):
target_mask = [0] * len(input_ids)

# setting tool information
if "tools" in data.keys() and data["tools"]:
if "tools" in data.keys():
tools = json.loads(data["tools"])
tool_prompt = tool_formater(tools)
tool_text = self.tool_format.format(content=tool_prompt)
tool_tokens = self.tokenizer.encode(tool_text, add_special_tokens=False)
input_ids = input_ids + tool_tokens
target_mask = target_mask + [0] * len(tool_tokens)
if tools:
# tool_prompt = tool_formater(tools)
tool_prompt = json.dumps(tools)
tool_text = self.tool_format.format(content=tool_prompt)
tool_tokens = self.tokenizer.encode(tool_text, add_special_tokens=False)
input_ids = input_ids + tool_tokens
target_mask = target_mask + [0] * len(tool_tokens)
Comment on lines +46 to +54
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

Verify the behavioral changes in tool formatting.

The tool processing logic has been modified:

  1. Added a nested if tools: check (line 48) - this is good defensive coding to skip processing when tools is null/empty.
  2. Replaced tool_formater(tools) with json.dumps(tools) (line 50) - this changes how tools are serialized.

The second change alters the output format. Ensure that:

  • The new JSON format is compatible with the qwen3 template expectations.
  • Existing datasets or models relying on tool_formater output are not broken.
  • Unit tests cover this behavior change.

Run the following script to check if tool_formater is still used elsewhere:


🏁 Script executed:

#!/bin/bash
# Description: Verify if tool_formater is used in other parts of the codebase

# Search for tool_formater usage
rg -n "tool_formater" --type=py

Length of output: 236


🏁 Script executed:

#!/bin/bash
echo "Occurrences of function_formatter:"
rg -n "function_formatter" --type=py

echo
echo "Listing test files:"
find . -type f -name "*test*.py"

Length of output: 399


Add tests and validate JSON tool serialization

  • Confirm json.dumps(tools) output aligns with qwen3 template requirements and existing pipelines
  • Add unit tests for the tools branch in src/core/dataset.py (cover both JSON serialization and the commented‐out function_formatter path)
  • Remove unused tool_formater/function_formatter imports if no longer needed
🤖 Prompt for AI Agents
In src/core/dataset.py around lines 46 to 54, the current handling of the
"tools" branch simply json.dumps the tools and uses that text which may not
match the qwen3 template and the commented function_formatter path; update the
code and tests as follows: 1) ensure the serialized tool output matches the
qwen3 template (either adapt json.dumps with the required keys/format or
re-enable/use the function_formatter/tool_formater path to produce the exact
template output), 2) add unit tests that cover the "tools" branch for both
JSON-serialized output and the function_formatter path (include validation that
the resulting string tokens and masks match expected shapes and contents), and
3) remove any unused imports (tool_formater/function_formatter) if you choose to
keep the json.dumps approach, or restore and use the formatter and its tests if
you opt for that path.


conversations = data["conversations"]

Expand All @@ -65,7 +67,8 @@ def __getitem__(self, index):
input_buffer += human

elif role == "function_call":
tool_calls = function_formatter(json.loads(content))
# tool_calls = function_formatter(json.loads(content))
tool_calls = content
function = self.function_format.format(content=tool_calls)
input_buffer += function

Expand Down
23 changes: 23 additions & 0 deletions src/core/template.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dataclasses import dataclass
from typing import Dict
from .constant import MODEL_TEMPLATE_MAP


@dataclass
Expand Down Expand Up @@ -67,6 +68,25 @@ def register_template(
stop_word="<|im_end|>",
)

register_template(
template_name="qwen3",
system_format="<|im_start|>system\n{content}<|im_end|>\n",
user_format="<|im_start|>user\n{content}<|im_end|>\n<|im_start|>assistant\n",
assistant_format="{content}<|im_end|>\n",
tool_format=(
"# Tools\n\n"
"You may call one or more functions to assist with the user query.\n\n"
"You are provided with function signatures within <tools></tools> XML tags:\n"
"<tools>\n{content}\n</tools>\n\n"
"For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n"
'<tool_call>\n{{"name": <function-name>, "arguments": <args-json-object>}}\n</tool_call>'
),
function_format="<tool_call>\n{content}\n</tool_call><|im_end|>\n",
observation_format="<|im_start|>user\n<tool_response>\n{content}\n</tool_response><|im_end|>\n<|im_start|>assistant\n",
system="You are a helpful assistant.",
stop_word="<|im_end|>",
)

register_template(
template_name="yi",
system_format="<|im_start|>system\n{content}<|im_end|>\n",
Expand Down Expand Up @@ -182,3 +202,6 @@ def register_template(
system=None,
stop_word="<|end|>",
)

for model_name, template_name in MODEL_TEMPLATE_MAP.items():
template_dict[model_name] = template_dict[template_name]