From 993c1bd7f603f52db867fb80542732d56709cc90 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Fri, 10 Oct 2025 06:08:37 -0700 Subject: [PATCH 1/6] tokenizer fixes --- README.md | 4 +-- apps/sft_v2/qwen3_8b.yaml | 64 +++++++++++++++++++++++++++++++++++++ src/forge/data/tokenizer.py | 29 ++++++++++++----- 3 files changed, 87 insertions(+), 10 deletions(-) create mode 100644 apps/sft_v2/qwen3_8b.yaml diff --git a/README.md b/README.md index 5142b91a0..70bf563f6 100644 --- a/README.md +++ b/README.md @@ -65,8 +65,8 @@ $ gh release create v0.0.0 assets/wheels/vllm-*.whl --title "Forge Wheels v0.0.0 To run SFT for Llama3 8B, run ```bash -uv run forge download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth" -uv run forge run --nproc_per_node 2 apps/sft/main.py --config apps/sft/llama3_8b.yaml +forge download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth" +python -m apps.sft_v2.main --config apps/sft_v2/llama3_8b.yaml ``` ### Citation diff --git a/apps/sft_v2/qwen3_8b.yaml b/apps/sft_v2/qwen3_8b.yaml new file mode 100644 index 000000000..273ba012a --- /dev/null +++ b/apps/sft_v2/qwen3_8b.yaml @@ -0,0 +1,64 @@ +# >>> python -m apps.sft_v2.main --config apps/sft_v2/qwen3_8b.yaml + + +# TODO: required by torchtitan +# https://github.com/pytorch/torchtitan/blob/2f1c814da071cc8ad165d00be6f9c1a66f8e1cce/torchtitan/distributed/utils.py#L265 + +model_name: "Qwen/Qwen3-8B" + +comm: + trace_buf_size: 0 + +model: + name: qwen3 + flavor: 8B + hf_assets_path: hf://${model_name} + +processes: + procs: 8 + with_gpus: true + +optimizer: + name: AdamW + lr: 1e-5 + eps: 1e-8 + +lr_scheduler: + warmup_steps: 200 + +training: + local_batch_size: 1 + seq_len: 2048 + max_norm: 1.0 + steps: 1000 + compile: false + dataset: "c4" + +parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: -1 + tensor_parallel_degree: 1 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + disable_loss_parallel: false + +checkpoint: + enable: true + initial_load_path: hf://${model_name} + initial_load_in_hf: true + last_save_in_hf: true + interval: 500 + async_mode: "disabled" + +activation_checkpoint: + mode: selective + selective_ac_option: op + +# profiling: +# enable_profiling: false + +# metrics: +# log_freq: 10 +# enable_tensorboard: true +# save_tb_folder: "tb" diff --git a/src/forge/data/tokenizer.py b/src/forge/data/tokenizer.py index 3cb90f79c..ce2207cdf 100644 --- a/src/forge/data/tokenizer.py +++ b/src/forge/data/tokenizer.py @@ -8,13 +8,13 @@ from typing import Any, Optional import jinja2 -from jinja2 import StrictUndefined - -from tokenizers import Tokenizer from forge.data.utils import truncate from forge.interfaces import BaseTokenizer, ModelTokenizer from forge.types import Message +from jinja2 import StrictUndefined + +from tokenizers import Tokenizer class HuggingFaceBaseTokenizer(BaseTokenizer): @@ -61,7 +61,7 @@ def __init__( self._infer_bos_eos_tokens() self._infer_should_add_bos_eos() - def _get_token_from_config(self, config: dict[str, Any], key: str) -> str: + def _get_token_from_config(self, config: dict[str, Any], key: str) -> Optional[str]: """ HF BOS/EOS tokens are either stored as e.g. {'bos_token': 5} or {'bos_token': {'content': 5, ...}}. This utility handles both. @@ -72,7 +72,7 @@ def _get_token_from_config(self, config: dict[str, Any], key: str) -> str: raise ValueError(f"Could not parse {key} from config") token = token["content"] else: - if not isinstance(token, str): + if token is not None and not isinstance(token, str): raise ValueError(f"Could not parse {key} from config") return token @@ -137,7 +137,12 @@ def encode( list[int]: The list of token ids. """ token_ids = self.tokenizer.encode(text).ids - if add_bos and not self.hf_adds_bos and self.bos_token not in text: + if ( + add_bos + and not self.hf_adds_bos + and self.bos_token is not None + and self.bos_token not in text + ): token_ids.insert(0, self.bos_id) if add_eos and not self.hf_adds_eos: token_ids.append(self.eos_id) @@ -262,8 +267,13 @@ def extract_top_level_variables(self, config): def render_template( self, messages: list[dict[str, str]], add_eos: bool = True ) -> str: + # Need to set tool_calls to something for qwen chat_template + for message in messages: + if "tool_calls" not in message: + message["tool_calls"] = {} rendered = self.template.render( messages=messages, + tools=None, add_generation_prompt=add_eos, **self.special_tokens_mapping, # We assume that the naming is consistent **self.top_level_variables, @@ -291,10 +301,13 @@ def tokenize_messages( add_eos=add_eos if i == len(messages) - 1 else False, ) - current_tokens = self.base_tokenizer.encode(rendered, add_eos=False) + current_tokens = self.base_tokenizer.encode( + rendered, add_bos=False, add_eos=False + ) if ( - self.base_tokenizer.bos_token in rendered + self.base_tokenizer.bos_token is not None + and self.base_tokenizer.bos_token in rendered and self.base_tokenizer.hf_adds_bos ): del current_tokens[0] From 57db3891bb1d14e5a5fe7a542e3aa154ef80ca78 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Fri, 10 Oct 2025 06:27:47 -0700 Subject: [PATCH 2/6] upd --- src/forge/data/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/forge/data/tokenizer.py b/src/forge/data/tokenizer.py index f218f4f6d..b70880f12 100644 --- a/src/forge/data/tokenizer.py +++ b/src/forge/data/tokenizer.py @@ -5,7 +5,7 @@ # LICENSE file in the root directory of this source tree. import json -from typing import Any +from typing import Any, Optional import jinja2 From 084cb389772d879b2f8e1d87ad808194c5dcc27b Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Mon, 13 Oct 2025 11:07:28 -0700 Subject: [PATCH 3/6] upd --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 70bf563f6..5142b91a0 100644 --- a/README.md +++ b/README.md @@ -65,8 +65,8 @@ $ gh release create v0.0.0 assets/wheels/vllm-*.whl --title "Forge Wheels v0.0.0 To run SFT for Llama3 8B, run ```bash -forge download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth" -python -m apps.sft_v2.main --config apps/sft_v2/llama3_8b.yaml +uv run forge download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth" +uv run forge run --nproc_per_node 2 apps/sft/main.py --config apps/sft/llama3_8b.yaml ``` ### Citation From e29b907d2241d594fd1a8ed50bcf154f14d1ac04 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Mon, 13 Oct 2025 11:49:50 -0700 Subject: [PATCH 4/6] upd --- src/forge/data/tokenizer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/forge/data/tokenizer.py b/src/forge/data/tokenizer.py index b70880f12..f006d38b1 100644 --- a/src/forge/data/tokenizer.py +++ b/src/forge/data/tokenizer.py @@ -268,9 +268,10 @@ def render_template( self, messages: list[dict[str, str]], add_eos: bool = True ) -> str: # Need to set tool_calls to something for qwen chat_template - for message in messages: - if "tool_calls" not in message: - message["tool_calls"] = {} + if self.base_tokenizer.config["tokenizer_class"] == "Qwen2Tokenizer": + for message in messages: + if "tool_calls" not in message: + message["tool_calls"] = {} rendered = self.template.render( messages=messages, tools=None, From 55291c2edc755f99002d8bb589f9cab4574e2803 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Mon, 13 Oct 2025 11:53:03 -0700 Subject: [PATCH 5/6] upd --- src/forge/data/tokenizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/forge/data/tokenizer.py b/src/forge/data/tokenizer.py index f006d38b1..d93c5d4a4 100644 --- a/src/forge/data/tokenizer.py +++ b/src/forge/data/tokenizer.py @@ -8,13 +8,13 @@ from typing import Any, Optional import jinja2 +from jinja2 import StrictUndefined + +from tokenizers import Tokenizer from forge.data.utils import truncate from forge.interfaces import BaseTokenizer, ModelTokenizer from forge.types import Message -from jinja2 import StrictUndefined - -from tokenizers import Tokenizer class HuggingFaceBaseTokenizer(BaseTokenizer): From 927e941e577694bea96342fd0c60a4c356648896 Mon Sep 17 00:00:00 2001 From: Danielle Pintz Date: Mon, 13 Oct 2025 13:39:34 -0700 Subject: [PATCH 6/6] upd --- apps/sft_v2/qwen3_8b.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/apps/sft_v2/qwen3_8b.yaml b/apps/sft_v2/qwen3_8b.yaml index 273ba012a..1a07bfa15 100644 --- a/apps/sft_v2/qwen3_8b.yaml +++ b/apps/sft_v2/qwen3_8b.yaml @@ -3,12 +3,11 @@ # TODO: required by torchtitan # https://github.com/pytorch/torchtitan/blob/2f1c814da071cc8ad165d00be6f9c1a66f8e1cce/torchtitan/distributed/utils.py#L265 - -model_name: "Qwen/Qwen3-8B" - comm: trace_buf_size: 0 +model_name: "Qwen/Qwen3-8B" + model: name: qwen3 flavor: 8B