Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ If you run into any issues, please visit the [VLA Troubleshooting](#vla-troubles

### Converting Prismatic Models to Hugging Face

**NOTE: Converting and deploying MiniVLA models and VQ / multi image is not supported yet!**
**NOTE: Converting and deploying VQ is not supported yet!**

If you have used the Prismatic VLMs codebase to train your model (e.g., if you did full fine-tuning of OpenVLA on a
new dataset), you will need to convert the final checkpoint to a version that is compatible with Hugging Face
Expand Down
2 changes: 2 additions & 0 deletions prismatic/extern/hf/configuration_prismatic.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
"mistral-v0.1-7b-instruct": "mistralai/Mistral-7B-Instruct-v0.1",

"phi-2-3b": "microsoft/phi-2",
"qwen25-0_5b-extra": "Qwen/Qwen2.5-0.5B",
}
LLM_BACKBONE_TO_HF_METACLASS = {
"llama2-7b-pure": "llama", "llama2-13b-pure": "llama", "llama2-7b-chat": "llama", "llama2-13b-chat": "llama",
Expand All @@ -62,6 +63,7 @@
"mistral-v0.1-7b-pure": "mistral", "mistral-v0.1-7b-instruct": "mistral",

"phi-2-3b": "phi",
"qwen25-0_5b-extra": "qwen2",
}

VALID_VISION_BACKBONES = set(VISION_BACKBONE_TO_RESOLUTION.keys())
Expand Down
53 changes: 45 additions & 8 deletions vla-scripts/extern/convert_openvla_weights_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,20 +161,57 @@ def convert_openvla_weights_to_hf(cfg: HFConvertConfig) -> None:

# Instantiate & Add Pad to Tokenizer =>> following `prismatic.models.materialize.get_llm_backbone_and_tokenizer`
# TODO (siddk) :: Implement batched generation -- in which case this should set `padding_side = "left"`!
padding_side = "right"
print("[*] Instantiating and Patching Tokenizer, LLM Config")
tokenizer = AutoTokenizer.from_pretrained(
hf_config.hf_llm_id, model_max_length=hf_config.llm_max_length, token=cfg.hf_token, padding_side="right"
hf_config.hf_llm_id,
model_max_length=hf_config.llm_max_length,
token=cfg.hf_token,
padding_side=padding_side
)

# Handle Extra Tokens (Generic Check)
if str(hf_config.llm_backbone_id).endswith("-extra"):
num_extra_tokens = 256
added = tokenizer.add_tokens([f"<|extra_{i}|>" for i in range(num_extra_tokens)])
print(f"Added {added} extra tokens to tokenizer for {hf_config.llm_backbone_id}.")

# Add PAD Token
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
tokenizer.init_kwargs.pop("add_prefix_space", None) # Pop to prevent unnecessary warning on reload...
tokenizer.init_kwargs.pop("add_prefix_space", None)

# Sync pad_token_id
hf_config.pad_token_id = tokenizer.pad_token_id
hf_config.text_config.pad_token_id = hf_config.pad_token_id
assert tokenizer.pad_token_id == hf_config.pad_token_id, "Incorrect Pad Token ID!"
assert len(tokenizer) > hf_config.text_config.vocab_size, "Tokenizer vocabulary must be larger than LLM vocabulary!"

# Patch LLM Config in `hf_config` with vocab_size (+ `hf_config.pad_to_multiple_of`), pad_token_id + validate
hf_config.text_config.vocab_size += hf_config.pad_to_multiple_of
hf_config.text_config.pad_token_id = hf_config.pad_token_id
hf_config.text_config.torch_dtype = torch.bfloat16
assert hf_config.text_config.use_cache, "LLM config `use_cache` should be True for inference (set default)!"
# Config Loading
from transformers import AutoConfig
base_llm_config = AutoConfig.from_pretrained(hf_config.hf_llm_id, token=cfg.hf_token)

# Update text_config with actual architecture values
hf_config.text_config.hidden_size = base_llm_config.hidden_size
hf_config.text_config.num_attention_heads = base_llm_config.num_attention_heads
hf_config.text_config.num_hidden_layers = base_llm_config.num_hidden_layers
hf_config.text_config.intermediate_size = base_llm_config.intermediate_size

# Qwen2.5 specific: handle GQA params
if hasattr(base_llm_config, 'num_key_value_heads'):
hf_config.text_config.num_key_value_heads = base_llm_config.num_key_value_heads
else:
hf_config.text_config.num_key_value_heads = base_llm_config.num_attention_heads

# Set use_cache for inference
hf_config.text_config.use_cache = True

# Vocab Size Alignment (Crucial for Prismatic compatibility)
vocab_size = len(tokenizer)
pad_to_multiple_of = 64
if vocab_size % pad_to_multiple_of != 0:
vocab_size = ((vocab_size + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of

hf_config.text_config.vocab_size = vocab_size
print(f"Tokenizer vocab_size aligned to: {vocab_size}")

# Create Vision Backbone & Transform =>> following `prismatic.models.materialize.get_vision_backbone_and_transform`
# =>> Deviates a bit from existing code; as such, explicitly tested in `tests/test_image_transforms.py`
Expand Down