Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion install/install_requirements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ REQUIREMENTS_TO_INSTALL=(
# Rely on the latest tochtune for flamingo support
(
set -x
$PIP_EXECUTABLE install git+https://github.com/pytorch/torchtune.git@18efc81dda1c537bb7c25058ff059b4623ccff58
$PIP_EXECUTABLE install git+https://github.com/pytorch/torchtune.git@d002d45e3ec700fa770d9dcc61b02c59e2507bf6
)

if [[ -x "$(command -v nvidia-smi)" ]]; then
Expand All @@ -99,3 +99,9 @@ if [[ -x "$(command -v nvidia-smi)" ]]; then
$PYTHON_EXECUTABLE torchchat/utils/scripts/patch_triton.py
)
fi


(
set -x
$PIP_EXECUTABLE install lm-eval=="0.4.2"
)
1 change: 0 additions & 1 deletion install/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ snakeviz
sentencepiece
numpy < 2.0
gguf
lm-eval==0.4.2
blobfile
tomli >= 1.1.0 ; python_version < "3.11"
openai
Expand Down
12 changes: 9 additions & 3 deletions torchchat/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,13 @@
)
from torch.nn import functional as F

from torchtune.models.flamingo import flamingo_decoder, flamingo_vision_encoder
from torchtune.models.clip import clip_vision_encoder
from torchtune.models.llama3_1._component_builders import llama3_1 as llama3_1_builder
from torchtune.models.llama3_2_vision._component_builders import (
llama3_2_vision_decoder,
llama3_2_vision_encoder,
)
from torchtune.modules.model_fusion import DeepFusionModel
from torchtune.models.clip import clip_vision_encoder

from torchchat.utils.build_utils import find_multiple, get_precision

Expand Down Expand Up @@ -213,7 +216,10 @@ def _llama3_1(cls):
def _flamingo(cls):
return cls(
model_type=ModelType.Flamingo,
modules={"encoder": flamingo_vision_encoder, "decoder": flamingo_decoder},
modules={
"encoder": llama3_2_vision_encoder,
"decoder": llama3_2_vision_decoder
},
fusion_class=DeepFusionModel,
)

Expand Down
11 changes: 6 additions & 5 deletions torchchat/usages/openai_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@

import torch

from torchtune.models.llama3_2_vision._convert_weights import padded_collate
from torchtune.models.llama3_2_vision._model_builders import llama3_2_vision_transform

from PIL import Image
from torchtune.data import Message

from torchtune.data import Message, padded_collate
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to double check about padded_collate func here; in the generation.py we've replaced it with padded_collate_tiled_images_and_mask

from torchtune.models.llama3_2_vision._model_builders import llama3_2_vision_transform

from torchchat.cli.download import is_model_downloaded, load_model_configs
from torchchat.generate import Generator, GeneratorArgs
Expand Down Expand Up @@ -374,7 +373,9 @@ def chunked_completion(self, completion_request: CompletionRequest):
images.append(Image.open(BytesIO(base64_decoded)))
print("images:", len(images), flush=True)
if len(images) > 0:
transform = llama3_2_vision_transform(str(self.tokenizer_args.tokenizer_path))
transform = llama3_2_vision_transform(
str(self.tokenizer_args.tokenizer_path)
)
torchtune_messages = self._openai_messages_to_torchtune(
completion_request.messages
)
Expand Down
Loading