diff --git a/install/install_requirements.sh b/install/install_requirements.sh index d7183ee30..bb0190a29 100755 --- a/install/install_requirements.sh +++ b/install/install_requirements.sh @@ -90,7 +90,7 @@ REQUIREMENTS_TO_INSTALL=( # Rely on the latest tochtune for flamingo support ( set -x - $PIP_EXECUTABLE install git+https://github.com/pytorch/torchtune.git@18efc81dda1c537bb7c25058ff059b4623ccff58 + $PIP_EXECUTABLE install git+https://github.com/pytorch/torchtune.git@d002d45e3ec700fa770d9dcc61b02c59e2507bf6 ) if [[ -x "$(command -v nvidia-smi)" ]]; then @@ -99,3 +99,9 @@ if [[ -x "$(command -v nvidia-smi)" ]]; then $PYTHON_EXECUTABLE torchchat/utils/scripts/patch_triton.py ) fi + + +( + set -x + $PIP_EXECUTABLE install lm-eval=="0.4.2" +) diff --git a/install/requirements.txt b/install/requirements.txt index bbb1d56d1..3329563b4 100644 --- a/install/requirements.txt +++ b/install/requirements.txt @@ -14,7 +14,6 @@ snakeviz sentencepiece numpy < 2.0 gguf -lm-eval==0.4.2 blobfile tomli >= 1.1.0 ; python_version < "3.11" openai diff --git a/torchchat/model.py b/torchchat/model.py index edb0ce3d5..ab0bc7e21 100644 --- a/torchchat/model.py +++ b/torchchat/model.py @@ -31,10 +31,13 @@ ) from torch.nn import functional as F -from torchtune.models.flamingo import flamingo_decoder, flamingo_vision_encoder +from torchtune.models.clip import clip_vision_encoder from torchtune.models.llama3_1._component_builders import llama3_1 as llama3_1_builder +from torchtune.models.llama3_2_vision._component_builders import ( + llama3_2_vision_decoder, + llama3_2_vision_encoder, +) from torchtune.modules.model_fusion import DeepFusionModel -from torchtune.models.clip import clip_vision_encoder from torchchat.utils.build_utils import find_multiple, get_precision @@ -213,7 +216,10 @@ def _llama3_1(cls): def _flamingo(cls): return cls( model_type=ModelType.Flamingo, - modules={"encoder": flamingo_vision_encoder, "decoder": flamingo_decoder}, + modules={ + "encoder": llama3_2_vision_encoder, + "decoder": llama3_2_vision_decoder + }, fusion_class=DeepFusionModel, ) diff --git a/torchchat/usages/openai_api.py b/torchchat/usages/openai_api.py index 9490af2ba..e0e309d5b 100644 --- a/torchchat/usages/openai_api.py +++ b/torchchat/usages/openai_api.py @@ -17,11 +17,10 @@ import torch -from torchtune.models.llama3_2_vision._convert_weights import padded_collate -from torchtune.models.llama3_2_vision._model_builders import llama3_2_vision_transform - from PIL import Image -from torchtune.data import Message + +from torchtune.data import Message, padded_collate +from torchtune.models.llama3_2_vision._model_builders import llama3_2_vision_transform from torchchat.cli.download import is_model_downloaded, load_model_configs from torchchat.generate import Generator, GeneratorArgs @@ -374,7 +373,9 @@ def chunked_completion(self, completion_request: CompletionRequest): images.append(Image.open(BytesIO(base64_decoded))) print("images:", len(images), flush=True) if len(images) > 0: - transform = llama3_2_vision_transform(str(self.tokenizer_args.tokenizer_path)) + transform = llama3_2_vision_transform( + str(self.tokenizer_args.tokenizer_path) + ) torchtune_messages = self._openai_messages_to_torchtune( completion_request.messages )