File tree Expand file tree Collapse file tree 5 files changed +38
-14
lines changed
examples/models/llama3_2_vision Expand file tree Collapse file tree 5 files changed +38
-14
lines changed Original file line number Diff line number Diff line change 7272 conda activate "${CONDA_ENV}"
7373
7474 MODEL_NAME=${{ matrix.model }}
75+ # Install requirements for llama vision
76+ if [[ "$MODEL_NAME" == "llama3_2_vision_encoder" ]]; then
77+ bash examples/models/llama3_2_vision/install_requirements.sh
78+ fi
7579 BUILD_TOOL=${{ matrix.build-tool }}
7680 BACKEND=${{ matrix.backend }}
7781 DEMO_BACKEND_DELEGATION=${{ matrix.demo_backend_delegation }}
Original file line number Diff line number Diff line change 5858 bash .ci/scripts/setup-conda.sh
5959 # Setup MacOS dependencies as there is no Docker support on MacOS atm
6060 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
61- # Build and test xecutorch
61+ # Install requirements for llama vision
62+ if [[ "$MODEL_NAME" == "llama3_2_vision_encoder" ]]; then
63+ ${CONDA_RUN} bash examples/models/llama3_2_vision/install_requirements.sh
64+ fi
65+ # Build and test executorch
6266 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_model.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
6367
6468 test-custom-ops-macos :
Original file line number Diff line number Diff line change 1+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2+ # All rights reserved.
3+ #
4+ # This source code is licensed under the BSD-style license found in the
5+ # LICENSE file in the root directory of this source tree.
6+
7+ from .vision_encoder import FlamingoVisionEncoderModel , VisionEncoderConfig
8+
9+ __all__ = [
10+ "FlamingoVisionEncoderModel" ,
11+ "VisionEncoderConfig" ,
12+ ]
Original file line number Diff line number Diff line change 1616)
1717from torchtune .models .llama3_2_vision ._component_builders import llama3_2_vision_encoder
1818
19- max_seq_len = 8192
20- in_channels = 3
21- tile_size = 560
22- max_num_tiles = 4
23- # how many tokens per image generated by the vision encoder
24- tokens_per_image = 6404
25- # how many images to cache in the kv cache in cross attention
26- kv_cache_image_num = 1
27- # maximum number of tokens generated by encoder and thus stored in the kv cache in cross attention
28- encoder_max_seq_len = tokens_per_image * kv_cache_image_num
29-
3019
3120@dataclass
3221class VisionEncoderConfig :
@@ -42,11 +31,26 @@ class VisionEncoderConfig:
4231 in_channels : int = 3
4332
4433
34+ # 8 layers for CI testing purpose
35+ demo_config : VisionEncoderConfig = VisionEncoderConfig (
36+ patch_size = 14 ,
37+ num_heads = 8 ,
38+ clip_embed_dim = 768 ,
39+ clip_num_layers = 6 ,
40+ clip_hidden_states = [1 , 3 , 5 ],
41+ decoder_embed_dim = 1024 ,
42+ num_layers_projection = 4 ,
43+ tile_size = 224 ,
44+ max_num_tiles = 4 ,
45+ in_channels = 3 ,
46+ )
47+
48+
4549class FlamingoVisionEncoderModel (EagerModelBase ):
4650 def __init__ (self , config : Optional [VisionEncoderConfig ] = None ):
4751 super ().__init__ ()
4852 if config is None :
49- config = VisionEncoderConfig ()
53+ config = demo_config
5054 self .config = config
5155 self .model = llama3_2_vision_encoder (
5256 patch_size = config .patch_size ,
Original file line number Diff line number Diff line change @@ -17,7 +17,7 @@ addopts =
1717 # examples
1818 examples/models/llama/tests
1919 examples/models/llama3_2_vision/preprocess
20- # examples/models/llama3_2_vision/vision_encoder/test TODO: enable this
20+ examples/models/llama3_2_vision/vision_encoder/test
2121 # examples/models/llava/test TODO: enable this
2222 # exir
2323 exir/_serialize/test
You can’t perform that action at this time.
0 commit comments