huggingface
diff --git a/‎README.md‎
Lines changed: 19 additions & 0 deletions b/‎README.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎tests/test_models.py‎
Lines changed: 3 additions & 3 deletions b/‎tests/test_models.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎timm/data/__init__.py‎
Lines changed: 12 additions & 0 deletions b/‎timm/data/__init__.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎timm/data/loader.py‎
Lines changed: 5 additions & 1 deletion b/‎timm/data/loader.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎timm/data/mixup.py‎
Lines changed: 52 additions & 19 deletions b/‎timm/data/mixup.py‎
Lines changed: 52 additions & 19 deletions
@@ -12,6 +12,25 @@
 
 ## What's New
 
+## June 5, 2025
+* Initial NaFlexVit model code. NaFlexVit is a Vision Transformer with:
+  1. Encapsulated embedding and position encoding in a single module
+  2. Support for nn.Linear patch embedding on pre-patchified (dictionary) inputs
+  3. Support for NaFlex variable aspect, variable resolution (SigLip-2: https://arxiv.org/abs/2502.14786)
+  4. Support for FlexiViT variable patch size (https://arxiv.org/abs/2212.08013)
+  5. Support for NaViT fractional/factorized position embedding (https://arxiv.org/abs/2307.06304)
+* Existing vit models in `vision_transformer.py` can be loaded into the NaFlexVit model by adding the `use_naflex=True` flag to `create_model`
+  * Some native weights coming soon
+* A full NaFlex data pipeline is available that allows training / fine-tuning / evaluating with variable aspect / size images
+  * To enable in `train.py` and `validate.py` add the `--naflex-loader` arg, must be used with a NaFlexVit
+* To evaluate an existing (classic) ViT loaded in NaFlexVit model w/ NaFlex data pipe:
+  * `python validate.py /imagenet --amp -j 8 --model vit_base_patch16_224 --model-kwargs use_naflex=True --naflex-loader --naflex-max-seq-len 256` 
+* The training has some extra args features worth noting
+  * The `--naflex-train-seq-lens'` argument specifies which sequence lengths to randomly pick from per batch during training
+  * The `--naflex-max-seq-len` argument sets the target sequence length for validation
+  * Adding `--model-kwargs enable_patch_interpolator=True --naflex-patch-sizes 12 16 24` will enable random patch size selection per-batch w/ interpolation
+  * The `--naflex-loss-scale` arg changes loss scaling mode per batch relative to the batch size, `timm` NaFlex loading changes the batch size for each seq len
+
 ## May 28, 2025
 * Add a number of small/fast models thanks to https://github.com/brianhou0208
   * SwiftFormer - [(ICCV2023) SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://github.com/Amshaker/SwiftFormer) 
 
@@ -56,12 +56,12 @@
     'regnet', 'byobnet', 'byoanet', 'mlp_mixer', 'hiera', 'fastvit', 'hieradet_sam2', 'aimv2*', 'tnt',
     'tiny_vit', 'vovnet', 'tresnet', 'rexnet', 'resnetv2', 'repghost', 'repvit', 'pvt_v2', 'nextvit', 'nest',
     'mambaout', 'inception_next', 'inception_v4', 'hgnet', 'gcvit', 'focalnet', 'efficientformer_v2', 'edgenext',
-    'davit', 'rdnet', 'convnext', 'pit', 'starnet', 'shvit', 'fasternet', 'swiftformer', 'ghostnet',
+    'davit', 'rdnet', 'convnext', 'pit', 'starnet', 'shvit', 'fasternet', 'swiftformer', 'ghostnet', 'naflexvit'
 ]
 
 # transformer / hybrid models don't support full set of spatial / feature APIs and/or have spatial output.
 NON_STD_FILTERS = [
-    'vit_*', 'tnt_*', 'pit_*', 'coat_*', 'cait_*', '*mixer_*', 'gmlp_*', 'resmlp_*', 'twins_*',
+    'vit_*', 'naflexvit*', 'tnt_*', 'pit_*', 'coat_*', 'cait_*', '*mixer_*', 'gmlp_*', 'resmlp_*', 'twins_*',
     'convit_*', 'levit*', 'visformer*', 'deit*', 'xcit_*', 'crossvit_*', 'beit*', 'aimv2*', 'swiftformer_*',
     'poolformer_*', 'volo_*', 'sequencer2d_*', 'mvitv2*', 'gcvit*', 'efficientformer*', 'sam_hiera*',
     'eva_*', 'flexivit*', 'eva02*', 'samvit_*', 'efficientvit_m*', 'tiny_vit_*', 'hiera_*', 'vitamin*', 'test_vit*',
@@ -81,7 +81,7 @@
     EXCLUDE_FILTERS = ['*enormous*']
     NON_STD_EXCLUDE_FILTERS = ['*gigantic*', '*enormous*', '*_3b_*']
 
-EXCLUDE_JIT_FILTERS = ['hiera_*']
+EXCLUDE_JIT_FILTERS = ['hiera_*', '*naflex*']
 
 TARGET_FWD_SIZE = MAX_FWD_SIZE = 384
 TARGET_BWD_SIZE = 128
 
@@ -8,6 +8,18 @@
 from .imagenet_info import ImageNetInfo, infer_imagenet_subset
 from .loader import create_loader
 from .mixup import Mixup, FastCollateMixup
+from .naflex_dataset import NaFlexMapDatasetWrapper, calculate_naflex_batch_size
+from .naflex_loader import create_naflex_loader
+from .naflex_mixup import NaFlexMixup, pairwise_mixup_target, mix_batch_variable_size
+from .naflex_transforms import (
+    ResizeToSequence,
+    CenterCropToSequence,
+    RandomCropToSequence,
+    RandomResizedCropToSequence,
+    ResizeKeepRatioToSequence,
+    Patchify,
+    patchify_image,
+)
 from .readers import create_reader
 from .readers import get_img_extensions, is_img_extension, set_img_extensions, add_img_extensions, del_img_extensions
 from .real_labels import RealLabelsImagenet
 
@@ -33,6 +33,7 @@ def fast_collate(batch):
     if isinstance(batch[0][0], tuple):
         # This branch 'deinterleaves' and flattens tuples of input tensors into one tensor ordered by position
         # such that all tuple of position n will end up in a torch.split(tensor, batch_size) in nth position
+        is_np = isinstance(batch[0][0], np.ndarray)
         inner_tuple_size = len(batch[0][0])
         flattened_batch_size = batch_size * inner_tuple_size
         targets = torch.zeros(flattened_batch_size, dtype=torch.int64)
@@ -41,7 +42,10 @@ def fast_collate(batch):
             assert len(batch[i][0]) == inner_tuple_size  # all input tensor tuples must be same length
             for j in range(inner_tuple_size):
                 targets[i + j * batch_size] = batch[i][1]
-                tensor[i + j * batch_size] += torch.from_numpy(batch[i][0][j])
+                if is_np:
+                    tensor[i + j * batch_size] += torch.from_numpy(batch[i][0][j])
+                else:
+                    tensor[i + j * batch_size] += batch[i][0][j]
         return tensor, targets
     elif isinstance(batch[0][0], np.ndarray):
         targets = torch.tensor([b[1] for b in batch], dtype=torch.int64)
 
@@ -229,29 +229,41 @@ def _mix_elem_collate(self, output, batch, half=False):
         num_elem = batch_size // 2 if half else batch_size
         assert len(output) == num_elem
         lam_batch, use_cutmix = self._params_per_elem(num_elem)
+        is_np = isinstance(batch[0][0], np.ndarray)
+
         for i in range(num_elem):
             j = batch_size - i - 1
             lam = lam_batch[i]
             mixed = batch[i][0]
             if lam != 1.:
                 if use_cutmix[i]:
                     if not half:
-                        mixed = mixed.copy()
+                        mixed = mixed.copy() if is_np else mixed.clone()
                     (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
-                        output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                        output.shape,
+                        lam,
+                        ratio_minmax=self.cutmix_minmax,
+                        correct_lam=self.correct_lam,
+                    )
                     mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
                     lam_batch[i] = lam
                 else:
-                    mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
-                    np.rint(mixed, out=mixed)
-            output[i] += torch.from_numpy(mixed.astype(np.uint8))
+                    if is_np:
+                        mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
+                        np.rint(mixed, out=mixed)
+                    else:
+                        mixed = mixed.float() * lam + batch[j][0].float() * (1 - lam)
+                        torch.round(mixed, out=mixed)
+            output[i] += torch.from_numpy(mixed.astype(np.uint8)) if is_np else mixed.byte()
         if half:
             lam_batch = np.concatenate((lam_batch, np.ones(num_elem)))
         return torch.tensor(lam_batch).unsqueeze(1)
 
     def _mix_pair_collate(self, output, batch):
         batch_size = len(batch)
         lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
+        is_np = isinstance(batch[0][0], np.ndarray)
+
         for i in range(batch_size // 2):
             j = batch_size - i - 1
             lam = lam_batch[i]
@@ -261,39 +273,60 @@ def _mix_pair_collate(self, output, batch):
             if lam < 1.:
                 if use_cutmix[i]:
                     (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
-                        output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
-                    patch_i = mixed_i[:, yl:yh, xl:xh].copy()
+                        output.shape,
+                        lam,
+                        ratio_minmax=self.cutmix_minmax,
+                        correct_lam=self.correct_lam,
+                    )
+                    patch_i = mixed_i[:, yl:yh, xl:xh].copy() if is_np else mixed_i[:, yl:yh, xl:xh].clone()
                     mixed_i[:, yl:yh, xl:xh] = mixed_j[:, yl:yh, xl:xh]
                     mixed_j[:, yl:yh, xl:xh] = patch_i
                     lam_batch[i] = lam
                 else:
-                    mixed_temp = mixed_i.astype(np.float32) * lam + mixed_j.astype(np.float32) * (1 - lam)
-                    mixed_j = mixed_j.astype(np.float32) * lam + mixed_i.astype(np.float32) * (1 - lam)
-                    mixed_i = mixed_temp
-                    np.rint(mixed_j, out=mixed_j)
-                    np.rint(mixed_i, out=mixed_i)
-            output[i] += torch.from_numpy(mixed_i.astype(np.uint8))
-            output[j] += torch.from_numpy(mixed_j.astype(np.uint8))
+                    if is_np:
+                        mixed_temp = mixed_i.astype(np.float32) * lam + mixed_j.astype(np.float32) * (1 - lam)
+                        mixed_j = mixed_j.astype(np.float32) * lam + mixed_i.astype(np.float32) * (1 - lam)
+                        mixed_i = mixed_temp
+                        np.rint(mixed_j, out=mixed_j)
+                        np.rint(mixed_i, out=mixed_i)
+                    else:
+                        mixed_temp = mixed_i.float() * lam + mixed_j.float() * (1 - lam)
+                        mixed_j = mixed_j.float() * lam + mixed_i.float() * (1 - lam)
+                        mixed_i = mixed_temp
+                        torch.round(mixed_j, out=mixed_j)
+                        torch.round(mixed_i, out=mixed_i)
+            output[i] += torch.from_numpy(mixed_i.astype(np.uint8)) if is_np else mixed_i.byte()
+            output[j] += torch.from_numpy(mixed_j.astype(np.uint8)) if is_np else mixed_j.byte()
         lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
         return torch.tensor(lam_batch).unsqueeze(1)
 
     def _mix_batch_collate(self, output, batch):
         batch_size = len(batch)
         lam, use_cutmix = self._params_per_batch()
+        is_np = isinstance(batch[0][0], np.ndarray)
+
         if use_cutmix:
             (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
-                output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                output.shape,
+                lam,
+                ratio_minmax=self.cutmix_minmax,
+                correct_lam=self.correct_lam,
+            )
         for i in range(batch_size):
             j = batch_size - i - 1
             mixed = batch[i][0]
             if lam != 1.:
                 if use_cutmix:
-                    mixed = mixed.copy()  # don't want to modify the original while iterating
+                    mixed = mixed.copy() if is_np else mixed.clone()  # don't want to modify the original while iterating
                     mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
                 else:
-                    mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
-                    np.rint(mixed, out=mixed)
-            output[i] += torch.from_numpy(mixed.astype(np.uint8))
+                    if is_np:
+                        mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
+                        np.rint(mixed, out=mixed)
+                    else:
+                        mixed = mixed.float() * lam + batch[j][0].float() * (1 - lam)
+                        torch.round(mixed, out=mixed)
+            output[i] += torch.from_numpy(mixed.astype(np.uint8)) if is_np else mixed.byte()
         return lam
 
     def __call__(self, batch, _=None):