Skip to content

Commit 56c45d5

Browse files
Bug fix for drop path decay rate in swin transformer (#34291)
* potential bug fix for drop path * variable name change * forgot to rename the variables * back to original * modify dpr properly * check_copies auto fix * corresponsing swin2 changes * auto fix * linting * default value for drop_path_rate as 0.0 * Update src/transformers/models/glm/modeling_glm.py * maskformer fix * ruff format * changes made to tf code as well * lint --------- Co-authored-by: abhijit deo <[email protected]>
1 parent 0ab0a42 commit 56c45d5

File tree

7 files changed

+32
-17
lines changed

7 files changed

+32
-17
lines changed

src/transformers/models/clap/modeling_clap.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -575,15 +575,15 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
575575

576576
# Copied from transformers.models.swin.modeling_swin.SwinLayer with SwinDropPath->ClapDropPath, Swin->ClapAudio
577577
class ClapAudioLayer(nn.Module):
578-
def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
578+
def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0):
579579
super().__init__()
580580
self.chunk_size_feed_forward = config.chunk_size_feed_forward
581581
self.shift_size = shift_size
582582
self.window_size = config.window_size
583583
self.input_resolution = input_resolution
584584
self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
585585
self.attention = ClapAudioAttention(config, dim, num_heads, window_size=self.window_size)
586-
self.drop_path = ClapDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
586+
self.drop_path = ClapDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
587587
self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
588588
self.intermediate = ClapAudioIntermediate(config, dim)
589589
self.output = ClapAudioOutput(config, dim)
@@ -712,6 +712,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d
712712
dim=dim,
713713
input_resolution=input_resolution,
714714
num_heads=num_heads,
715+
drop_path_rate=drop_path[i],
715716
shift_size=0 if (i % 2 == 0) else config.window_size // 2,
716717
)
717718
for i in range(depth)

src/transformers/models/donut/modeling_donut_swin.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -558,15 +558,15 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
558558

559559
# Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->DonutSwin
560560
class DonutSwinLayer(nn.Module):
561-
def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
561+
def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0):
562562
super().__init__()
563563
self.chunk_size_feed_forward = config.chunk_size_feed_forward
564564
self.shift_size = shift_size
565565
self.window_size = config.window_size
566566
self.input_resolution = input_resolution
567567
self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
568568
self.attention = DonutSwinAttention(config, dim, num_heads, window_size=self.window_size)
569-
self.drop_path = DonutSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
569+
self.drop_path = DonutSwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
570570
self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
571571
self.intermediate = DonutSwinIntermediate(config, dim)
572572
self.output = DonutSwinOutput(config, dim)
@@ -695,6 +695,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d
695695
dim=dim,
696696
input_resolution=input_resolution,
697697
num_heads=num_heads,
698+
drop_path_rate=drop_path[i],
698699
shift_size=0 if (i % 2 == 0) else config.window_size // 2,
699700
)
700701
for i in range(depth)

src/transformers/models/maskformer/modeling_maskformer_swin.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -520,16 +520,14 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
520520

521521

522522
class MaskFormerSwinLayer(nn.Module):
523-
def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
523+
def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0):
524524
super().__init__()
525525
self.shift_size = shift_size
526526
self.window_size = config.window_size
527527
self.input_resolution = input_resolution
528528
self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
529529
self.attention = MaskFormerSwinAttention(config, dim, num_heads, self.window_size)
530-
self.drop_path = (
531-
MaskFormerSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
532-
)
530+
self.drop_path = MaskFormerSwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
533531
self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
534532
self.intermediate = MaskFormerSwinIntermediate(config, dim)
535533
self.output = MaskFormerSwinOutput(config, dim)
@@ -644,6 +642,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d
644642
dim=dim,
645643
input_resolution=input_resolution,
646644
num_heads=num_heads,
645+
drop_path_rate=drop_path[i],
647646
shift_size=0 if (i % 2 == 0) else config.window_size // 2,
648647
)
649648
for i in range(depth)

src/transformers/models/swin/modeling_swin.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -635,15 +635,15 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
635635

636636

637637
class SwinLayer(nn.Module):
638-
def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
638+
def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0):
639639
super().__init__()
640640
self.chunk_size_feed_forward = config.chunk_size_feed_forward
641641
self.shift_size = shift_size
642642
self.window_size = config.window_size
643643
self.input_resolution = input_resolution
644644
self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
645645
self.attention = SwinAttention(config, dim, num_heads, window_size=self.window_size)
646-
self.drop_path = SwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
646+
self.drop_path = SwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
647647
self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
648648
self.intermediate = SwinIntermediate(config, dim)
649649
self.output = SwinOutput(config, dim)
@@ -771,6 +771,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d
771771
dim=dim,
772772
input_resolution=input_resolution,
773773
num_heads=num_heads,
774+
drop_path_rate=drop_path[i],
774775
shift_size=0 if (i % 2 == 0) else config.window_size // 2,
775776
)
776777
for i in range(depth)

src/transformers/models/swin/modeling_tf_swin.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -742,7 +742,14 @@ def build(self, input_shape=None):
742742

743743
class TFSwinLayer(keras.layers.Layer):
744744
def __init__(
745-
self, config, dim, input_resolution: Tuple[int, int], num_heads: int, shift_size: int = 0, **kwargs
745+
self,
746+
config,
747+
dim,
748+
input_resolution: Tuple[int, int],
749+
num_heads: int,
750+
drop_path_rate: float = 0.0,
751+
shift_size: int = 0,
752+
**kwargs,
746753
) -> None:
747754
super().__init__(**kwargs)
748755
self.chunk_size_feed_forward = config.chunk_size_feed_forward
@@ -754,8 +761,8 @@ def __init__(
754761
self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
755762
self.attention = TFSwinAttention(config, dim, num_heads, name="attention")
756763
self.drop_path = (
757-
TFSwinDropPath(config.drop_path_rate, name="drop_path")
758-
if config.drop_path_rate > 0.0
764+
TFSwinDropPath(drop_path_rate, name="drop_path")
765+
if drop_path_rate > 0.0
759766
else keras.layers.Activation("linear", name="drop_path")
760767
)
761768
self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
@@ -913,6 +920,7 @@ def __init__(
913920
input_resolution=input_resolution,
914921
num_heads=num_heads,
915922
shift_size=0 if (i % 2 == 0) else config.window_size // 2,
923+
drop_path_rate=drop_path[i],
916924
name=f"blocks.{i}",
917925
)
918926
for i in range(depth)

src/transformers/models/swin2sr/modeling_swin2sr.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -482,7 +482,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
482482

483483
# Copied from transformers.models.swinv2.modeling_swinv2.Swinv2Layer with Swinv2->Swin2SR
484484
class Swin2SRLayer(nn.Module):
485-
def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretrained_window_size=0):
485+
def __init__(
486+
self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0, pretrained_window_size=0
487+
):
486488
super().__init__()
487489
self.input_resolution = input_resolution
488490
window_size, shift_size = self._compute_window_shift(
@@ -500,7 +502,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretr
500502
else (pretrained_window_size, pretrained_window_size),
501503
)
502504
self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
503-
self.drop_path = Swin2SRDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
505+
self.drop_path = Swin2SRDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
504506
self.intermediate = Swin2SRIntermediate(config, dim)
505507
self.output = Swin2SROutput(config, dim)
506508
self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)

src/transformers/models/swinv2/modeling_swinv2.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -683,7 +683,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
683683

684684

685685
class Swinv2Layer(nn.Module):
686-
def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretrained_window_size=0):
686+
def __init__(
687+
self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0, pretrained_window_size=0
688+
):
687689
super().__init__()
688690
self.input_resolution = input_resolution
689691
window_size, shift_size = self._compute_window_shift(
@@ -701,7 +703,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretr
701703
else (pretrained_window_size, pretrained_window_size),
702704
)
703705
self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
704-
self.drop_path = Swinv2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
706+
self.drop_path = Swinv2DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
705707
self.intermediate = Swinv2Intermediate(config, dim)
706708
self.output = Swinv2Output(config, dim)
707709
self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
@@ -819,6 +821,7 @@ def __init__(
819821
dim=dim,
820822
input_resolution=input_resolution,
821823
num_heads=num_heads,
824+
drop_path_rate=drop_path[i],
822825
shift_size=0 if (i % 2 == 0) else config.window_size // 2,
823826
pretrained_window_size=pretrained_window_size,
824827
)

0 commit comments

Comments
 (0)