117117 "hunyuan-video" : "txt_in.individual_token_refiner.blocks.0.adaLN_modulation.1.bias" ,
118118 "instruct-pix2pix" : "model.diffusion_model.input_blocks.0.0.weight" ,
119119 "lumina2" : ["model.diffusion_model.cap_embedder.0.weight" , "cap_embedder.0.weight" ],
120- "sana" : [
120+ "sana" : [
121121 "blocks.0.cross_attn.q_linear.weight" ,
122122 "blocks.0.cross_attn.q_linear.bias" ,
123123 "blocks.0.cross_attn.kv_linear.weight" ,
@@ -2877,7 +2877,7 @@ def convert_sana_transformer_to_diffusers(checkpoint, **kwargs):
28772877 checkpoint [k .replace ("model.diffusion_model." , "" )] = checkpoint .pop (k )
28782878
28792879 num_layers = list (set (int (k .split ("." , 2 )[1 ]) for k in checkpoint if "blocks" in k ))[- 1 ] + 1 # noqa: C401
2880-
2880+
28812881
28822882 # Positional and patch embeddings.
28832883 checkpoint .pop ("pos_embed" )
@@ -2891,7 +2891,7 @@ def convert_sana_transformer_to_diffusers(checkpoint, **kwargs):
28912891 converted_state_dict ["time_embed.emb.timestep_embedder.linear_2.bias" ] = checkpoint .pop ("t_embedder.mlp.2.bias" )
28922892 converted_state_dict ["time_embed.linear.weight" ] = checkpoint .pop ("t_block.1.weight" )
28932893 converted_state_dict ["time_embed.linear.bias" ] = checkpoint .pop ("t_block.1.bias" )
2894-
2894+
28952895 # Caption Projection.
28962896 checkpoint .pop ("y_embedder.y_embedding" )
28972897 converted_state_dict ["caption_projection.linear_1.weight" ] = checkpoint .pop ("y_embedder.y_proj.fc1.weight" )
@@ -2935,10 +2935,10 @@ def convert_sana_transformer_to_diffusers(checkpoint, **kwargs):
29352935 converted_state_dict [f"transformer_blocks.{ i } .ff.conv_depth.weight" ] = checkpoint .pop (f"blocks.{ i } .mlp.depth_conv.conv.weight" )
29362936 converted_state_dict [f"transformer_blocks.{ i } .ff.conv_depth.bias" ] = checkpoint .pop (f"blocks.{ i } .mlp.depth_conv.conv.bias" )
29372937 converted_state_dict [f"transformer_blocks.{ i } .ff.conv_point.weight" ] = checkpoint .pop (f"blocks.{ i } .mlp.point_conv.conv.weight" )
2938-
2938+
29392939 # Final layer
29402940 converted_state_dict ["proj_out.weight" ] = checkpoint .pop ("final_layer.linear.weight" )
29412941 converted_state_dict ["proj_out.bias" ] = checkpoint .pop ("final_layer.linear.bias" )
29422942 converted_state_dict ["scale_shift_table" ] = checkpoint .pop ("final_layer.scale_shift_table" )
29432943
2944- return converted_state_dict
2944+ return converted_state_dict
0 commit comments