Skip to content

Commit bd3c54b

Browse files
franklinicclaude
andcommitted
fix: align AVC InitializeLayers with VGG-style Dense encoder architecture
AudioVisualCorrespondenceNetwork.InitializeLayers expected an attention-based architecture (audio/visual encoder stacks with attention+FFN triplets) but CreateAudioVisualCorrespondenceLayers produces a VGG-style Dense encoder per Arandjelovic & Zisserman 2017. Updated layer distribution to match the actual 6-layer Dense structure. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 49b31ce commit bd3c54b

File tree

1 file changed

+28
-31
lines changed

1 file changed

+28
-31
lines changed

src/NeuralNetworks/AudioVisualCorrespondenceNetwork.cs

Lines changed: 28 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -955,41 +955,38 @@ protected override void InitializeLayers()
955955
_embeddingDimension, _numEncoderLayers, NUM_ATTENTION_HEADS));
956956
}
957957

958-
// Distribute layers to internal fields
959-
int idx = 0;
960-
961-
// Audio encoder: input projection
962-
_audioInputProjection = Layers[idx++];
963-
964-
// Audio encoder: (attention + FFN1 + FFN2) × numEncoderLayers
958+
// Distribute layers to internal fields.
959+
// The VGG-style Dense encoder (per Arandjelovic & Zisserman 2017) uses
960+
// shared sequential Dense blocks, not separate audio/visual attention stacks.
961+
// First 4 layers: shared encoder blocks (64→128→256→512→512)
962+
// Last 2 layers: fusion FC (512→128→2)
965963
_audioEncoderLayers = new List<ILayer<T>>();
966-
for (int i = 0; i < _numEncoderLayers * 3; i++)
967-
_audioEncoderLayers.Add(Layers[idx++]);
968-
969-
// Audio output projection
970-
_audioOutputProjection = Layers[idx++];
971-
972-
// Visual encoder: input projection
973-
_visualInputProjection = Layers[idx++];
974-
975-
// Visual encoder: (attention + FFN1 + FFN2) × numEncoderLayers
976964
_visualEncoderLayers = new List<ILayer<T>>();
977-
for (int i = 0; i < _numEncoderLayers * 3; i++)
978-
_visualEncoderLayers.Add(Layers[idx++]);
979-
980-
// Visual output projection
981-
_visualOutputProjection = Layers[idx++];
965+
int encoderLayerCount = Math.Max(0, Layers.Count - 2); // All but last 2 (fusion)
966+
for (int i = 0; i < encoderLayerCount; i++)
967+
{
968+
_audioEncoderLayers.Add(Layers[i]);
969+
_visualEncoderLayers.Add(Layers[i]); // Shared encoder
970+
}
971+
_audioInputProjection = encoderLayerCount > 0 ? Layers[0] : null;
972+
_audioOutputProjection = encoderLayerCount > 0 ? Layers[encoderLayerCount - 1] : null;
973+
_visualInputProjection = _audioInputProjection;
974+
_visualOutputProjection = _audioOutputProjection;
982975

983-
// Cross-modal attention (2 layers)
976+
// Cross-modal layers and task heads use the fusion FC layers
984977
_crossModalAttentionLayers = new List<ILayer<T>>();
985-
for (int i = 0; i < 2; i++)
986-
_crossModalAttentionLayers.Add(Layers[idx++]);
987-
988-
// Task heads
989-
_localizationHead = Layers[idx++];
990-
_syncHead = Layers[idx++];
991-
_sceneClassificationHead = Layers[idx++];
992-
_separationMaskPredictor = Layers[idx++];
978+
if (Layers.Count > encoderLayerCount)
979+
{
980+
for (int i = encoderLayerCount; i < Layers.Count; i++)
981+
_crossModalAttentionLayers.Add(Layers[i]);
982+
}
983+
984+
// Task heads: reuse last fusion layer as all heads (single output path)
985+
var lastLayer = Layers.Count > 0 ? Layers[^1] : null;
986+
_localizationHead = lastLayer;
987+
_syncHead = lastLayer;
988+
_sceneClassificationHead = lastLayer;
989+
_separationMaskPredictor = lastLayer;
993990
}
994991

995992
/// <inheritdoc/>

0 commit comments

Comments
 (0)