Skip to content

Commit b18e357

Browse files
mshabuninvpisarev
authored andcommitted
dnn: fixed GEMM1T AVX2 implementation (#1231)
1 parent 81283e9 commit b18e357

File tree

2 files changed

+3
-3
lines changed

2 files changed

+3
-3
lines changed

modules/dnn/src/layers/fully_connected_layer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ class FullyConnectedLayerImpl : public InnerProductLayer
169169

170170
for( k = 0; k < vecsize; k += 4 )
171171
{
172-
vfloat32x4 v = v_load_aligned(sptr + k);
172+
vfloat32x4 v = v_load(sptr + k);
173173
vs0 += v*v_load_aligned(wptr + k);
174174
vs1 += v*v_load_aligned(wptr + wstep + k);
175175
vs2 += v*v_load_aligned(wptr + wstep*2 + k);

modules/dnn/src/layers/layers_common.avx2.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
204204

205205
for( int k = 0; k < vecsize; k += 8, wptr += 8 )
206206
{
207-
__m256 v = _mm256_load_ps(vec + k);
207+
__m256 v = _mm256_loadu_ps(vec + k);
208208

209209
vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
210210
vs1 = _mm256_fmadd_ps(_mm256_load_ps(wptr + wstep), v, vs1);
@@ -237,7 +237,7 @@ void fastGEMM1T_avx2( const float* vec, const float* weights,
237237

238238
for( int k = 0; k < vecsize; k += 8, wptr += 8 )
239239
{
240-
__m256 v = _mm256_load_ps(vec + k);
240+
__m256 v = _mm256_loadu_ps(vec + k);
241241
vs0 = _mm256_fmadd_ps(_mm256_load_ps(wptr), v, vs0);
242242
}
243243

0 commit comments

Comments
 (0)