@@ -67,11 +67,6 @@ void HouLiFilterCached1DVector::operator()(Grid::View::C_XY view) const {
67
67
68
68
static_assert (view.stride (0 ) == 1 ); // contiguous in kx
69
69
70
- // permutation indices
71
- static_assert (C_WIDTH == 4 ); // idx are hardcoded
72
- VIdx const lower_idx = {0 , 0 , 1 , 1 , 2 , 2 , 3 , 3 };
73
- VIdx const upper_idx = {4 , 4 , 5 , 5 , 6 , 6 , 7 , 7 };
74
-
75
70
for (int ky = 0 ; ky < grid.KY ; ++ky) {
76
71
// avoid std::vector dereference inside loop:
77
72
// broadcast fy value into vector
@@ -93,11 +88,11 @@ void HouLiFilterCached1DVector::operator()(Grid::View::C_XY view) const {
93
88
VReal vfx_full{fx_addr};
94
89
95
90
// Permute lower factors, multiply lower input
96
- VReal lower_fx = _mm512_permutex2var_pd (vfx_full, lower_idx, vfx_full);
91
+ VReal lower_fx = duplicateLower ( vfx_full);
97
92
eve::store (input_lower * lower_fx * vfy, view_addr);
98
93
99
94
// Permute upper factors, multiply upper input
100
- VReal upper_fx = _mm512_permutex2var_pd (vfx_full, upper_idx, vfx_full);
95
+ VReal upper_fx = duplicateUpper ( vfx_full);
101
96
eve::store (input_upper * upper_fx * vfy, upper_view_addr);
102
97
}
103
98
@@ -107,4 +102,26 @@ void HouLiFilterCached1DVector::operator()(Grid::View::C_XY view) const {
107
102
}
108
103
}
109
104
}
105
+
106
+ #ifdef AVX512_ENABLED
107
+ HouLiFilterCached1DVector::VReal HouLiFilterCached1DVector::duplicateLower (VReal src) const {
108
+ static_assert (R_WIDTH == 8 );
109
+ const VIdx lower_idx{0 , 0 , 1 , 1 , 2 , 2 , 3 , 3 };
110
+ return _mm512_permutex2var_pd (src, lower_idx, src);
111
+ }
112
+ HouLiFilterCached1DVector::VReal HouLiFilterCached1DVector::duplicateUpper (VReal src) const {
113
+ static_assert (R_WIDTH == 8 );
114
+ const VIdx upper_idx{4 , 4 , 5 , 5 , 6 , 6 , 7 , 7 };
115
+ return _mm512_permutex2var_pd (src, upper_idx, src);
116
+ }
117
+ #else
118
+ HouLiFilterCached1DVector::VReal HouLiFilterCached1DVector::duplicateLower (VReal src) const {
119
+ static_assert (R_WIDTH == 2 );
120
+ return _mm_unpacklo_pd (src, src);
121
+ }
122
+ HouLiFilterCached1DVector::VReal HouLiFilterCached1DVector::duplicateUpper (VReal src) const {
123
+ static_assert (R_WIDTH == 2 );
124
+ return _mm_unpackhi_pd (src, src);
125
+ }
126
+ #endif
110
127
} // namespace ahr
0 commit comments