@@ -1238,153 +1238,162 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
12381238// }
12391239
12401240 for (z = 0 ; z < NX ; z += 4 ) {
1241- * ( uint64_t * ) & out [ iN [ z ]] =
1241+ uint64_t t0 [ 4 ] = {
12421242 ((uint64_t )(t [0 ][z ])<< 0 ) +
12431243 ((uint64_t )(t [1 ][z ])<< 8 ) +
12441244 ((uint64_t )(t [2 ][z ])<<16 ) +
12451245 ((uint64_t )(t [3 ][z ])<<24 ) +
12461246 ((uint64_t )(t [4 ][z ])<<32 ) +
12471247 ((uint64_t )(t [5 ][z ])<<40 ) +
12481248 ((uint64_t )(t [6 ][z ])<<48 ) +
1249- ((uint64_t )(t [7 ][z ])<<56 );
1250- * (uint64_t * )& out [iN [z + 1 ]] =
1251- ((uint64_t )(t [0 ][z + 1 ])<< 0 ) +
1252- ((uint64_t )(t [1 ][z + 1 ])<< 8 ) +
1253- ((uint64_t )(t [2 ][z + 1 ])<<16 ) +
1254- ((uint64_t )(t [3 ][z + 1 ])<<24 ) +
1255- ((uint64_t )(t [4 ][z + 1 ])<<32 ) +
1256- ((uint64_t )(t [5 ][z + 1 ])<<40 ) +
1257- ((uint64_t )(t [6 ][z + 1 ])<<48 ) +
1258- ((uint64_t )(t [7 ][z + 1 ])<<56 );
1259- * (uint64_t * )& out [iN [z + 2 ]] =
1260- ((uint64_t )(t [0 ][z + 2 ])<< 0 ) +
1261- ((uint64_t )(t [1 ][z + 2 ])<< 8 ) +
1262- ((uint64_t )(t [2 ][z + 2 ])<<16 ) +
1263- ((uint64_t )(t [3 ][z + 2 ])<<24 ) +
1264- ((uint64_t )(t [4 ][z + 2 ])<<32 ) +
1265- ((uint64_t )(t [5 ][z + 2 ])<<40 ) +
1266- ((uint64_t )(t [6 ][z + 2 ])<<48 ) +
1267- ((uint64_t )(t [7 ][z + 2 ])<<56 );
1268- * (uint64_t * )& out [iN [z + 3 ]] =
1269- ((uint64_t )(t [0 ][z + 3 ])<< 0 ) +
1270- ((uint64_t )(t [1 ][z + 3 ])<< 8 ) +
1271- ((uint64_t )(t [2 ][z + 3 ])<<16 ) +
1272- ((uint64_t )(t [3 ][z + 3 ])<<24 ) +
1273- ((uint64_t )(t [4 ][z + 3 ])<<32 ) +
1274- ((uint64_t )(t [5 ][z + 3 ])<<40 ) +
1275- ((uint64_t )(t [6 ][z + 3 ])<<48 ) +
1276- ((uint64_t )(t [7 ][z + 3 ])<<56 );
1249+ ((uint64_t )(t [7 ][z ])<<56 ),
12771250
1278- * (uint64_t * )& out [iN [z ]+ 8 ] =
12791251 ((uint64_t )(t [8 + 0 ][z ])<< 0 ) +
12801252 ((uint64_t )(t [8 + 1 ][z ])<< 8 ) +
12811253 ((uint64_t )(t [8 + 2 ][z ])<<16 ) +
12821254 ((uint64_t )(t [8 + 3 ][z ])<<24 ) +
12831255 ((uint64_t )(t [8 + 4 ][z ])<<32 ) +
12841256 ((uint64_t )(t [8 + 5 ][z ])<<40 ) +
12851257 ((uint64_t )(t [8 + 6 ][z ])<<48 ) +
1286- ((uint64_t )(t [8 + 7 ][z ])<<56 );
1287- * (uint64_t * )& out [iN [z + 1 ]+ 8 ] =
1288- ((uint64_t )(t [8 + 0 ][z + 1 ])<< 0 ) +
1289- ((uint64_t )(t [8 + 1 ][z + 1 ])<< 8 ) +
1290- ((uint64_t )(t [8 + 2 ][z + 1 ])<<16 ) +
1291- ((uint64_t )(t [8 + 3 ][z + 1 ])<<24 ) +
1292- ((uint64_t )(t [8 + 4 ][z + 1 ])<<32 ) +
1293- ((uint64_t )(t [8 + 5 ][z + 1 ])<<40 ) +
1294- ((uint64_t )(t [8 + 6 ][z + 1 ])<<48 ) +
1295- ((uint64_t )(t [8 + 7 ][z + 1 ])<<56 );
1296- * (uint64_t * )& out [iN [z + 2 ]+ 8 ] =
1297- ((uint64_t )(t [8 + 0 ][z + 2 ])<< 0 ) +
1298- ((uint64_t )(t [8 + 1 ][z + 2 ])<< 8 ) +
1299- ((uint64_t )(t [8 + 2 ][z + 2 ])<<16 ) +
1300- ((uint64_t )(t [8 + 3 ][z + 2 ])<<24 ) +
1301- ((uint64_t )(t [8 + 4 ][z + 2 ])<<32 ) +
1302- ((uint64_t )(t [8 + 5 ][z + 2 ])<<40 ) +
1303- ((uint64_t )(t [8 + 6 ][z + 2 ])<<48 ) +
1304- ((uint64_t )(t [8 + 7 ][z + 2 ])<<56 );
1305- * (uint64_t * )& out [iN [z + 3 ]+ 8 ] =
1306- ((uint64_t )(t [8 + 0 ][z + 3 ])<< 0 ) +
1307- ((uint64_t )(t [8 + 1 ][z + 3 ])<< 8 ) +
1308- ((uint64_t )(t [8 + 2 ][z + 3 ])<<16 ) +
1309- ((uint64_t )(t [8 + 3 ][z + 3 ])<<24 ) +
1310- ((uint64_t )(t [8 + 4 ][z + 3 ])<<32 ) +
1311- ((uint64_t )(t [8 + 5 ][z + 3 ])<<40 ) +
1312- ((uint64_t )(t [8 + 6 ][z + 3 ])<<48 ) +
1313- ((uint64_t )(t [8 + 7 ][z + 3 ])<<56 );
1258+ ((uint64_t )(t [8 + 7 ][z ])<<56 ),
13141259
1315- * (uint64_t * )& out [iN [z ]+ 16 ] =
13161260 ((uint64_t )(t [16 + 0 ][z ])<< 0 ) +
13171261 ((uint64_t )(t [16 + 1 ][z ])<< 8 ) +
13181262 ((uint64_t )(t [16 + 2 ][z ])<<16 ) +
13191263 ((uint64_t )(t [16 + 3 ][z ])<<24 ) +
13201264 ((uint64_t )(t [16 + 4 ][z ])<<32 ) +
13211265 ((uint64_t )(t [16 + 5 ][z ])<<40 ) +
13221266 ((uint64_t )(t [16 + 6 ][z ])<<48 ) +
1323- ((uint64_t )(t [16 + 7 ][z ])<<56 );
1324- * (uint64_t * )& out [iN [z + 1 ]+ 16 ] =
1325- ((uint64_t )(t [16 + 0 ][z + 1 ])<< 0 ) +
1326- ((uint64_t )(t [16 + 1 ][z + 1 ])<< 8 ) +
1327- ((uint64_t )(t [16 + 2 ][z + 1 ])<<16 ) +
1328- ((uint64_t )(t [16 + 3 ][z + 1 ])<<24 ) +
1329- ((uint64_t )(t [16 + 4 ][z + 1 ])<<32 ) +
1330- ((uint64_t )(t [16 + 5 ][z + 1 ])<<40 ) +
1331- ((uint64_t )(t [16 + 6 ][z + 1 ])<<48 ) +
1332- ((uint64_t )(t [16 + 7 ][z + 1 ])<<56 );
1333- * (uint64_t * )& out [iN [z + 2 ]+ 16 ] =
1334- ((uint64_t )(t [16 + 0 ][z + 2 ])<< 0 ) +
1335- ((uint64_t )(t [16 + 1 ][z + 2 ])<< 8 ) +
1336- ((uint64_t )(t [16 + 2 ][z + 2 ])<<16 ) +
1337- ((uint64_t )(t [16 + 3 ][z + 2 ])<<24 ) +
1338- ((uint64_t )(t [16 + 4 ][z + 2 ])<<32 ) +
1339- ((uint64_t )(t [16 + 5 ][z + 2 ])<<40 ) +
1340- ((uint64_t )(t [16 + 6 ][z + 2 ])<<48 ) +
1341- ((uint64_t )(t [16 + 7 ][z + 2 ])<<56 );
1342- * (uint64_t * )& out [iN [z + 3 ]+ 16 ] =
1343- ((uint64_t )(t [16 + 0 ][z + 3 ])<< 0 ) +
1344- ((uint64_t )(t [16 + 1 ][z + 3 ])<< 8 ) +
1345- ((uint64_t )(t [16 + 2 ][z + 3 ])<<16 ) +
1346- ((uint64_t )(t [16 + 3 ][z + 3 ])<<24 ) +
1347- ((uint64_t )(t [16 + 4 ][z + 3 ])<<32 ) +
1348- ((uint64_t )(t [16 + 5 ][z + 3 ])<<40 ) +
1349- ((uint64_t )(t [16 + 6 ][z + 3 ])<<48 ) +
1350- ((uint64_t )(t [16 + 7 ][z + 3 ])<<56 );
1267+ ((uint64_t )(t [16 + 7 ][z ])<<56 ),
13511268
1352- * (uint64_t * )& out [iN [z ]+ 24 ] =
13531269 ((uint64_t )(t [24 + 0 ][z ])<< 0 ) +
13541270 ((uint64_t )(t [24 + 1 ][z ])<< 8 ) +
13551271 ((uint64_t )(t [24 + 2 ][z ])<<16 ) +
13561272 ((uint64_t )(t [24 + 3 ][z ])<<24 ) +
13571273 ((uint64_t )(t [24 + 4 ][z ])<<32 ) +
13581274 ((uint64_t )(t [24 + 5 ][z ])<<40 ) +
13591275 ((uint64_t )(t [24 + 6 ][z ])<<48 ) +
1360- ((uint64_t )(t [24 + 7 ][z ])<<56 );
1361- * (uint64_t * )& out [iN [z + 1 ]+ 24 ] =
1276+ ((uint64_t )(t [24 + 7 ][z ])<<56 )
1277+ };
1278+ memcpy (& out [iN [z ]], & t0 , 32 );
1279+
1280+ uint64_t t1 [4 ] = {
1281+ ((uint64_t )(t [0 ][z + 1 ])<< 0 ) +
1282+ ((uint64_t )(t [1 ][z + 1 ])<< 8 ) +
1283+ ((uint64_t )(t [2 ][z + 1 ])<<16 ) +
1284+ ((uint64_t )(t [3 ][z + 1 ])<<24 ) +
1285+ ((uint64_t )(t [4 ][z + 1 ])<<32 ) +
1286+ ((uint64_t )(t [5 ][z + 1 ])<<40 ) +
1287+ ((uint64_t )(t [6 ][z + 1 ])<<48 ) +
1288+ ((uint64_t )(t [7 ][z + 1 ])<<56 ),
1289+
1290+ ((uint64_t )(t [8 + 0 ][z + 1 ])<< 0 ) +
1291+ ((uint64_t )(t [8 + 1 ][z + 1 ])<< 8 ) +
1292+ ((uint64_t )(t [8 + 2 ][z + 1 ])<<16 ) +
1293+ ((uint64_t )(t [8 + 3 ][z + 1 ])<<24 ) +
1294+ ((uint64_t )(t [8 + 4 ][z + 1 ])<<32 ) +
1295+ ((uint64_t )(t [8 + 5 ][z + 1 ])<<40 ) +
1296+ ((uint64_t )(t [8 + 6 ][z + 1 ])<<48 ) +
1297+ ((uint64_t )(t [8 + 7 ][z + 1 ])<<56 ),
1298+
1299+ ((uint64_t )(t [16 + 0 ][z + 1 ])<< 0 ) +
1300+ ((uint64_t )(t [16 + 1 ][z + 1 ])<< 8 ) +
1301+ ((uint64_t )(t [16 + 2 ][z + 1 ])<<16 ) +
1302+ ((uint64_t )(t [16 + 3 ][z + 1 ])<<24 ) +
1303+ ((uint64_t )(t [16 + 4 ][z + 1 ])<<32 ) +
1304+ ((uint64_t )(t [16 + 5 ][z + 1 ])<<40 ) +
1305+ ((uint64_t )(t [16 + 6 ][z + 1 ])<<48 ) +
1306+ ((uint64_t )(t [16 + 7 ][z + 1 ])<<56 ),
1307+
13621308 ((uint64_t )(t [24 + 0 ][z + 1 ])<< 0 ) +
13631309 ((uint64_t )(t [24 + 1 ][z + 1 ])<< 8 ) +
13641310 ((uint64_t )(t [24 + 2 ][z + 1 ])<<16 ) +
13651311 ((uint64_t )(t [24 + 3 ][z + 1 ])<<24 ) +
13661312 ((uint64_t )(t [24 + 4 ][z + 1 ])<<32 ) +
13671313 ((uint64_t )(t [24 + 5 ][z + 1 ])<<40 ) +
13681314 ((uint64_t )(t [24 + 6 ][z + 1 ])<<48 ) +
1369- ((uint64_t )(t [24 + 7 ][z + 1 ])<<56 );
1370- * (uint64_t * )& out [iN [z + 2 ]+ 24 ] =
1315+ ((uint64_t )(t [24 + 7 ][z + 1 ])<<56 )
1316+ };
1317+ memcpy (& out [iN [z + 1 ]], & t1 , 32 );
1318+
1319+ uint64_t t2 [4 ] = {
1320+ ((uint64_t )(t [0 ][z + 2 ])<< 0 ) +
1321+ ((uint64_t )(t [1 ][z + 2 ])<< 8 ) +
1322+ ((uint64_t )(t [2 ][z + 2 ])<<16 ) +
1323+ ((uint64_t )(t [3 ][z + 2 ])<<24 ) +
1324+ ((uint64_t )(t [4 ][z + 2 ])<<32 ) +
1325+ ((uint64_t )(t [5 ][z + 2 ])<<40 ) +
1326+ ((uint64_t )(t [6 ][z + 2 ])<<48 ) +
1327+ ((uint64_t )(t [7 ][z + 2 ])<<56 ),
1328+
1329+ ((uint64_t )(t [8 + 0 ][z + 2 ])<< 0 ) +
1330+ ((uint64_t )(t [8 + 1 ][z + 2 ])<< 8 ) +
1331+ ((uint64_t )(t [8 + 2 ][z + 2 ])<<16 ) +
1332+ ((uint64_t )(t [8 + 3 ][z + 2 ])<<24 ) +
1333+ ((uint64_t )(t [8 + 4 ][z + 2 ])<<32 ) +
1334+ ((uint64_t )(t [8 + 5 ][z + 2 ])<<40 ) +
1335+ ((uint64_t )(t [8 + 6 ][z + 2 ])<<48 ) +
1336+ ((uint64_t )(t [8 + 7 ][z + 2 ])<<56 ),
1337+
1338+ ((uint64_t )(t [16 + 0 ][z + 2 ])<< 0 ) +
1339+ ((uint64_t )(t [16 + 1 ][z + 2 ])<< 8 ) +
1340+ ((uint64_t )(t [16 + 2 ][z + 2 ])<<16 ) +
1341+ ((uint64_t )(t [16 + 3 ][z + 2 ])<<24 ) +
1342+ ((uint64_t )(t [16 + 4 ][z + 2 ])<<32 ) +
1343+ ((uint64_t )(t [16 + 5 ][z + 2 ])<<40 ) +
1344+ ((uint64_t )(t [16 + 6 ][z + 2 ])<<48 ) +
1345+ ((uint64_t )(t [16 + 7 ][z + 2 ])<<56 ),
1346+
13711347 ((uint64_t )(t [24 + 0 ][z + 2 ])<< 0 ) +
13721348 ((uint64_t )(t [24 + 1 ][z + 2 ])<< 8 ) +
13731349 ((uint64_t )(t [24 + 2 ][z + 2 ])<<16 ) +
13741350 ((uint64_t )(t [24 + 3 ][z + 2 ])<<24 ) +
13751351 ((uint64_t )(t [24 + 4 ][z + 2 ])<<32 ) +
13761352 ((uint64_t )(t [24 + 5 ][z + 2 ])<<40 ) +
13771353 ((uint64_t )(t [24 + 6 ][z + 2 ])<<48 ) +
1378- ((uint64_t )(t [24 + 7 ][z + 2 ])<<56 );
1379- * (uint64_t * )& out [iN [z + 3 ]+ 24 ] =
1354+ ((uint64_t )(t [24 + 7 ][z + 2 ])<<56 ),
1355+
1356+ };
1357+ memcpy (& out [iN [z + 2 ]], & t2 , 32 );
1358+
1359+ uint64_t t3 [4 ] = {
1360+ ((uint64_t )(t [0 ][z + 3 ])<< 0 ) +
1361+ ((uint64_t )(t [1 ][z + 3 ])<< 8 ) +
1362+ ((uint64_t )(t [2 ][z + 3 ])<<16 ) +
1363+ ((uint64_t )(t [3 ][z + 3 ])<<24 ) +
1364+ ((uint64_t )(t [4 ][z + 3 ])<<32 ) +
1365+ ((uint64_t )(t [5 ][z + 3 ])<<40 ) +
1366+ ((uint64_t )(t [6 ][z + 3 ])<<48 ) +
1367+ ((uint64_t )(t [7 ][z + 3 ])<<56 ),
1368+
1369+ ((uint64_t )(t [8 + 0 ][z + 3 ])<< 0 ) +
1370+ ((uint64_t )(t [8 + 1 ][z + 3 ])<< 8 ) +
1371+ ((uint64_t )(t [8 + 2 ][z + 3 ])<<16 ) +
1372+ ((uint64_t )(t [8 + 3 ][z + 3 ])<<24 ) +
1373+ ((uint64_t )(t [8 + 4 ][z + 3 ])<<32 ) +
1374+ ((uint64_t )(t [8 + 5 ][z + 3 ])<<40 ) +
1375+ ((uint64_t )(t [8 + 6 ][z + 3 ])<<48 ) +
1376+ ((uint64_t )(t [8 + 7 ][z + 3 ])<<56 ),
1377+
1378+ ((uint64_t )(t [16 + 0 ][z + 3 ])<< 0 ) +
1379+ ((uint64_t )(t [16 + 1 ][z + 3 ])<< 8 ) +
1380+ ((uint64_t )(t [16 + 2 ][z + 3 ])<<16 ) +
1381+ ((uint64_t )(t [16 + 3 ][z + 3 ])<<24 ) +
1382+ ((uint64_t )(t [16 + 4 ][z + 3 ])<<32 ) +
1383+ ((uint64_t )(t [16 + 5 ][z + 3 ])<<40 ) +
1384+ ((uint64_t )(t [16 + 6 ][z + 3 ])<<48 ) +
1385+ ((uint64_t )(t [16 + 7 ][z + 3 ])<<56 ),
1386+
13801387 ((uint64_t )(t [24 + 0 ][z + 3 ])<< 0 ) +
13811388 ((uint64_t )(t [24 + 1 ][z + 3 ])<< 8 ) +
13821389 ((uint64_t )(t [24 + 2 ][z + 3 ])<<16 ) +
13831390 ((uint64_t )(t [24 + 3 ][z + 3 ])<<24 ) +
13841391 ((uint64_t )(t [24 + 4 ][z + 3 ])<<32 ) +
13851392 ((uint64_t )(t [24 + 5 ][z + 3 ])<<40 ) +
13861393 ((uint64_t )(t [24 + 6 ][z + 3 ])<<48 ) +
1387- ((uint64_t )(t [24 + 7 ][z + 3 ])<<56 );
1394+ ((uint64_t )(t [24 + 7 ][z + 3 ])<<56 )
1395+ };
1396+ memcpy (& out [iN [z + 3 ]], & t3 , 32 );
13881397
13891398 iN [z + 0 ] += 32 ;
13901399 iN [z + 1 ] += 32 ;
0 commit comments