@@ -807,19 +807,59 @@ unsigned char *rans_uncompress_O0_32x16_sse4(unsigned char *in,
807807static inline void transpose_and_copy (uint8_t * out , int iN [32 ],
808808 uint8_t t [32 ][32 ]) {
809809 int z ;
810- #ifdef UBSAN
811- // Simplified version to avoid undefined behaviour sanitiser warnings.
810+
811+ // Simplified version from below.
812+ // This is pretty good with zig cc, but slow on clang and very slow on
813+ // gcc, even with -O3
814+ /*
812815 for (z = 0; z < NX; z++) {
813816 int k;
814817 for (k = 0; k < 32; k++)
815818 out[iN[z]+k] = t[k][z];
816819 iN[z] += 32;
817820 }
818- #else
819- // Unaligned access. We know we can get away with this as this
820- // code is only ever executed on x86 platforms which permit this.
821- for (z = 0 ; z < NX ; z += 4 ) {
822- * (uint64_t * )& out [iN [z ]] =
821+ */
822+
823+
824+ // A better approach for clang and gcc can be had with some manual
825+ // restructuring to attempt to do the two loops in explcit blocks.
826+ // With gcc -O3 or -O2 -ftree-vectorize this is quite fast, as is clang
827+ // and zig but neither beat the version below (or, for zig, the basic
828+ // code above).
829+ //
830+ // It's left here incase we ever want to move to tidier code and
831+ // to understand what auto-vectorises and what doesn't.
832+ /*
833+ #define NZ 2
834+ #define NK 8
835+ for (z = 0; z < 32; z+=NZ) {
836+ for (int k = 0; k < 32; k+=NK) {
837+ for (int z0 = z; z0 < z+NZ; z0++) {
838+ uint8_t tmp[NK];// __attribute__((aligned(32)));
839+ //uint8_t (*RESTRICT t0)[32] = &t[k];
840+ for (int k0 = 0; k0 < NK; k0++)
841+ //tmp[k0] = t0[k0][z0];
842+ tmp[k0] = t[k+k0][z0];
843+ memcpy(&out[iN[z0]+k], tmp, NK);
844+ }
845+ }
846+ for (int z0 = z; z0 < z+NZ; z0++)
847+ iN[z0] += 32;
848+ }
849+ */
850+
851+ // Manually unrolled code.
852+ // This is fastest on gcc and clang and not far behind with zig cc.
853+ // It also doesn't need aggressive gcc optimisation levels to be
854+ // efficient.
855+ //
856+ // It works by constructing 64-bit ints and copying them with a single
857+ // memory write. The fixed size memcpys just boil down to a memory write,
858+ // but unlike the earlier versions that did this direct, this isn't
859+ // exploiting undefined behaviour.
860+ for (z = 0 ; z < 32 ; z += 4 ) {
861+ uint64_t i64 ;
862+ i64 =
823863 ((uint64_t )(t [0 ][z ])<< 0 ) +
824864 ((uint64_t )(t [1 ][z ])<< 8 ) +
825865 ((uint64_t )(t [2 ][z ])<<16 ) +
@@ -828,7 +868,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
828868 ((uint64_t )(t [5 ][z ])<<40 ) +
829869 ((uint64_t )(t [6 ][z ])<<48 ) +
830870 ((uint64_t )(t [7 ][z ])<<56 );
831- * (uint64_t * )& out [iN [z + 1 ]] =
871+ memcpy (& out [iN [z ]], & i64 , 8 );
872+ i64 =
832873 ((uint64_t )(t [0 ][z + 1 ])<< 0 ) +
833874 ((uint64_t )(t [1 ][z + 1 ])<< 8 ) +
834875 ((uint64_t )(t [2 ][z + 1 ])<<16 ) +
@@ -837,7 +878,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
837878 ((uint64_t )(t [5 ][z + 1 ])<<40 ) +
838879 ((uint64_t )(t [6 ][z + 1 ])<<48 ) +
839880 ((uint64_t )(t [7 ][z + 1 ])<<56 );
840- * (uint64_t * )& out [iN [z + 2 ]] =
881+ memcpy (& out [iN [z + 1 ]], & i64 , 8 );
882+ i64 =
841883 ((uint64_t )(t [0 ][z + 2 ])<< 0 ) +
842884 ((uint64_t )(t [1 ][z + 2 ])<< 8 ) +
843885 ((uint64_t )(t [2 ][z + 2 ])<<16 ) +
@@ -846,7 +888,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
846888 ((uint64_t )(t [5 ][z + 2 ])<<40 ) +
847889 ((uint64_t )(t [6 ][z + 2 ])<<48 ) +
848890 ((uint64_t )(t [7 ][z + 2 ])<<56 );
849- * (uint64_t * )& out [iN [z + 3 ]] =
891+ memcpy (& out [iN [z + 2 ]], & i64 , 8 );
892+ i64 =
850893 ((uint64_t )(t [0 ][z + 3 ])<< 0 ) +
851894 ((uint64_t )(t [1 ][z + 3 ])<< 8 ) +
852895 ((uint64_t )(t [2 ][z + 3 ])<<16 ) +
@@ -855,8 +898,9 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
855898 ((uint64_t )(t [5 ][z + 3 ])<<40 ) +
856899 ((uint64_t )(t [6 ][z + 3 ])<<48 ) +
857900 ((uint64_t )(t [7 ][z + 3 ])<<56 );
901+ memcpy (& out [iN [z + 3 ]], & i64 , 8 );
858902
859- * ( uint64_t * ) & out [ iN [ z ] + 8 ] =
903+ i64 =
860904 ((uint64_t )(t [8 + 0 ][z ])<< 0 ) +
861905 ((uint64_t )(t [8 + 1 ][z ])<< 8 ) +
862906 ((uint64_t )(t [8 + 2 ][z ])<<16 ) +
@@ -865,7 +909,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
865909 ((uint64_t )(t [8 + 5 ][z ])<<40 ) +
866910 ((uint64_t )(t [8 + 6 ][z ])<<48 ) +
867911 ((uint64_t )(t [8 + 7 ][z ])<<56 );
868- * (uint64_t * )& out [iN [z + 1 ]+ 8 ] =
912+ memcpy (& out [iN [z ]+ 8 ], & i64 , 8 );
913+ i64 =
869914 ((uint64_t )(t [8 + 0 ][z + 1 ])<< 0 ) +
870915 ((uint64_t )(t [8 + 1 ][z + 1 ])<< 8 ) +
871916 ((uint64_t )(t [8 + 2 ][z + 1 ])<<16 ) +
@@ -874,7 +919,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
874919 ((uint64_t )(t [8 + 5 ][z + 1 ])<<40 ) +
875920 ((uint64_t )(t [8 + 6 ][z + 1 ])<<48 ) +
876921 ((uint64_t )(t [8 + 7 ][z + 1 ])<<56 );
877- * (uint64_t * )& out [iN [z + 2 ]+ 8 ] =
922+ memcpy (& out [iN [z + 1 ]+ 8 ], & i64 , 8 );
923+ i64 =
878924 ((uint64_t )(t [8 + 0 ][z + 2 ])<< 0 ) +
879925 ((uint64_t )(t [8 + 1 ][z + 2 ])<< 8 ) +
880926 ((uint64_t )(t [8 + 2 ][z + 2 ])<<16 ) +
@@ -883,7 +929,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
883929 ((uint64_t )(t [8 + 5 ][z + 2 ])<<40 ) +
884930 ((uint64_t )(t [8 + 6 ][z + 2 ])<<48 ) +
885931 ((uint64_t )(t [8 + 7 ][z + 2 ])<<56 );
886- * (uint64_t * )& out [iN [z + 3 ]+ 8 ] =
932+ memcpy (& out [iN [z + 2 ]+ 8 ], & i64 , 8 );
933+ i64 =
887934 ((uint64_t )(t [8 + 0 ][z + 3 ])<< 0 ) +
888935 ((uint64_t )(t [8 + 1 ][z + 3 ])<< 8 ) +
889936 ((uint64_t )(t [8 + 2 ][z + 3 ])<<16 ) +
@@ -892,8 +939,9 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
892939 ((uint64_t )(t [8 + 5 ][z + 3 ])<<40 ) +
893940 ((uint64_t )(t [8 + 6 ][z + 3 ])<<48 ) +
894941 ((uint64_t )(t [8 + 7 ][z + 3 ])<<56 );
942+ memcpy (& out [iN [z + 3 ]+ 8 ], & i64 , 8 );
895943
896- * ( uint64_t * ) & out [ iN [ z ] + 16 ] =
944+ i64 =
897945 ((uint64_t )(t [16 + 0 ][z ])<< 0 ) +
898946 ((uint64_t )(t [16 + 1 ][z ])<< 8 ) +
899947 ((uint64_t )(t [16 + 2 ][z ])<<16 ) +
@@ -902,7 +950,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
902950 ((uint64_t )(t [16 + 5 ][z ])<<40 ) +
903951 ((uint64_t )(t [16 + 6 ][z ])<<48 ) +
904952 ((uint64_t )(t [16 + 7 ][z ])<<56 );
905- * (uint64_t * )& out [iN [z + 1 ]+ 16 ] =
953+ memcpy (& out [iN [z ]+ 16 ], & i64 , 8 );
954+ i64 =
906955 ((uint64_t )(t [16 + 0 ][z + 1 ])<< 0 ) +
907956 ((uint64_t )(t [16 + 1 ][z + 1 ])<< 8 ) +
908957 ((uint64_t )(t [16 + 2 ][z + 1 ])<<16 ) +
@@ -911,7 +960,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
911960 ((uint64_t )(t [16 + 5 ][z + 1 ])<<40 ) +
912961 ((uint64_t )(t [16 + 6 ][z + 1 ])<<48 ) +
913962 ((uint64_t )(t [16 + 7 ][z + 1 ])<<56 );
914- * (uint64_t * )& out [iN [z + 2 ]+ 16 ] =
963+ memcpy (& out [iN [z + 1 ]+ 16 ], & i64 , 8 );
964+ i64 =
915965 ((uint64_t )(t [16 + 0 ][z + 2 ])<< 0 ) +
916966 ((uint64_t )(t [16 + 1 ][z + 2 ])<< 8 ) +
917967 ((uint64_t )(t [16 + 2 ][z + 2 ])<<16 ) +
@@ -920,7 +970,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
920970 ((uint64_t )(t [16 + 5 ][z + 2 ])<<40 ) +
921971 ((uint64_t )(t [16 + 6 ][z + 2 ])<<48 ) +
922972 ((uint64_t )(t [16 + 7 ][z + 2 ])<<56 );
923- * (uint64_t * )& out [iN [z + 3 ]+ 16 ] =
973+ memcpy (& out [iN [z + 2 ]+ 16 ], & i64 , 8 );
974+ i64 =
924975 ((uint64_t )(t [16 + 0 ][z + 3 ])<< 0 ) +
925976 ((uint64_t )(t [16 + 1 ][z + 3 ])<< 8 ) +
926977 ((uint64_t )(t [16 + 2 ][z + 3 ])<<16 ) +
@@ -929,8 +980,9 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
929980 ((uint64_t )(t [16 + 5 ][z + 3 ])<<40 ) +
930981 ((uint64_t )(t [16 + 6 ][z + 3 ])<<48 ) +
931982 ((uint64_t )(t [16 + 7 ][z + 3 ])<<56 );
983+ memcpy (& out [iN [z + 3 ]+ 16 ], & i64 , 8 );
932984
933- * ( uint64_t * ) & out [ iN [ z ] + 24 ] =
985+ i64 =
934986 ((uint64_t )(t [24 + 0 ][z ])<< 0 ) +
935987 ((uint64_t )(t [24 + 1 ][z ])<< 8 ) +
936988 ((uint64_t )(t [24 + 2 ][z ])<<16 ) +
@@ -939,7 +991,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
939991 ((uint64_t )(t [24 + 5 ][z ])<<40 ) +
940992 ((uint64_t )(t [24 + 6 ][z ])<<48 ) +
941993 ((uint64_t )(t [24 + 7 ][z ])<<56 );
942- * (uint64_t * )& out [iN [z + 1 ]+ 24 ] =
994+ memcpy (& out [iN [z ]+ 24 ], & i64 , 8 );
995+ i64 =
943996 ((uint64_t )(t [24 + 0 ][z + 1 ])<< 0 ) +
944997 ((uint64_t )(t [24 + 1 ][z + 1 ])<< 8 ) +
945998 ((uint64_t )(t [24 + 2 ][z + 1 ])<<16 ) +
@@ -948,7 +1001,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
9481001 ((uint64_t )(t [24 + 5 ][z + 1 ])<<40 ) +
9491002 ((uint64_t )(t [24 + 6 ][z + 1 ])<<48 ) +
9501003 ((uint64_t )(t [24 + 7 ][z + 1 ])<<56 );
951- * (uint64_t * )& out [iN [z + 2 ]+ 24 ] =
1004+ memcpy (& out [iN [z + 1 ]+ 24 ], & i64 , 8 );
1005+ i64 =
9521006 ((uint64_t )(t [24 + 0 ][z + 2 ])<< 0 ) +
9531007 ((uint64_t )(t [24 + 1 ][z + 2 ])<< 8 ) +
9541008 ((uint64_t )(t [24 + 2 ][z + 2 ])<<16 ) +
@@ -957,7 +1011,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
9571011 ((uint64_t )(t [24 + 5 ][z + 2 ])<<40 ) +
9581012 ((uint64_t )(t [24 + 6 ][z + 2 ])<<48 ) +
9591013 ((uint64_t )(t [24 + 7 ][z + 2 ])<<56 );
960- * (uint64_t * )& out [iN [z + 3 ]+ 24 ] =
1014+ memcpy (& out [iN [z + 2 ]+ 24 ], & i64 , 8 );
1015+ i64 =
9611016 ((uint64_t )(t [24 + 0 ][z + 3 ])<< 0 ) +
9621017 ((uint64_t )(t [24 + 1 ][z + 3 ])<< 8 ) +
9631018 ((uint64_t )(t [24 + 2 ][z + 3 ])<<16 ) +
@@ -966,13 +1021,13 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
9661021 ((uint64_t )(t [24 + 5 ][z + 3 ])<<40 ) +
9671022 ((uint64_t )(t [24 + 6 ][z + 3 ])<<48 ) +
9681023 ((uint64_t )(t [24 + 7 ][z + 3 ])<<56 );
1024+ memcpy (& out [iN [z + 3 ]+ 24 ], & i64 , 8 );
9691025
9701026 iN [z + 0 ] += 32 ;
9711027 iN [z + 1 ] += 32 ;
9721028 iN [z + 2 ] += 32 ;
9731029 iN [z + 3 ] += 32 ;
9741030 }
975- #endif
9761031}
9771032
9781033unsigned char * rans_uncompress_O1_32x16_sse4 (unsigned char * in ,
0 commit comments