Skip to content

Commit cd1f950

Browse files
committed
Remove undefined behaviour in transpose_and_copy.
There was a -DUBSAN guard for use with undefined behaviour sanitizers, but actually the only code change that's necessary is to change the explicit writes to a memcpy. This fixes unoptimised zig-cc builds which use -fsanitize=undefined by default. I've tested gcc7, gcc13, clang10, clang16, zig cc, icc and icl. All seem fine with this change. NB: Zig cc is the only on to vectorise the most naive implementation, and it does so very well, basically creating an SSE4 version of the loop. That is the even fastest implementation for zig cc.
1 parent 2c761e3 commit cd1f950

File tree

1 file changed

+78
-23
lines changed

1 file changed

+78
-23
lines changed

htscodecs/rANS_static32x16pr_sse4.c

Lines changed: 78 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -807,19 +807,59 @@ unsigned char *rans_uncompress_O0_32x16_sse4(unsigned char *in,
807807
static inline void transpose_and_copy(uint8_t *out, int iN[32],
808808
uint8_t t[32][32]) {
809809
int z;
810-
#ifdef UBSAN
811-
// Simplified version to avoid undefined behaviour sanitiser warnings.
810+
811+
// Simplified version from below.
812+
// This is pretty good with zig cc, but slow on clang and very slow on
813+
// gcc, even with -O3
814+
/*
812815
for (z = 0; z < NX; z++) {
813816
int k;
814817
for (k = 0; k < 32; k++)
815818
out[iN[z]+k] = t[k][z];
816819
iN[z] += 32;
817820
}
818-
#else
819-
// Unaligned access. We know we can get away with this as this
820-
// code is only ever executed on x86 platforms which permit this.
821-
for (z = 0; z < NX; z+=4) {
822-
*(uint64_t *)&out[iN[z]] =
821+
*/
822+
823+
824+
// A better approach for clang and gcc can be had with some manual
825+
// restructuring to attempt to do the two loops in explcit blocks.
826+
// With gcc -O3 or -O2 -ftree-vectorize this is quite fast, as is clang
827+
// and zig but neither beat the version below (or, for zig, the basic
828+
// code above).
829+
//
830+
// It's left here incase we ever want to move to tidier code and
831+
// to understand what auto-vectorises and what doesn't.
832+
/*
833+
#define NZ 2
834+
#define NK 8
835+
for (z = 0; z < 32; z+=NZ) {
836+
for (int k = 0; k < 32; k+=NK) {
837+
for (int z0 = z; z0 < z+NZ; z0++) {
838+
uint8_t tmp[NK];// __attribute__((aligned(32)));
839+
//uint8_t (*RESTRICT t0)[32] = &t[k];
840+
for (int k0 = 0; k0 < NK; k0++)
841+
//tmp[k0] = t0[k0][z0];
842+
tmp[k0] = t[k+k0][z0];
843+
memcpy(&out[iN[z0]+k], tmp, NK);
844+
}
845+
}
846+
for (int z0 = z; z0 < z+NZ; z0++)
847+
iN[z0] += 32;
848+
}
849+
*/
850+
851+
// Manually unrolled code.
852+
// This is fastest on gcc and clang and not far behind with zig cc.
853+
// It also doesn't need aggressive gcc optimisation levels to be
854+
// efficient.
855+
//
856+
// It works by constructing 64-bit ints and copying them with a single
857+
// memory write. The fixed size memcpys just boil down to a memory write,
858+
// but unlike the earlier versions that did this direct, this isn't
859+
// exploiting undefined behaviour.
860+
for (z = 0; z < 32; z+=4) {
861+
uint64_t i64;
862+
i64 =
823863
((uint64_t)(t[0][z])<< 0) +
824864
((uint64_t)(t[1][z])<< 8) +
825865
((uint64_t)(t[2][z])<<16) +
@@ -828,7 +868,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
828868
((uint64_t)(t[5][z])<<40) +
829869
((uint64_t)(t[6][z])<<48) +
830870
((uint64_t)(t[7][z])<<56);
831-
*(uint64_t *)&out[iN[z+1]] =
871+
memcpy(&out[iN[z]], &i64, 8);
872+
i64 =
832873
((uint64_t)(t[0][z+1])<< 0) +
833874
((uint64_t)(t[1][z+1])<< 8) +
834875
((uint64_t)(t[2][z+1])<<16) +
@@ -837,7 +878,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
837878
((uint64_t)(t[5][z+1])<<40) +
838879
((uint64_t)(t[6][z+1])<<48) +
839880
((uint64_t)(t[7][z+1])<<56);
840-
*(uint64_t *)&out[iN[z+2]] =
881+
memcpy(&out[iN[z+1]], &i64, 8);
882+
i64 =
841883
((uint64_t)(t[0][z+2])<< 0) +
842884
((uint64_t)(t[1][z+2])<< 8) +
843885
((uint64_t)(t[2][z+2])<<16) +
@@ -846,7 +888,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
846888
((uint64_t)(t[5][z+2])<<40) +
847889
((uint64_t)(t[6][z+2])<<48) +
848890
((uint64_t)(t[7][z+2])<<56);
849-
*(uint64_t *)&out[iN[z+3]] =
891+
memcpy(&out[iN[z+2]], &i64, 8);
892+
i64 =
850893
((uint64_t)(t[0][z+3])<< 0) +
851894
((uint64_t)(t[1][z+3])<< 8) +
852895
((uint64_t)(t[2][z+3])<<16) +
@@ -855,8 +898,9 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
855898
((uint64_t)(t[5][z+3])<<40) +
856899
((uint64_t)(t[6][z+3])<<48) +
857900
((uint64_t)(t[7][z+3])<<56);
901+
memcpy(&out[iN[z+3]], &i64, 8);
858902

859-
*(uint64_t *)&out[iN[z]+8] =
903+
i64 =
860904
((uint64_t)(t[8+0][z])<< 0) +
861905
((uint64_t)(t[8+1][z])<< 8) +
862906
((uint64_t)(t[8+2][z])<<16) +
@@ -865,7 +909,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
865909
((uint64_t)(t[8+5][z])<<40) +
866910
((uint64_t)(t[8+6][z])<<48) +
867911
((uint64_t)(t[8+7][z])<<56);
868-
*(uint64_t *)&out[iN[z+1]+8] =
912+
memcpy(&out[iN[z]+8], &i64, 8);
913+
i64 =
869914
((uint64_t)(t[8+0][z+1])<< 0) +
870915
((uint64_t)(t[8+1][z+1])<< 8) +
871916
((uint64_t)(t[8+2][z+1])<<16) +
@@ -874,7 +919,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
874919
((uint64_t)(t[8+5][z+1])<<40) +
875920
((uint64_t)(t[8+6][z+1])<<48) +
876921
((uint64_t)(t[8+7][z+1])<<56);
877-
*(uint64_t *)&out[iN[z+2]+8] =
922+
memcpy(&out[iN[z+1]+8], &i64, 8);
923+
i64 =
878924
((uint64_t)(t[8+0][z+2])<< 0) +
879925
((uint64_t)(t[8+1][z+2])<< 8) +
880926
((uint64_t)(t[8+2][z+2])<<16) +
@@ -883,7 +929,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
883929
((uint64_t)(t[8+5][z+2])<<40) +
884930
((uint64_t)(t[8+6][z+2])<<48) +
885931
((uint64_t)(t[8+7][z+2])<<56);
886-
*(uint64_t *)&out[iN[z+3]+8] =
932+
memcpy(&out[iN[z+2]+8], &i64, 8);
933+
i64 =
887934
((uint64_t)(t[8+0][z+3])<< 0) +
888935
((uint64_t)(t[8+1][z+3])<< 8) +
889936
((uint64_t)(t[8+2][z+3])<<16) +
@@ -892,8 +939,9 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
892939
((uint64_t)(t[8+5][z+3])<<40) +
893940
((uint64_t)(t[8+6][z+3])<<48) +
894941
((uint64_t)(t[8+7][z+3])<<56);
942+
memcpy(&out[iN[z+3]+8], &i64, 8);
895943

896-
*(uint64_t *)&out[iN[z]+16] =
944+
i64 =
897945
((uint64_t)(t[16+0][z])<< 0) +
898946
((uint64_t)(t[16+1][z])<< 8) +
899947
((uint64_t)(t[16+2][z])<<16) +
@@ -902,7 +950,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
902950
((uint64_t)(t[16+5][z])<<40) +
903951
((uint64_t)(t[16+6][z])<<48) +
904952
((uint64_t)(t[16+7][z])<<56);
905-
*(uint64_t *)&out[iN[z+1]+16] =
953+
memcpy(&out[iN[z]+16], &i64, 8);
954+
i64 =
906955
((uint64_t)(t[16+0][z+1])<< 0) +
907956
((uint64_t)(t[16+1][z+1])<< 8) +
908957
((uint64_t)(t[16+2][z+1])<<16) +
@@ -911,7 +960,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
911960
((uint64_t)(t[16+5][z+1])<<40) +
912961
((uint64_t)(t[16+6][z+1])<<48) +
913962
((uint64_t)(t[16+7][z+1])<<56);
914-
*(uint64_t *)&out[iN[z+2]+16] =
963+
memcpy(&out[iN[z+1]+16], &i64, 8);
964+
i64 =
915965
((uint64_t)(t[16+0][z+2])<< 0) +
916966
((uint64_t)(t[16+1][z+2])<< 8) +
917967
((uint64_t)(t[16+2][z+2])<<16) +
@@ -920,7 +970,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
920970
((uint64_t)(t[16+5][z+2])<<40) +
921971
((uint64_t)(t[16+6][z+2])<<48) +
922972
((uint64_t)(t[16+7][z+2])<<56);
923-
*(uint64_t *)&out[iN[z+3]+16] =
973+
memcpy(&out[iN[z+2]+16], &i64, 8);
974+
i64 =
924975
((uint64_t)(t[16+0][z+3])<< 0) +
925976
((uint64_t)(t[16+1][z+3])<< 8) +
926977
((uint64_t)(t[16+2][z+3])<<16) +
@@ -929,8 +980,9 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
929980
((uint64_t)(t[16+5][z+3])<<40) +
930981
((uint64_t)(t[16+6][z+3])<<48) +
931982
((uint64_t)(t[16+7][z+3])<<56);
983+
memcpy(&out[iN[z+3]+16], &i64, 8);
932984

933-
*(uint64_t *)&out[iN[z]+24] =
985+
i64 =
934986
((uint64_t)(t[24+0][z])<< 0) +
935987
((uint64_t)(t[24+1][z])<< 8) +
936988
((uint64_t)(t[24+2][z])<<16) +
@@ -939,7 +991,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
939991
((uint64_t)(t[24+5][z])<<40) +
940992
((uint64_t)(t[24+6][z])<<48) +
941993
((uint64_t)(t[24+7][z])<<56);
942-
*(uint64_t *)&out[iN[z+1]+24] =
994+
memcpy(&out[iN[z]+24], &i64, 8);
995+
i64 =
943996
((uint64_t)(t[24+0][z+1])<< 0) +
944997
((uint64_t)(t[24+1][z+1])<< 8) +
945998
((uint64_t)(t[24+2][z+1])<<16) +
@@ -948,7 +1001,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
9481001
((uint64_t)(t[24+5][z+1])<<40) +
9491002
((uint64_t)(t[24+6][z+1])<<48) +
9501003
((uint64_t)(t[24+7][z+1])<<56);
951-
*(uint64_t *)&out[iN[z+2]+24] =
1004+
memcpy(&out[iN[z+1]+24], &i64, 8);
1005+
i64 =
9521006
((uint64_t)(t[24+0][z+2])<< 0) +
9531007
((uint64_t)(t[24+1][z+2])<< 8) +
9541008
((uint64_t)(t[24+2][z+2])<<16) +
@@ -957,7 +1011,8 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
9571011
((uint64_t)(t[24+5][z+2])<<40) +
9581012
((uint64_t)(t[24+6][z+2])<<48) +
9591013
((uint64_t)(t[24+7][z+2])<<56);
960-
*(uint64_t *)&out[iN[z+3]+24] =
1014+
memcpy(&out[iN[z+2]+24], &i64, 8);
1015+
i64 =
9611016
((uint64_t)(t[24+0][z+3])<< 0) +
9621017
((uint64_t)(t[24+1][z+3])<< 8) +
9631018
((uint64_t)(t[24+2][z+3])<<16) +
@@ -966,13 +1021,13 @@ static inline void transpose_and_copy(uint8_t *out, int iN[32],
9661021
((uint64_t)(t[24+5][z+3])<<40) +
9671022
((uint64_t)(t[24+6][z+3])<<48) +
9681023
((uint64_t)(t[24+7][z+3])<<56);
1024+
memcpy(&out[iN[z+3]+24], &i64, 8);
9691025

9701026
iN[z+0] += 32;
9711027
iN[z+1] += 32;
9721028
iN[z+2] += 32;
9731029
iN[z+3] += 32;
9741030
}
975-
#endif
9761031
}
9771032

9781033
unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,

0 commit comments

Comments
 (0)