Skip to content

Commit a729f91

Browse files
Enable vectorisation for ZIP reconstruct stage on Windows (#2043)
* Enable vectorisation for ZIP reconstruct stage on Windows The __SSE4_1__ macro that is used to control the use of the vectorised version of the reconstruct() function, is only defined for GCC and clang. As a result, Windows were using the scalar version, which is less performant. This commit fixes the above issue by adding a check for MSVC, similar to what we have on the SSE2 macro check (and in other places in the code base). Signed-off-by: Nikolaos Koutsikos <nikolaos.koutsikos@foundry.com> * Applying simple const correctness to internal_zip Signed-off-by: Nikolaos Koutsikos <nikolaos.koutsikos@foundry.com> --------- Signed-off-by: Nikolaos Koutsikos <nikolaos.koutsikos@foundry.com> Co-authored-by: Cary Phillips <cary@ilm.com>
1 parent 80b2c94 commit a729f91

File tree

1 file changed

+15
-15
lines changed

1 file changed

+15
-15
lines changed

src/lib/OpenEXRCore/internal_zip.c

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
# include <emmintrin.h>
2121
# include <mmintrin.h>
2222
#endif
23-
#if defined __SSE4_1__
23+
#if defined __SSE4_1__ || (_MSC_VER >= 1300 && (_M_IX86 || _M_X64))
2424
# define IMF_HAVE_SSE4_1 1
2525
# include <smmintrin.h>
2626
#endif
@@ -33,7 +33,7 @@
3333

3434
#ifdef IMF_HAVE_SSE4_1
3535
static void
36-
reconstruct (uint8_t* buf, uint64_t outSize)
36+
reconstruct (uint8_t* buf, const uint64_t outSize)
3737
{
3838
static const uint64_t bytesPerChunk = sizeof (__m128i);
3939
const uint64_t vOutSize = outSize / bytesPerChunk;
@@ -79,7 +79,7 @@ reconstruct (uint8_t* buf, uint64_t outSize)
7979
}
8080
#elif defined(IMF_HAVE_NEON_AARCH64)
8181
static void
82-
reconstruct (uint8_t* buf, uint64_t outSize)
82+
reconstruct (uint8_t* buf, const uint64_t outSize)
8383
{
8484
static const uint64_t bytesPerChunk = sizeof (uint8x16_t);
8585
const uint64_t vOutSize = outSize / bytesPerChunk;
@@ -128,7 +128,7 @@ reconstruct (uint8_t* buf, uint64_t outSize)
128128
}
129129
#else
130130
static void
131-
reconstruct (uint8_t* buf, uint64_t sz)
131+
reconstruct (uint8_t* buf, const uint64_t sz)
132132
{
133133
uint8_t* t = buf + 1;
134134
uint8_t* stop = buf + sz;
@@ -145,7 +145,7 @@ reconstruct (uint8_t* buf, uint64_t sz)
145145

146146
#ifdef IMF_HAVE_SSE2
147147
static void
148-
interleave (uint8_t* out, const uint8_t* source, uint64_t outSize)
148+
interleave (uint8_t* out, const uint8_t* const source, const uint64_t outSize)
149149
{
150150
static const uint64_t bytesPerChunk = 2 * sizeof (__m128i);
151151
const uint64_t vOutSize = outSize / bytesPerChunk;
@@ -176,7 +176,7 @@ interleave (uint8_t* out, const uint8_t* source, uint64_t outSize)
176176

177177
#elif defined(IMF_HAVE_NEON_AARCH64)
178178
static void
179-
interleave (uint8_t* out, const uint8_t* source, uint64_t outSize)
179+
interleave (uint8_t* out, const uint8_t* const source, const uint64_t outSize)
180180
{
181181
static const uint64_t bytesPerChunk = 2 * sizeof (uint8x16_t);
182182
const uint64_t vOutSize = outSize / bytesPerChunk;
@@ -205,7 +205,7 @@ interleave (uint8_t* out, const uint8_t* source, uint64_t outSize)
205205
#else
206206

207207
static void
208-
interleave (uint8_t* out, const uint8_t* source, uint64_t outSize)
208+
interleave (uint8_t* out, const uint8_t* const source, const uint64_t outSize)
209209
{
210210
const uint8_t* t1 = source;
211211
const uint8_t* t2 = source + (outSize + 1) / 2;
@@ -231,7 +231,7 @@ interleave (uint8_t* out, const uint8_t* source, uint64_t outSize)
231231
/**************************************/
232232

233233
void
234-
internal_zip_reconstruct_bytes (uint8_t* out, uint8_t* source, uint64_t count)
234+
internal_zip_reconstruct_bytes (uint8_t* out, uint8_t* source, const uint64_t count)
235235
{
236236
reconstruct (source, count);
237237
interleave (out, source, count);
@@ -241,13 +241,13 @@ internal_zip_reconstruct_bytes (uint8_t* out, uint8_t* source, uint64_t count)
241241

242242
void
243243
internal_zip_deconstruct_bytes (
244-
uint8_t* scratch, const uint8_t* source, uint64_t count)
244+
uint8_t* scratch, const uint8_t* source, const uint64_t count)
245245
{
246246
int p;
247247
uint8_t* t1 = scratch;
248248
uint8_t* t2 = t1 + (count + 1) / 2;
249249
const uint8_t* raw = source;
250-
const uint8_t* stop = raw + count;
250+
const uint8_t* const stop = raw + count;
251251

252252
/* reorder */
253253
while (raw < stop)
@@ -276,11 +276,11 @@ static exr_result_t
276276
undo_zip_impl (
277277
exr_decode_pipeline_t* decode,
278278
const void* compressed_data,
279-
uint64_t comp_buf_size,
279+
const uint64_t comp_buf_size,
280280
void* uncompressed_data,
281-
uint64_t uncompressed_size,
281+
const uint64_t uncompressed_size,
282282
void* scratch_data,
283-
uint64_t scratch_size)
283+
const uint64_t scratch_size)
284284
{
285285
size_t actual_out_bytes;
286286
exr_result_t res;
@@ -314,9 +314,9 @@ exr_result_t
314314
internal_exr_undo_zip (
315315
exr_decode_pipeline_t* decode,
316316
const void* compressed_data,
317-
uint64_t comp_buf_size,
317+
const uint64_t comp_buf_size,
318318
void* uncompressed_data,
319-
uint64_t uncompressed_size)
319+
const uint64_t uncompressed_size)
320320
{
321321
exr_result_t rv;
322322
uint64_t scratchbufsz = uncompressed_size;

0 commit comments

Comments
 (0)