|
| 1 | +#ifndef _pyth_oracle_util_avg_h_ |
| 2 | +#define _pyth_oracle_util_avg_h_ |
| 3 | + |
| 4 | +/* Portable robust integer averaging. (No intermediate overflow, no |
| 5 | + assumptions about platform type wides, no assumptions about integer |
| 6 | + signed right shift, no signed integer division, no implict |
| 7 | + conversions, etc.) */ |
| 8 | + |
| 9 | +#include "sar.h" |
| 10 | + |
| 11 | +#ifdef __cplusplus |
| 12 | +extern "C" { |
| 13 | +#endif |
| 14 | + |
| 15 | +/* uint8_t a = avg_2_uint8 ( x, y ); int8_t a = avg_2_int8 ( x, y ); |
| 16 | + uint16_t a = avg_2_uint16( x, y ); int16_t a = avg_2_int16( x, y ); |
| 17 | + uint32_t a = avg_2_uint32( x, y ); int32_t a = avg_2_int32( x, y ); |
| 18 | + uint64_t a = avg_2_uint64( x, y ); int64_t a = avg_2_int64( x, y ); |
| 19 | +
|
| 20 | + compute the average (a) of x and y of the corresponding type with <1 |
| 21 | + ulp error (floor / round toward negative infinity rounding). That |
| 22 | + is, these compute this exactly: |
| 23 | + floor( (x+y)/2 ) |
| 24 | + On systems where signed right shifts are sign extending, this is |
| 25 | + identical to: |
| 26 | + (x+y) >> 1 |
| 27 | + but the below will be safe against intermediate overflow. */ |
| 28 | + |
| 29 | +static inline uint8_t avg_2_uint8 ( uint8_t x, uint8_t y ) { return (uint8_t )((((uint16_t)x)+((uint16_t)y))>>1); } |
| 30 | +static inline uint16_t avg_2_uint16( uint16_t x, uint16_t y ) { return (uint16_t)((((uint32_t)x)+((uint32_t)y))>>1); } |
| 31 | +static inline uint32_t avg_2_uint32( uint32_t x, uint32_t y ) { return (uint32_t)((((uint64_t)x)+((uint64_t)y))>>1); } |
| 32 | + |
| 33 | +static inline uint64_t |
| 34 | +avg_2_uint64( uint64_t x, |
| 35 | + uint64_t y ) { |
| 36 | + |
| 37 | + /* x+y might have intermediate overflow and we don't have an efficient |
| 38 | + portable wider integer type available. So we can't just do |
| 39 | + (x+y)>>1 to compute floor((x+y)/2). */ |
| 40 | + |
| 41 | +# if 1 /* Fewer ops but less parallel issue */ |
| 42 | + /* Emulate an adc (add with carry) to get the 65-bit wide intermediate |
| 43 | + and then shift back in as necessary */ |
| 44 | + uint64_t z = x + y; |
| 45 | + uint64_t c = (uint64_t)(z<x); // Or z<y |
| 46 | + return (z>>1) + (c<<63); |
| 47 | +# else /* More ops but more parallel issue */ |
| 48 | + /* floor(x/2)+floor(y/2)==(x>>1)+(y>>1) doesn't have intermediate |
| 49 | + overflow but it has two rounding errors. Noting that |
| 50 | + (x+y)/2 == floor(x/2) + floor(y/2) + (x_lsb+y_lsb)/2 |
| 51 | + == (x>>1) + (y>>1) + (x_lsb+y_lsb)/2 |
| 52 | + implies we should increment when x and y have their lsbs set. */ |
| 53 | + return (x>>1) + (y>>1) + (x & y & UINT64_C(1)); |
| 54 | +# endif |
| 55 | + |
| 56 | +} |
| 57 | + |
| 58 | +static inline int8_t avg_2_int8 ( int8_t x, int8_t y ) { return (int8_t )sar_int16((int16_t)(((int16_t)x)+((int16_t)y)),1); } |
| 59 | +static inline int16_t avg_2_int16 ( int16_t x, int16_t y ) { return (int16_t)sar_int32((int32_t)(((int32_t)x)+((int32_t)y)),1); } |
| 60 | +static inline int32_t avg_2_int32 ( int32_t x, int32_t y ) { return (int32_t)sar_int64((int64_t)(((int64_t)x)+((int64_t)y)),1); } |
| 61 | + |
| 62 | +static inline int64_t |
| 63 | +avg_2_int64( int64_t x, |
| 64 | + int64_t y ) { |
| 65 | + |
| 66 | + /* Similar considerations as above */ |
| 67 | + |
| 68 | +# if 1 /* Fewer ops but less parallel issue */ |
| 69 | + /* x+y = x+2^63 + y+2^63 - 2^64 ... exact ops |
| 70 | + = ux + uy - 2^64 ... exact ops where ux and uy are exactly representable as a 64-bit uint |
| 71 | + = uz + 2^64 c - 2^64 ... exact ops where uz is a 64-bit uint and c is in [0,1]. |
| 72 | + Thus, as before |
| 73 | + uz = ux+uy ... c ops |
| 74 | + c = (uz<ux) ... exact ops |
| 75 | + Further simplifying: |
| 76 | + x+y = uz - 2^64 ( 1-c ) ... exact ops |
| 77 | + = uz - 2^64 b ... exact ops |
| 78 | + where: |
| 79 | + b = 1-c = ~c = (uz>=ux) ... exact ops |
| 80 | + Noting that (ux + uy) mod 2^64 = (x+y+2^64) mod 2^64 = (x+y) mod 2^64 |
| 81 | + and that signed and added adds are the same operation binary in twos |
| 82 | + complement math yields: |
| 83 | + uz = (uint64_t)(x+y) ... c ops |
| 84 | + b = uz>=(x+2^63) ... exact ops |
| 85 | + as we know x+2^63 is in [0,2^64), x+2^63 = x+/-2^63 mod 2^64. And |
| 86 | + using the signed and unsigned adds are the same operation binary |
| 87 | + in we have: |
| 88 | + x+2^63 ... exact ops |
| 89 | + ==x+/-2^63 mod 2^64 ... exact ops |
| 90 | + ==x+/-2^63 ... c ops */ |
| 91 | + uint64_t t = (uint64_t)x; |
| 92 | + uint64_t z = t + (uint64_t)y; |
| 93 | + uint64_t b = (uint64_t)(z>=(t+(((uint64_t)1)<<63))); |
| 94 | + return (int64_t)((z>>1) - (b<<63)); |
| 95 | +# else /* More ops but more parallel issue (significant more ops if no platform arithmetic right shift) */ |
| 96 | + /* See above for uint64_t for details on this calc. */ |
| 97 | + return sar_int64( x, 1 ) + sar_int64( y, 1 ) + (x & y & (int64_t)1); |
| 98 | +# endif |
| 99 | + |
| 100 | +} |
| 101 | + |
| 102 | +/* uint8_t a = avg_uint8 ( x, n ); int8_t a = avg_int8 ( x, n ); |
| 103 | + uint16_t a = avg_uint16( x, n ); int16_t a = avg_int16( x, n ); |
| 104 | + uint32_t a = avg_uint32( x, n ); int32_t a = avg_int32( x, n ); |
| 105 | +
|
| 106 | + compute the average (a) of the n element array f the corresponding |
| 107 | + type pointed to by x with <1 ulp round error (truncate / round toward |
| 108 | + zero rounding). Supports arrays with up to 2^32-1 elements. Returns |
| 109 | + 0 for arrays with 0 elements. Elements of the array are unchanged by |
| 110 | + this function. No 64-bit equivalents are provided here as that |
| 111 | + requires a different and slower implementation (at least if |
| 112 | + portability is required). */ |
| 113 | + |
| 114 | +#define PYTH_ORACLE_UTIL_AVG_DECL(func,T) /* Comments for uint32_t but apply for uint8_t and uint16_t too */ \ |
| 115 | +static inline T /* == (1/n) sum_i x[i] with one rounding error and round toward zero rounding */ \ |
| 116 | +func( T const * x, /* Indexed [0,n) */ \ |
| 117 | + uint32_t n ) { /* Returns 0 on n==0 */ \ |
| 118 | + if( !n ) return (T)0; /* Handle n==0 case */ \ |
| 119 | + /* At this point n is in [1,2^32-1]. Sum up all the x into a 64-bit */ \ |
| 120 | + /* wide signed accumulator. */ \ |
| 121 | + uint64_t a = (uint64_t)0; \ |
| 122 | + for( uint32_t r=n; r; r-- ) a += (uint64_t)*(x++); \ |
| 123 | + /* At this point a is in [ 0, (2^32-1)(2^32-1) ] < 2^64 and thus was */ \ |
| 124 | + /* computed without intermediate overflow. Complete the average. */ \ |
| 125 | + return (T)( a / (uint64_t)n ); \ |
| 126 | +} |
| 127 | + |
| 128 | +PYTH_ORACLE_UTIL_AVG_DECL( avg_uint8, uint8_t ) |
| 129 | +PYTH_ORACLE_UTIL_AVG_DECL( avg_uint16, uint16_t ) |
| 130 | +PYTH_ORACLE_UTIL_AVG_DECL( avg_uint32, uint32_t ) |
| 131 | +/* See note above about 64-bit variants */ |
| 132 | + |
| 133 | +#undef PYTH_ORACLE_UTIL_AVG_DECL |
| 134 | + |
| 135 | +#define PYTH_ORACLE_UTIL_AVG_DECL(func,T) /* Comments for int32_t but apply for int8_t and int16_t too */ \ |
| 136 | +static inline T /* == (1/n) sum_i x[i] with one rounding error and round toward zero rounding */ \ |
| 137 | +func( T const * x, /* Indexed [0,n) */ \ |
| 138 | + uint32_t n ) { /* Returns 0 on n==0 */ \ |
| 139 | + if( !n ) return (T)0; /* Handle n==0 case */ \ |
| 140 | + /* At this point n is in [1,2^32-1]. Sum up all the x into a 64-bit */ \ |
| 141 | + /* wide signed accumulator. */ \ |
| 142 | + int64_t a = (int64_t)0; \ |
| 143 | + for( uint32_t r=n; r; r-- ) a += (int64_t)*(x++); \ |
| 144 | + /* At this point a is in [ -2^31 (2^32-1), (2^31-1)(2^32-1) ] such that */ \ |
| 145 | + /* |a| < 2^63. It thus was computed without intermediate overflow and */ \ |
| 146 | + /* further |a| can fit with an uint64_t. Compute |a|/n. If a<0, the */ \ |
| 147 | + /* result will be at most 2^31-1 (i.e. all x[i]==2^31-1). Otherwise, */ \ |
| 148 | + /* the result will be at most 2^31 (i.e. all x[i]==-2^31). */ \ |
| 149 | + int s = a<(int64_t)0; \ |
| 150 | + uint64_t u = (uint64_t)(s ? -a : a); \ |
| 151 | + u /= (uint64_t)n; \ |
| 152 | + a = (int64_t)u; \ |
| 153 | + return (T)(s ? -a : a); \ |
| 154 | +} |
| 155 | + |
| 156 | +PYTH_ORACLE_UTIL_AVG_DECL( avg_int8, int8_t ) |
| 157 | +PYTH_ORACLE_UTIL_AVG_DECL( avg_int16, int16_t ) |
| 158 | +PYTH_ORACLE_UTIL_AVG_DECL( avg_int32, int32_t ) |
| 159 | +/* See note above about 64-bit variants */ |
| 160 | + |
| 161 | +#undef PYTH_ORACLE_UTIL_AVG_DECL |
| 162 | + |
| 163 | +#ifdef __cplusplus |
| 164 | +} |
| 165 | +#endif |
| 166 | + |
| 167 | +#endif /* _pyth_oracle_util_avg_h_ */ |
| 168 | + |
0 commit comments