New pricing model usable on and off chain with off-chain standalone tests

kbowers-jump · Reisen · commit 42c9edbe2584 · 2022-04-01T11:20:08.000Z
diff --git a/program/src/oracle/model/clean b/program/src/oracle/model/clean
@@ -0,0 +1,3 @@
+#!/bin/sh
+rm -rfv bin
+
diff --git a/program/src/oracle/model/model.h b/program/src/oracle/model/model.h
@@ -0,0 +1,101 @@
+#ifndef _pyth_oracle_model_model_h_
+#define _pyth_oracle_model_model_h_
+
+#include <stdint.h>
+#include <stdalign.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Returns the minimum and maximum number of quotes the implementation
+   can handle */
+
+static inline uint64_t
+price_model_quote_min( void ) {
+  return (uint64_t)1;
+}
+
+static inline uint64_t
+price_model_quote_max( void ) {
+  return (UINT64_MAX-(uint64_t)alignof(int64_t)+(uint64_t)1) / (uint64_t)sizeof(int64_t);
+}
+
+/* price_model_cnt_valid returns non-zero if cnt is a valid value or
+   zero if not. */
+
+static inline int
+price_model_cnt_valid( uint64_t cnt ) {
+  return price_model_quote_min()<=cnt && cnt<=price_model_quote_max();
+}
+
+/* price_model_scratch_footprint returns the number of bytes of scratch
+   space needed for an arbitrarily aligned scratch region required by
+   price_model to handle price_model_quote_min() to cnt quotes
+   inclusive. */
+
+static inline uint64_t
+price_model_scratch_footprint( uint64_t cnt ) { /* Assumes price_model_cnt_valid( cnt ) is true */
+  /* cnt int64_t's plus worst case alignment padding, no overflow
+     possible as cnt is valid at this point */
+  return cnt*(uint64_t)sizeof(int64_t)+(uint64_t)alignof(int64_t)-(uint64_t)1;
+}
+
+/* price_model_core minimizes (to quote precision in a floor / round
+   toward negative infinity sense) the loss model of the given quotes.
+   Assumes valid inputs (e.g. cnt is at least 1 and not unreasonably
+   large ... typically a multiple of 3 but this is not required,
+   quote[i] for i in [0,cnt) are the quotes of interest on input, p25,
+   p50, p75 point to where to write model outputs, scratch points to a
+   suitable footprint srcatch region).
+
+   Returns a pointer to the quotes sorted in ascending order.  As such,
+   the min and max and any other rank statistic can be extracted easily
+   on return.  This location will either be quote itself or to a
+   location in scratch.  Use price_model below for a variant that always
+   replaces quote with the sorted quotes (potentially has extra ops for
+   copying).  Further, on return, *_p25, *_p50, *_p75 will hold the loss
+   model minimizing values for the input quotes and the scratch region
+   was clobbered.
+
+   Scratch points to a memory region of arbitrary alignment with at
+   least price_model_scratch_footprint( cnt ) bytes and it will be
+   clobbered on output.  It is sufficient to use a normally aligned /
+   normally allocated / normally declared array of cnt int64_t's.
+
+   The cost of this function is a fast and low variance (but not
+   completely data oblivious) O(cnt lg cnt) in the best / average /
+   worst cases.  This function uses no heap / dynamic memory allocation.
+   It is thread safe provided it passed non-conflicting quote, output
+   and scratch arrays.  It has a bounded call depth ~lg cnt <= ~64 (this
+   could reducd to O(1) by using a non-recursive sort/select
+   implementation under the hood if desired). */
+
+int64_t *                                       /* Returns pointer to sorted quotes (either quote or ALIGN_UP(scratch,int64_t)) */
+price_model_core( uint64_t           cnt,       /* Assumes price_model_cnt_valid( cnt ) is true */
+                  int64_t * restrict quote,     /* Assumes quote[i] for i in [0,cnt) is the i-th quote on input */
+                  int64_t * restrict _p25,      /* Assumes *_p25 is safe to write to the p25 model output */
+                  int64_t * restrict _p50,      /* Assumes *_p50 " */
+                  int64_t * restrict _p75,      /* Assumes *_p75 " */
+                  void    *          scratch ); /* Assumes a suitable scratch region */
+
+/* Same as the above but always returns quote and quote always holds the
+   sorted quotes on return. */
+
+static inline int64_t *
+price_model( uint64_t           cnt,
+             int64_t * restrict quote,
+             int64_t * restrict _p25,
+             int64_t * restrict _p50,
+             int64_t * restrict _p75,
+             void    *          scratch ) {
+  int64_t * restrict tmp = price_model_core( cnt, quote, _p25, _p50, _p75, scratch );
+  if( tmp!=quote ) for( uint64_t idx=(uint64_t)0; idx<cnt; idx++ ) quote[ idx ] = tmp[ idx ];
+  return quote;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_model_model_h_ */
diff --git a/program/src/oracle/model/price_model.c b/program/src/oracle/model/price_model.c
@@ -0,0 +1,113 @@
+#include "model.h"
+#include "../util/avg.h" /* For avg_2_int64 */
+
+#define SORT_NAME  int64_sort_ascending
+#define SORT_KEY_T int64_t
+#include "../sort/tmpl/sort_stable.c"
+
+int64_t *
+price_model_core( uint64_t           cnt,
+                  int64_t * restrict quote,
+                  int64_t * restrict _p25,
+                  int64_t * restrict _p50,
+                  int64_t * restrict _p75,
+                  void    *          scratch ) {
+
+  /* Sort the quotes.  The sorting implementation used here is a highly
+     optimized mergesort (merge with an unrolled insertion sorting
+     network small n base cases).  The best case is ~0.5 n lg n compares
+     and the average and worst cases are ~n lg n compares.
+     
+     While not completely data oblivious, this has quite low variance in
+     operation count practically and this is _better_ than quicksort's
+     average case and quicksort's worst case is a computational
+     denial-of-service and timing attack vulnerable O(n^2).  Unlike
+     quicksort, this is also stable (but this stability does not
+     currently matter ... it might be a factor in future models).
+     
+     A data oblivious sorting network approach might be viable here with
+     and would have a completely deterministic operations count.  It
+     currently isn't used as the best known practical approaches for
+     general n have a worse algorithmic cost (O( n (lg n)^2 )) and,
+     while the application probably doesn't need perfect obliviousness,
+     mergesort is still moderately oblivious and the application can
+     benefit from mergesort's lower operations cost.  (The main drawback
+     of mergesort over quicksort is that it isn't in place, but memory
+     footprint isn't an issue here.)
+
+     Given the operations cost model (e.g. cache friendliness is not
+     incorporated), a radix sort might be viable here (O(n) in best /
+     average / worst).  It currently isn't used as we expect invocations
+     with small-ish n to be common and radix sort would be have large
+     coefficients on the O(n) and additional fixed overheads that would
+     make it more expensive than mergesort in this regime.
+
+     Note: price_model_cnt_valid( cnt ) implies
+     int64_sort_ascending_cnt_valid( cnt ) currently.
+
+     Note: consider filtering out "NaN" quotes (i.e. INT64_MIN)? */
+
+  int64_t * sort_quote = int64_sort_ascending_stable( quote, cnt, scratch );
+
+  /* Extract the p25
+
+     There are many variants with subtle tradeoffs here.  One option is
+     to interpolate when the ideal p25 is bracketed by two samples (akin
+     to the p50 interpolation above when the number of quotes is even).
+     That is, for p25, interpolate between quotes floor((cnt-2)/4) and
+     ceil((cnt-2)/4) with the weights determined by cnt mod 4.  The
+     current preference is to not do that as it is slightly more
+     complex, doesn't exactly always minimize the current loss function
+     and is more exposed to the confidence intervals getting skewed by
+     bum quotes with the number of quotes is small.
+
+     Another option is to use the inside quote of the above pair.  That
+     is, for p25, use quote ceil((cnt-2)/4) == floor((cnt+1)/4) ==
+     (cnt+1)>>2.  The current preference is not to do this as, though
+     this has stronger bum quote robustness, it results in p25==p50==p75
+     when cnt==3.  (In this case, the above wants to do an interpolation
+     between quotes 0 and 1 to for the p25 and between quotes 1 and 2
+     for the p75.  But limiting to just the inside quote results in
+     p25/p50/p75 all using the median quote.)
+
+     A tweak to this option, for p25, is to use floor(cnt/4) == cnt>>2.
+     This is simple, has the same asymptotic behavior for large cnt, has
+     good behavior in the cnt==3 case and practically as good bum quote
+     rejection in the moderate cnt case. */
+
+  uint64_t p25_idx = cnt >> 2;
+
+  *_p25 = sort_quote[p25_idx];
+
+  /* Extract the p50 */
+
+  if( (cnt & (uint64_t)1) ) { /* Odd number of quotes */
+
+    uint64_t p50_idx = cnt >> 1; /* ==ceil((cnt-1)/2) */
+
+    *_p50 = sort_quote[p50_idx];
+
+  } else { /* Even number of quotes (at least 2) */
+
+    uint64_t p50_idx_right = cnt >> 1;                    /* == ceil((cnt-1)/2)> 0 */
+    uint64_t p50_idx_left  = p50_idx_right - (uint64_t)1; /* ==floor((cnt-1)/2)>=0 (no overflow/underflow) */
+
+    int64_t vl = sort_quote[p50_idx_left ];
+    int64_t vr = sort_quote[p50_idx_right];
+
+    /* Compute the average of vl and vr (with floor / round toward
+       negative infinity rounding and without possibility of
+       intermediate overflow). */
+
+    *_p50 = avg_2_int64( vl, vr );
+  }
+  
+  /* Extract the p75 (this is the mirror image of the p25 case) */
+
+  uint64_t p75_idx = cnt - ((uint64_t)1) - p25_idx;
+
+  *_p75 = sort_quote[p75_idx];
+
+  return sort_quote;
+}
+
diff --git a/program/src/oracle/model/run_tests b/program/src/oracle/model/run_tests
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+module purge          || exit 1
+module load gcc-9.3.0 || exit 1
+
+./clean       || exit 1
+mkdir -pv bin || exit 1
+
+CC="gcc -g -Wall -Werror -Wextra -Wconversion -Wstrict-aliasing=2 -Wimplicit-fallthrough=2 -pedantic -D_XOPEN_SOURCE=600 -O2 -march=native -std=c17"
+
+set -x
+
+$CC test_price_model.c price_model.c -o bin/test_price_model || exit 1
+
+bin/test_price_model || exit 1
+
+echo all tests passed
+
diff --git a/program/src/oracle/model/test_price_model.c b/program/src/oracle/model/test_price_model.c
@@ -0,0 +1,69 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../util/util.h"
+#include "model.h"
+
+int
+qcmp( void const * _p,
+      void const * _q ) {
+  int64_t p = *(int64_t const *)_p;
+  int64_t q = *(int64_t const *)_q;
+  if( p < q ) return -1;
+  if( p > q ) return  1;
+  return 0;
+}
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+# define N 96
+
+  int64_t quote0 [N];
+  int64_t quote  [N];
+  int64_t val    [3];
+  int64_t scratch[N];
+
+  int ctr = 0;
+  for( int iter=0; iter<10000000; iter++ ) {
+    if( !ctr ) { printf( "Completed %u iterations\n", iter ); ctr = 100000; }
+    ctr--;
+
+    /* Generate a random test */
+
+    uint64_t cnt = (uint64_t)1 + (uint64_t)(prng_uint32( prng ) % (uint32_t)N); /* In [1,N], approx uniform IID */
+    for( uint64_t idx=(uint64_t)0; idx<cnt; idx++ ) quote0[ idx ] = (int64_t)prng_uint64( prng );
+
+    /* Apply the model */
+
+    memcpy( quote, quote0, sizeof(int64_t)*(size_t)cnt );
+    if( price_model( cnt, quote, val+0, val+1, val+2, scratch )!=quote ) { printf( "FAIL (compose)\n" ); return 1; }
+
+    /* Validate the results */
+
+    qsort( quote0, (size_t)cnt, sizeof(int64_t), qcmp );
+    if( memcmp( quote, quote0, sizeof(int64_t)*(size_t)cnt ) ) { printf( "FAIL (sort)\n" ); return 1; }
+
+    uint64_t p25_idx = cnt>>2;
+    uint64_t p50_idx = cnt>>1;
+    uint64_t p75_idx = cnt - (uint64_t)1 - p25_idx;
+    uint64_t is_even = (uint64_t)!(cnt & (uint64_t)1);
+
+    if( val[0]!=quote[ p25_idx ] ) { printf( "FAIL (p25)\n" ); return 1; }
+    if( val[1]!=avg_2_int64( quote[ p50_idx-is_even ], quote[ p50_idx ] ) ) { printf( "FAIL (p50)\n" ); return 1; }
+    if( val[2]!=quote[ p75_idx ] ) { printf( "FAIL (p75)\n" ); return 1; }
+  }
+
+# undef N
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass\n" );
+  return 0;
+}
+