@@ -14,6 +14,7 @@ import (
1414 "bytes"
1515 "context"
1616 "fmt"
17+ "math"
1718 "math/bits"
1819 "math/rand"
1920 "testing"
@@ -32,9 +33,7 @@ import (
3233 "github.com/cockroachdb/cockroach/pkg/util/duration"
3334 "github.com/cockroachdb/cockroach/pkg/util/leaktest"
3435 "github.com/cockroachdb/cockroach/pkg/util/log"
35- "github.com/cockroachdb/cockroach/pkg/util/retry"
3636 "github.com/cockroachdb/cockroach/pkg/util/timeutil"
37- "github.com/cockroachdb/errors"
3837 "github.com/lib/pq"
3938 "github.com/stretchr/testify/assert"
4039 "github.com/stretchr/testify/require"
@@ -97,8 +96,8 @@ func TestMapToUniqueUnorderedID(t *testing.T) {
9796}
9897
9998// TestSerialNormalizationWithUniqueUnorderedID makes sure that serial
100- // normalization can use unique_unordered_id() and a split in a table followed
101- // by insertions guarantees a (somewhat) uniform distribution of the data.
99+ // normalization can use the unordered_rowid mode and a large number of
100+ // insertions (80k) results in a (somewhat) uniform distribution of the data.
102101func TestSerialNormalizationWithUniqueUnorderedID (t * testing.T ) {
103102 defer leaktest .AfterTest (t )()
104103 defer log .Scope (t ).Close (t )
@@ -107,25 +106,33 @@ func TestSerialNormalizationWithUniqueUnorderedID(t *testing.T) {
107106 "assumes large N" )
108107
109108 ctx := context .Background ()
110- const attempts = 2
111- // This test can flake when the random data is not distributed evenly.
112- err := retry .WithMaxAttempts (ctx , retry.Options {}, attempts , func () error {
113-
114- params := base.TestServerArgs {
115- Knobs : base.TestingKnobs {
116- SQLEvalContext : & eval.TestingKnobs {
117- // We disable the randomization of some batch sizes because
118- // with some low values the test takes much longer.
119- ForceProductionValues : true ,
109+
110+ // Since this is a statistical test, an occasional observation of non-uniformity
111+ // is not in and of itself a huge concern. As such, we'll only fail the test if
112+ // a majority of our observations show statistically significant amounts of
113+ // non-uniformity.
114+ const numObservations = 10
115+ numNonUniformObservations := 0
116+ for i := 0 ; i < numObservations ; i ++ {
117+ // We use an anonymous function because of the defer in each iteration.
118+ func () {
119+ t .Logf ("starting observation %d" , i )
120+
121+ params := base.TestServerArgs {
122+ Knobs : base.TestingKnobs {
123+ SQLEvalContext : & eval.TestingKnobs {
124+ // We disable the randomization of some batch sizes because
125+ // with some low values the test takes much longer.
126+ ForceProductionValues : true ,
127+ },
120128 },
121- },
122- }
123- s , db , _ := serverutils .StartServer (t , params )
124- defer s .Stopper ().Stop (ctx )
129+ }
130+ s , db , _ := serverutils .StartServer (t , params )
131+ defer s .Stopper ().Stop (ctx )
125132
126- tdb := sqlutils .MakeSQLRunner (db )
127- // Create a new table with serial primary key i (unordered_rowid) and int j (index).
128- tdb .Exec (t , `
133+ tdb := sqlutils .MakeSQLRunner (db )
134+ // Create a new table with serial primary key i (unordered_rowid) and int j (index).
135+ tdb .Exec (t , `
129136SET serial_normalization TO 'unordered_rowid';
130137CREATE DATABASE t;
131138USE t;
@@ -134,30 +141,29 @@ CREATE TABLE t (
134141 j INT
135142)` )
136143
137- numberOfRows := 100000
138-
139- // Insert rows.
140- tdb .Exec (t , fmt .Sprintf (`
144+ // Insert rows.
145+ numberOfRows := 80000
146+ tdb .Exec (t , fmt .Sprintf (`
141147INSERT INTO t(j) SELECT * FROM generate_series(1, %d);
142148` , numberOfRows ))
143149
144- // Build an equi-width histogram over the key range. The below query will
145- // generate the key bounds for each high-order bit pattern as defined by
146- // prefixBits. For example, if this were 3, we'd get the following groups:
147- //
148- // low | high
149- // ----------------------+----------------------
150- // 0 | 2305843009213693952
151- // 2305843009213693952 | 4611686018427387904
152- // 4611686018427387904 | 6917529027641081856
153- // 6917529027641081856 | 9223372036854775807
154- //
155- const prefixBits = 4
156- var keyCounts pq.Int64Array
157- tdb .QueryRow (t , `
150+ // Build an equi-width histogram over the key range. The below query will
151+ // generate the key bounds for each high-order bit pattern as defined by
152+ // prefixBits. For example, if this were 3, we'd get the following groups:
153+ //
154+ // low | high
155+ // ----------------------+----------------------
156+ // 0 | 2305843009213693952
157+ // 2305843009213693952 | 4611686018427387904
158+ // 4611686018427387904 | 6917529027641081856
159+ // 6917529027641081856 | 9223372036854775807
160+ //
161+ const prefixBits = 4
162+ var keyCounts pq.Int64Array
163+ tdb .QueryRow (t , `
158164 WITH boundaries AS (
159- SELECT i << (64 - $1) AS p FROM ROWS FROM (generate_series(0, (1<< ($1- 1)) -1)) AS t (i)
160- UNION ALL SELECT (((1 << 62) - 1) << 1)+ 1 -- int63 max value
165+ SELECT i << (64 - $1) AS p FROM ROWS FROM (generate_series(0, (1 << ($1 - 1)) - 1)) AS t (i)
166+ UNION ALL SELECT (((1 << 62) - 1) << 1) + 1 -- int63 max value
161167 ),
162168 groups AS (
163169 SELECT *
@@ -167,55 +173,61 @@ INSERT INTO t(j) SELECT * FROM generate_series(1, %d);
167173 counts AS (
168174 SELECT count(i) AS c
169175 FROM t, groups
170- WHERE low <= i AND high > i
176+ WHERE low < i AND i <= high
171177 GROUP BY (low, high)
172178 )
173179SELECT array_agg(c)
174180 FROM counts;` , prefixBits ).Scan (& keyCounts )
175181
176- t .Log ("Key counts in each split range" )
177- for i , keyCount := range keyCounts {
178- t .Logf ("range %d: %d\n " , i , keyCount )
179- }
180- require .Len (t , keyCounts , 1 << (prefixBits - 1 ))
181-
182- // To check that the distribution over ranges is not uniform, we use a
183- // chi-square goodness of fit statistic. We'll set our null hypothesis as
184- // 'each range in the distribution should have the same probability of getting
185- // a row inserted' and we'll check if we can reject the null hypothesis if
186- // chi-square is greater than the critical value we currently set as 35.2585,
187- // a deliberate choice that gives us a p-value of 0.00001 according to
188- // https://www.fourmilab.ch/rpkp/experiments/analysis/chiCalc.html. If we are
189- // able to reject the null hypothesis, then the distribution is not uniform,
190- // and we raise an error. This test has 7 degrees of freedom (categories - 1).
191- chiSquared := discreteUniformChiSquared (keyCounts )
192- criticalValue := 35.2585
193- if chiSquared >= criticalValue {
194- return errors .Newf ("chiSquared value of %f must be" +
195- " less than criticalVal %f to guarantee distribution is relatively uniform" ,
196- chiSquared , criticalValue )
197- }
198- return nil
199- })
200- require .NoError (t , err )
182+ t .Log ("Key counts in each split range" )
183+ for i , keyCount := range keyCounts {
184+ t .Logf ("range %d: %d\n " , i , keyCount )
185+ }
186+ require .Len (t , keyCounts , 1 << (prefixBits - 1 ))
187+
188+ // To check that the distribution over ranges is not uniform, we use a
189+ // chi-square goodness of fit statistic. We'll set our null hypothesis as
190+ // 'each range in the distribution should have the same probability of getting
191+ // a row inserted' and we'll check if we can reject the null hypothesis if
192+ // chi-squared is greater than the critical value we currently set as 35.2585,
193+ // a deliberate choice that gives us a p-value of 0.00001 according to
194+ // https://www.fourmilab.ch/rpkp/experiments/analysis/chiCalc.html. If we are
195+ // able to reject the null hypothesis, then the distribution is not uniform,
196+ // and we raise an error. This test has 7 degrees of freedom (categories - 1).
197+ chiSquared := discreteUniformChiSquared (keyCounts )
198+ criticalValue := 35.2585
199+ if chiSquared > criticalValue {
200+ t .Logf ("chiSquared value of %f > criticalVal %f indicates that the distribution " +
201+ "is non-uniform (statistically significant, p < 0.00001)" ,
202+ chiSquared , criticalValue )
203+ numNonUniformObservations += 1
204+ } else {
205+ t .Logf ("chiSquared value of %f <= criticalVal %f indicates that the distribution " +
206+ "is relatively uniform" ,
207+ chiSquared , criticalValue )
208+ }
209+ }()
210+ }
211+
212+ if numNonUniformObservations >= numObservations / 2 {
213+ t .Fatalf ("a majority of our observations indicate the distribution is non-uniform" )
214+ }
201215}
202216
203217// discreteUniformChiSquared calculates the chi-squared statistic (ref:
204218// https://www.itl.nist.gov/div898/handbook/eda/section3/eda35f.htm) to be used
205219// in our hypothesis testing for the distribution of rows among ranges.
206- func discreteUniformChiSquared (counts []int64 ) float64 {
220+ func discreteUniformChiSquared (buckets []int64 ) float64 {
207221 var n int64
208- for _ , c := range counts {
222+ for _ , c := range buckets {
209223 n += c
210224 }
211- p := float64 (1 ) / float64 (len (counts ))
212- var stat float64
213- for _ , c := range counts {
214- oSubE := float64 (c )/ float64 (n ) - p
215- stat += (oSubE * oSubE ) / p
225+ expectedPerBucket := float64 (n ) / float64 (len (buckets ))
226+ var chiSquared float64
227+ for _ , c := range buckets {
228+ chiSquared += math .Pow (float64 (c )- expectedPerBucket , 2 ) / expectedPerBucket
216229 }
217- stat *= float64 (n )
218- return stat
230+ return chiSquared
219231}
220232
221233func TestStringToArrayAndBack (t * testing.T ) {
0 commit comments