@@ -14,6 +14,7 @@ import (
1414 "bytes"
1515 "context"
1616 "fmt"
17+ "math"
1718 "math/bits"
1819 "math/rand"
1920 "testing"
@@ -32,9 +33,7 @@ import (
3233 "github.com/cockroachdb/cockroach/pkg/util/duration"
3334 "github.com/cockroachdb/cockroach/pkg/util/leaktest"
3435 "github.com/cockroachdb/cockroach/pkg/util/log"
35- "github.com/cockroachdb/cockroach/pkg/util/retry"
3636 "github.com/cockroachdb/cockroach/pkg/util/timeutil"
37- "github.com/cockroachdb/errors"
3837 "github.com/lib/pq"
3938 "github.com/stretchr/testify/assert"
4039 "github.com/stretchr/testify/require"
@@ -131,8 +130,8 @@ func TestMapToUnorderedUniqueInt(t *testing.T) {
131130}
132131
133132// TestSerialNormalizationWithUniqueUnorderedID makes sure that serial
134- // normalization can use unique_unordered_id() and a split in a table followed
135- // by insertions guarantees a (somewhat) uniform distribution of the data.
133+ // normalization can use the unordered_rowid mode and a large number of
134+ // insertions (80k) results in a (somewhat) uniform distribution of the data.
136135func TestSerialNormalizationWithUniqueUnorderedID (t * testing.T ) {
137136 defer leaktest .AfterTest (t )()
138137 defer log .Scope (t ).Close (t )
@@ -141,25 +140,33 @@ func TestSerialNormalizationWithUniqueUnorderedID(t *testing.T) {
141140 "assumes large N" )
142141
143142 ctx := context .Background ()
144- const attempts = 2
145- // This test can flake when the random data is not distributed evenly.
146- err := retry .WithMaxAttempts (ctx , retry.Options {}, attempts , func () error {
147-
148- params := base.TestServerArgs {
149- Knobs : base.TestingKnobs {
150- SQLEvalContext : & eval.TestingKnobs {
151- // We disable the randomization of some batch sizes because
152- // with some low values the test takes much longer.
153- ForceProductionValues : true ,
143+
144+ // Since this is a statistical test, an occasional observation of non-uniformity
145+ // is not in and of itself a huge concern. As such, we'll only fail the test if
146+ // a majority of our observations show statistically significant amounts of
147+ // non-uniformity.
148+ const numObservations = 10
149+ numNonUniformObservations := 0
150+ for i := 0 ; i < numObservations ; i ++ {
151+ // We use an anonymous function because of the defer in each iteration.
152+ func () {
153+ t .Logf ("starting observation %d" , i )
154+
155+ params := base.TestServerArgs {
156+ Knobs : base.TestingKnobs {
157+ SQLEvalContext : & eval.TestingKnobs {
158+ // We disable the randomization of some batch sizes because
159+ // with some low values the test takes much longer.
160+ ForceProductionValues : true ,
161+ },
154162 },
155- },
156- }
157- s , db , _ := serverutils .StartServer (t , params )
158- defer s .Stopper ().Stop (ctx )
163+ }
164+ s , db , _ := serverutils .StartServer (t , params )
165+ defer s .Stopper ().Stop (ctx )
159166
160- tdb := sqlutils .MakeSQLRunner (db )
161- // Create a new table with serial primary key i (unordered_rowid) and int j (index).
162- tdb .Exec (t , `
167+ tdb := sqlutils .MakeSQLRunner (db )
168+ // Create a new table with serial primary key i (unordered_rowid) and int j (index).
169+ tdb .Exec (t , `
163170SET serial_normalization TO 'unordered_rowid';
164171CREATE DATABASE t;
165172USE t;
@@ -168,30 +175,29 @@ CREATE TABLE t (
168175 j INT
169176)` )
170177
171- numberOfRows := 100000
172-
173- // Insert rows.
174- tdb .Exec (t , fmt .Sprintf (`
178+ // Insert rows.
179+ numberOfRows := 80000
180+ tdb .Exec (t , fmt .Sprintf (`
175181INSERT INTO t(j) SELECT * FROM generate_series(1, %d);
176182` , numberOfRows ))
177183
178- // Build an equi-width histogram over the key range. The below query will
179- // generate the key bounds for each high-order bit pattern as defined by
180- // prefixBits. For example, if this were 3, we'd get the following groups:
181- //
182- // low | high
183- // ----------------------+----------------------
184- // 0 | 2305843009213693952
185- // 2305843009213693952 | 4611686018427387904
186- // 4611686018427387904 | 6917529027641081856
187- // 6917529027641081856 | 9223372036854775807
188- //
189- const prefixBits = 4
190- var keyCounts pq.Int64Array
191- tdb .QueryRow (t , `
184+ // Build an equi-width histogram over the key range. The below query will
185+ // generate the key bounds for each high-order bit pattern as defined by
186+ // prefixBits. For example, if this were 3, we'd get the following groups:
187+ //
188+ // low | high
189+ // ----------------------+----------------------
190+ // 0 | 2305843009213693952
191+ // 2305843009213693952 | 4611686018427387904
192+ // 4611686018427387904 | 6917529027641081856
193+ // 6917529027641081856 | 9223372036854775807
194+ //
195+ const prefixBits = 4
196+ var keyCounts pq.Int64Array
197+ tdb .QueryRow (t , `
192198 WITH boundaries AS (
193- SELECT i << (64 - $1) AS p FROM ROWS FROM (generate_series(0, (1<< ($1- 1)) -1)) AS t (i)
194- UNION ALL SELECT (((1 << 62) - 1) << 1)+ 1 -- int63 max value
199+ SELECT i << (64 - $1) AS p FROM ROWS FROM (generate_series(0, (1 << ($1 - 1)) - 1)) AS t (i)
200+ UNION ALL SELECT (((1 << 62) - 1) << 1) + 1 -- int63 max value
195201 ),
196202 groups AS (
197203 SELECT *
@@ -201,55 +207,61 @@ INSERT INTO t(j) SELECT * FROM generate_series(1, %d);
201207 counts AS (
202208 SELECT count(i) AS c
203209 FROM t, groups
204- WHERE low <= i AND high > i
210+ WHERE low < i AND i <= high
205211 GROUP BY (low, high)
206212 )
207213SELECT array_agg(c)
208214 FROM counts;` , prefixBits ).Scan (& keyCounts )
209215
210- t .Log ("Key counts in each split range" )
211- for i , keyCount := range keyCounts {
212- t .Logf ("range %d: %d\n " , i , keyCount )
213- }
214- require .Len (t , keyCounts , 1 << (prefixBits - 1 ))
215-
216- // To check that the distribution over ranges is not uniform, we use a
217- // chi-square goodness of fit statistic. We'll set our null hypothesis as
218- // 'each range in the distribution should have the same probability of getting
219- // a row inserted' and we'll check if we can reject the null hypothesis if
220- // chi-square is greater than the critical value we currently set as 35.2585,
221- // a deliberate choice that gives us a p-value of 0.00001 according to
222- // https://www.fourmilab.ch/rpkp/experiments/analysis/chiCalc.html. If we are
223- // able to reject the null hypothesis, then the distribution is not uniform,
224- // and we raise an error. This test has 7 degrees of freedom (categories - 1).
225- chiSquared := discreteUniformChiSquared (keyCounts )
226- criticalValue := 35.2585
227- if chiSquared >= criticalValue {
228- return errors .Newf ("chiSquared value of %f must be" +
229- " less than criticalVal %f to guarantee distribution is relatively uniform" ,
230- chiSquared , criticalValue )
231- }
232- return nil
233- })
234- require .NoError (t , err )
216+ t .Log ("Key counts in each split range" )
217+ for i , keyCount := range keyCounts {
218+ t .Logf ("range %d: %d\n " , i , keyCount )
219+ }
220+ require .Len (t , keyCounts , 1 << (prefixBits - 1 ))
221+
222+ // To check that the distribution over ranges is not uniform, we use a
223+ // chi-square goodness of fit statistic. We'll set our null hypothesis as
224+ // 'each range in the distribution should have the same probability of getting
225+ // a row inserted' and we'll check if we can reject the null hypothesis if
226+ // chi-squared is greater than the critical value we currently set as 35.2585,
227+ // a deliberate choice that gives us a p-value of 0.00001 according to
228+ // https://www.fourmilab.ch/rpkp/experiments/analysis/chiCalc.html. If we are
229+ // able to reject the null hypothesis, then the distribution is not uniform,
230+ // and we raise an error. This test has 7 degrees of freedom (categories - 1).
231+ chiSquared := discreteUniformChiSquared (keyCounts )
232+ criticalValue := 35.2585
233+ if chiSquared > criticalValue {
234+ t .Logf ("chiSquared value of %f > criticalVal %f indicates that the distribution " +
235+ "is non-uniform (statistically significant, p < 0.00001)" ,
236+ chiSquared , criticalValue )
237+ numNonUniformObservations += 1
238+ } else {
239+ t .Logf ("chiSquared value of %f <= criticalVal %f indicates that the distribution " +
240+ "is relatively uniform" ,
241+ chiSquared , criticalValue )
242+ }
243+ }()
244+ }
245+
246+ if numNonUniformObservations >= numObservations / 2 {
247+ t .Fatalf ("a majority of our observations indicate the distribution is non-uniform" )
248+ }
235249}
236250
237251// discreteUniformChiSquared calculates the chi-squared statistic (ref:
238252// https://www.itl.nist.gov/div898/handbook/eda/section3/eda35f.htm) to be used
239253// in our hypothesis testing for the distribution of rows among ranges.
240- func discreteUniformChiSquared (counts []int64 ) float64 {
254+ func discreteUniformChiSquared (buckets []int64 ) float64 {
241255 var n int64
242- for _ , c := range counts {
256+ for _ , c := range buckets {
243257 n += c
244258 }
245- p := float64 (1 ) / float64 (len (counts ))
246- var stat float64
247- for _ , c := range counts {
248- oSubE := float64 (c )/ float64 (n ) - p
249- stat += (oSubE * oSubE ) / p
259+ expectedPerBucket := float64 (n ) / float64 (len (buckets ))
260+ var chiSquared float64
261+ for _ , c := range buckets {
262+ chiSquared += math .Pow (float64 (c )- expectedPerBucket , 2 ) / expectedPerBucket
250263 }
251- stat *= float64 (n )
252- return stat
264+ return chiSquared
253265}
254266
255267func TestStringToArrayAndBack (t * testing.T ) {
0 commit comments