Extended docs regarding ValueAtQuantile() and added ValuesAreEquivalent() (#39)

filipecosta90 · web-flow · commit 1dc8842b4cde · 2020-11-24T07:52:56.000-08:00
* [add] Extended docs regarding ValueAtQuantile() and added ValuesAreEquivalent()

* [add] made New() documentation clearer

* [fix] Fixes per PR review on New()

* [fix] Fixed New() not to panic on numberOfSignificantValueDigits &lt; 1 || numberOfSignificantValueDigits &gt; 5. Adding linter check to CI

* [add] Added whitebox testing for hdr.go ( specifically for New() numberOfSignificantValueDigits limits ).
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -15,4 +15,15 @@ jobs:
     - name: Checkout code
       uses: actions/checkout@v2
     - name: Test
-      run: make test
+      run: make test
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Install Go
+      uses: actions/setup-go@v2
+      with:
+        go-version: 1.15.x
+    - name: Checkout code
+      uses: actions/checkout@v2
+    - name: Lint
+      run: make lint
diff --git a/example_hdr_test.go b/example_hdr_test.go
@@ -7,7 +7,7 @@ import (
 )
 
 // This latency Histogram could be used to track and analyze the counts of
-// observed integer values between 0 us and 30000000 us ( 30 secs )
+// observed integer values between 1 us and 30000000 us ( 30 secs )
 // while maintaining a value precision of 4 significant digits across that range,
 // translating to a value resolution of :
 //   - 1 microsecond up to 10 milliseconds,
diff --git a/go.mod b/go.mod
@@ -3,7 +3,11 @@ module github.com/HdrHistogram/hdrhistogram-go
 go 1.14
 
 require (
-	github.com/golangci/golangci-lint v1.31.0 // indirect
+	github.com/davecgh/go-spew v1.1.1 // indirect
 	github.com/google/go-cmp v0.5.2
+	github.com/kr/text v0.2.0 // indirect
+	github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e // indirect
 	github.com/stretchr/testify v1.6.1
+	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
+	gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f // indirect
 )
diff --git a/go.sum b/go.sum
diff --git a/hdr.go b/hdr.go
@@ -28,7 +28,7 @@ type Snapshot struct {
 // non-normally distributed data (like latency) with a high degree of accuracy
 // and a bounded degree of precision.
 type Histogram struct {
-	lowestTrackableValue        int64
+	lowestDiscernibleValue      int64
 	highestTrackableValue       int64
 	unitMagnitude               int64
 	significantFigures          int64
@@ -69,23 +69,41 @@ func (h *Histogram) SetStartTimeMs(startTimeMs int64) {
 	h.startTimeMs = startTimeMs
 }
 
-// New returns a new Histogram instance capable of tracking values in the given
-// range and with the given amount of precision.
-func New(minValue, maxValue int64, sigfigs int) *Histogram {
-	if sigfigs < 1 || 5 < sigfigs {
-		panic(fmt.Errorf("sigfigs must be [1,5] (was %d)", sigfigs))
-	}
-
-	largestValueWithSingleUnitResolution := 2 * math.Pow10(sigfigs)
+// Construct a Histogram given the Lowest and Highest values to be tracked and a number of significant decimal digits.
+//
+// Providing a lowestDiscernibleValue is useful in situations where the units used for the histogram's values are
+// much smaller that the minimal accuracy required.
+// E.g. when tracking time values stated in nanosecond units, where the minimal accuracy required is a microsecond,
+// the proper value for lowestDiscernibleValue would be 1000.
+//
+// Note: the numberOfSignificantValueDigits must be [1,5]. If lower than 1 the numberOfSignificantValueDigits will be
+// forced to 1, and if higher than 5 the numberOfSignificantValueDigits will be forced to 5.
+func New(lowestDiscernibleValue, highestTrackableValue int64, numberOfSignificantValueDigits int) *Histogram {
+	if numberOfSignificantValueDigits < 1 {
+		numberOfSignificantValueDigits = 1
+	} else if numberOfSignificantValueDigits > 5 {
+		numberOfSignificantValueDigits = 5
+	}
+	if lowestDiscernibleValue < 1 {
+		lowestDiscernibleValue = 1
+	}
+
+	// Given a 3 decimal point accuracy, the expectation is obviously for "+/- 1 unit at 1000". It also means that
+	// it's "ok to be +/- 2 units at 2000". The "tricky" thing is that it is NOT ok to be +/- 2 units at 1999. Only
+	// starting at 2000. So internally, we need to maintain single unit resolution to 2x 10^decimalPoints.
+	largestValueWithSingleUnitResolution := 2 * math.Pow10(numberOfSignificantValueDigits)
+
+	// We need to maintain power-of-two subBucketCount (for clean direct indexing) that is large enough to
+	// provide unit resolution to at least largestValueWithSingleUnitResolution. So figure out
+	// largestValueWithSingleUnitResolution's nearest power-of-two (rounded up), and use that:
 	subBucketCountMagnitude := int32(math.Ceil(math.Log2(float64(largestValueWithSingleUnitResolution))))
-
 	subBucketHalfCountMagnitude := subBucketCountMagnitude
 	if subBucketHalfCountMagnitude < 1 {
 		subBucketHalfCountMagnitude = 1
 	}
 	subBucketHalfCountMagnitude--
 
-	unitMagnitude := int32(math.Floor(math.Log2(float64(minValue))))
+	unitMagnitude := int32(math.Floor(math.Log2(float64(lowestDiscernibleValue))))
 	if unitMagnitude < 0 {
 		unitMagnitude = 0
 	}
@@ -98,20 +116,16 @@ func New(minValue, maxValue int64, sigfigs int) *Histogram {
 	// determine exponent range needed to support the trackable value with no
 	// overflow:
 	smallestUntrackableValue := int64(subBucketCount) << uint(unitMagnitude)
-	bucketsNeeded := int32(1)
-	for smallestUntrackableValue < maxValue {
-		smallestUntrackableValue <<= 1
-		bucketsNeeded++
-	}
+	bucketsNeeded := getBucketsNeededToCoverValue(smallestUntrackableValue, highestTrackableValue)
 
 	bucketCount := bucketsNeeded
 	countsLen := (bucketCount + 1) * (subBucketCount / 2)
 
 	return &Histogram{
-		lowestTrackableValue:        minValue,
-		highestTrackableValue:       maxValue,
+		lowestDiscernibleValue:      lowestDiscernibleValue,
+		highestTrackableValue:       highestTrackableValue,
 		unitMagnitude:               int64(unitMagnitude),
-		significantFigures:          int64(sigfigs),
+		significantFigures:          int64(numberOfSignificantValueDigits),
 		subBucketHalfCountMagnitude: subBucketHalfCountMagnitude,
 		subBucketHalfCount:          subBucketHalfCount,
 		subBucketMask:               subBucketMask,
@@ -126,6 +140,21 @@ func New(minValue, maxValue int64, sigfigs int) *Histogram {
 	}
 }
 
+func getBucketsNeededToCoverValue(smallestUntrackableValue int64, maxValue int64) int32 {
+	// always have at least 1 bucket
+	bucketsNeeded := int32(1)
+	for smallestUntrackableValue < maxValue {
+		if smallestUntrackableValue > (math.MaxInt64 / 2) {
+			// next shift will overflow, meaning that bucket could represent values up to ones greater than
+			// math.MaxInt64, so it's the last bucket
+			return bucketsNeeded + 1
+		}
+		smallestUntrackableValue <<= 1
+		bucketsNeeded++
+	}
+	return bucketsNeeded
+}
+
 // ByteSize returns an estimate of the amount of memory allocated to the
 // histogram in bytes.
 //
@@ -277,7 +306,12 @@ func (h *Histogram) setCountAtIndex(idx int, n int64) {
 	h.totalCount += n
 }
 
-// ValueAtQuantile returns the recorded value at the given quantile (0..100).
+// ValueAtQuantile returns the largest value that (100% - percentile) of the overall recorded value entries
+// in the histogram are either larger than or equivalent to.
+//
+// Note that two values are "equivalent" if `ValuesAreEquivalent(value1,value2)` would return true.
+//
+// Returns 0 if no recorded values exist.
 func (h *Histogram) ValueAtQuantile(q float64) int64 {
 	if q > 100 {
 		q = 100
@@ -290,13 +324,24 @@ func (h *Histogram) ValueAtQuantile(q float64) int64 {
 	for i.next() {
 		total += i.countAtIdx
 		if total >= countAtPercentile {
+			if q == 0.0 {
+				return h.lowestEquivalentValue(i.valueFromIdx)
+			}
 			return h.highestEquivalentValue(i.valueFromIdx)
 		}
 	}
 
 	return 0
 }
 
+// Determine if two values are equivalent with the histogram's resolution.
+// Where "equivalent" means that value samples recorded for any two
+// equivalent values are counted in a common total count.
+func (h *Histogram) ValuesAreEquivalent(value1, value2 int64) (result bool) {
+	result = h.lowestEquivalentValue(value1) == h.lowestEquivalentValue(value2)
+	return
+}
+
 // CumulativeDistribution returns an ordered list of brackets of the
 // distribution of recorded values.
 func (h *Histogram) CumulativeDistribution() []Bracket {
@@ -323,7 +368,7 @@ func (h *Histogram) SignificantFigures() int64 {
 // LowestTrackableValue returns the lower bound on values that will be added
 // to the histogram
 func (h *Histogram) LowestTrackableValue() int64 {
-	return h.lowestTrackableValue
+	return h.lowestDiscernibleValue
 }
 
 // HighestTrackableValue returns the upper bound on values that will be added
@@ -361,7 +406,7 @@ func (h *Histogram) Distribution() (result []Bar) {
 func (h *Histogram) Equals(other *Histogram) bool {
 	switch {
 	case
-		h.lowestTrackableValue != other.lowestTrackableValue,
+		h.lowestDiscernibleValue != other.lowestDiscernibleValue,
 		h.highestTrackableValue != other.highestTrackableValue,
 		h.unitMagnitude != other.unitMagnitude,
 		h.significantFigures != other.significantFigures,
@@ -387,7 +432,7 @@ func (h *Histogram) Equals(other *Histogram) bool {
 // Import to construct a new Histogram with the same state.
 func (h *Histogram) Export() *Snapshot {
 	return &Snapshot{
-		LowestTrackableValue:  h.lowestTrackableValue,
+		LowestTrackableValue:  h.lowestDiscernibleValue,
 		HighestTrackableValue: h.highestTrackableValue,
 		SignificantFigures:    h.significantFigures,
 		Counts:                append([]int64(nil), h.counts...), // copy
@@ -478,12 +523,21 @@ func (h *Histogram) countsIndex(bucketIdx, subBucketIdx int32) int32 {
 	return bucketBaseIdx + offsetInBucket
 }
 
+// return the lowest (and therefore highest precision) bucket index that can represent the value
+// Calculates the number of powers of two by which the value is greater than the biggest value that fits in
+// bucket 0. This is the bucket index since each successive bucket can hold a value 2x greater.
 func (h *Histogram) getBucketIndex(v int64) int32 {
 	pow2Ceiling := bitLen(v | h.subBucketMask)
 	return int32(pow2Ceiling - int64(h.unitMagnitude) -
 		int64(h.subBucketHalfCountMagnitude+1))
 }
 
+// For bucketIndex 0, this is just value, so it may be anywhere in 0 to subBucketCount.
+// For other bucketIndex, this will always end up in the top half of subBucketCount: assume that for some bucket
+// k > 0, this calculation will yield a value in the bottom half of 0 to subBucketCount. Then, because of how
+// buckets overlap, it would have also been in the top half of bucket k-1, and therefore would have
+// returned k-1 in getBucketIndex(). Since we would then shift it one fewer bits here, it would be twice as big,
+// and therefore in the top half of subBucketCount.
 func (h *Histogram) getSubBucketIdx(v int64, idx int32) int32 {
 	return int32(v >> uint(int64(idx)+int64(h.unitMagnitude)))
 }
@@ -505,11 +559,11 @@ type iterator struct {
 	highestEquivalentValue               int64
 }
 
+// Returns the next element in the iteration.
 func (i *iterator) next() bool {
 	if i.countToIdx >= i.h.totalCount {
 		return false
 	}
-
 	// increment bucket
 	i.subBucketIdx++
 	if i.subBucketIdx >= i.h.subBucketCount {
diff --git a/hdr_encoding.go b/hdr_encoding.go
@@ -127,7 +127,7 @@ func (h *Histogram) encodeIntoByteBuffer() (*bytes.Buffer, error) {
 	if err != nil {
 		return nil, err
 	}
-	err = binary.Write(toCompress, binary.BigEndian, h.lowestTrackableValue) // 16-23
+	err = binary.Write(toCompress, binary.BigEndian, h.lowestDiscernibleValue) // 16-23
 	if err != nil {
 		return nil, err
 	}
diff --git a/hdr_test.go b/hdr_test.go
@@ -54,7 +54,6 @@ func TestValueAtQuantile(t *testing.T) {
 	}
 }
 
-
 func TestMean(t *testing.T) {
 	h := hdrhistogram.New(1, 10000000, 3)
 	for i := 0; i < 1000000; i++ {
@@ -386,3 +385,18 @@ func TestEquals(t *testing.T) {
 		t.Error("Expected Histograms to be equivalent")
 	}
 }
+
+// nolint
+func TestHistogram_ValuesAreEquivalent(t *testing.T) {
+	hist := hdrhistogram.New(1476573605, 1476593605, 3)
+	assert.True(t, hist.ValuesAreEquivalent(1476583605, 2147483647))
+
+	// test large histograms
+	hist = hdrhistogram.New(20000000, 100000000, 5)
+	hist.RecordValue(100000000)
+	hist.RecordValue(20000000)
+	hist.RecordValue(30000000)
+	assert.True(t, hist.ValuesAreEquivalent(20000000, hist.ValueAtQuantile(50.0)))
+	assert.True(t, hist.ValuesAreEquivalent(100000000, hist.ValueAtQuantile(83.34)))
+	assert.True(t, hist.ValuesAreEquivalent(100000000, hist.ValueAtQuantile(99.0)))
+}
diff --git a/hdr_whitebox_test.go b/hdr_whitebox_test.go
@@ -0,0 +1,15 @@
+package hdrhistogram
+
+import (
+	"github.com/stretchr/testify/assert"
+	"testing"
+)
+
+func TestHistogram_New_internals(t *testing.T) {
+	// test for numberOfSignificantValueDigits if higher than 5 the numberOfSignificantValueDigits will be forced to 5
+	hist := New(1, 9007199254740991, 6)
+	assert.Equal(t, int64(5), hist.significantFigures)
+	// test for numberOfSignificantValueDigits if lower than 1 the numberOfSignificantValueDigits will be forced to 1
+	hist = New(1, 9007199254740991, 0)
+	assert.Equal(t, int64(1), hist.significantFigures)
+}

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ import (`
`7`	`7`	`)`
`8`	`8`
`9`	`9`	`// This latency Histogram could be used to track and analyze the counts of`
`10`		`-// observed integer values between 0 us and 30000000 us ( 30 secs )`
	`10`	`+// observed integer values between 1 us and 30000000 us ( 30 secs )`
`11`	`11`	`// while maintaining a value precision of 4 significant digits across that range,`
`12`	`12`	`// translating to a value resolution of :`
`13`	`13`	`// - 1 microsecond up to 10 milliseconds,`
Original file line number	Diff line number	Diff line change
`@@ -127,7 +127,7 @@ func (h Histogram) encodeIntoByteBuffer() (bytes.Buffer, error) {`
`127`	`127`	`if err != nil {`
`128`	`128`	`return nil, err`
`129`	`129`	`}`
`130`		`- err = binary.Write(toCompress, binary.BigEndian, h.lowestTrackableValue) // 16-23`
	`130`	`+ err = binary.Write(toCompress, binary.BigEndian, h.lowestDiscernibleValue) // 16-23`
`131`	`131`	`if err != nil {`
`132`	`132`	`return nil, err`
`133`	`133`	`}`