diff --git a/pkg/aws/cloudwatch/histograms/README.md b/pkg/aws/cloudwatch/histograms/README.md new file mode 100644 index 0000000000000..f1f40dd309c1a --- /dev/null +++ b/pkg/aws/cloudwatch/histograms/README.md @@ -0,0 +1,14 @@ +# Histograms + +This package holds common CloudWatch histogram functionality for AWS owned OpenTelemetry components. + +## Visualize histogram mappings + +1. Remove `t.Skip(...)` from `TestWriteInputHistograms` and run the test to generate json files for the input histograms. +1. Remove `t.Skip(...)` from `TestWriteConvertedHistograms` and run the test to generate json files for the converted histograms. +1. Run `histogram_mapping.py` to generate visualizations + +```bash +pip install matplotlib numpy +python histogram_mappings.py +```` diff --git a/pkg/aws/cloudwatch/histograms/conversion.go b/pkg/aws/cloudwatch/histograms/conversion.go new file mode 100644 index 0000000000000..c2652e1eb16bb --- /dev/null +++ b/pkg/aws/cloudwatch/histograms/conversion.go @@ -0,0 +1,317 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package histograms // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/aws/cloudwatch/histograms" + +import ( + "math" + + "go.opentelemetry.io/collector/pdata/pmetric" + + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/aws/cloudwatch" +) + +type ExponentialMapping struct { + maximum float64 + minimum float64 + sampleCount float64 + sum float64 + values []float64 + counts []float64 +} + +var _ (cloudwatch.HistogramDataPoint) = (*ExponentialMapping)(nil) + +// ConvertOTelToCloudWatch converts an OpenTelemetry histogram datapoint to a CloudWatch histogram datapoint using +// exponential mapping +func ConvertOTelToCloudWatch(dp pmetric.HistogramDataPoint) cloudwatch.HistogramDataPoint { + // maximumInnerBucketCount is the maximum number of inner buckets that each outer bucket can be represented with + // + // A larger values increase the resolution at which the data is sub-sampled while also incurring additional memory + // allocation, processing time, and the maximum number of value/count pairs that are sent to CloudWatch which could + // cause a CloudWatch PutMetricData / PutLogEvent request to be split into multiple requests due to the 100/150 + // metric datapoint limit. + const maximumInnerBucketCount = 10 + + // No validations - assuming valid input histogram + + em := &ExponentialMapping{ + maximum: dp.Max(), + minimum: dp.Min(), + sampleCount: float64(dp.Count()), + sum: dp.Sum(), + } + + // bounds specifies the boundaries between buckets + // bucketCounts specifies the number of datapoints in each bucket + // there is always 1 more bucket count than there is boundaries + // len(bucketCounts) = len(bounds) + 1 + bounds := dp.ExplicitBounds() + lenBounds := bounds.Len() + bucketCounts := dp.BucketCounts() + lenBucketCounts := bucketCounts.Len() + + // Special case: no boundaries implies a single bucket + if lenBounds == 0 { + em.counts = append(em.counts, float64(bucketCounts.At(0))) // recall that len(bucketCounts) = len(bounds)+1 + switch { + case dp.HasMax() && dp.HasMin(): + em.values = append(em.values, em.minimum/2.0+em.maximum/2.0) + case dp.HasMax(): + em.values = append(em.values, em.maximum) // only data point we have is the maximum + case dp.HasMin(): + em.values = append(em.values, em.minimum) // only data point we have is the minimum + default: + em.values = append(em.values, 0) // arbitrary value + } + return em + } + + // To create inner buckets, all outer buckets need to have defined boundaries. The first and last bucket use the + // min and max and their lower and upper bounds respectively. The min and max are optional on the OTel datapoint. + // When min and max are not defined, make some reasonable about about what the min/max could be + if !dp.HasMin() { + // Find the first bucket which contains some data points. The min must be in that bucket + minBucketIdx := 0 + for i := 0; i < lenBucketCounts; i++ { + if bucketCounts.At(i) > 0 { + minBucketIdx = i + break + } + } + + // take the lower bound of the bucket. lower bound of bucket index n is boundary index n-1 + if minBucketIdx != 0 { + em.minimum = bounds.At(minBucketIdx - 1) + } else { + bucketWidth := 0.001 // arbitrary width - there's no information about this histogram to make an inference with if there are no bounds + if lenBounds > 1 { + bucketWidth = bounds.At(1) - bounds.At(0) + } + em.minimum = bounds.At(0) - bucketWidth + + // if all boundaries are positive, assume all data is positive. this covers use cases where Prometheus + // histogram metrics for non-zero values like request durations have their first bucket start at 0. for + // these metrics, a negative minimum will cause percentile metrics to be unavailable + if bounds.At(0) >= 0 { + em.minimum = max(em.minimum, 0.0) + } + } + } + + if !dp.HasMax() { + // Find the last bucket with some data in it. The max must be in that bucket + maxBucketIdx := lenBounds - 1 + for i := lenBucketCounts - 1; i >= 0; i-- { + if bucketCounts.At(i) > 0 { + maxBucketIdx = i + break + } + } + + // we want the upper bound of the bucket. the upper bound of bucket index n is boundary index n + if maxBucketIdx <= lenBounds-1 { + em.maximum = bounds.At(maxBucketIdx) + } else { + bucketWidth := 0.01 // arbitrary width - there's no information about this histogram to make an inference with + if lenBounds > 1 { + bucketWidth = bounds.At(lenBounds-1) - bounds.At(lenBounds-2) + } + em.maximum = bounds.At(lenBounds-1) + bucketWidth + } + } + + // Pre-calculate total output size to avoid dynamic growth + totalOutputSize := 0 + for i := 0; i < lenBucketCounts; i++ { + sampleCount := bucketCounts.At(i) + if sampleCount > 0 { + totalOutputSize += int(min(sampleCount, maximumInnerBucketCount)) + } + } + if totalOutputSize == 0 { + // No samples in any bucket + return em + } + + em.values = make([]float64, 0, totalOutputSize) + em.counts = make([]float64, 0, totalOutputSize) + + for i := 0; i < lenBucketCounts; i++ { + sampleCount := int(bucketCounts.At(i)) + if sampleCount == 0 { + // No need to operate on a bucket with no samples + continue + } + + lowerBound := em.minimum + if i > 0 { + lowerBound = bounds.At(i - 1) + } + upperBound := em.maximum + if i < lenBucketCounts-1 { + upperBound = bounds.At(i) + } + + // This algorithm creates "inner buckets" between user-defined bucket based on the sample count, up to a + // maximum. A logarithmic ratio (named "magnitude") compares the density between the current bucket and the + // next bucket. This logarithmic ratio is used to decide how to spread samples amongst inner buckets. + // + // case 1: magnitude < 0 + // * What this means: Current bucket is denser than the next bucket -> density is decreasing. + // * What we do: Use inverse quadratic distribution to spread the samples. This allocates more samples towards + // the lower bound of the bucket. + // case 2: 0 <= magnitude < 1 + // * What this means: Current bucket and next bucket has similar densities -> density is not changing much. + // * What we do: Use inform distribution to spread the samples. Extra samples that can't be spread evenly are + // (arbitrarily) allocated towards the start of the bucket. + // case 3: 1 <= magnitude + // * What this means: Current bucket is less dense than the next bucket -> density is increasing. + // * What we do: Use quadratic distribution to spread the samples. This allocates more samples toward the end + // of the bucket. + // + // As a small optimization, we omit the logarithm invocation and change the thresholds. + ratio := 0.0 + if i < lenBucketCounts-1 { + nextSampleCount := bucketCounts.At(i + 1) + // If next bucket is empty, than density is surely decreasing + if nextSampleCount == 0 { + ratio = 0.0 + } else { + var nextUpperBound float64 + if i+1 == lenBucketCounts-1 { + nextUpperBound = em.maximum + } else { + nextUpperBound = bounds.At(i + 1) + } + + // original calculations for reference + // currentBucketDensity := float64(sampleCount) / (upperBound - lowerBound) + // nextBucketDensity := float64(nextSampleCount) / (nextUpperBound - upperBound) + // ratio = nextBucketDensity / currentBucketDensity + // + // the following calculations are the same but improves speed by ~1% in benchmark tests + denom := (nextUpperBound - upperBound) * float64(sampleCount) + numerator := (upperBound - lowerBound) * float64(nextSampleCount) + ratio = numerator / denom + } + } + + // innerBucketCount is how many "inner buckets" to spread the sample count amongst + innerBucketCount := min(sampleCount, maximumInnerBucketCount) + delta := (upperBound - lowerBound) / float64(innerBucketCount) + + switch { + case ratio < 1.0/math.E: // magnitude < 0: Use -yx^2 (inverse quadratic) + sigma := float64(sumOfSquares(innerBucketCount)) + epsilon := float64(sampleCount) / sigma + entryStart := len(em.counts) + + runningSum := 0 + for j := 0; j < innerBucketCount; j++ { + innerBucketSampleCount := epsilon * float64((j-innerBucketCount)*(j-innerBucketCount)) + innerBucketSampleCountAdjusted := int(math.Floor(innerBucketSampleCount)) + if innerBucketSampleCountAdjusted > 0 { + runningSum += innerBucketSampleCountAdjusted + em.values = append(em.values, lowerBound+delta*float64(j+1)) + em.counts = append(em.counts, float64(innerBucketSampleCountAdjusted)) + } + } + + // distribute the remainder towards the front + remainder := sampleCount - runningSum + // make sure there's room for the remainder + if len(em.counts) < entryStart+remainder { + em.counts = append(em.counts, make([]float64, remainder)...) + em.values = append(em.values, make([]float64, remainder)...) + } + for j := 0; j < remainder; j++ { + em.counts[entryStart]++ + entryStart++ + } + case ratio < math.E: // 0 <= magnitude < 1: Use x + // Distribute samples evenly with integer counts + baseCount := sampleCount / innerBucketCount + remainder := sampleCount % innerBucketCount + for j := 1; j <= innerBucketCount; j++ { + count := baseCount + + // Distribute remainder to first few buckets + if j <= remainder { + count++ + } + em.values = append(em.values, lowerBound+delta*float64(j)) + em.counts = append(em.counts, float64(count)) + } + default: // magnitude >= 1: Use yx^2 (quadratic) + sigma := float64(sumOfSquares(innerBucketCount)) + epsilon := float64(sampleCount) / sigma + entryStart := len(em.counts) + + runningSum := 0 + for j := 1; j <= innerBucketCount; j++ { + innerBucketSampleCount := epsilon * float64(j*j) + innerBucketSampleCountAdjusted := int(math.Floor(innerBucketSampleCount)) + if innerBucketSampleCountAdjusted > 0 { + runningSum += innerBucketSampleCountAdjusted + em.values = append(em.values, lowerBound+delta*float64(j)) + em.counts = append(em.counts, float64(innerBucketSampleCountAdjusted)) + } + } + + // distribute the remainder towards the end + remainder := sampleCount - runningSum + // make sure there's room for the remainder + if len(em.counts) < entryStart+remainder { + em.counts = append(em.counts, make([]float64, remainder)...) + em.values = append(em.values, make([]float64, remainder)...) + } + entryStart = len(em.counts) - 1 + for j := 0; j < remainder; j++ { + em.counts[entryStart]++ + entryStart-- + } + } + } + + // Move last entry to maximum if needed + if dp.HasMax() && len(em.values) > 0 { + lastIdx := len(em.values) - 1 + for i := lastIdx; i >= 0; i-- { + if em.counts[i] > 0 { + lastIdx = i + break + } + } + em.values[lastIdx] = em.maximum + em.values = em.values[:lastIdx+1] + em.counts = em.counts[:lastIdx+1] + } + + return em +} + +func (em *ExponentialMapping) ValuesAndCounts() ([]float64, []float64) { + return em.values, em.counts +} + +func (em *ExponentialMapping) Minimum() float64 { + return em.minimum +} + +func (em *ExponentialMapping) Maximum() float64 { + return em.maximum +} + +func (em *ExponentialMapping) SampleCount() float64 { + return em.sampleCount +} + +func (em *ExponentialMapping) Sum() float64 { + return em.sum +} + +// sumOfSquares is a closed form calculation of Σx^2, for 1 to n +func sumOfSquares(n int) int { + return n * (n + 1) * (2*n + 1) / 6 +} diff --git a/pkg/aws/cloudwatch/histograms/conversion_test.go b/pkg/aws/cloudwatch/histograms/conversion_test.go new file mode 100644 index 0000000000000..2a43594a2ed82 --- /dev/null +++ b/pkg/aws/cloudwatch/histograms/conversion_test.go @@ -0,0 +1,304 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package histograms // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/aws/cloudwatch/histograms" + +import ( + "encoding/csv" + "encoding/json" + "fmt" + "math" + "os" + "sort" + "strconv" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/pdata/pmetric" + + "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/aws/cloudwatch" +) + +var filenameReplacer = strings.NewReplacer( + " ", "_", + "/", "_", +) + +func TestWriteInputHistograms(t *testing.T) { + t.Skip("only used to create test data for visualization") + for _, tc := range TestCases() { + jsonData, err := json.MarshalIndent(tc.Input, "", " ") + require.NoError(t, err) + _ = os.Mkdir("testdata/input", os.ModePerm) + require.NoError(t, os.WriteFile("testdata/input/"+filenameReplacer.Replace(tc.Name)+".json", jsonData, 0o600)) + } +} + +func TestWriteConvertedHistograms(t *testing.T) { + t.Skip("only used to create test data for visualization") + for _, tc := range TestCases() { + t.Run(tc.Name, func(t *testing.T) { + dp := setupDatapoint(tc.Input) + dist := ConvertOTelToCloudWatch(dp) + _ = os.Mkdir("testdata/exponential", os.ModePerm) + assert.NoError(t, writeValuesAndCountsToJSON(dist, "testdata/exponential/"+filenameReplacer.Replace(tc.Name+".json"))) + }) + } +} + +func TestConvertOTelToCloudWatch(t *testing.T) { + for _, tc := range TestCases() { + t.Run(tc.Name, func(t *testing.T) { + dp := setupDatapoint(tc.Input) + dist := ConvertOTelToCloudWatch(dp) + verifyDist(t, dist, tc.Expected) + }) + } + + t.Run("accuracy test - lognormal", func(t *testing.T) { + verifyDistAccuracy(t, ConvertOTelToCloudWatch, "testdata/lognormal_10000.csv") + }) + + t.Run("accuracy test - weibull", func(t *testing.T) { + verifyDistAccuracy(t, ConvertOTelToCloudWatch, "testdata/weibull_10000.csv") + }) +} + +func BenchmarkLogNormal(b *testing.B) { + // arrange + boundaries := []float64{ + 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, + 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018, 0.019, 0.02, + 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027, 0.028, 0.029, 0.03, + 0.031, 0.032, 0.033, 0.034, 0.035, 0.036, 0.037, 0.038, 0.039, 0.04, + 0.041, 0.042, 0.043, 0.044, 0.045, 0.046, 0.047, 0.048, 0.049, 0.05, + 0.1, 0.2, + } + + data, err := loadCsvData("testdata/lognormal_10000.csv") + require.NoError(b, err) + require.Len(b, data, 10000) + + dp := createHistogramDatapointFromData(data, boundaries) + require.Equal(b, 10000, int(dp.Count())) + + b.Run("NewExponentialMappingCWFromOtel", func(b *testing.B) { + for i := 0; i < b.N; i++ { + dist := ConvertOTelToCloudWatch(dp) + values, counts := dist.ValuesAndCounts() + assert.NotNil(b, values) + assert.NotNil(b, counts) + } + }) +} + +func BenchmarkWeibull(b *testing.B) { + // arrange + boundaries := []float64{ + 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, + 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018, 0.019, 0.02, + 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027, 0.028, 0.029, 0.03, + 0.031, 0.032, 0.033, 0.034, 0.035, 0.036, 0.037, 0.038, 0.039, 0.04, + 0.041, 0.042, 0.043, 0.044, 0.045, 0.046, 0.047, 0.048, 0.049, 0.05, + 0.1, 0.2, + } + + data, err := loadCsvData("testdata/weibull_10000.csv") + require.NoError(b, err) + require.Len(b, data, 10000) + + dp := createHistogramDatapointFromData(data, boundaries) + require.Equal(b, 10000, int(dp.Count())) + + b.Run("NewExponentialMappingCWFromOtel", func(b *testing.B) { + for i := 0; i < b.N; i++ { + dist := ConvertOTelToCloudWatch(dp) + values, counts := dist.ValuesAndCounts() + assert.NotNil(b, values) + assert.NotNil(b, counts) + } + }) +} + +func setupDatapoint(input HistogramInput) pmetric.HistogramDataPoint { + dp := pmetric.NewHistogramDataPoint() + dp.SetCount(input.Count) + dp.SetSum(input.Sum) + if input.Min != nil { + dp.SetMin(*input.Min) + } + if input.Max != nil { + dp.SetMax(*input.Max) + } + dp.ExplicitBounds().FromRaw(input.Boundaries) + dp.BucketCounts().FromRaw(input.Counts) + return dp +} + +func verifyDist(t *testing.T, dist cloudwatch.HistogramDataPoint, expected ExpectedMetrics) { + if expected.Min != nil { + assert.Equal(t, *expected.Min, dist.Minimum(), "min does not match expected") + } + if expected.Max != nil { + assert.Equal(t, *expected.Max, dist.Maximum(), "max does not match expected") + } + assert.Equal(t, int(expected.Count), int(dist.SampleCount()), "samplecount does not match expected") + assert.Equal(t, expected.Sum, dist.Sum(), "sum does not match expected") + + values, counts := dist.ValuesAndCounts() + + calculatedCount := 0.0 + for _, count := range counts { + calculatedCount += count + // fmt.Printf("%7.2f = %4d (%d)\n", values[i], int(counts[i]), calculatedCount) + } + assert.InDelta(t, float64(expected.Count), calculatedCount, 1e-6, "calculated count does not match expected") + + for p, r := range expected.PercentileRanges { + x := int(math.Round(float64(dist.SampleCount()) * p)) + + soFar := 0 + for i, count := range counts { + soFar += int(count) + if soFar >= x { + // fmt.Printf("Found p%.f at bucket %0.2f. Expected range: %+v\n", p*100, values[i], r) + assert.GreaterOrEqual(t, values[i], r.Low, "percentile %0.2f", p) + assert.LessOrEqual(t, values[i], r.High, "percentile %0.2f", p) + break + } + } + } +} + +func loadCsvData(filename string) ([]float64, error) { + file, err := os.Open(filename) + if err != nil { + return nil, err + } + defer file.Close() + + reader := csv.NewReader(file) + records, err := reader.ReadAll() + if err != nil { + return nil, err + } + + var data []float64 + for _, value := range records[0] { + f, err := strconv.ParseFloat(strings.TrimSpace(value), 64) + if err != nil { + return nil, err + } + data = append(data, f) + } + return data, nil +} + +func createHistogramDatapointFromData(data []float64, boundaries []float64) pmetric.HistogramDataPoint { + dp := pmetric.NewHistogramDataPoint() + + // Calculate basic stats + var sum float64 + minimum := math.Inf(1) + maximum := math.Inf(-1) + + for _, v := range data { + sum += v + if v < minimum { + minimum = v + } + if v > maximum { + maximum = v + } + } + + dp.SetCount(uint64(len(data))) + dp.SetSum(sum) + dp.SetMin(minimum) + dp.SetMax(maximum) + + // Create bucket counts + bucketCounts := make([]uint64, len(boundaries)+1) + + for _, v := range data { + bucket := sort.SearchFloat64s(boundaries, v) + bucketCounts[bucket]++ + } + + dp.ExplicitBounds().FromRaw(boundaries) + dp.BucketCounts().FromRaw(bucketCounts) + + return dp +} + +func verifyDistAccuracy(t *testing.T, newDistFunc func(pmetric.HistogramDataPoint) cloudwatch.HistogramDataPoint, filename string) { + // arrange + percentiles := []float64{0.1, 0.25, 0.5, 0.75, 0.9, 0.99, 0.999} + boundaries := []float64{ + 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, + 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018, 0.019, 0.02, + 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027, 0.028, 0.029, 0.03, + 0.031, 0.032, 0.033, 0.034, 0.035, 0.036, 0.037, 0.038, 0.039, 0.04, + 0.041, 0.042, 0.043, 0.044, 0.045, 0.046, 0.047, 0.048, 0.049, 0.05, + 0.1, 0.2, + } + + data, err := loadCsvData(filename) + require.NoError(t, err) + assert.Len(t, data, 10000) + + dp := createHistogramDatapointFromData(data, boundaries) + assert.Equal(t, 10000, int(dp.Count())) + calculatedTotal := 0 + for _, count := range dp.BucketCounts().All() { + calculatedTotal += int(count) + } + assert.Equal(t, 10000, calculatedTotal) + + // act + dist := newDistFunc(dp) + values, counts := dist.ValuesAndCounts() + + // assert + calculatedCount := 0.0 + for _, count := range counts { + calculatedCount += count + } + assert.InDelta(t, 10000, calculatedCount, 1e-6, "calculated count does not match expected") + + for _, p := range percentiles { + x1 := int(math.Round(float64(dp.Count()) * p)) + x2 := int(math.Round(calculatedCount * p)) + + exactPercentileValue := data[x1] + + soFar := 0 + for i, count := range counts { + soFar += int(count) + if soFar >= x2 { + calculatedPercentileValue := values[i] + errorPercent := (exactPercentileValue - calculatedPercentileValue) / exactPercentileValue * 100 + fmt.Printf("P%.1f: exact=%.6f, calculated=%.6f, error=%.2f%%\n", p*100, exactPercentileValue, calculatedPercentileValue, errorPercent) + break + } + } + } +} + +func writeValuesAndCountsToJSON(dist cloudwatch.HistogramDataPoint, filename string) error { + values, counts := dist.ValuesAndCounts() + + data := make(map[string]any) + data["values"] = values + data["counts"] = counts + data["sum"] = dist.Sum() + + jsonData, err := json.MarshalIndent(data, "", " ") + if err != nil { + return err + } + + return os.WriteFile(filename, jsonData, 0o600) +} diff --git a/pkg/aws/cloudwatch/histograms/testdata/.gitignore b/pkg/aws/cloudwatch/histograms/testdata/.gitignore new file mode 100644 index 0000000000000..0f0d6bdaf849c --- /dev/null +++ b/pkg/aws/cloudwatch/histograms/testdata/.gitignore @@ -0,0 +1,3 @@ +comparisons/ +exponential/ +input/ \ No newline at end of file diff --git a/pkg/aws/cloudwatch/histograms/testdata/histogram_mappings.py b/pkg/aws/cloudwatch/histograms/testdata/histogram_mappings.py new file mode 100644 index 0000000000000..6fa92abf6be25 --- /dev/null +++ b/pkg/aws/cloudwatch/histograms/testdata/histogram_mappings.py @@ -0,0 +1,169 @@ +import argparse +import json +import math +import matplotlib.pyplot as plt +import numpy as np +import os +import pdb + +from pathlib import Path +from typing import Dict, List, Tuple + +def plot_input_histogram(data, ax, title: str, color: str): + """Plot input histogram using exact bucket boundaries.""" + boundaries = data.get('Boundaries', []) + counts = data['Counts'] + min_val = data.get('Min') + max_val = data.get('Max') + summ = data.get('Sum') + total_count = sum(counts) + + # Handle case with no boundaries (single bucket) + if not boundaries or len(boundaries) == 0: + if min_val is not None and max_val is not None: + left_edges = [min_val] + widths = [max_val - min_val] + else: + # Use arbitrary range if no min/max + left_edges = [-10] + widths = [20] + else: + # Calculate exact bucket edges and widths + left_edges = [] + widths = [] + + for i in range(len(counts)): + if i == 0: + # First bucket: from min to first boundary + left = min_val if min_val is not None else boundaries[0] - (boundaries[1] - boundaries[0]) if len(boundaries) > 1 else boundaries[0] - 10 + right = boundaries[0] + elif i == len(counts) - 1: + # Last bucket: from last boundary to max + left = boundaries[i-1] + right = max_val if max_val is not None else boundaries[i-1] + (boundaries[i-1] - boundaries[i-2]) if len(boundaries) > 1 else boundaries[i-1] + 10 + else: + # Middle buckets: between boundaries + left = boundaries[i-1] + right = boundaries[i] + + left_edges.append(left) + widths.append(right - left) + + ax.bar(left_edges, counts, width=widths, alpha=0.7, edgecolor='black', linewidth=0.8, color=color, align='edge') + ax.set_title(f'{title} (Count: {total_count}, Sum: {summ})') + ax.set_ylabel('Counts') + ax.grid(True, alpha=0.3) + +def plot_cw_histogram_bars(histogram: Dict[float, float], histogram_min: float, histogram_max: float, histogram_sum: float, ax, title: str, color: str): + """Plot histogram bars on given axes.""" + values = sorted(histogram.keys()) + counts = [histogram[v] for v in values] + total_count = sum(counts) + + if len(values) == 1: + # Single bar case + width = (histogram_max - histogram_min) * 0.8 + ax.bar(values, counts, width=width, alpha=0.7, edgecolor='black', linewidth=1.5, color=color) + else: + # Calculate minimum gap to prevent overlaps + gaps = [values[i+1] - values[i] for i in range(len(values)-1)] + min_gap = min(gaps) + max_width = min_gap * 0.8 # Use 80% of minimum gap + + widths = [] + for i in range(len(values)): + if i == 0: + # First bar: extend to histogram_min or use half-gap to next + left_space = values[0] - histogram_min + right_space = (values[1] - values[0]) / 2 if len(values) > 1 else (histogram_max - values[0]) + width = min(left_space + right_space, max_width) + elif i == len(values) - 1: + # Last bar: extend to histogram_max or use half-gap from previous + left_space = (values[i] - values[i-1]) / 2 + right_space = histogram_max - values[i] + width = min(left_space + right_space, max_width) + else: + # Middle bars: use half-gaps on both sides + left_space = (values[i] - values[i-1]) / 2 + right_space = (values[i+1] - values[i]) / 2 + width = min(left_space + right_space, max_width) + + widths.append(width) + + ax.bar(values, counts, width=widths, alpha=0.7, edgecolor='black', linewidth=0.8, color=color) + + ax.scatter(values, counts, color='red', s=50, zorder=5) + ax.set_title(f'{title} (Count: {total_count}, Sum: {histogram_sum})') + ax.set_ylabel('Counts') + ax.grid(True, alpha=0.3) + +def load_json_data(filepath): + """Load histogram data from JSON file.""" + with open(filepath, 'r') as f: + data = json.load(f) + return data['values'], data['counts'], data['sum'] + +def load_input_histogram(filepath): + """Load input histogram format.""" + with open(filepath, 'r') as f: + data = json.load(f) + return data + +def plot_all_folders_comparison(json_filename): + """Plot the same JSON file from all folders for comparison.""" + base_path = Path('.') + folders = ['input', 'exponential'] + colors = ['black', 'green'] + + fig, ax = plt.subplots(len(folders), 1, figsize=(12, 20)) + + i = -1 + for folder, color in zip(folders, colors): + i += 1 + filepath = base_path / folder / (json_filename+".json") + if filepath.exists(): + try: + if folder == 'input': + data = load_input_histogram(filepath) + plot_input_histogram(data, ax[i], f'{folder.capitalize()} Mapping', color) + else: + values, counts, summ = load_json_data(filepath) + if not values: # Skip if no values + continue + hist = {values[j]: counts[j] for j in range(len(values))} + plot_cw_histogram_bars(hist, min(values), max(values), summ, ax[i], f'{folder.capitalize()} Mapping', color) + except Exception as e: + print(f"Error processing {filepath}: {e}") + + plt.tight_layout() + plt.savefig(f"comparisons/{json_filename}.png", dpi=300, bbox_inches='tight') + plt.show() + +# Example usage +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Process histogram mappings') + parser.add_argument('dataset', nargs='?', help='Optional dataset name to process') + args = parser.parse_args() + + os.makedirs('comparisons', exist_ok=True) + + input_path = Path('./input') + if input_path.exists(): + if args.dataset: + # Process specific dataset if provided + dataset_file = input_path / f"{args.dataset}.json" + if dataset_file.exists(): + print(f"Processing {args.dataset}...") + plot_all_folders_comparison(args.dataset) + else: + print(f"Dataset '{args.dataset}' not found in input folder.") + else: + # Process all datasets if no specific dataset provided + json_files = [f.stem for f in input_path.iterdir() if f.suffix == '.json'] + for json_file in json_files: + print(f"Processing {json_file}...") + plot_all_folders_comparison(json_file) + else: + print("Input folder not found.") + +