From 9d20b8c00be19d8176ae83930bc081cb33ea78eb Mon Sep 17 00:00:00 2001 From: Nate Armstrong <109548806+elv-nate@users.noreply.github.com> Date: Tue, 19 Mar 2024 11:59:52 -0700 Subject: [PATCH 1/2] Move duration histogram from content-fabric (#39) * Add histogram * Make bins public * Make marshal totals public * Add ability to clear histogram * Marshal nicer * Fix bins with 0 count * Add average function * Update comment grammar Co-authored-by: elv-gilles <40674218+elv-gilles@users.noreply.github.com> * Convert histogram to use mutex, add concurrency test --------- Co-authored-by: elv-gilles <40674218+elv-gilles@users.noreply.github.com> --- util/histogram/histogram.go | 343 +++++++++++++++++++++++++++++++ util/histogram/histogram_test.go | 205 ++++++++++++++++++ 2 files changed, 548 insertions(+) create mode 100644 util/histogram/histogram.go create mode 100644 util/histogram/histogram_test.go diff --git a/util/histogram/histogram.go b/util/histogram/histogram.go new file mode 100644 index 0000000..839fe7d --- /dev/null +++ b/util/histogram/histogram.go @@ -0,0 +1,343 @@ +package histogram + +import ( + "encoding/json" + "math" + "sync" + "time" + + "github.com/eluv-io/common-go/util/jsonutil" + "github.com/eluv-io/errors-go" +) + +// ----- durationHistogram ----- +type DurationHistogramType uint8 + +const ( + DefaultDurationHistogram = iota + ConnectionResponseDurationHistogram + SegmentLatencyHistogram +) + +func NewDefaultDurationHistogram() *DurationHistogram { + return NewDurationHistogram(DefaultDurationHistogram) +} + +// NewDurationHistogram creates a new duration histogram with predefined +// labeled duration bins. +// note: label are provided since computing them produces string with useless +// suffixes like: 1m => 1m0s +func NewDurationHistogram(t DurationHistogramType) *DurationHistogram { + var bins []*DurationBin + switch t { + case ConnectionResponseDurationHistogram: + bins = []*DurationBin{ + {Label: "0-10ms", Max: time.Millisecond * 10}, + {Label: "10ms-20ms", Max: time.Millisecond * 20}, + {Label: "20ms-50ms", Max: time.Millisecond * 50}, + {Label: "50ms-100ms", Max: time.Millisecond * 100}, + {Label: "100ms-200ms", Max: time.Millisecond * 200}, + {Label: "200ms-500ms", Max: time.Millisecond * 500}, + {Label: "500ms-1s", Max: time.Second}, + {Label: "1s-2s", Max: time.Second * 2}, + {Label: "2s-5s", Max: time.Second * 5}, + {Label: "5s-10s", Max: time.Second * 10}, + {Label: "10s-20s", Max: time.Second * 20}, + {Label: "20s-30s", Max: time.Second * 30}, + {Label: "30s-", Max: time.Hour * 10000}, + } + case SegmentLatencyHistogram: + bins = []*DurationBin{ + {Label: "0-100ms", Max: time.Millisecond * 100}, + {Label: "100ms-200ms", Max: time.Millisecond * 200}, + {Label: "200ms-300ms", Max: time.Millisecond * 300}, + {Label: "300ms-400ms", Max: time.Millisecond * 400}, + {Label: "400ms-500ms", Max: time.Millisecond * 500}, + {Label: "500ms-750ms", Max: time.Millisecond * 750}, + {Label: "750ms-1s", Max: time.Second}, + {Label: "1s-2s", Max: time.Second * 2}, + {Label: "2s-4s", Max: time.Second * 4}, + {Label: "4s-", Max: time.Second * 30}, + } + case DefaultDurationHistogram: + fallthrough + default: + bins = []*DurationBin{ + {Label: "0-10ms", Max: time.Millisecond * 10}, + {Label: "10ms-20ms", Max: time.Millisecond * 20}, + {Label: "20ms-50ms", Max: time.Millisecond * 50}, + {Label: "50ms-100ms", Max: time.Millisecond * 100}, + {Label: "100ms-200ms", Max: time.Millisecond * 200}, + {Label: "200ms-500ms", Max: time.Millisecond * 500}, + {Label: "500ms-1s", Max: time.Second}, + {Label: "1s-2s", Max: time.Second * 2}, + {Label: "2s-5s", Max: time.Second * 5}, + {Label: "5s-10s", Max: time.Second * 10}, + {Label: "10s-20s", Max: time.Second * 20}, + {Label: "20s-30s", Max: time.Second * 30}, + {Label: "30s-1m", Max: time.Minute}, + {Label: "1m-2m", Max: time.Minute * 2}, + {Label: "2m-5m", Max: time.Minute * 5}, + {Label: "5m-10m", Max: time.Minute * 10}, + {Label: "10m-20m", Max: time.Minute * 20}, + {Label: "20m-30m", Max: time.Minute * 30}, + {Label: "30m-2h", Max: time.Hour * 2}, + {Label: "2h-", Max: time.Hour * 10000}, + } + } + return NewDurationHistogramBins(bins) +} + +func (h *DurationHistogram) LoadValues(values []*SerializedDurationBin) error { + h.mu.Lock() + defer h.mu.Unlock() + + for _, v := range values { + seen := false + for _, b := range h.bins { + if v.Label == b.Label { + b.Count = v.Count + b.DSum = v.DSum + seen = true + break + } + } + if !seen { + return errors.E("DurationHistogramFromValues", "reason", "mismatched histogram types", "label", v.Label) + } + } + return nil +} + +type DurationBin struct { + Label string + Max time.Duration // immutable upper-bound of bin (lower bound defined by previous bin) + Count int64 // the number of durations falling in this bin + DSum int64 // sum of durations of this bin +} + +type SerializedDurationBin struct { + Label string `json:"label"` + Count int64 `json:"count"` + DSum int64 `json:"dsum"` +} + +func NewDurationHistogramBins(bins []*DurationBin) *DurationHistogram { + return &DurationHistogram{ + bins: bins, + } +} + +// DurationHistogram uses predefined bins. +type DurationHistogram struct { + bins []*DurationBin + mu sync.Mutex + MarshalTotals bool +} + +func (h *DurationHistogram) TotalCount() int64 { + h.mu.Lock() + defer h.mu.Unlock() + + tot := int64(0) + for _, b := range h.bins { + tot += b.Count + } + return tot +} + +func (h *DurationHistogram) TotalDSum() int64 { + h.mu.Lock() + defer h.mu.Unlock() + + tot := int64(0) + for _, b := range h.bins { + tot += b.DSum + } + return tot +} + +func (h *DurationHistogram) Clear() { + h.mu.Lock() + defer h.mu.Unlock() + for _, b := range h.bins { + b.Count = 0 + b.DSum = 0 + } +} + +func (h *DurationHistogram) Observe(n time.Duration) { + for i := range h.bins { + if n <= h.bins[i].Max || i == len(h.bins)-1 { + h.mu.Lock() + defer h.mu.Unlock() + + h.bins[i].Count += 1 + h.bins[i].DSum += int64(n) + return + } + } +} + +// loadCounts preloads counts for consistent use within a calculation, along with the total +func (h *DurationHistogram) loadCounts() ([]int64, int64) { + data := make([]int64, len(h.bins)) + tot := int64(0) + for i := range h.bins { + data[i] = h.bins[i].Count + tot += data[i] + } + + return data, tot +} + +// loadDSums preloads durations for consistent use within a calculation, along with the total +func (h *DurationHistogram) loadDSums() ([]time.Duration, time.Duration) { + data := make([]time.Duration, len(h.bins)) + tot := time.Duration(0) + for i := range h.bins { + data[i] = time.Duration(h.bins[i].DSum) + tot += data[i] + } + + return data, tot +} + +func (h *DurationHistogram) Average() time.Duration { + h.mu.Lock() + defer h.mu.Unlock() + _, totCount := h.loadCounts() + _, totDur := h.loadDSums() + trueAvg := float64(totDur) / float64(totCount) + return time.Duration(trueAvg) +} + +// Quantile returns an approximation of the value at the qth quantile of the histogram, where q is +// in the range [0, 1]. It makes use of the assumption that data within each histogram bin is +// uniformly distributed in order to estimate within bins. Depending on the distribution, this +// assumption may be more or less accurate. +func (h *DurationHistogram) Quantile(q float64) time.Duration { + h.mu.Lock() + defer h.mu.Unlock() + + if q < 0 || q > 1 { + return -1 + } + // Pre-load data to ensure consistency within function + data, tot := h.loadCounts() + + count := q * float64(tot) + for i := range h.bins { + fData := float64(data[i]) + if count <= fData { + binProportion := 1 - ((fData - count) / fData) + binStart := time.Duration(0) + if i > 0 { + binStart = h.bins[i-1].Max + } + binSpan := h.bins[i].Max - binStart + return binStart + time.Duration(int64(binProportion*float64(binSpan))) + } + count -= fData + } + return -1 +} + +// StandardDeviation estimates the standard deviation using the average of each histogram box. It +// will consistently slightly underestimate the standard deviation as a consequence, because +// variation within each box is not captured in the standard deviation. +// +// A worked example is below to provide intuition: +// +// Our histogram bins are `[[0, 10], [10, 50], [50, 90], [90, 100]]`. Our data is `[3, 7, 12, 13, +// 14, 15, 15, 15, 16, 17, 25, 30, 60, 70, 80, 85, 93]`. +// +// The true standard deviation of that data is 30.6. The data is concentrated on a peak at 15, and +// has another more spread out area around 80. Calculating the standard deviation with just the bin +// endpoints or midpoints would be quite inaccurate, because the bins are not close to uniformly +// filled with values. +// +// The bin averages are `[5, 17, 74, 93]`, with counts `[2, 10, 4, 1]`. We thus use the formula for +// standard deviation, assuming that 2 points are at 5, 10 points are at 17, 4 points are at 74, and +// 1 point at 93. However, because we have the true average of all points, we can use that as well. +// +// We then calculate an estimated standard deviation of 29, which is very close to the actual +// standard deviation of 30.6. +func (h *DurationHistogram) StandardDeviation() time.Duration { + h.mu.Lock() + defer h.mu.Unlock() + + counts, totCount := h.loadCounts() + durs, totDur := h.loadDSums() + trueAvg := float64(totDur) / float64(totCount) + binAvgs := []float64{} + for i := range h.bins { + if counts[i] == 0 { + binAvgs = append(binAvgs, 0) + continue + } + binAvg := float64(durs[i]) / float64(counts[i]) + binAvgs = append(binAvgs, binAvg) + } + + sumOfSquares := float64(0) + for i := range h.bins { + sumOfSquares += float64(counts[i]) * math.Pow(binAvgs[i]-trueAvg, 2) + } + + stdDev := math.Pow(sumOfSquares/float64(totCount), 0.5) + return time.Duration(int64(stdDev)) +} + +func (h *DurationHistogram) MarshalGeneric() interface{} { + h.mu.Lock() + defer h.mu.Unlock() + + totalCount := int64(0) + totalDur := int64(0) + + v := make([]SerializedDurationBin, len(h.bins)) + for i, b := range h.bins { + v[i].Label = b.Label + v[i].Count = b.Count + v[i].DSum = b.DSum + + totalCount += v[i].Count + totalDur += v[i].DSum + } + if !h.MarshalTotals { + return v + } + + return map[string]any{ + "count": totalCount, + "dsum": totalDur, + "hist": v, + } +} + +func (h *DurationHistogram) MarshalJSON() ([]byte, error) { + return json.Marshal(h.MarshalGeneric()) +} + +func (h *DurationHistogram) MarshalArray() []SerializedDurationBin { + h.mu.Lock() + defer h.mu.Unlock() + + v := make([]SerializedDurationBin, len(h.bins)) + for i, b := range h.bins { + v[i].Label = b.Label + v[i].Count = b.Count + v[i].DSum = b.DSum + } + return v +} + +// String returns a string representation of the histogram. +func (h *DurationHistogram) String() string { + bb, err := json.Marshal(h) + if err != nil { + return jsonutil.MarshallingError("duration_histogram", err) + } + return string(bb) +} diff --git a/util/histogram/histogram_test.go b/util/histogram/histogram_test.go new file mode 100644 index 0000000..ab2216f --- /dev/null +++ b/util/histogram/histogram_test.go @@ -0,0 +1,205 @@ +package histogram + +import ( + "encoding/json" + "math" + "strings" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestDurationHistogramBounds(t *testing.T) { + h := NewDefaultDurationHistogram() + for i, b := range h.bins { + //fmt.Println(b.label) + lims := strings.Split(b.Label, "-") + require.Equal(t, 2, len(lims)) + d0, err := time.ParseDuration(lims[0]) + require.NoError(t, err) + d1, err := time.ParseDuration(lims[1]) + if i < len(h.bins)-1 { + require.NoError(t, err) + } else if err != nil { + d1 = time.Duration(math.MaxInt64) + } + require.True(t, d0 < d1, "For %s: %s >= %s", b.Label, d0.String(), d1.String()) + if i > 0 { + require.Equal(t, h.bins[i-1].Max, d0) + } + if i < len(h.bins)-1 { + require.Equal(t, b.Max, d1) + } + } +} + +func TestDurationHistogram(t *testing.T) { + h := NewDefaultDurationHistogram() + + for i, b := range h.bins { + min := time.Duration(0) + if i > 0 { + min = h.bins[i-1].Max + } + h.Observe(min) + h.Observe(b.Max) + h.Observe(b.Max + 1) + } + for i, b := range h.bins { + require.Equal(t, int64(3), b.Count) + switch i { + case 0: + require.EqualValues(t, b.Max*2, b.DSum) + case len(h.bins) - 1: + require.EqualValues(t, b.Max*2+h.bins[i-1].Max+2, b.DSum) + default: + require.EqualValues(t, b.Max*2+h.bins[i-1].Max+1, b.DSum) + } + } +} + +func TestStandardDeviation(t *testing.T) { + bins := []*DurationBin{ + {Label: "0-10", Max: 10}, + {Label: "10-50", Max: 50}, + {Label: "50-90", Max: 90}, + {Label: "90-100", Max: 100}, + } + h := NewDurationHistogramBins(bins) + + data := []time.Duration{ + 3, 7, 12, 13, 14, 15, 15, 15, 16, 17, 25, 30, 60, 70, 80, 85, 93, + } + + for _, d := range data { + h.Observe(d) + } + + require.Equal(t, time.Duration(29), h.StandardDeviation()) +} + +func TestStandardDeviation2(t *testing.T) { + values := []*SerializedDurationBin{ + {Label: "0-100ms", Count: 11917, DSum: 750548874146}, + {Label: "100ms-200ms", Count: 11174, DSum: 1632240504452}, + {Label: "200ms-300ms", Count: 6794, DSum: 1668223282733}, + {Label: "300ms-400ms", Count: 3911, DSum: 1350261250114}, + {Label: "400ms-500ms", Count: 2449, DSum: 1091202961948}, + {Label: "500ms-750ms", Count: 2644, DSum: 1585670182188}, + {Label: "750ms-1s", Count: 781, DSum: 665865316657}, + {Label: "1s-2s", Count: 328, DSum: 389033374409}, + {Label: "2s-4s", Count: 2, DSum: 4031693641}, + {Label: "4s-", Count: 0, DSum: 0}, + } + h := NewDurationHistogram(SegmentLatencyHistogram) + h.LoadValues(values) + + require.Equal(t, int64(196), h.StandardDeviation().Milliseconds()) +} +func TestQuantileEstimation(t *testing.T) { + bins := []*DurationBin{ + {Label: "0-10", Max: 10}, + {Label: "10-20", Max: 20}, + {Label: "20-30", Max: 30}, + {Label: "30-40", Max: 40}, + {Label: "40-50", Max: 50}, + } + h := NewDurationHistogramBins(bins) + + // Uniformly distributed data from 0 to 50 + data := []time.Duration{ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + } + + for _, d := range data { + h.Observe(d) + } + + // This test is a bit brittle to details, don't worry if it breaks with changes as long as the + // new values are close. + require.Equal(t, int64(50), h.TotalCount()) + require.Equal(t, time.Duration(10), h.Quantile(0.2)) + require.Equal(t, time.Duration(49), h.Quantile(0.99)) + require.Equal(t, time.Duration(25), h.Quantile(0.5)) + require.Equal(t, time.Duration(45), h.Quantile(0.9)) +} + +func TestMarshalUnmarshal(t *testing.T) { + h := NewDurationHistogram(DefaultDurationHistogram) + + data := []time.Duration{ + 10 * time.Millisecond, + 10 * time.Millisecond, + 100 * time.Millisecond, + 100 * time.Millisecond, + 300 * time.Millisecond, + 300 * time.Millisecond, + } + + for _, d := range data { + h.Observe(d) + } + + s, err := h.MarshalJSON() + require.NoError(t, err) + + var vals []*SerializedDurationBin + json.Unmarshal(s, &vals) + h2 := NewDurationHistogram(DefaultDurationHistogram) + err = h2.LoadValues(vals) + require.NoError(t, err) + + s2, err := h2.MarshalJSON() + require.NoError(t, err) + + require.Equal(t, string(s), string(s2)) +} + +func TestHistogramConcurrency(t *testing.T) { + bins := []*DurationBin{ + {Label: "0-10", Max: 10}, + {Label: "10-20", Max: 20}, + {Label: "20-30", Max: 30}, + {Label: "30-40", Max: 40}, + {Label: "40-50", Max: 50}, + } + h := NewDurationHistogramBins(bins) + + // Uniformly distributed data from 0 to 50 + data := []time.Duration{ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + } + + wg := sync.WaitGroup{} + + for _, d := range data { + wg.Add(1) + go func(d time.Duration) { + h.Observe(d) + wg.Done() + }(d) + } + + wg.Wait() + + // This test is a bit brittle to details, don't worry if it breaks with changes as long as the + // new values are close. + + // Summary statistics are a simple way to ensure it looks the way it should + require.Equal(t, int64(50), h.TotalCount()) + require.Equal(t, int64(1275), h.TotalDSum()) + require.Equal(t, time.Duration(10), h.Quantile(0.2)) + require.Equal(t, time.Duration(49), h.Quantile(0.99)) + require.Equal(t, time.Duration(25), h.Quantile(0.5)) + require.Equal(t, time.Duration(45), h.Quantile(0.9)) +} From 6443330dd8d5ba5c25cb711c1e9f48651ffc85c6 Mon Sep 17 00:00:00 2001 From: Nate Armstrong <109548806+elv-nate@users.noreply.github.com> Date: Wed, 27 Mar 2024 09:30:21 -0700 Subject: [PATCH 2/2] Histogram ignores values above upper bound of bin (#40) - Add histogram input validation - Improve histogram documentation and testing - Histogram keeps track of outliers if not tracked by bins naturally - Bins with max 0 represent unbounded bins --- util/histogram/histogram.go | 114 ++++++++++++++++++++--- util/histogram/histogram_test.go | 155 +++++++++++++++++++++++++++++-- 2 files changed, 250 insertions(+), 19 deletions(-) diff --git a/util/histogram/histogram.go b/util/histogram/histogram.go index 839fe7d..1d29e8a 100644 --- a/util/histogram/histogram.go +++ b/util/histogram/histogram.go @@ -19,6 +19,8 @@ const ( SegmentLatencyHistogram ) +const OutlierLabel = "outliers" + func NewDefaultDurationHistogram() *DurationHistogram { return NewDurationHistogram(DefaultDurationHistogram) } @@ -44,7 +46,7 @@ func NewDurationHistogram(t DurationHistogramType) *DurationHistogram { {Label: "5s-10s", Max: time.Second * 10}, {Label: "10s-20s", Max: time.Second * 20}, {Label: "20s-30s", Max: time.Second * 30}, - {Label: "30s-", Max: time.Hour * 10000}, + {Label: "30s-"}, } case SegmentLatencyHistogram: bins = []*DurationBin{ @@ -57,7 +59,7 @@ func NewDurationHistogram(t DurationHistogramType) *DurationHistogram { {Label: "750ms-1s", Max: time.Second}, {Label: "1s-2s", Max: time.Second * 2}, {Label: "2s-4s", Max: time.Second * 4}, - {Label: "4s-", Max: time.Second * 30}, + {Label: "4s-10s", Max: time.Second * 10}, } case DefaultDurationHistogram: fallthrough @@ -82,10 +84,11 @@ func NewDurationHistogram(t DurationHistogramType) *DurationHistogram { {Label: "10m-20m", Max: time.Minute * 20}, {Label: "20m-30m", Max: time.Minute * 30}, {Label: "30m-2h", Max: time.Hour * 2}, - {Label: "2h-", Max: time.Hour * 10000}, + {Label: "2h-"}, } } - return NewDurationHistogramBins(bins) + h, _ := NewDurationHistogramBins(bins) + return h } func (h *DurationHistogram) LoadValues(values []*SerializedDurationBin) error { @@ -111,9 +114,11 @@ func (h *DurationHistogram) LoadValues(values []*SerializedDurationBin) error { type DurationBin struct { Label string - Max time.Duration // immutable upper-bound of bin (lower bound defined by previous bin) - Count int64 // the number of durations falling in this bin - DSum int64 // sum of durations of this bin + // Max is the immutable upper-bound of the bin (lower bound defined by previous bin). Setting + // this to 0 represents that the bin has no upper bound + Max time.Duration + Count int64 // the number of durations falling in this bin + DSum int64 // sum of durations of this bin } type SerializedDurationBin struct { @@ -122,10 +127,48 @@ type SerializedDurationBin struct { DSum int64 `json:"dsum"` } -func NewDurationHistogramBins(bins []*DurationBin) *DurationHistogram { +// NewDurationHistogramBins creates a histogram from custom duration bins. The bins must be empty +// (Count and DSum equal to 0), and provided in strictly increasing order of bin maximums. +// Optionally, the final bin may have a max of 0 to represent an unbounded bin. +// By convention, the provided labels are usually PREV_MAX-CUR_MAX, where each is formatted in a +// concise readable format. +// An 'outliers' bin is automatically added if there is not an unbounded bin at the end of the +// histogram. It is ignored for summary statistic purposes. +func NewDurationHistogramBins(bins []*DurationBin) (*DurationHistogram, error) { + e := errors.Template("NewDurationHistogramBins", errors.K.Invalid) + + if len(bins) == 0 { + return nil, e("reason", "no bins") + } + + for i, b := range bins { + // Unbounded bins can only be the final bin + if b.Max == 0 && i != len(bins)-1 { + return nil, e("reason", "unbounded bin not final bin", "label", b.Label, "index", i) + } + + // The outlier bin can only be the final bin + if b.Label == OutlierLabel && (i != len(bins)-1 || b.Max != 0) { + return nil, e("reason", "outlier bin not correct", "label", b.Label, "index", i) + } + + if b.Max != 0 && i > 0 && b.Max <= bins[i-1].Max { + return nil, e("reason", "bins not strictly increasing", "bin_label", b.Label, + "bin_max", b.Max, "prev_label", bins[i-1].Label, "prev_max", bins[i-1].Max) + } + + if b.Count != 0 || b.DSum != 0 { + return nil, e("reason", "bin for construction not empty", "label", b.Label) + } + } + + if bins[len(bins)-1].Max != 0 { + bins = append(bins, &DurationBin{Label: OutlierLabel, Max: 0}) + } + return &DurationHistogram{ bins: bins, - } + }, nil } // DurationHistogram uses predefined bins. @@ -141,6 +184,9 @@ func (h *DurationHistogram) TotalCount() int64 { tot := int64(0) for _, b := range h.bins { + if b.Label == OutlierLabel { + continue + } tot += b.Count } return tot @@ -152,11 +198,33 @@ func (h *DurationHistogram) TotalDSum() int64 { tot := int64(0) for _, b := range h.bins { + if b.Label == OutlierLabel { + continue + } tot += b.DSum } return tot } +// OutlierProportion returns the proportion of histogram observations that fall outside the given +// bins. This returns 0 if the histogram includes an unbounded bin at the upper end. +func (h *DurationHistogram) OutlierProportion() float64 { + h.mu.Lock() + defer h.mu.Unlock() + + _, totalCount := h.loadCounts() + + if h.bins[len(h.bins)-1].Label != OutlierLabel { + return 0 + } + + outCount := h.bins[len(h.bins)-1].Count + if outCount == 0 && totalCount == 0 { + return 0 + } + return float64(outCount) / (float64(totalCount + outCount)) +} + func (h *DurationHistogram) Clear() { h.mu.Lock() defer h.mu.Unlock() @@ -168,7 +236,7 @@ func (h *DurationHistogram) Clear() { func (h *DurationHistogram) Observe(n time.Duration) { for i := range h.bins { - if n <= h.bins[i].Max || i == len(h.bins)-1 { + if n <= h.bins[i].Max || (i == len(h.bins)-1 && h.bins[i].Max == 0) { h.mu.Lock() defer h.mu.Unlock() @@ -184,6 +252,9 @@ func (h *DurationHistogram) loadCounts() ([]int64, int64) { data := make([]int64, len(h.bins)) tot := int64(0) for i := range h.bins { + if h.bins[i].Label == OutlierLabel { + continue + } data[i] = h.bins[i].Count tot += data[i] } @@ -196,6 +267,9 @@ func (h *DurationHistogram) loadDSums() ([]time.Duration, time.Duration) { data := make([]time.Duration, len(h.bins)) tot := time.Duration(0) for i := range h.bins { + if h.bins[i].Label == OutlierLabel { + continue + } data[i] = time.Duration(h.bins[i].DSum) tot += data[i] } @@ -215,7 +289,8 @@ func (h *DurationHistogram) Average() time.Duration { // Quantile returns an approximation of the value at the qth quantile of the histogram, where q is // in the range [0, 1]. It makes use of the assumption that data within each histogram bin is // uniformly distributed in order to estimate within bins. Depending on the distribution, this -// assumption may be more or less accurate. +// assumption may be more or less accurate. For the topmost bin that is unbounded, it assumes the +// intra-bin distribution is uniform over the range [bin_min, 2 * bin_average]. func (h *DurationHistogram) Quantile(q float64) time.Duration { h.mu.Lock() defer h.mu.Unlock() @@ -228,6 +303,9 @@ func (h *DurationHistogram) Quantile(q float64) time.Duration { count := q * float64(tot) for i := range h.bins { + if h.bins[i].Label == OutlierLabel { + continue + } fData := float64(data[i]) if count <= fData { binProportion := 1 - ((fData - count) / fData) @@ -235,7 +313,13 @@ func (h *DurationHistogram) Quantile(q float64) time.Duration { if i > 0 { binStart = h.bins[i-1].Max } - binSpan := h.bins[i].Max - binStart + var binSpan time.Duration + if h.bins[i].Max != 0 { + binSpan = h.bins[i].Max - binStart + } else { + binAvg := float64(h.bins[i].DSum) / float64(h.bins[i].Count) + binSpan = (time.Duration(binAvg) - binStart) * 2 + } return binStart + time.Duration(int64(binProportion*float64(binSpan))) } count -= fData @@ -272,6 +356,9 @@ func (h *DurationHistogram) StandardDeviation() time.Duration { trueAvg := float64(totDur) / float64(totCount) binAvgs := []float64{} for i := range h.bins { + if h.bins[i].Label == OutlierLabel { + continue + } if counts[i] == 0 { binAvgs = append(binAvgs, 0) continue @@ -282,6 +369,9 @@ func (h *DurationHistogram) StandardDeviation() time.Duration { sumOfSquares := float64(0) for i := range h.bins { + if h.bins[i].Label == OutlierLabel { + continue + } sumOfSquares += float64(counts[i]) * math.Pow(binAvgs[i]-trueAvg, 2) } diff --git a/util/histogram/histogram_test.go b/util/histogram/histogram_test.go index ab2216f..66c2bff 100644 --- a/util/histogram/histogram_test.go +++ b/util/histogram/histogram_test.go @@ -35,6 +35,35 @@ func TestDurationHistogramBounds(t *testing.T) { } } +func TestBinsValidation(t *testing.T) { + emptyBins := []*DurationBin{} + unboundedBeforeEnd := []*DurationBin{ + {Label: "0-10", Max: 10}, + {Label: "10-", Max: 0}, + {Label: "10-20", Max: 20}, + } + duplicateBins := []*DurationBin{ + {Label: "0-10", Max: 10}, + {Label: "10-20", Max: 20}, + {Label: "10-20", Max: 20}, + } + decreasingBins := []*DurationBin{ + {Label: "0-10", Max: 10}, + {Label: "15-20", Max: 20}, + {Label: "10-15", Max: 15}, + } + notEmptyBin := []*DurationBin{ + {Label: "0-10", Max: 10}, + {Label: "10-20", Max: 20, Count: 15}, + } + + testCases := [][]*DurationBin{emptyBins, unboundedBeforeEnd, duplicateBins, decreasingBins, notEmptyBin} + for _, testCase := range testCases { + _, err := NewDurationHistogramBins(testCase) + require.Error(t, err) + } +} + func TestDurationHistogram(t *testing.T) { h := NewDefaultDurationHistogram() @@ -44,22 +73,58 @@ func TestDurationHistogram(t *testing.T) { min = h.bins[i-1].Max } h.Observe(min) - h.Observe(b.Max) - h.Observe(b.Max + 1) + if b.Max != 0 { + h.Observe(b.Max) + h.Observe(b.Max + 1) + } } for i, b := range h.bins { - require.Equal(t, int64(3), b.Count) + if i == len(h.bins)-1 { + require.Equal(t, int64(1), b.Count) + } else { + require.Equal(t, int64(3), b.Count) + } switch i { case 0: require.EqualValues(t, b.Max*2, b.DSum) case len(h.bins) - 1: - require.EqualValues(t, b.Max*2+h.bins[i-1].Max+2, b.DSum) + require.EqualValues(t, b.Max*2+h.bins[i-1].Max+1, b.DSum) default: require.EqualValues(t, b.Max*2+h.bins[i-1].Max+1, b.DSum) } } } +func TestDurationMaxBehavior(t *testing.T) { + bins := []*DurationBin{ + {Label: "0-10", Max: 10}, + {Label: "10-"}, + } + + h, err := NewDurationHistogramBins(bins) + require.NoError(t, err) + + h.Observe(10) + h.Observe(11) + h.Observe(100) + require.Equal(t, int64(2), h.bins[1].Count) + require.Equal(t, int64(111), h.bins[1].DSum) + + bins2 := []*DurationBin{ + {Label: "0-10", Max: 10}, + {Label: "10-20", Max: 20}, + } + + h2, err := NewDurationHistogramBins(bins2) + require.NoError(t, err) + + h2.Observe(10) + h2.Observe(11) + h2.Observe(100) + require.Equal(t, int64(1), h2.bins[1].Count) + require.Equal(t, int64(11), h2.bins[1].DSum) +} + func TestStandardDeviation(t *testing.T) { bins := []*DurationBin{ {Label: "0-10", Max: 10}, @@ -67,7 +132,8 @@ func TestStandardDeviation(t *testing.T) { {Label: "50-90", Max: 90}, {Label: "90-100", Max: 100}, } - h := NewDurationHistogramBins(bins) + h, err := NewDurationHistogramBins(bins) + require.NoError(t, err) data := []time.Duration{ 3, 7, 12, 13, 14, 15, 15, 15, 16, 17, 25, 30, 60, 70, 80, 85, 93, @@ -80,6 +146,56 @@ func TestStandardDeviation(t *testing.T) { require.Equal(t, time.Duration(29), h.StandardDeviation()) } +func TestSummaryStatsIgnoreOutliers(t *testing.T) { + bins := []*DurationBin{ + {Label: "0-10", Max: 10}, + {Label: "10-20", Max: 20}, + {Label: "20-30", Max: 30}, + {Label: "30-40", Max: 40}, + {Label: "40-50", Max: 50}, + } + bins2 := append([]*DurationBin{}, bins...) + + h, err := NewDurationHistogramBins(bins) + require.NoError(t, err) + + hWithOutliers, err := NewDurationHistogramBins(bins2) + require.NoError(t, err) + + // Uniformly distributed data from 0 to 50 + data := []time.Duration{ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + } + + outliers := []time.Duration{ + 55, 80, 100, 200, 500, + } + + for _, d := range data { + h.Observe(d) + hWithOutliers.Observe(d) + } + + for _, d := range outliers { + hWithOutliers.Observe(d) + } + + require.Equal(t, h.TotalCount(), hWithOutliers.TotalCount()) + require.Equal(t, h.TotalDSum(), hWithOutliers.TotalDSum()) + require.Equal(t, h.Quantile(0.1), hWithOutliers.Quantile(0.1)) + require.Equal(t, h.Quantile(0.5), hWithOutliers.Quantile(0.5)) + require.Equal(t, h.Quantile(0.9), hWithOutliers.Quantile(0.9)) + require.Equal(t, h.Quantile(1.0), hWithOutliers.Quantile(1.0)) + require.Equal(t, h.Average(), hWithOutliers.Average()) + require.Equal(t, h.StandardDeviation(), hWithOutliers.StandardDeviation()) + require.InDeltaf(t, float64(0.047), hWithOutliers.OutlierProportion(), float64(0.01), "bad outliers") + require.Equal(t, float64(0), h.OutlierProportion()) +} + func TestStandardDeviation2(t *testing.T) { values := []*SerializedDurationBin{ {Label: "0-100ms", Count: 11917, DSum: 750548874146}, @@ -106,7 +222,8 @@ func TestQuantileEstimation(t *testing.T) { {Label: "30-40", Max: 40}, {Label: "40-50", Max: 50}, } - h := NewDurationHistogramBins(bins) + h, err := NewDurationHistogramBins(bins) + require.NoError(t, err) // Uniformly distributed data from 0 to 50 data := []time.Duration{ @@ -130,6 +247,29 @@ func TestQuantileEstimation(t *testing.T) { require.Equal(t, time.Duration(45), h.Quantile(0.9)) } +func TestUnboundedQuantileEstimation(t *testing.T) { + bins := []*DurationBin{ + {Label: "0-10", Max: 10}, + {Label: "10-"}, + } + + h, err := NewDurationHistogramBins(bins) + require.NoError(t, err) + h.Observe(5) + // The 10- bin has an average of 30 + // Therefore we estimate that the bin is evenly distributed across 10-50 (twice the difference + // between the previous max and the average) + h.Observe(20) + h.Observe(30) + h.Observe(30) + h.Observe(40) + + // The upper 80% of this histogram is estimated as the linear distribution from 10-50 + require.Equal(t, time.Duration(10), h.Quantile(0.20)) + require.Equal(t, time.Duration(30), h.Quantile(0.60)) + require.Equal(t, time.Duration(50), h.Quantile(1.0)) +} + func TestMarshalUnmarshal(t *testing.T) { h := NewDurationHistogram(DefaultDurationHistogram) @@ -169,7 +309,8 @@ func TestHistogramConcurrency(t *testing.T) { {Label: "30-40", Max: 40}, {Label: "40-50", Max: 50}, } - h := NewDurationHistogramBins(bins) + h, err := NewDurationHistogramBins(bins) + require.NoError(t, err) // Uniformly distributed data from 0 to 50 data := []time.Duration{