From 9d20b8c00be19d8176ae83930bc081cb33ea78eb Mon Sep 17 00:00:00 2001
From: Nate Armstrong <109548806+elv-nate@users.noreply.github.com>
Date: Tue, 19 Mar 2024 11:59:52 -0700
Subject: [PATCH 1/2] Move duration histogram from content-fabric (#39)

* Add histogram

* Make bins public

* Make marshal totals public

* Add ability to clear histogram

* Marshal nicer

* Fix bins with 0 count

* Add average function

* Update comment grammar

Co-authored-by: elv-gilles <40674218+elv-gilles@users.noreply.github.com>

* Convert histogram to use mutex, add concurrency test

---------

Co-authored-by: elv-gilles <40674218+elv-gilles@users.noreply.github.com>
---
 util/histogram/histogram.go      | 343 +++++++++++++++++++++++++++++++
 util/histogram/histogram_test.go | 205 ++++++++++++++++++
 2 files changed, 548 insertions(+)
 create mode 100644 util/histogram/histogram.go
 create mode 100644 util/histogram/histogram_test.go

diff --git a/util/histogram/histogram.go b/util/histogram/histogram.go
new file mode 100644
index 0000000..839fe7d
--- /dev/null
+++ b/util/histogram/histogram.go
@@ -0,0 +1,343 @@
+package histogram
+
+import (
+	"encoding/json"
+	"math"
+	"sync"
+	"time"
+
+	"github.com/eluv-io/common-go/util/jsonutil"
+	"github.com/eluv-io/errors-go"
+)
+
+// ----- durationHistogram -----
+type DurationHistogramType uint8
+
+const (
+	DefaultDurationHistogram = iota
+	ConnectionResponseDurationHistogram
+	SegmentLatencyHistogram
+)
+
+func NewDefaultDurationHistogram() *DurationHistogram {
+	return NewDurationHistogram(DefaultDurationHistogram)
+}
+
+// NewDurationHistogram creates a new duration histogram with predefined
+// labeled duration bins.
+// note: label are provided since computing them produces string with useless
+// suffixes like: 1m => 1m0s
+func NewDurationHistogram(t DurationHistogramType) *DurationHistogram {
+	var bins []*DurationBin
+	switch t {
+	case ConnectionResponseDurationHistogram:
+		bins = []*DurationBin{
+			{Label: "0-10ms", Max: time.Millisecond * 10},
+			{Label: "10ms-20ms", Max: time.Millisecond * 20},
+			{Label: "20ms-50ms", Max: time.Millisecond * 50},
+			{Label: "50ms-100ms", Max: time.Millisecond * 100},
+			{Label: "100ms-200ms", Max: time.Millisecond * 200},
+			{Label: "200ms-500ms", Max: time.Millisecond * 500},
+			{Label: "500ms-1s", Max: time.Second},
+			{Label: "1s-2s", Max: time.Second * 2},
+			{Label: "2s-5s", Max: time.Second * 5},
+			{Label: "5s-10s", Max: time.Second * 10},
+			{Label: "10s-20s", Max: time.Second * 20},
+			{Label: "20s-30s", Max: time.Second * 30},
+			{Label: "30s-", Max: time.Hour * 10000},
+		}
+	case SegmentLatencyHistogram:
+		bins = []*DurationBin{
+			{Label: "0-100ms", Max: time.Millisecond * 100},
+			{Label: "100ms-200ms", Max: time.Millisecond * 200},
+			{Label: "200ms-300ms", Max: time.Millisecond * 300},
+			{Label: "300ms-400ms", Max: time.Millisecond * 400},
+			{Label: "400ms-500ms", Max: time.Millisecond * 500},
+			{Label: "500ms-750ms", Max: time.Millisecond * 750},
+			{Label: "750ms-1s", Max: time.Second},
+			{Label: "1s-2s", Max: time.Second * 2},
+			{Label: "2s-4s", Max: time.Second * 4},
+			{Label: "4s-", Max: time.Second * 30},
+		}
+	case DefaultDurationHistogram:
+		fallthrough
+	default:
+		bins = []*DurationBin{
+			{Label: "0-10ms", Max: time.Millisecond * 10},
+			{Label: "10ms-20ms", Max: time.Millisecond * 20},
+			{Label: "20ms-50ms", Max: time.Millisecond * 50},
+			{Label: "50ms-100ms", Max: time.Millisecond * 100},
+			{Label: "100ms-200ms", Max: time.Millisecond * 200},
+			{Label: "200ms-500ms", Max: time.Millisecond * 500},
+			{Label: "500ms-1s", Max: time.Second},
+			{Label: "1s-2s", Max: time.Second * 2},
+			{Label: "2s-5s", Max: time.Second * 5},
+			{Label: "5s-10s", Max: time.Second * 10},
+			{Label: "10s-20s", Max: time.Second * 20},
+			{Label: "20s-30s", Max: time.Second * 30},
+			{Label: "30s-1m", Max: time.Minute},
+			{Label: "1m-2m", Max: time.Minute * 2},
+			{Label: "2m-5m", Max: time.Minute * 5},
+			{Label: "5m-10m", Max: time.Minute * 10},
+			{Label: "10m-20m", Max: time.Minute * 20},
+			{Label: "20m-30m", Max: time.Minute * 30},
+			{Label: "30m-2h", Max: time.Hour * 2},
+			{Label: "2h-", Max: time.Hour * 10000},
+		}
+	}
+	return NewDurationHistogramBins(bins)
+}
+
+func (h *DurationHistogram) LoadValues(values []*SerializedDurationBin) error {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	for _, v := range values {
+		seen := false
+		for _, b := range h.bins {
+			if v.Label == b.Label {
+				b.Count = v.Count
+				b.DSum = v.DSum
+				seen = true
+				break
+			}
+		}
+		if !seen {
+			return errors.E("DurationHistogramFromValues", "reason", "mismatched histogram types", "label", v.Label)
+		}
+	}
+	return nil
+}
+
+type DurationBin struct {
+	Label string
+	Max   time.Duration // immutable upper-bound of bin (lower bound defined by previous bin)
+	Count int64         // the number of durations falling in this bin
+	DSum  int64         // sum of durations of this bin
+}
+
+type SerializedDurationBin struct {
+	Label string `json:"label"`
+	Count int64  `json:"count"`
+	DSum  int64  `json:"dsum"`
+}
+
+func NewDurationHistogramBins(bins []*DurationBin) *DurationHistogram {
+	return &DurationHistogram{
+		bins: bins,
+	}
+}
+
+// DurationHistogram uses predefined bins.
+type DurationHistogram struct {
+	bins          []*DurationBin
+	mu            sync.Mutex
+	MarshalTotals bool
+}
+
+func (h *DurationHistogram) TotalCount() int64 {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	tot := int64(0)
+	for _, b := range h.bins {
+		tot += b.Count
+	}
+	return tot
+}
+
+func (h *DurationHistogram) TotalDSum() int64 {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	tot := int64(0)
+	for _, b := range h.bins {
+		tot += b.DSum
+	}
+	return tot
+}
+
+func (h *DurationHistogram) Clear() {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	for _, b := range h.bins {
+		b.Count = 0
+		b.DSum = 0
+	}
+}
+
+func (h *DurationHistogram) Observe(n time.Duration) {
+	for i := range h.bins {
+		if n <= h.bins[i].Max || i == len(h.bins)-1 {
+			h.mu.Lock()
+			defer h.mu.Unlock()
+
+			h.bins[i].Count += 1
+			h.bins[i].DSum += int64(n)
+			return
+		}
+	}
+}
+
+// loadCounts preloads counts for consistent use within a calculation, along with the total
+func (h *DurationHistogram) loadCounts() ([]int64, int64) {
+	data := make([]int64, len(h.bins))
+	tot := int64(0)
+	for i := range h.bins {
+		data[i] = h.bins[i].Count
+		tot += data[i]
+	}
+
+	return data, tot
+}
+
+// loadDSums preloads durations for consistent use within a calculation, along with the total
+func (h *DurationHistogram) loadDSums() ([]time.Duration, time.Duration) {
+	data := make([]time.Duration, len(h.bins))
+	tot := time.Duration(0)
+	for i := range h.bins {
+		data[i] = time.Duration(h.bins[i].DSum)
+		tot += data[i]
+	}
+
+	return data, tot
+}
+
+func (h *DurationHistogram) Average() time.Duration {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+	_, totCount := h.loadCounts()
+	_, totDur := h.loadDSums()
+	trueAvg := float64(totDur) / float64(totCount)
+	return time.Duration(trueAvg)
+}
+
+// Quantile returns an approximation of the value at the qth quantile of the histogram, where q is
+// in the range [0, 1]. It makes use of the assumption that data within each histogram bin is
+// uniformly distributed in order to estimate within bins. Depending on the distribution, this
+// assumption may be more or less accurate.
+func (h *DurationHistogram) Quantile(q float64) time.Duration {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	if q < 0 || q > 1 {
+		return -1
+	}
+	// Pre-load data to ensure consistency within function
+	data, tot := h.loadCounts()
+
+	count := q * float64(tot)
+	for i := range h.bins {
+		fData := float64(data[i])
+		if count <= fData {
+			binProportion := 1 - ((fData - count) / fData)
+			binStart := time.Duration(0)
+			if i > 0 {
+				binStart = h.bins[i-1].Max
+			}
+			binSpan := h.bins[i].Max - binStart
+			return binStart + time.Duration(int64(binProportion*float64(binSpan)))
+		}
+		count -= fData
+	}
+	return -1
+}
+
+// StandardDeviation estimates the standard deviation using the average of each histogram box. It
+// will consistently slightly underestimate the standard deviation as a consequence, because
+// variation within each box is not captured in the standard deviation.
+//
+// A worked example is below to provide intuition:
+//
+// Our histogram bins are `[[0, 10], [10, 50], [50, 90], [90, 100]]`.  Our data is `[3, 7, 12, 13,
+// 14, 15, 15, 15, 16, 17, 25, 30, 60, 70, 80, 85, 93]`.
+//
+// The true standard deviation of that data is 30.6. The data is concentrated on a peak at 15, and
+// has another more spread out area around 80. Calculating the standard deviation with just the bin
+// endpoints or midpoints would be quite inaccurate, because the bins are not close to uniformly
+// filled with values.
+//
+// The bin averages are `[5, 17, 74, 93]`, with counts `[2, 10, 4, 1]`. We thus use the formula for
+// standard deviation, assuming that 2 points are at 5, 10 points are at 17, 4 points are at 74, and
+// 1 point at 93. However, because we have the true average of all points, we can use that as well.
+//
+// We then calculate an estimated standard deviation of 29, which is very close to the actual
+// standard deviation of 30.6.
+func (h *DurationHistogram) StandardDeviation() time.Duration {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	counts, totCount := h.loadCounts()
+	durs, totDur := h.loadDSums()
+	trueAvg := float64(totDur) / float64(totCount)
+	binAvgs := []float64{}
+	for i := range h.bins {
+		if counts[i] == 0 {
+			binAvgs = append(binAvgs, 0)
+			continue
+		}
+		binAvg := float64(durs[i]) / float64(counts[i])
+		binAvgs = append(binAvgs, binAvg)
+	}
+
+	sumOfSquares := float64(0)
+	for i := range h.bins {
+		sumOfSquares += float64(counts[i]) * math.Pow(binAvgs[i]-trueAvg, 2)
+	}
+
+	stdDev := math.Pow(sumOfSquares/float64(totCount), 0.5)
+	return time.Duration(int64(stdDev))
+}
+
+func (h *DurationHistogram) MarshalGeneric() interface{} {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	totalCount := int64(0)
+	totalDur := int64(0)
+
+	v := make([]SerializedDurationBin, len(h.bins))
+	for i, b := range h.bins {
+		v[i].Label = b.Label
+		v[i].Count = b.Count
+		v[i].DSum = b.DSum
+
+		totalCount += v[i].Count
+		totalDur += v[i].DSum
+	}
+	if !h.MarshalTotals {
+		return v
+	}
+
+	return map[string]any{
+		"count": totalCount,
+		"dsum":  totalDur,
+		"hist":  v,
+	}
+}
+
+func (h *DurationHistogram) MarshalJSON() ([]byte, error) {
+	return json.Marshal(h.MarshalGeneric())
+}
+
+func (h *DurationHistogram) MarshalArray() []SerializedDurationBin {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	v := make([]SerializedDurationBin, len(h.bins))
+	for i, b := range h.bins {
+		v[i].Label = b.Label
+		v[i].Count = b.Count
+		v[i].DSum = b.DSum
+	}
+	return v
+}
+
+// String returns a string representation of the histogram.
+func (h *DurationHistogram) String() string {
+	bb, err := json.Marshal(h)
+	if err != nil {
+		return jsonutil.MarshallingError("duration_histogram", err)
+	}
+	return string(bb)
+}
diff --git a/util/histogram/histogram_test.go b/util/histogram/histogram_test.go
new file mode 100644
index 0000000..ab2216f
--- /dev/null
+++ b/util/histogram/histogram_test.go
@@ -0,0 +1,205 @@
+package histogram
+
+import (
+	"encoding/json"
+	"math"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestDurationHistogramBounds(t *testing.T) {
+	h := NewDefaultDurationHistogram()
+	for i, b := range h.bins {
+		//fmt.Println(b.label)
+		lims := strings.Split(b.Label, "-")
+		require.Equal(t, 2, len(lims))
+		d0, err := time.ParseDuration(lims[0])
+		require.NoError(t, err)
+		d1, err := time.ParseDuration(lims[1])
+		if i < len(h.bins)-1 {
+			require.NoError(t, err)
+		} else if err != nil {
+			d1 = time.Duration(math.MaxInt64)
+		}
+		require.True(t, d0 < d1, "For %s: %s >= %s", b.Label, d0.String(), d1.String())
+		if i > 0 {
+			require.Equal(t, h.bins[i-1].Max, d0)
+		}
+		if i < len(h.bins)-1 {
+			require.Equal(t, b.Max, d1)
+		}
+	}
+}
+
+func TestDurationHistogram(t *testing.T) {
+	h := NewDefaultDurationHistogram()
+
+	for i, b := range h.bins {
+		min := time.Duration(0)
+		if i > 0 {
+			min = h.bins[i-1].Max
+		}
+		h.Observe(min)
+		h.Observe(b.Max)
+		h.Observe(b.Max + 1)
+	}
+	for i, b := range h.bins {
+		require.Equal(t, int64(3), b.Count)
+		switch i {
+		case 0:
+			require.EqualValues(t, b.Max*2, b.DSum)
+		case len(h.bins) - 1:
+			require.EqualValues(t, b.Max*2+h.bins[i-1].Max+2, b.DSum)
+		default:
+			require.EqualValues(t, b.Max*2+h.bins[i-1].Max+1, b.DSum)
+		}
+	}
+}
+
+func TestStandardDeviation(t *testing.T) {
+	bins := []*DurationBin{
+		{Label: "0-10", Max: 10},
+		{Label: "10-50", Max: 50},
+		{Label: "50-90", Max: 90},
+		{Label: "90-100", Max: 100},
+	}
+	h := NewDurationHistogramBins(bins)
+
+	data := []time.Duration{
+		3, 7, 12, 13, 14, 15, 15, 15, 16, 17, 25, 30, 60, 70, 80, 85, 93,
+	}
+
+	for _, d := range data {
+		h.Observe(d)
+	}
+
+	require.Equal(t, time.Duration(29), h.StandardDeviation())
+}
+
+func TestStandardDeviation2(t *testing.T) {
+	values := []*SerializedDurationBin{
+		{Label: "0-100ms", Count: 11917, DSum: 750548874146},
+		{Label: "100ms-200ms", Count: 11174, DSum: 1632240504452},
+		{Label: "200ms-300ms", Count: 6794, DSum: 1668223282733},
+		{Label: "300ms-400ms", Count: 3911, DSum: 1350261250114},
+		{Label: "400ms-500ms", Count: 2449, DSum: 1091202961948},
+		{Label: "500ms-750ms", Count: 2644, DSum: 1585670182188},
+		{Label: "750ms-1s", Count: 781, DSum: 665865316657},
+		{Label: "1s-2s", Count: 328, DSum: 389033374409},
+		{Label: "2s-4s", Count: 2, DSum: 4031693641},
+		{Label: "4s-", Count: 0, DSum: 0},
+	}
+	h := NewDurationHistogram(SegmentLatencyHistogram)
+	h.LoadValues(values)
+
+	require.Equal(t, int64(196), h.StandardDeviation().Milliseconds())
+}
+func TestQuantileEstimation(t *testing.T) {
+	bins := []*DurationBin{
+		{Label: "0-10", Max: 10},
+		{Label: "10-20", Max: 20},
+		{Label: "20-30", Max: 30},
+		{Label: "30-40", Max: 40},
+		{Label: "40-50", Max: 50},
+	}
+	h := NewDurationHistogramBins(bins)
+
+	// Uniformly distributed data from 0 to 50
+	data := []time.Duration{
+		1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+		11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+		21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+		31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+		41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+	}
+
+	for _, d := range data {
+		h.Observe(d)
+	}
+
+	// This test is a bit brittle to details, don't worry if it breaks with changes as long as the
+	// new values are close.
+	require.Equal(t, int64(50), h.TotalCount())
+	require.Equal(t, time.Duration(10), h.Quantile(0.2))
+	require.Equal(t, time.Duration(49), h.Quantile(0.99))
+	require.Equal(t, time.Duration(25), h.Quantile(0.5))
+	require.Equal(t, time.Duration(45), h.Quantile(0.9))
+}
+
+func TestMarshalUnmarshal(t *testing.T) {
+	h := NewDurationHistogram(DefaultDurationHistogram)
+
+	data := []time.Duration{
+		10 * time.Millisecond,
+		10 * time.Millisecond,
+		100 * time.Millisecond,
+		100 * time.Millisecond,
+		300 * time.Millisecond,
+		300 * time.Millisecond,
+	}
+
+	for _, d := range data {
+		h.Observe(d)
+	}
+
+	s, err := h.MarshalJSON()
+	require.NoError(t, err)
+
+	var vals []*SerializedDurationBin
+	json.Unmarshal(s, &vals)
+	h2 := NewDurationHistogram(DefaultDurationHistogram)
+	err = h2.LoadValues(vals)
+	require.NoError(t, err)
+
+	s2, err := h2.MarshalJSON()
+	require.NoError(t, err)
+
+	require.Equal(t, string(s), string(s2))
+}
+
+func TestHistogramConcurrency(t *testing.T) {
+	bins := []*DurationBin{
+		{Label: "0-10", Max: 10},
+		{Label: "10-20", Max: 20},
+		{Label: "20-30", Max: 30},
+		{Label: "30-40", Max: 40},
+		{Label: "40-50", Max: 50},
+	}
+	h := NewDurationHistogramBins(bins)
+
+	// Uniformly distributed data from 0 to 50
+	data := []time.Duration{
+		1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+		11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+		21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+		31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+		41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+	}
+
+	wg := sync.WaitGroup{}
+
+	for _, d := range data {
+		wg.Add(1)
+		go func(d time.Duration) {
+			h.Observe(d)
+			wg.Done()
+		}(d)
+	}
+
+	wg.Wait()
+
+	// This test is a bit brittle to details, don't worry if it breaks with changes as long as the
+	// new values are close.
+
+	// Summary statistics are a simple way to ensure it looks the way it should
+	require.Equal(t, int64(50), h.TotalCount())
+	require.Equal(t, int64(1275), h.TotalDSum())
+	require.Equal(t, time.Duration(10), h.Quantile(0.2))
+	require.Equal(t, time.Duration(49), h.Quantile(0.99))
+	require.Equal(t, time.Duration(25), h.Quantile(0.5))
+	require.Equal(t, time.Duration(45), h.Quantile(0.9))
+}

From 6443330dd8d5ba5c25cb711c1e9f48651ffc85c6 Mon Sep 17 00:00:00 2001
From: Nate Armstrong <109548806+elv-nate@users.noreply.github.com>
Date: Wed, 27 Mar 2024 09:30:21 -0700
Subject: [PATCH 2/2] Histogram ignores values above upper bound of bin (#40)

- Add histogram input validation
- Improve histogram documentation and testing
- Histogram keeps track of outliers if not tracked by bins naturally
- Bins with max 0 represent unbounded bins
---
 util/histogram/histogram.go      | 114 ++++++++++++++++++++---
 util/histogram/histogram_test.go | 155 +++++++++++++++++++++++++++++--
 2 files changed, 250 insertions(+), 19 deletions(-)

diff --git a/util/histogram/histogram.go b/util/histogram/histogram.go
index 839fe7d..1d29e8a 100644
--- a/util/histogram/histogram.go
+++ b/util/histogram/histogram.go
@@ -19,6 +19,8 @@ const (
 	SegmentLatencyHistogram
 )
 
+const OutlierLabel = "outliers"
+
 func NewDefaultDurationHistogram() *DurationHistogram {
 	return NewDurationHistogram(DefaultDurationHistogram)
 }
@@ -44,7 +46,7 @@ func NewDurationHistogram(t DurationHistogramType) *DurationHistogram {
 			{Label: "5s-10s", Max: time.Second * 10},
 			{Label: "10s-20s", Max: time.Second * 20},
 			{Label: "20s-30s", Max: time.Second * 30},
-			{Label: "30s-", Max: time.Hour * 10000},
+			{Label: "30s-"},
 		}
 	case SegmentLatencyHistogram:
 		bins = []*DurationBin{
@@ -57,7 +59,7 @@ func NewDurationHistogram(t DurationHistogramType) *DurationHistogram {
 			{Label: "750ms-1s", Max: time.Second},
 			{Label: "1s-2s", Max: time.Second * 2},
 			{Label: "2s-4s", Max: time.Second * 4},
-			{Label: "4s-", Max: time.Second * 30},
+			{Label: "4s-10s", Max: time.Second * 10},
 		}
 	case DefaultDurationHistogram:
 		fallthrough
@@ -82,10 +84,11 @@ func NewDurationHistogram(t DurationHistogramType) *DurationHistogram {
 			{Label: "10m-20m", Max: time.Minute * 20},
 			{Label: "20m-30m", Max: time.Minute * 30},
 			{Label: "30m-2h", Max: time.Hour * 2},
-			{Label: "2h-", Max: time.Hour * 10000},
+			{Label: "2h-"},
 		}
 	}
-	return NewDurationHistogramBins(bins)
+	h, _ := NewDurationHistogramBins(bins)
+	return h
 }
 
 func (h *DurationHistogram) LoadValues(values []*SerializedDurationBin) error {
@@ -111,9 +114,11 @@ func (h *DurationHistogram) LoadValues(values []*SerializedDurationBin) error {
 
 type DurationBin struct {
 	Label string
-	Max   time.Duration // immutable upper-bound of bin (lower bound defined by previous bin)
-	Count int64         // the number of durations falling in this bin
-	DSum  int64         // sum of durations of this bin
+	// Max is the immutable upper-bound of the bin (lower bound defined by previous bin). Setting
+	// this to 0 represents that the bin has no upper bound
+	Max   time.Duration
+	Count int64 // the number of durations falling in this bin
+	DSum  int64 // sum of durations of this bin
 }
 
 type SerializedDurationBin struct {
@@ -122,10 +127,48 @@ type SerializedDurationBin struct {
 	DSum  int64  `json:"dsum"`
 }
 
-func NewDurationHistogramBins(bins []*DurationBin) *DurationHistogram {
+// NewDurationHistogramBins creates a histogram from custom duration bins. The bins must be empty
+// (Count and DSum equal to 0), and provided in strictly increasing order of bin maximums.
+// Optionally, the final bin may have a max of 0 to represent an unbounded bin.
+// By convention, the provided labels are usually PREV_MAX-CUR_MAX, where each is formatted in a
+// concise readable format.
+// An 'outliers' bin is automatically added if there is not an unbounded bin at the end of the
+// histogram. It is ignored for summary statistic purposes.
+func NewDurationHistogramBins(bins []*DurationBin) (*DurationHistogram, error) {
+	e := errors.Template("NewDurationHistogramBins", errors.K.Invalid)
+
+	if len(bins) == 0 {
+		return nil, e("reason", "no bins")
+	}
+
+	for i, b := range bins {
+		// Unbounded bins can only be the final bin
+		if b.Max == 0 && i != len(bins)-1 {
+			return nil, e("reason", "unbounded bin not final bin", "label", b.Label, "index", i)
+		}
+
+		// The outlier bin can only be the final bin
+		if b.Label == OutlierLabel && (i != len(bins)-1 || b.Max != 0) {
+			return nil, e("reason", "outlier bin not correct", "label", b.Label, "index", i)
+		}
+
+		if b.Max != 0 && i > 0 && b.Max <= bins[i-1].Max {
+			return nil, e("reason", "bins not strictly increasing", "bin_label", b.Label,
+				"bin_max", b.Max, "prev_label", bins[i-1].Label, "prev_max", bins[i-1].Max)
+		}
+
+		if b.Count != 0 || b.DSum != 0 {
+			return nil, e("reason", "bin for construction not empty", "label", b.Label)
+		}
+	}
+
+	if bins[len(bins)-1].Max != 0 {
+		bins = append(bins, &DurationBin{Label: OutlierLabel, Max: 0})
+	}
+
 	return &DurationHistogram{
 		bins: bins,
-	}
+	}, nil
 }
 
 // DurationHistogram uses predefined bins.
@@ -141,6 +184,9 @@ func (h *DurationHistogram) TotalCount() int64 {
 
 	tot := int64(0)
 	for _, b := range h.bins {
+		if b.Label == OutlierLabel {
+			continue
+		}
 		tot += b.Count
 	}
 	return tot
@@ -152,11 +198,33 @@ func (h *DurationHistogram) TotalDSum() int64 {
 
 	tot := int64(0)
 	for _, b := range h.bins {
+		if b.Label == OutlierLabel {
+			continue
+		}
 		tot += b.DSum
 	}
 	return tot
 }
 
+// OutlierProportion returns the proportion of histogram observations that fall outside the given
+// bins. This returns 0 if the histogram includes an unbounded bin at the upper end.
+func (h *DurationHistogram) OutlierProportion() float64 {
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	_, totalCount := h.loadCounts()
+
+	if h.bins[len(h.bins)-1].Label != OutlierLabel {
+		return 0
+	}
+
+	outCount := h.bins[len(h.bins)-1].Count
+	if outCount == 0 && totalCount == 0 {
+		return 0
+	}
+	return float64(outCount) / (float64(totalCount + outCount))
+}
+
 func (h *DurationHistogram) Clear() {
 	h.mu.Lock()
 	defer h.mu.Unlock()
@@ -168,7 +236,7 @@ func (h *DurationHistogram) Clear() {
 
 func (h *DurationHistogram) Observe(n time.Duration) {
 	for i := range h.bins {
-		if n <= h.bins[i].Max || i == len(h.bins)-1 {
+		if n <= h.bins[i].Max || (i == len(h.bins)-1 && h.bins[i].Max == 0) {
 			h.mu.Lock()
 			defer h.mu.Unlock()
 
@@ -184,6 +252,9 @@ func (h *DurationHistogram) loadCounts() ([]int64, int64) {
 	data := make([]int64, len(h.bins))
 	tot := int64(0)
 	for i := range h.bins {
+		if h.bins[i].Label == OutlierLabel {
+			continue
+		}
 		data[i] = h.bins[i].Count
 		tot += data[i]
 	}
@@ -196,6 +267,9 @@ func (h *DurationHistogram) loadDSums() ([]time.Duration, time.Duration) {
 	data := make([]time.Duration, len(h.bins))
 	tot := time.Duration(0)
 	for i := range h.bins {
+		if h.bins[i].Label == OutlierLabel {
+			continue
+		}
 		data[i] = time.Duration(h.bins[i].DSum)
 		tot += data[i]
 	}
@@ -215,7 +289,8 @@ func (h *DurationHistogram) Average() time.Duration {
 // Quantile returns an approximation of the value at the qth quantile of the histogram, where q is
 // in the range [0, 1]. It makes use of the assumption that data within each histogram bin is
 // uniformly distributed in order to estimate within bins. Depending on the distribution, this
-// assumption may be more or less accurate.
+// assumption may be more or less accurate. For the topmost bin that is unbounded, it assumes the
+// intra-bin distribution is uniform over the range [bin_min, 2 * bin_average].
 func (h *DurationHistogram) Quantile(q float64) time.Duration {
 	h.mu.Lock()
 	defer h.mu.Unlock()
@@ -228,6 +303,9 @@ func (h *DurationHistogram) Quantile(q float64) time.Duration {
 
 	count := q * float64(tot)
 	for i := range h.bins {
+		if h.bins[i].Label == OutlierLabel {
+			continue
+		}
 		fData := float64(data[i])
 		if count <= fData {
 			binProportion := 1 - ((fData - count) / fData)
@@ -235,7 +313,13 @@ func (h *DurationHistogram) Quantile(q float64) time.Duration {
 			if i > 0 {
 				binStart = h.bins[i-1].Max
 			}
-			binSpan := h.bins[i].Max - binStart
+			var binSpan time.Duration
+			if h.bins[i].Max != 0 {
+				binSpan = h.bins[i].Max - binStart
+			} else {
+				binAvg := float64(h.bins[i].DSum) / float64(h.bins[i].Count)
+				binSpan = (time.Duration(binAvg) - binStart) * 2
+			}
 			return binStart + time.Duration(int64(binProportion*float64(binSpan)))
 		}
 		count -= fData
@@ -272,6 +356,9 @@ func (h *DurationHistogram) StandardDeviation() time.Duration {
 	trueAvg := float64(totDur) / float64(totCount)
 	binAvgs := []float64{}
 	for i := range h.bins {
+		if h.bins[i].Label == OutlierLabel {
+			continue
+		}
 		if counts[i] == 0 {
 			binAvgs = append(binAvgs, 0)
 			continue
@@ -282,6 +369,9 @@ func (h *DurationHistogram) StandardDeviation() time.Duration {
 
 	sumOfSquares := float64(0)
 	for i := range h.bins {
+		if h.bins[i].Label == OutlierLabel {
+			continue
+		}
 		sumOfSquares += float64(counts[i]) * math.Pow(binAvgs[i]-trueAvg, 2)
 	}
 
diff --git a/util/histogram/histogram_test.go b/util/histogram/histogram_test.go
index ab2216f..66c2bff 100644
--- a/util/histogram/histogram_test.go
+++ b/util/histogram/histogram_test.go
@@ -35,6 +35,35 @@ func TestDurationHistogramBounds(t *testing.T) {
 	}
 }
 
+func TestBinsValidation(t *testing.T) {
+	emptyBins := []*DurationBin{}
+	unboundedBeforeEnd := []*DurationBin{
+		{Label: "0-10", Max: 10},
+		{Label: "10-", Max: 0},
+		{Label: "10-20", Max: 20},
+	}
+	duplicateBins := []*DurationBin{
+		{Label: "0-10", Max: 10},
+		{Label: "10-20", Max: 20},
+		{Label: "10-20", Max: 20},
+	}
+	decreasingBins := []*DurationBin{
+		{Label: "0-10", Max: 10},
+		{Label: "15-20", Max: 20},
+		{Label: "10-15", Max: 15},
+	}
+	notEmptyBin := []*DurationBin{
+		{Label: "0-10", Max: 10},
+		{Label: "10-20", Max: 20, Count: 15},
+	}
+
+	testCases := [][]*DurationBin{emptyBins, unboundedBeforeEnd, duplicateBins, decreasingBins, notEmptyBin}
+	for _, testCase := range testCases {
+		_, err := NewDurationHistogramBins(testCase)
+		require.Error(t, err)
+	}
+}
+
 func TestDurationHistogram(t *testing.T) {
 	h := NewDefaultDurationHistogram()
 
@@ -44,22 +73,58 @@ func TestDurationHistogram(t *testing.T) {
 			min = h.bins[i-1].Max
 		}
 		h.Observe(min)
-		h.Observe(b.Max)
-		h.Observe(b.Max + 1)
+		if b.Max != 0 {
+			h.Observe(b.Max)
+			h.Observe(b.Max + 1)
+		}
 	}
 	for i, b := range h.bins {
-		require.Equal(t, int64(3), b.Count)
+		if i == len(h.bins)-1 {
+			require.Equal(t, int64(1), b.Count)
+		} else {
+			require.Equal(t, int64(3), b.Count)
+		}
 		switch i {
 		case 0:
 			require.EqualValues(t, b.Max*2, b.DSum)
 		case len(h.bins) - 1:
-			require.EqualValues(t, b.Max*2+h.bins[i-1].Max+2, b.DSum)
+			require.EqualValues(t, b.Max*2+h.bins[i-1].Max+1, b.DSum)
 		default:
 			require.EqualValues(t, b.Max*2+h.bins[i-1].Max+1, b.DSum)
 		}
 	}
 }
 
+func TestDurationMaxBehavior(t *testing.T) {
+	bins := []*DurationBin{
+		{Label: "0-10", Max: 10},
+		{Label: "10-"},
+	}
+
+	h, err := NewDurationHistogramBins(bins)
+	require.NoError(t, err)
+
+	h.Observe(10)
+	h.Observe(11)
+	h.Observe(100)
+	require.Equal(t, int64(2), h.bins[1].Count)
+	require.Equal(t, int64(111), h.bins[1].DSum)
+
+	bins2 := []*DurationBin{
+		{Label: "0-10", Max: 10},
+		{Label: "10-20", Max: 20},
+	}
+
+	h2, err := NewDurationHistogramBins(bins2)
+	require.NoError(t, err)
+
+	h2.Observe(10)
+	h2.Observe(11)
+	h2.Observe(100)
+	require.Equal(t, int64(1), h2.bins[1].Count)
+	require.Equal(t, int64(11), h2.bins[1].DSum)
+}
+
 func TestStandardDeviation(t *testing.T) {
 	bins := []*DurationBin{
 		{Label: "0-10", Max: 10},
@@ -67,7 +132,8 @@ func TestStandardDeviation(t *testing.T) {
 		{Label: "50-90", Max: 90},
 		{Label: "90-100", Max: 100},
 	}
-	h := NewDurationHistogramBins(bins)
+	h, err := NewDurationHistogramBins(bins)
+	require.NoError(t, err)
 
 	data := []time.Duration{
 		3, 7, 12, 13, 14, 15, 15, 15, 16, 17, 25, 30, 60, 70, 80, 85, 93,
@@ -80,6 +146,56 @@ func TestStandardDeviation(t *testing.T) {
 	require.Equal(t, time.Duration(29), h.StandardDeviation())
 }
 
+func TestSummaryStatsIgnoreOutliers(t *testing.T) {
+	bins := []*DurationBin{
+		{Label: "0-10", Max: 10},
+		{Label: "10-20", Max: 20},
+		{Label: "20-30", Max: 30},
+		{Label: "30-40", Max: 40},
+		{Label: "40-50", Max: 50},
+	}
+	bins2 := append([]*DurationBin{}, bins...)
+
+	h, err := NewDurationHistogramBins(bins)
+	require.NoError(t, err)
+
+	hWithOutliers, err := NewDurationHistogramBins(bins2)
+	require.NoError(t, err)
+
+	// Uniformly distributed data from 0 to 50
+	data := []time.Duration{
+		1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+		11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+		21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+		31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+		41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+	}
+
+	outliers := []time.Duration{
+		55, 80, 100, 200, 500,
+	}
+
+	for _, d := range data {
+		h.Observe(d)
+		hWithOutliers.Observe(d)
+	}
+
+	for _, d := range outliers {
+		hWithOutliers.Observe(d)
+	}
+
+	require.Equal(t, h.TotalCount(), hWithOutliers.TotalCount())
+	require.Equal(t, h.TotalDSum(), hWithOutliers.TotalDSum())
+	require.Equal(t, h.Quantile(0.1), hWithOutliers.Quantile(0.1))
+	require.Equal(t, h.Quantile(0.5), hWithOutliers.Quantile(0.5))
+	require.Equal(t, h.Quantile(0.9), hWithOutliers.Quantile(0.9))
+	require.Equal(t, h.Quantile(1.0), hWithOutliers.Quantile(1.0))
+	require.Equal(t, h.Average(), hWithOutliers.Average())
+	require.Equal(t, h.StandardDeviation(), hWithOutliers.StandardDeviation())
+	require.InDeltaf(t, float64(0.047), hWithOutliers.OutlierProportion(), float64(0.01), "bad outliers")
+	require.Equal(t, float64(0), h.OutlierProportion())
+}
+
 func TestStandardDeviation2(t *testing.T) {
 	values := []*SerializedDurationBin{
 		{Label: "0-100ms", Count: 11917, DSum: 750548874146},
@@ -106,7 +222,8 @@ func TestQuantileEstimation(t *testing.T) {
 		{Label: "30-40", Max: 40},
 		{Label: "40-50", Max: 50},
 	}
-	h := NewDurationHistogramBins(bins)
+	h, err := NewDurationHistogramBins(bins)
+	require.NoError(t, err)
 
 	// Uniformly distributed data from 0 to 50
 	data := []time.Duration{
@@ -130,6 +247,29 @@ func TestQuantileEstimation(t *testing.T) {
 	require.Equal(t, time.Duration(45), h.Quantile(0.9))
 }
 
+func TestUnboundedQuantileEstimation(t *testing.T) {
+	bins := []*DurationBin{
+		{Label: "0-10", Max: 10},
+		{Label: "10-"},
+	}
+
+	h, err := NewDurationHistogramBins(bins)
+	require.NoError(t, err)
+	h.Observe(5)
+	// The 10- bin has an average of 30
+	// Therefore we estimate that the bin is evenly distributed across 10-50 (twice the difference
+	// between the previous max and the average)
+	h.Observe(20)
+	h.Observe(30)
+	h.Observe(30)
+	h.Observe(40)
+
+	// The upper 80% of this histogram is estimated as the linear distribution from 10-50
+	require.Equal(t, time.Duration(10), h.Quantile(0.20))
+	require.Equal(t, time.Duration(30), h.Quantile(0.60))
+	require.Equal(t, time.Duration(50), h.Quantile(1.0))
+}
+
 func TestMarshalUnmarshal(t *testing.T) {
 	h := NewDurationHistogram(DefaultDurationHistogram)
 
@@ -169,7 +309,8 @@ func TestHistogramConcurrency(t *testing.T) {
 		{Label: "30-40", Max: 40},
 		{Label: "40-50", Max: 50},
 	}
-	h := NewDurationHistogramBins(bins)
+	h, err := NewDurationHistogramBins(bins)
+	require.NoError(t, err)
 
 	// Uniformly distributed data from 0 to 50
 	data := []time.Duration{