API usability improvements in sampling package (#31940)

**Description:** Several usability issues were ironed out while working on #31894. This PR is the pkg/sampling changes from that PR. Highlights: - Adds `NeverSampleThreshold` value, which is the exclusive upper-boundary of threshold values. This makes negative sampling decisions easier to manage, as shown in #31894. - Adds `AllProbabilitiesRandomness` value, which is the inclusive upper-boundary of Randomness values. This makes error handling more natural as shown in #31894. All thresholds except `NeverSampleThreshold` will be sampled at `AllProbabilitiesRandomness`. - Adds `UnsignedToThreshold` constructor for explicit threshold construction. This is useful in the case of #31894 because it constructs a 14-bit threshold value. - Adds `UnsignedToRandomness` constructor for explicit randomness construction. This is useful in the case of #31894 because it constructs randomness values from log records w/o use of TraceState. - Removes a parameter from `UpdateTValueWithSampling` to avoid the potential for an inconsistent update through mis-use (as identified in #31894, there is less optimization potential because sampling.threshold modifies thresholds in all modes). - Eliminates the `ErrPrecisionUnderflow` error condition and automatically corrects the problem by extending precision near 0.0 and 1.0 where there are obligatory leading `f` or `0` digits. **Link to tracking Issue:** #31918 **Testing:** New tests added for coverage. **Documentation:** New comments to explain. --------- Co-authored-by: Sean Marciniak <30928402+MovieStoreGuy@users.noreply.github.com>
open-telemetry · Apr 9, 2024 · beef35e · beef35e
1 parent 78d1718
commit beef35e
Show file tree

Hide file tree

Showing 11 changed files with 258 additions and 115 deletions.
diff --git a/.chloggen/sampling-improvements.yaml b/.chloggen/sampling-improvements.yaml
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: enhancement
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: pkg/sampling
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Usability improvements in the sampling API.
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [31918]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: [api]
diff --git a/pkg/sampling/doc.go b/pkg/sampling/doc.go
@@ -37,53 +37,58 @@
 // conditioned on how much sampling has already taken place, use the
 // following pseudocode.
 //
-//	func Setup() {
-//		// Get a fixed probability value from the configuration, in
-//		// the range (0, 1].
-//		probability := *FLAG_probability
+// func Setup() {
+// 	// Get a fixed probability value from the configuration, in
+// 	// the range (0, 1].
+// 	probability := *FLAG_probability
 //
-//		// Calculate the sampling threshold from probability using 3
-//		// hex digits of precision.
-//		fixedThreshold, err = ProbabilityToThresholdWithPrecision(probability, 3)
-//		if err != nil {
-//			// error case: Probability is not valid.
-//		}
-//	}
+// 	// Calculate the sampling threshold from probability using 3
+// 	// hex digits of precision.
+// 	fixedThreshold, err = ProbabilityToThresholdWithPrecision(probability, 3)
+// 	if err != nil {
+// 		// error case: Probability is not valid.
+// 	}
+// }
 //
-//	func MakeDecision(tracestate string, tid TraceID) bool {
-//		// Parse the incoming tracestate
-//		ts, err := NewW3CTraceState(tracestate)
-//		if err != nil {
-//			// error case: Tracestate is ill-formed.
-//		}
-//		// For an absolute probability sample, we check the incoming
-//		// tracestate to see whether it was already sampled enough.
-//		if len(ts.OTelValue().TValue()) != 0 {
-//			// If the incoming tracestate was already sampled at
-//			// least as much as our threshold implies, then its
-//			// (rejection) threshold is higher.  If so, then no
-//			// further sampling is called for.
-//			if ThresholdGreater(ts.OTelValue().TValueThreshold(), fixedThreshold) {
-//				return true
-//			}
-//		}
-//		var rnd Randomness
-//		// If the R-value is present, use it.  If not, rely on TraceID
-//		// randomness.  Note that OTLP v1.1.0 introduces a new Span flag
-//		// to convey trace randomness correctly, and if the context has
-//		// neither the randomness bit set or the R-value set, we need a
-//		// fallback, which can be to synthesize an R-value or to assume
-//		// the TraceID has sufficient randomness.  This detail is left
-//		// out of scope.
-//		if rval, hasRval := ts.OTelValue().RValueRandomness(); hasRv {
-//			rnd = rval
-//		} else {
-//			rnd = TraceIDToRandomness(tid)
-//		}
-//
-//		return fixedThreshold.ShouldSample(rnd)
-//	}
+// func MakeDecision(tracestate string, tid TraceID) bool {
+// 	// Parse the incoming tracestate
+// 	ts, err := NewW3CTraceState(tracestate)
+// 	if err != nil {
+// 		// error case: Tracestate is ill-formed.
+// 	}
+// 	// For an absolute probability sample, we check the incoming
+// 	// tracestate to see whether it was already sampled enough.
+// 	if threshold, hasThreshold := ts.OTelValue().TValueThreshold(); hasThreshold {
+// 		// If the incoming tracestate was already sampled at
+// 		// least as much as our threshold implies, then its
+// 		// (rejection) threshold is higher.  If so, then no
+// 		// further sampling is called for.
+// 		if ThresholdGreater(threshold, fixedThreshold) {
+// 			// Do not update.
+// 			return true
+// 		}
+//		// The error below is ignored because we've tested
+//              // the equivalent condition above.  This lowers the sampling
+//              // probability expressed in the tracestate T-value.
+// 		_ = ts.OTelValue().UpdateThresholdWithSampling(fixedThreshold)
+// 	}
+// 	var rnd Randomness
+// 	// If the R-value is present, use it.  If not, rely on TraceID
+// 	// randomness.  Note that OTLP v1.1.0 introduces a new Span flag
+// 	// to convey trace randomness correctly, and if the context has
+// 	// neither the randomness bit set or the R-value set, we need a
+// 	// fallback, which can be to synthesize an R-value or to assume
+// 	// the TraceID has sufficient randomness.  This detail is left
+// 	// out of scope.
+// 	if rv, hasRand := ts.OTelValue().RValueRandomness(); hasRand {
+// 		rnd = rv
+// 	} else {
+// 		rnd = TraceIDToRandomness(tid)
+// 	}
+// 	return fixedThreshold.ShouldSample(rnd)
+// }
 //
 // [W3C tracecontext specification]: https://www.w3.org/TR/trace-context/#tracestate-header
 // [Tracestate handling]: https://opentelemetry.io/docs/specs/otel/trace/tracestate-handling/
+
 package sampling // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/sampling"
diff --git a/pkg/sampling/oteltracestate.go b/pkg/sampling/oteltracestate.go
@@ -97,16 +97,17 @@ func NewOpenTelemetryTraceState(input string) (OpenTelemetryTraceState, error) {
 			if otts.rnd, err = RValueToRandomness(value); err == nil {
 				otts.rvalue = value
 			} else {
-				// The zero-value for randomness implies always-sample;
-				// the threshold test is R < T, but T is not meaningful
-				// at zero, and this value implies zero adjusted count.
+				// RValueRandomness() will return false, the error
+				// accumulates and is returned below.
 				otts.rvalue = ""
 				otts.rnd = Randomness{}
 			}
 		case tValueFieldName:
 			if otts.threshold, err = TValueToThreshold(value); err == nil {
 				otts.tvalue = value
 			} else {
+				// TValueThreshold() will return false, the error
+				// accumulates and is returned below.
 				otts.tvalue = ""
 				otts.threshold = AlwaysSampleThreshold
 			}
@@ -148,29 +149,47 @@ func (otts *OpenTelemetryTraceState) TValueThreshold() (Threshold, bool) {
 }
 
 // UpdateTValueWithSampling modifies the TValue of this object, which
-// changes its adjusted count.  If the change of TValue leads to
-// inconsistency (i.e., raising sampling probability), an error is
-// returned.
-func (otts *OpenTelemetryTraceState) UpdateTValueWithSampling(sampledThreshold Threshold, encodedTValue string) error {
+// changes its adjusted count.  It is not logical to modify a sampling
+// probability in the direction of larger probability.  This prevents
+// accidental loss of adjusted count.
+//
+// If the change of TValue leads to inconsistency, an error is returned.
+func (otts *OpenTelemetryTraceState) UpdateTValueWithSampling(sampledThreshold Threshold) error {
+	// Note: there was once a code path here that optimized for
+	// cases where a static threshold is used, in which case the
+	// call to TValue() causes an unnecessary allocation per data
+	// item (w/ a constant result).  We have eliminated that
+	// parameter, due to the significant potential for mis-use.
+	// Therefore, this method always recomputes TValue() of the
+	// sampledThreshold (on success).  A future method such as
+	// UpdateTValueWithSamplingFixedTValue() could extend this
+	// API to address this allocation, although it is probably
+	// not significant.
 	if len(otts.TValue()) != 0 && ThresholdGreater(otts.threshold, sampledThreshold) {
 		return ErrInconsistentSampling
 	}
+	// Note NeverSampleThreshold is the (exclusive) upper boundary
+	// of valid thresholds, so the test above permits never-
+	// sampled updates, in which case the TValue() here is empty.
 	otts.threshold = sampledThreshold
-	otts.tvalue = encodedTValue
+	otts.tvalue = sampledThreshold.TValue()
 	return nil
 }
 
-// AdjustedCount returns the adjusted count implied by this TValue.
-// This term is defined here:
-// https://opentelemetry.io/docs/specs/otel/trace/tracestate-probability-sampling/
+// AdjustedCount returns the adjusted count for this item.  If the
+// TValue string is empty, this returns 0, otherwise returns
+// Threshold.AdjustedCount().
 func (otts *OpenTelemetryTraceState) AdjustedCount() float64 {
-	if len(otts.TValue()) == 0 {
+	if len(otts.tvalue) == 0 {
+		// Note: this case covers the zero state, where
+		// len(tvalue) == 0 and threshold == AlwaysSampleThreshold.
+		// We return 0 to indicate that no information is available.
 		return 0
 	}
-	return 1.0 / otts.threshold.Probability()
+	return otts.threshold.AdjustedCount()
 }
 
-// ClearTValue is used to unset TValue, in cases where it is
+// ClearTValue is used to unset TValue, for use in cases where it is
 // inconsistent on arrival.
 func (otts *OpenTelemetryTraceState) ClearTValue() {
 	otts.tvalue = ""

diff --git a/pkg/sampling/oteltracestate_test.go b/pkg/sampling/oteltracestate_test.go
@@ -105,7 +105,7 @@ func TestOpenTelemetryTraceStateTValueUpdate(t *testing.T) {
 	require.NotEqual(t, "", otts.RValue())
 
 	th, _ := TValueToThreshold("3")
-	require.NoError(t, otts.UpdateTValueWithSampling(th, "3"))
+	require.NoError(t, otts.UpdateTValueWithSampling(th))
 
 	require.Equal(t, "3", otts.TValue())
 	tv, hasTv := otts.TValueThreshold()
@@ -126,7 +126,7 @@ func TestOpenTelemetryTraceStateRTUpdate(t *testing.T) {
 	require.True(t, otts.HasAnyValue())
 
 	th, _ := TValueToThreshold("3")
-	require.NoError(t, otts.UpdateTValueWithSampling(th, "3"))
+	require.NoError(t, otts.UpdateTValueWithSampling(th))
 	otts.SetRValue(must(RValueToRandomness("00000000000003")))
 
 	const updated = "rv:00000000000003;th:3;a:b"
@@ -329,7 +329,7 @@ func TestUpdateTValueWithSampling(t *testing.T) {
 			newTh, err := ProbabilityToThreshold(test.prob)
 			require.NoError(t, err)
 
-			upErr := otts.UpdateTValueWithSampling(newTh, newTh.TValue())
+			upErr := otts.UpdateTValueWithSampling(newTh)
 
 			require.Equal(t, test.updateErr, upErr)
 

diff --git a/pkg/sampling/probability.go b/pkg/sampling/probability.go
@@ -11,12 +11,9 @@ import (
 // ErrProbabilityRange is returned when a value should be in the range [1/MaxAdjustedCount, 1].
 var ErrProbabilityRange = errors.New("sampling probability out of the range [1/MaxAdjustedCount, 1]")
 
-// ErrPrecisionUnderflow is returned when a precision is too great for the range.
-var ErrPrecisionUnderflow = errors.New("sampling precision is too great for the range")
-
 // MinSamplingProbability is the smallest representable probability
 // and is the inverse of MaxAdjustedCount.
-const MinSamplingProbability = 1.0 / MaxAdjustedCount
+const MinSamplingProbability = 1.0 / float64(MaxAdjustedCount)
 
 // probabilityInRange tests MinSamplingProb <= prob <= 1.
 func probabilityInRange(prob float64) bool {
@@ -26,67 +23,60 @@ func probabilityInRange(prob float64) bool {
 // ProbabilityToThreshold converts a probability to a Threshold.  It
 // returns an error when the probability is out-of-range.
 func ProbabilityToThreshold(prob float64) (Threshold, error) {
-	// Probability cases
-	if !probabilityInRange(prob) {
-		return AlwaysSampleThreshold, ErrProbabilityRange
-	}
-
-	scaled := uint64(math.Round(prob * MaxAdjustedCount))
-
-	return Threshold{
-		unsigned: MaxAdjustedCount - scaled,
-	}, nil
+	return ProbabilityToThresholdWithPrecision(prob, NumHexDigits)
 }
 
 // ProbabilityToThresholdWithPrecision is like ProbabilityToThreshold
-// with support for reduced precision.  The `prec` argument determines
+// with support for reduced precision.  The `precision` argument determines
 // how many significant hex digits will be used to encode the exact
 // probability.
-func ProbabilityToThresholdWithPrecision(prob float64, prec uint8) (Threshold, error) {
+func ProbabilityToThresholdWithPrecision(fraction float64, precision int) (Threshold, error) {
 	// Assume full precision at 0.
-	if prec == 0 {
-		return ProbabilityToThreshold(prob)
+	if precision == 0 {
+		precision = NumHexDigits
 	}
-	if !probabilityInRange(prob) {
+	if !probabilityInRange(fraction) {
 		return AlwaysSampleThreshold, ErrProbabilityRange
 	}
-	// Special case for prob == 1.  The logic for revising precision
-	// that follows requires 0 < 1 - prob < 1.
-	if prob == 1 {
+	// Special case for prob == 1.
+	if fraction == 1 {
 		return AlwaysSampleThreshold, nil
 	}
 
-	// Adjust precision considering the significance of leading
-	// zeros.  If we can multiply the rejection probability by 16
-	// and still be less than 1, then there is a leading zero of
-	// obligatory precision.
-	for reject := 1 - prob; reject*16 < 1; {
-		reject *= 16
-		prec++
-	}
-
-	// Check if leading zeros plus precision is above the maximum.
-	// This is called underflow because the requested precision
-	// leads to complete no significant figures.
-	if prec > NumHexDigits {
-		return AlwaysSampleThreshold, ErrPrecisionUnderflow
+	// Calculate the amount of precision needed to encode the
+	// threshold with reasonable precision.  Here, we count the
+	// number of leading `0` or `f` characters and automatically
+	// add precision to preserve relative error near the extremes.
+	//
+	// Frexp() normalizes both the fraction and one-minus the
+	// fraction, because more digits of precision are needed if
+	// either value is near zero.  Frexp returns an exponent <= 0.
+	//
+	// If `exp <= -4`, there will be a leading hex `0` or `f`.
+	// For every multiple of -4, another leading `0` or `f`
+	// appears, so this raises precision accordingly.
+	_, expF := math.Frexp(fraction)
+	_, expR := math.Frexp(1 - fraction)
+	precision = min(NumHexDigits, max(precision+expF/-hexBits, precision+expR/-hexBits))
+
+	// Compute the threshold
+	scaled := uint64(math.Round(fraction * float64(MaxAdjustedCount)))
+	threshold := MaxAdjustedCount - scaled
+
+	// Round to the specified precision, if less than the maximum.
+	if shift := hexBits * (NumHexDigits - precision); shift != 0 {
+		half := uint64(1) << (shift - 1)
+		threshold += half
+		threshold >>= shift
+		threshold <<= shift
 	}
 
-	scaled := uint64(math.Round(prob * MaxAdjustedCount))
-	rscaled := MaxAdjustedCount - scaled
-	shift := 4 * (14 - prec)
-	half := uint64(1) << (shift - 1)
-
-	rscaled += half
-	rscaled >>= shift
-	rscaled <<= shift
-
 	return Threshold{
-		unsigned: rscaled,
+		unsigned: threshold,
 	}, nil
 }
 
 // Probability is the sampling ratio in the range [MinSamplingProb, 1].
 func (t Threshold) Probability() float64 {
-	return float64(MaxAdjustedCount-t.unsigned) / MaxAdjustedCount
+	return float64(MaxAdjustedCount-t.unsigned) / float64(MaxAdjustedCount)
 }
diff --git a/pkg/sampling/probability_test.go b/pkg/sampling/probability_test.go
@@ -171,7 +171,10 @@ func ExampleProbabilityToThreshold_verysmall() {
 		0x8p-56,                // Skip 8 out of 2**56
 		0x10p-56,               // Skip 0x10 out of 2**56
 	} {
-		tval, _ := ProbabilityToThreshold(prob)
+		// Note that precision is automatically raised for
+		// such small probabilities, because leading 'f' and
+		// '0' digits are discounted.
+		tval, _ := ProbabilityToThresholdWithPrecision(prob, 3)
 		fmt.Println(tval.TValue())
 	}
 
@@ -279,7 +282,7 @@ func TestProbabilityToThresholdWithPrecision(t *testing.T) {
 					for len(strip) > 0 && strip[0] == '0' {
 						strip = strip[1:]
 					}
-					rth, err := ProbabilityToThresholdWithPrecision(test.prob, uint8(len(strip)))
+					rth, err := ProbabilityToThresholdWithPrecision(test.prob, len(strip))
 					require.NoError(t, err)
 					require.Equal(t, round, rth.TValue())
 				})