-
Notifications
You must be signed in to change notification settings - Fork 3.4k
/
metrics.go
332 lines (310 loc) · 14 KB
/
metrics.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
package ingester
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/grafana/loki/v3/pkg/analytics"
"github.com/grafana/loki/v3/pkg/util/constants"
"github.com/grafana/loki/v3/pkg/validation"
)
type ingesterMetrics struct {
checkpointDeleteFail prometheus.Counter
checkpointDeleteTotal prometheus.Counter
checkpointCreationFail prometheus.Counter
checkpointCreationTotal prometheus.Counter
checkpointDuration prometheus.Summary
checkpointLoggedBytesTotal prometheus.Counter
walDiskFullFailures prometheus.Counter
walReplayActive prometheus.Gauge
walReplayDuration prometheus.Gauge
walReplaySamplesDropped *prometheus.CounterVec
walReplayBytesDropped *prometheus.CounterVec
walCorruptionsTotal *prometheus.CounterVec
walLoggedBytesTotal prometheus.Counter
walRecordsLogged prometheus.Counter
recoveredStreamsTotal prometheus.Counter
recoveredChunksTotal prometheus.Counter
recoveredEntriesTotal prometheus.Counter
duplicateEntriesTotal prometheus.Counter
recoveredBytesTotal prometheus.Counter
recoveryBytesInUse prometheus.Gauge
recoveryIsFlushing prometheus.Gauge
limiterEnabled prometheus.Gauge
autoForgetUnhealthyIngestersTotal prometheus.Counter
chunkUtilization prometheus.Histogram
memoryChunks prometheus.Gauge
chunkEntries prometheus.Histogram
chunkSize prometheus.Histogram
chunkCompressionRatio prometheus.Histogram
chunksPerTenant *prometheus.CounterVec
chunkSizePerTenant *prometheus.CounterVec
chunkAge prometheus.Histogram
chunkEncodeTime prometheus.Histogram
chunksFlushFailures prometheus.Counter
chunksFlushedPerReason *prometheus.CounterVec
chunkLifespan prometheus.Histogram
chunksEncoded *prometheus.CounterVec
chunkDecodeFailures *prometheus.CounterVec
flushedChunksStats *analytics.Counter
flushedChunksBytesStats *analytics.Statistics
flushedChunksLinesStats *analytics.Statistics
flushedChunksAgeStats *analytics.Statistics
flushedChunksLifespanStats *analytics.Statistics
flushedChunksUtilizationStats *analytics.Statistics
chunksCreatedTotal prometheus.Counter
samplesPerChunk prometheus.Histogram
blocksPerChunk prometheus.Histogram
chunkCreatedStats *analytics.Counter
// Shutdown marker for ingester scale down
shutdownMarker prometheus.Gauge
flushQueueLength prometheus.Gauge
duplicateLogBytesTotal *prometheus.CounterVec
streamsOwnershipCheck prometheus.Histogram
}
// setRecoveryBytesInUse bounds the bytes reports to >= 0.
// TODO(owen-d): we can gain some efficiency by having the flusher never update this after recovery ends.
func (m *ingesterMetrics) setRecoveryBytesInUse(v int64) {
if v < 0 {
v = 0
}
m.recoveryBytesInUse.Set(float64(v))
}
const (
walTypeCheckpoint = "checkpoint"
walTypeSegment = "segment"
duplicateReason = "duplicate"
)
func newIngesterMetrics(r prometheus.Registerer, metricsNamespace string) *ingesterMetrics {
return &ingesterMetrics{
walDiskFullFailures: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_disk_full_failures_total",
Help: "Total number of wal write failures due to full disk.",
}),
walReplayActive: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "loki_ingester_wal_replay_active",
Help: "Whether the WAL is replaying",
}),
walReplayDuration: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "loki_ingester_wal_replay_duration_seconds",
Help: "Time taken to replay the checkpoint and the WAL.",
}),
walReplaySamplesDropped: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "loki_ingester_wal_discarded_samples_total",
Help: "WAL segment entries discarded during replay",
}, []string{validation.ReasonLabel}),
walReplayBytesDropped: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "loki_ingester_wal_discarded_bytes_total",
Help: "WAL segment bytes discarded during replay",
}, []string{validation.ReasonLabel}),
walCorruptionsTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "loki_ingester_wal_corruptions_total",
Help: "Total number of WAL corruptions encountered.",
}, []string{"type"}),
checkpointDeleteFail: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_checkpoint_deletions_failed_total",
Help: "Total number of checkpoint deletions that failed.",
}),
checkpointDeleteTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_checkpoint_deletions_total",
Help: "Total number of checkpoint deletions attempted.",
}),
checkpointCreationFail: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_checkpoint_creations_failed_total",
Help: "Total number of checkpoint creations that failed.",
}),
checkpointCreationTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_checkpoint_creations_total",
Help: "Total number of checkpoint creations attempted.",
}),
checkpointDuration: promauto.With(r).NewSummary(prometheus.SummaryOpts{
Name: "loki_ingester_checkpoint_duration_seconds",
Help: "Time taken to create a checkpoint.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
}),
walRecordsLogged: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_records_logged_total",
Help: "Total number of WAL records logged.",
}),
checkpointLoggedBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_checkpoint_logged_bytes_total",
Help: "Total number of bytes written to disk for checkpointing.",
}),
walLoggedBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_logged_bytes_total",
Help: "Total number of bytes written to disk for WAL records.",
}),
recoveredStreamsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_recovered_streams_total",
Help: "Total number of streams recovered from the WAL.",
}),
recoveredChunksTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_recovered_chunks_total",
Help: "Total number of chunks recovered from the WAL checkpoints.",
}),
recoveredEntriesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_recovered_entries_total",
Help: "Total number of entries recovered from the WAL.",
}),
duplicateEntriesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_duplicate_entries_total",
Help: "Entries discarded during WAL replay due to existing in checkpoints.",
}),
recoveredBytesTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_wal_recovered_bytes_total",
Help: "Total number of bytes recovered from the WAL.",
}),
recoveryBytesInUse: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "loki_ingester_wal_bytes_in_use",
Help: "Total number of bytes in use by the WAL recovery process.",
}),
recoveryIsFlushing: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "loki_ingester_wal_replay_flushing",
Help: "Whether the wal replay is in a flushing phase due to backpressure",
}),
limiterEnabled: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Name: "loki_ingester_limiter_enabled",
Help: "Whether the ingester's limiter is enabled",
}),
autoForgetUnhealthyIngestersTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "loki_ingester_autoforget_unhealthy_ingesters_total",
Help: "Total number of ingesters automatically forgotten",
}),
chunkUtilization: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_utilization",
Help: "Distribution of stored chunk utilization (when stored).",
Buckets: prometheus.LinearBuckets(0, 0.2, 6),
}),
memoryChunks: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: constants.Loki,
Name: "ingester_memory_chunks",
Help: "The total number of chunks in memory.",
}),
chunkEntries: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_entries",
Help: "Distribution of stored lines per chunk (when stored).",
Buckets: prometheus.ExponentialBuckets(200, 2, 9), // biggest bucket is 200*2^(9-1) = 51200
}),
chunkSize: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_size_bytes",
Help: "Distribution of stored chunk sizes (when stored).",
Buckets: prometheus.ExponentialBuckets(20000, 2, 10), // biggest bucket is 20000*2^(10-1) = 10,240,000 (~10.2MB)
}),
chunkCompressionRatio: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_compression_ratio",
Help: "Compression ratio of chunks (when stored).",
Buckets: prometheus.LinearBuckets(.75, 2, 10),
}),
chunksPerTenant: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: constants.Loki,
Name: "ingester_chunks_stored_total",
Help: "Total stored chunks per tenant.",
}, []string{"tenant"}),
chunkSizePerTenant: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_stored_bytes_total",
Help: "Total bytes stored in chunks per tenant.",
}, []string{"tenant"}),
chunkAge: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_age_seconds",
Help: "Distribution of chunk ages (when stored).",
// with default settings chunks should flush between 5 min and 12 hours
// so buckets at 1min, 5min, 10min, 30min, 1hr, 2hr, 4hr, 10hr, 12hr, 16hr
Buckets: []float64{60, 300, 600, 1800, 3600, 7200, 14400, 36000, 43200, 57600},
}),
chunkEncodeTime: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_encode_time_seconds",
Help: "Distribution of chunk encode times.",
// 10ms to 10s.
Buckets: prometheus.ExponentialBuckets(0.01, 4, 6),
}),
chunksFlushFailures: promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: constants.Loki,
Name: "ingester_chunks_flush_failures_total",
Help: "Total number of flush failures.",
}),
chunksFlushedPerReason: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: constants.Loki,
Name: "ingester_chunks_flushed_total",
Help: "Total flushed chunks per reason.",
}, []string{"reason"}),
chunkLifespan: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_bounds_hours",
Help: "Distribution of chunk end-start durations.",
// 1h -> 8hr
Buckets: prometheus.LinearBuckets(1, 1, 8),
}),
chunksEncoded: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: constants.Loki,
Name: "ingester_chunks_encoded_total",
Help: "The total number of chunks encoded in the ingester.",
}, []string{"user"}),
chunkDecodeFailures: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: constants.Loki,
Name: "ingester_chunk_decode_failures_total",
Help: "The number of freshly encoded chunks that failed to decode.",
}, []string{"user"}),
flushedChunksStats: analytics.NewCounter("ingester_flushed_chunks"),
flushedChunksBytesStats: analytics.NewStatistics("ingester_flushed_chunks_bytes"),
flushedChunksLinesStats: analytics.NewStatistics("ingester_flushed_chunks_lines"),
flushedChunksAgeStats: analytics.NewStatistics(
"ingester_flushed_chunks_age_seconds",
),
flushedChunksLifespanStats: analytics.NewStatistics(
"ingester_flushed_chunks_lifespan_seconds",
),
flushedChunksUtilizationStats: analytics.NewStatistics(
"ingester_flushed_chunks_utilization",
),
chunksCreatedTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: constants.Loki,
Name: "ingester_chunks_created_total",
Help: "The total number of chunks created in the ingester.",
}),
samplesPerChunk: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Subsystem: "ingester",
Name: "samples_per_chunk",
Help: "The number of samples in a chunk.",
Buckets: prometheus.LinearBuckets(4096, 2048, 6),
}),
blocksPerChunk: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Subsystem: "ingester",
Name: "blocks_per_chunk",
Help: "The number of blocks in a chunk.",
Buckets: prometheus.ExponentialBuckets(5, 2, 6),
}),
chunkCreatedStats: analytics.NewCounter("ingester_chunk_created"),
shutdownMarker: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: constants.Loki,
Subsystem: "ingester",
Name: "shutdown_marker",
Help: "1 if prepare shutdown has been called, 0 otherwise",
}),
flushQueueLength: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: "ingester",
Name: "flush_queue_length",
Help: "The total number of series pending in the flush queue.",
}),
streamsOwnershipCheck: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Name: "ingester_streams_ownership_check_duration_ms",
Help: "Distribution of streams ownership check durations in milliseconds.",
// 100ms to 5s.
Buckets: []float64{100, 250, 350, 500, 750, 1000, 1500, 2000, 5000},
}),
duplicateLogBytesTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: "ingester",
Name: "duplicate_log_bytes_total",
Help: "The total number of bytes that were discarded for duplicate log lines.",
}, []string{"tenant"}),
}
}