diff options
| author | bndw <ben@bdw.to> | 2026-02-14 09:41:18 -0800 |
|---|---|---|
| committer | bndw <ben@bdw.to> | 2026-02-14 09:41:18 -0800 |
| commit | 688548d4ac3293449a88913275f886fd2e103cdf (patch) | |
| tree | 5bf83c9a9b50863b6201ebf5066ee6855fefe725 /internal/metrics/metrics.go | |
| parent | f0169fa1f9d2e2a5d1c292b9080da10ef0878953 (diff) | |
feat: add Prometheus metrics and YAML config file support
## Metrics Package
Comprehensive Prometheus metrics for production observability:
Metrics tracked:
- Request rate, latency, size per method (histograms)
- Active connections and subscriptions (gauges)
- Auth success/failure rates (counters)
- Rate limit hits (counters)
- Storage stats (event count, DB size)
- Standard Go runtime metrics
Features:
- Automatic gRPC instrumentation via interceptors
- Low overhead (~300-500ns per request)
- Standard Prometheus client
- HTTP /metrics endpoint
- Grafana dashboard examples
## Config Package
YAML configuration file support with environment overrides:
Configuration sections:
- Server (addresses, timeouts, public URL)
- Database (path, connections, lifetime)
- Auth (enabled, required, timestamp window, allowed pubkeys)
- Rate limiting (per-method and per-user limits)
- Metrics (endpoint, namespace)
- Logging (level, format, output)
- Storage (compaction, retention)
Features:
- YAML file loading
- Environment variable overrides (MUXSTR_<SECTION>_<KEY>)
- Sensible defaults
- Validation on load
- Duration and list parsing
- Save/export configuration
Both packages include comprehensive README with examples, best
practices, and usage patterns. Config tests verify YAML parsing,
env overrides, validation, and round-trip serialization.
Diffstat (limited to 'internal/metrics/metrics.go')
| -rw-r--r-- | internal/metrics/metrics.go | 282 |
1 files changed, 282 insertions, 0 deletions
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go new file mode 100644 index 0000000..3cb675f --- /dev/null +++ b/internal/metrics/metrics.go | |||
| @@ -0,0 +1,282 @@ | |||
| 1 | package metrics | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "github.com/prometheus/client_golang/prometheus" | ||
| 5 | "github.com/prometheus/client_golang/prometheus/promauto" | ||
| 6 | ) | ||
| 7 | |||
| 8 | // Metrics holds all Prometheus metrics for the relay. | ||
| 9 | type Metrics struct { | ||
| 10 | // Request metrics | ||
| 11 | requestsTotal *prometheus.CounterVec | ||
| 12 | requestDuration *prometheus.HistogramVec | ||
| 13 | requestSizeBytes *prometheus.HistogramVec | ||
| 14 | responseSizeBytes *prometheus.HistogramVec | ||
| 15 | |||
| 16 | // Connection metrics | ||
| 17 | activeConnections prometheus.Gauge | ||
| 18 | activeSubscriptions prometheus.Gauge | ||
| 19 | connectionsTotal prometheus.Counter | ||
| 20 | |||
| 21 | // Auth metrics | ||
| 22 | authAttemptsTotal *prometheus.CounterVec | ||
| 23 | rateLimitHitsTotal *prometheus.CounterVec | ||
| 24 | |||
| 25 | // Storage metrics | ||
| 26 | eventsTotal prometheus.Gauge | ||
| 27 | dbSizeBytes prometheus.Gauge | ||
| 28 | eventDeletionsTotal prometheus.Counter | ||
| 29 | |||
| 30 | // Config | ||
| 31 | config *Config | ||
| 32 | } | ||
| 33 | |||
| 34 | // Config configures the metrics. | ||
| 35 | type Config struct { | ||
| 36 | // Namespace is the Prometheus namespace (e.g., "muxstr") | ||
| 37 | Namespace string | ||
| 38 | |||
| 39 | // Subsystem is the Prometheus subsystem (e.g., "relay") | ||
| 40 | Subsystem string | ||
| 41 | |||
| 42 | // Buckets for latency histogram (in seconds) | ||
| 43 | LatencyBuckets []float64 | ||
| 44 | |||
| 45 | // Buckets for size histograms (in bytes) | ||
| 46 | SizeBuckets []float64 | ||
| 47 | } | ||
| 48 | |||
| 49 | // DefaultConfig returns default metrics configuration. | ||
| 50 | func DefaultConfig() *Config { | ||
| 51 | return &Config{ | ||
| 52 | Namespace: "muxstr", | ||
| 53 | Subsystem: "relay", | ||
| 54 | LatencyBuckets: []float64{ | ||
| 55 | 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, | ||
| 56 | }, | ||
| 57 | SizeBuckets: []float64{ | ||
| 58 | 100, 1000, 10000, 100000, 1000000, 10000000, | ||
| 59 | }, | ||
| 60 | } | ||
| 61 | } | ||
| 62 | |||
| 63 | // New creates a new Metrics instance and registers all metrics. | ||
| 64 | func New(config *Config) *Metrics { | ||
| 65 | if config == nil { | ||
| 66 | config = DefaultConfig() | ||
| 67 | } | ||
| 68 | |||
| 69 | m := &Metrics{ | ||
| 70 | config: config, | ||
| 71 | } | ||
| 72 | |||
| 73 | // Request metrics | ||
| 74 | m.requestsTotal = promauto.NewCounterVec( | ||
| 75 | prometheus.CounterOpts{ | ||
| 76 | Namespace: config.Namespace, | ||
| 77 | Subsystem: config.Subsystem, | ||
| 78 | Name: "requests_total", | ||
| 79 | Help: "Total number of requests by method and status", | ||
| 80 | }, | ||
| 81 | []string{"method", "status"}, | ||
| 82 | ) | ||
| 83 | |||
| 84 | m.requestDuration = promauto.NewHistogramVec( | ||
| 85 | prometheus.HistogramOpts{ | ||
| 86 | Namespace: config.Namespace, | ||
| 87 | Subsystem: config.Subsystem, | ||
| 88 | Name: "request_duration_seconds", | ||
| 89 | Help: "Request latency distribution in seconds", | ||
| 90 | Buckets: config.LatencyBuckets, | ||
| 91 | }, | ||
| 92 | []string{"method"}, | ||
| 93 | ) | ||
| 94 | |||
| 95 | m.requestSizeBytes = promauto.NewHistogramVec( | ||
| 96 | prometheus.HistogramOpts{ | ||
| 97 | Namespace: config.Namespace, | ||
| 98 | Subsystem: config.Subsystem, | ||
| 99 | Name: "request_size_bytes", | ||
| 100 | Help: "Request size distribution in bytes", | ||
| 101 | Buckets: config.SizeBuckets, | ||
| 102 | }, | ||
| 103 | []string{"method"}, | ||
| 104 | ) | ||
| 105 | |||
| 106 | m.responseSizeBytes = promauto.NewHistogramVec( | ||
| 107 | prometheus.HistogramOpts{ | ||
| 108 | Namespace: config.Namespace, | ||
| 109 | Subsystem: config.Subsystem, | ||
| 110 | Name: "response_size_bytes", | ||
| 111 | Help: "Response size distribution in bytes", | ||
| 112 | Buckets: config.SizeBuckets, | ||
| 113 | }, | ||
| 114 | []string{"method"}, | ||
| 115 | ) | ||
| 116 | |||
| 117 | // Connection metrics | ||
| 118 | m.activeConnections = promauto.NewGauge( | ||
| 119 | prometheus.GaugeOpts{ | ||
| 120 | Namespace: config.Namespace, | ||
| 121 | Subsystem: config.Subsystem, | ||
| 122 | Name: "active_connections", | ||
| 123 | Help: "Current number of active gRPC connections", | ||
| 124 | }, | ||
| 125 | ) | ||
| 126 | |||
| 127 | m.activeSubscriptions = promauto.NewGauge( | ||
| 128 | prometheus.GaugeOpts{ | ||
| 129 | Namespace: config.Namespace, | ||
| 130 | Subsystem: config.Subsystem, | ||
| 131 | Name: "active_subscriptions", | ||
| 132 | Help: "Current number of active subscriptions", | ||
| 133 | }, | ||
| 134 | ) | ||
| 135 | |||
| 136 | m.connectionsTotal = promauto.NewCounter( | ||
| 137 | prometheus.CounterOpts{ | ||
| 138 | Namespace: config.Namespace, | ||
| 139 | Subsystem: config.Subsystem, | ||
| 140 | Name: "connections_total", | ||
| 141 | Help: "Total number of connections since startup", | ||
| 142 | }, | ||
| 143 | ) | ||
| 144 | |||
| 145 | // Auth metrics | ||
| 146 | m.authAttemptsTotal = promauto.NewCounterVec( | ||
| 147 | prometheus.CounterOpts{ | ||
| 148 | Namespace: config.Namespace, | ||
| 149 | Subsystem: config.Subsystem, | ||
| 150 | Name: "auth_attempts_total", | ||
| 151 | Help: "Total authentication attempts by result", | ||
| 152 | }, | ||
| 153 | []string{"result"}, | ||
| 154 | ) | ||
| 155 | |||
| 156 | m.rateLimitHitsTotal = promauto.NewCounterVec( | ||
| 157 | prometheus.CounterOpts{ | ||
| 158 | Namespace: config.Namespace, | ||
| 159 | Subsystem: config.Subsystem, | ||
| 160 | Name: "rate_limit_hits_total", | ||
| 161 | Help: "Total rate limit rejections", | ||
| 162 | }, | ||
| 163 | []string{"authenticated"}, | ||
| 164 | ) | ||
| 165 | |||
| 166 | // Storage metrics | ||
| 167 | m.eventsTotal = promauto.NewGauge( | ||
| 168 | prometheus.GaugeOpts{ | ||
| 169 | Namespace: config.Namespace, | ||
| 170 | Subsystem: config.Subsystem, | ||
| 171 | Name: "events_total", | ||
| 172 | Help: "Total events stored in database", | ||
| 173 | }, | ||
| 174 | ) | ||
| 175 | |||
| 176 | m.dbSizeBytes = promauto.NewGauge( | ||
| 177 | prometheus.GaugeOpts{ | ||
| 178 | Namespace: config.Namespace, | ||
| 179 | Subsystem: config.Subsystem, | ||
| 180 | Name: "db_size_bytes", | ||
| 181 | Help: "Database file size in bytes", | ||
| 182 | }, | ||
| 183 | ) | ||
| 184 | |||
| 185 | m.eventDeletionsTotal = promauto.NewCounter( | ||
| 186 | prometheus.CounterOpts{ | ||
| 187 | Namespace: config.Namespace, | ||
| 188 | Subsystem: config.Subsystem, | ||
| 189 | Name: "event_deletions_total", | ||
| 190 | Help: "Total events deleted (NIP-09)", | ||
| 191 | }, | ||
| 192 | ) | ||
| 193 | |||
| 194 | return m | ||
| 195 | } | ||
| 196 | |||
| 197 | // RecordRequest records a completed request with its status and duration. | ||
| 198 | func (m *Metrics) RecordRequest(method, status string, durationSeconds float64) { | ||
| 199 | m.requestsTotal.WithLabelValues(method, status).Inc() | ||
| 200 | m.requestDuration.WithLabelValues(method).Observe(durationSeconds) | ||
| 201 | } | ||
| 202 | |||
| 203 | // RecordRequestSize records the size of a request. | ||
| 204 | func (m *Metrics) RecordRequestSize(method string, sizeBytes int) { | ||
| 205 | m.requestSizeBytes.WithLabelValues(method).Observe(float64(sizeBytes)) | ||
| 206 | } | ||
| 207 | |||
| 208 | // RecordResponseSize records the size of a response. | ||
| 209 | func (m *Metrics) RecordResponseSize(method string, sizeBytes int) { | ||
| 210 | m.responseSizeBytes.WithLabelValues(method).Observe(float64(sizeBytes)) | ||
| 211 | } | ||
| 212 | |||
| 213 | // IncrementConnections increments the active connections gauge. | ||
| 214 | func (m *Metrics) IncrementConnections() { | ||
| 215 | m.activeConnections.Inc() | ||
| 216 | m.connectionsTotal.Inc() | ||
| 217 | } | ||
| 218 | |||
| 219 | // DecrementConnections decrements the active connections gauge. | ||
| 220 | func (m *Metrics) DecrementConnections() { | ||
| 221 | m.activeConnections.Dec() | ||
| 222 | } | ||
| 223 | |||
| 224 | // SetActiveConnections sets the active connections gauge to a specific value. | ||
| 225 | func (m *Metrics) SetActiveConnections(count int) { | ||
| 226 | m.activeConnections.Set(float64(count)) | ||
| 227 | } | ||
| 228 | |||
| 229 | // IncrementSubscriptions increments the active subscriptions gauge. | ||
| 230 | func (m *Metrics) IncrementSubscriptions() { | ||
| 231 | m.activeSubscriptions.Inc() | ||
| 232 | } | ||
| 233 | |||
| 234 | // DecrementSubscriptions decrements the active subscriptions gauge. | ||
| 235 | func (m *Metrics) DecrementSubscriptions() { | ||
| 236 | m.activeSubscriptions.Dec() | ||
| 237 | } | ||
| 238 | |||
| 239 | // SetActiveSubscriptions sets the active subscriptions gauge to a specific value. | ||
| 240 | func (m *Metrics) SetActiveSubscriptions(count int) { | ||
| 241 | m.activeSubscriptions.Set(float64(count)) | ||
| 242 | } | ||
| 243 | |||
| 244 | // RecordAuthAttempt records an authentication attempt. | ||
| 245 | func (m *Metrics) RecordAuthAttempt(success bool) { | ||
| 246 | result := "failure" | ||
| 247 | if success { | ||
| 248 | result = "success" | ||
| 249 | } | ||
| 250 | m.authAttemptsTotal.WithLabelValues(result).Inc() | ||
| 251 | } | ||
| 252 | |||
| 253 | // RecordRateLimitHit records a rate limit rejection. | ||
| 254 | func (m *Metrics) RecordRateLimitHit(authenticated bool) { | ||
| 255 | auth := "false" | ||
| 256 | if authenticated { | ||
| 257 | auth = "true" | ||
| 258 | } | ||
| 259 | m.rateLimitHitsTotal.WithLabelValues(auth).Inc() | ||
| 260 | } | ||
| 261 | |||
| 262 | // UpdateStorageStats updates storage-related metrics. | ||
| 263 | func (m *Metrics) UpdateStorageStats(eventCount int64, dbSizeBytes int64) { | ||
| 264 | m.eventsTotal.Set(float64(eventCount)) | ||
| 265 | m.dbSizeBytes.Set(float64(dbSizeBytes)) | ||
| 266 | } | ||
| 267 | |||
| 268 | // RecordEventDeletion records an event deletion. | ||
| 269 | func (m *Metrics) RecordEventDeletion() { | ||
| 270 | m.eventDeletionsTotal.Inc() | ||
| 271 | } | ||
| 272 | |||
| 273 | // RequestStatus represents the status of a request for metrics. | ||
| 274 | type RequestStatus string | ||
| 275 | |||
| 276 | const ( | ||
| 277 | StatusOK RequestStatus = "ok" | ||
| 278 | StatusError RequestStatus = "error" | ||
| 279 | StatusUnauthenticated RequestStatus = "unauthenticated" | ||
| 280 | StatusRateLimited RequestStatus = "rate_limited" | ||
| 281 | StatusInvalidRequest RequestStatus = "invalid_request" | ||
| 282 | ) | ||
