initial commit
This commit is contained in:
283
pkg/metrics/metrics.go
Normal file
283
pkg/metrics/metrics.go
Normal file
@@ -0,0 +1,283 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
)
|
||||
|
||||
// Metrics holds all metrics for the base service
|
||||
type Metrics struct {
|
||||
// HTTP metrics
|
||||
HTTPRequest *prometheus.HistogramVec
|
||||
|
||||
// Database metrics
|
||||
DatabaseQuery *prometheus.HistogramVec
|
||||
|
||||
// RabbitMQ metrics
|
||||
RabbitMQMessages *prometheus.HistogramVec
|
||||
|
||||
// Business metrics
|
||||
BusinessOperations *prometheus.HistogramVec
|
||||
|
||||
// Cache metrics
|
||||
Cache *prometheus.HistogramVec
|
||||
|
||||
// External service metrics
|
||||
ExternalServiceCall *prometheus.HistogramVec
|
||||
|
||||
// Configuration
|
||||
namespace string
|
||||
subsystem string
|
||||
serviceName string
|
||||
}
|
||||
|
||||
var (
|
||||
metricsInstance *Metrics
|
||||
metricsOnce = &sync.Once{}
|
||||
startTime = time.Now()
|
||||
)
|
||||
|
||||
// GetMetrics returns a singleton instance of Metrics
|
||||
func GetMetrics(namespace, subsystem, serviceName string) *Metrics {
|
||||
metricsOnce.Do(func() {
|
||||
metricsInstance = newMetrics(namespace, subsystem, serviceName)
|
||||
})
|
||||
return metricsInstance
|
||||
}
|
||||
|
||||
// newMetrics creates a new instance of Metrics
|
||||
func newMetrics(namespace, subsystem, serviceName string) *Metrics {
|
||||
return &Metrics{
|
||||
namespace: namespace,
|
||||
subsystem: subsystem,
|
||||
serviceName: serviceName,
|
||||
|
||||
HTTPRequest: promauto.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "http_request_duration_seconds",
|
||||
Help: "HTTP request duration in seconds",
|
||||
Buckets: prometheus.DefBuckets,
|
||||
ConstLabels: prometheus.Labels{"service": serviceName},
|
||||
},
|
||||
[]string{"method", "endpoint", "status_code"},
|
||||
),
|
||||
|
||||
DatabaseQuery: promauto.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "database_query_duration_seconds",
|
||||
Help: "Database query duration in seconds",
|
||||
Buckets: prometheus.DefBuckets,
|
||||
ConstLabels: prometheus.Labels{"service": serviceName},
|
||||
},
|
||||
[]string{"operation", "table", "error"},
|
||||
),
|
||||
|
||||
// RabbitMQ metrics
|
||||
RabbitMQMessages: promauto.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "rabbitmq_messages_duration_seconds",
|
||||
Help: "Duration of RabbitMQ message operations (publish/consume) in seconds",
|
||||
Buckets: prometheus.DefBuckets,
|
||||
ConstLabels: prometheus.Labels{"service": serviceName},
|
||||
},
|
||||
[]string{"exchange", "routing_key", "action", "error"},
|
||||
),
|
||||
|
||||
// Business metrics
|
||||
BusinessOperations: promauto.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "business_operations_duration_seconds",
|
||||
Help: "Duration of business operations in seconds",
|
||||
Buckets: prometheus.DefBuckets,
|
||||
ConstLabels: prometheus.Labels{"service": serviceName},
|
||||
},
|
||||
[]string{"operation_type", "error"},
|
||||
),
|
||||
|
||||
// Cache metrics
|
||||
Cache: promauto.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "cache_operations_duration_seconds",
|
||||
Help: "Duration of store operations in seconds",
|
||||
Buckets: prometheus.DefBuckets,
|
||||
ConstLabels: prometheus.Labels{"service": serviceName},
|
||||
},
|
||||
[]string{"cache_type", "key_pattern", "action", "hit", "error"},
|
||||
),
|
||||
|
||||
ExternalServiceCall: promauto.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "external_service_duration_seconds",
|
||||
Help: "External service call duration in seconds",
|
||||
Buckets: prometheus.DefBuckets,
|
||||
ConstLabels: prometheus.Labels{"service": serviceName},
|
||||
},
|
||||
[]string{"service_name", "endpoint", "error"},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
// GetNamespace returns the metrics namespace
|
||||
func (m *Metrics) GetNamespace() string {
|
||||
return m.namespace
|
||||
}
|
||||
|
||||
// GetSubsystem returns the metrics subsystem
|
||||
func (m *Metrics) GetSubsystem() string {
|
||||
return m.subsystem
|
||||
}
|
||||
|
||||
// GetServiceName returns the service name
|
||||
func (m *Metrics) GetServiceName() string {
|
||||
return m.serviceName
|
||||
}
|
||||
|
||||
// GetFullMetricName returns the full metric name with namespace and subsystem
|
||||
func (m *Metrics) GetFullMetricName(metricName string) string {
|
||||
return fmt.Sprintf("%s_%s_%s", m.namespace, m.subsystem, metricName)
|
||||
}
|
||||
|
||||
// RecordHTTPRequest HTTP Metrics Functions
|
||||
func (m *Metrics) RecordHTTPRequest(method, endpoint, statusCode string, duration time.Duration) {
|
||||
m.HTTPRequest.WithLabelValues(method, endpoint, statusCode).Observe(duration.Seconds())
|
||||
}
|
||||
|
||||
// NormalizePath normalizes HTTP paths by replacing numeric IDs and parameters with placeholders
|
||||
// This prevents metric cardinality explosion while maintaining meaningful endpoint grouping
|
||||
func (m *Metrics) NormalizePath(path string) string {
|
||||
// Replace numeric IDs with :id placeholder
|
||||
path = regexp.MustCompile(`/\d+`).ReplaceAllString(path, "/:id")
|
||||
|
||||
// Replace UUIDs with :uuid placeholder
|
||||
path = regexp.MustCompile(`/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}`).ReplaceAllString(path, "/:uuid")
|
||||
|
||||
// Replace other common parameter patterns
|
||||
path = regexp.MustCompile(`/[a-zA-Z0-9]{20,}`).ReplaceAllString(path, "/:hash") // Long hashes
|
||||
path = regexp.MustCompile(`/\d{10,}`).ReplaceAllString(path, "/:long_id") // Very long numbers
|
||||
return path
|
||||
}
|
||||
|
||||
// NormalizeExternalServiceEndpoint normalizes external service endpoint names
|
||||
// Use this when you have dynamic endpoint names that could cause cardinality issues
|
||||
func (m *Metrics) NormalizeExternalServiceEndpoint(endpoint string) string {
|
||||
// Replace numeric IDs with :id placeholder
|
||||
endpoint = regexp.MustCompile(`\d+`).ReplaceAllString(endpoint, ":id")
|
||||
|
||||
// Replace UUIDs with :uuid placeholder
|
||||
endpoint = regexp.MustCompile(`[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}`).ReplaceAllString(endpoint, ":uuid")
|
||||
|
||||
// Replace other common parameter patterns
|
||||
endpoint = regexp.MustCompile(`[a-zA-Z0-9]{20,}`).ReplaceAllString(endpoint, ":hash") // Long hashes
|
||||
endpoint = regexp.MustCompile(`\d{10,}`).ReplaceAllString(endpoint, ":long_id") // Very long numbers
|
||||
|
||||
return endpoint
|
||||
}
|
||||
|
||||
// RecordDatabaseQuery Database Metrics Functions
|
||||
func (m *Metrics) RecordDatabaseQuery(operation, table string, duration time.Duration, err error) {
|
||||
m.DatabaseQuery.WithLabelValues(operation, table, m.classifyError(err)).Observe(duration.Seconds())
|
||||
}
|
||||
|
||||
// RecordRabbitMQMessage RabbitMQ Metrics Functions
|
||||
func (m *Metrics) RecordRabbitMQMessage(exchange, routingKey, action string, duration time.Duration, err error) {
|
||||
m.RabbitMQMessages.WithLabelValues(exchange, routingKey, action, m.classifyError(err)).Observe(duration.Seconds())
|
||||
}
|
||||
|
||||
// RecordBusinessOperation Business Metrics Functions
|
||||
func (m *Metrics) RecordBusinessOperation(operationType string, err error, duration time.Duration) {
|
||||
m.BusinessOperations.WithLabelValues(operationType, m.classifyError(err)).Observe(duration.Seconds())
|
||||
}
|
||||
|
||||
// RecordCacheHit Cache Metrics Functions
|
||||
func (m *Metrics) RecordCacheHit(cacheType, keyPattern, action string, hit bool, err error, duration time.Duration) {
|
||||
m.Cache.WithLabelValues(cacheType, keyPattern, action, strconv.FormatBool(hit), m.classifyError(err)).Observe(duration.Seconds())
|
||||
}
|
||||
|
||||
// RecordExternalServiceCall External Service Metrics Functions
|
||||
func (m *Metrics) RecordExternalServiceCall(serviceName, endpoint string, err error, duration time.Duration) {
|
||||
m.ExternalServiceCall.WithLabelValues(serviceName, endpoint, m.classifyError(err)).Observe(duration.Seconds())
|
||||
}
|
||||
|
||||
// Utility Functions
|
||||
func (m *Metrics) classifyError(err error) string {
|
||||
if err == nil {
|
||||
return "none"
|
||||
}
|
||||
|
||||
errStr := err.Error()
|
||||
switch {
|
||||
case strings.Contains(errStr, "connection"):
|
||||
return "connection_error"
|
||||
case strings.Contains(errStr, "connection lost"):
|
||||
return "connection_lost"
|
||||
case strings.Contains(errStr, "connection reset by peer"):
|
||||
return "connection_reset_by_peer"
|
||||
case strings.Contains(errStr, "timeout"):
|
||||
return "timeout_error"
|
||||
case strings.Contains(strings.ToLower(errStr), "deadlock"):
|
||||
return "deadlock_error"
|
||||
case strings.Contains(errStr, "not found") || strings.Contains(errStr, "NotFound"):
|
||||
return "not_found_error"
|
||||
case strings.Contains(errStr, "Duplicate"):
|
||||
return "duplicate_error"
|
||||
case strings.Contains(errStr, "permission"):
|
||||
return "permission_error"
|
||||
case strings.Contains(errStr, "validation"):
|
||||
return "validation_error"
|
||||
case strings.Contains(errStr, "failed to publish") || strings.Contains(errStr, "publish error"):
|
||||
return "publish_error"
|
||||
case strings.Contains(errStr, "failed to marshal"):
|
||||
return "marshal_error"
|
||||
case strings.Contains(errStr, "failed to save"):
|
||||
return "save_error"
|
||||
case strings.Contains(errStr, "too many open files"):
|
||||
return "too_many_open_files"
|
||||
case strings.Contains(errStr, "no such file or directory"):
|
||||
return "no_such_file"
|
||||
case strings.Contains(errStr, "failed to parse CSV"):
|
||||
return "parse_csv_error"
|
||||
case strings.Contains(errStr, "Internal Server Error"):
|
||||
return "internal_server_error"
|
||||
default:
|
||||
return "unknown_error"
|
||||
}
|
||||
}
|
||||
|
||||
// RecordCacheMetrics records comprehensive store metrics
|
||||
func (m *Metrics) RecordCacheMetrics(cacheType, keyPattern, action string, hit bool, err error, duration time.Duration) {
|
||||
m.RecordCacheHit(cacheType, keyPattern, action, hit, err, duration)
|
||||
}
|
||||
|
||||
// RecordDatabaseOperation records comprehensive database operation metrics
|
||||
func (m *Metrics) RecordDatabaseOperation(operation, table string, duration time.Duration, err error) {
|
||||
m.RecordDatabaseQuery(operation, table, duration, err)
|
||||
}
|
||||
|
||||
// GetMetricsSummary returns a summary of current metrics
|
||||
func (m *Metrics) GetMetricsSummary() map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
"uptime_seconds": time.Since(startTime).Seconds(),
|
||||
"goroutines": runtime.NumGoroutine(),
|
||||
"start_time": startTime.Format(time.RFC3339),
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user