284 lines
9.7 KiB
Go
284 lines
9.7 KiB
Go
package metrics
|
|
|
|
import (
|
|
"fmt"
|
|
"regexp"
|
|
"runtime"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
)
|
|
|
|
// Metrics holds all metrics for the base service
|
|
type Metrics struct {
|
|
// HTTP metrics
|
|
HTTPRequest *prometheus.HistogramVec
|
|
|
|
// Database metrics
|
|
DatabaseQuery *prometheus.HistogramVec
|
|
|
|
// RabbitMQ metrics
|
|
RabbitMQMessages *prometheus.HistogramVec
|
|
|
|
// Business metrics
|
|
BusinessOperations *prometheus.HistogramVec
|
|
|
|
// Cache metrics
|
|
Cache *prometheus.HistogramVec
|
|
|
|
// External service metrics
|
|
ExternalServiceCall *prometheus.HistogramVec
|
|
|
|
// Configuration
|
|
namespace string
|
|
subsystem string
|
|
serviceName string
|
|
}
|
|
|
|
var (
|
|
metricsInstance *Metrics
|
|
metricsOnce = &sync.Once{}
|
|
startTime = time.Now()
|
|
)
|
|
|
|
// GetMetrics returns a singleton instance of Metrics
|
|
func GetMetrics(namespace, subsystem, serviceName string) *Metrics {
|
|
metricsOnce.Do(func() {
|
|
metricsInstance = newMetrics(namespace, subsystem, serviceName)
|
|
})
|
|
return metricsInstance
|
|
}
|
|
|
|
// newMetrics creates a new instance of Metrics
|
|
func newMetrics(namespace, subsystem, serviceName string) *Metrics {
|
|
return &Metrics{
|
|
namespace: namespace,
|
|
subsystem: subsystem,
|
|
serviceName: serviceName,
|
|
|
|
HTTPRequest: promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "http_request_duration_seconds",
|
|
Help: "HTTP request duration in seconds",
|
|
Buckets: prometheus.DefBuckets,
|
|
ConstLabels: prometheus.Labels{"service": serviceName},
|
|
},
|
|
[]string{"method", "endpoint", "status_code"},
|
|
),
|
|
|
|
DatabaseQuery: promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "database_query_duration_seconds",
|
|
Help: "Database query duration in seconds",
|
|
Buckets: prometheus.DefBuckets,
|
|
ConstLabels: prometheus.Labels{"service": serviceName},
|
|
},
|
|
[]string{"operation", "table", "error"},
|
|
),
|
|
|
|
// RabbitMQ metrics
|
|
RabbitMQMessages: promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "rabbitmq_messages_duration_seconds",
|
|
Help: "Duration of RabbitMQ message operations (publish/consume) in seconds",
|
|
Buckets: prometheus.DefBuckets,
|
|
ConstLabels: prometheus.Labels{"service": serviceName},
|
|
},
|
|
[]string{"exchange", "routing_key", "action", "error"},
|
|
),
|
|
|
|
// Business metrics
|
|
BusinessOperations: promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "business_operations_duration_seconds",
|
|
Help: "Duration of business operations in seconds",
|
|
Buckets: prometheus.DefBuckets,
|
|
ConstLabels: prometheus.Labels{"service": serviceName},
|
|
},
|
|
[]string{"operation_type", "error"},
|
|
),
|
|
|
|
// Cache metrics
|
|
Cache: promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "cache_operations_duration_seconds",
|
|
Help: "Duration of store operations in seconds",
|
|
Buckets: prometheus.DefBuckets,
|
|
ConstLabels: prometheus.Labels{"service": serviceName},
|
|
},
|
|
[]string{"cache_type", "key_pattern", "action", "hit", "error"},
|
|
),
|
|
|
|
ExternalServiceCall: promauto.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "external_service_duration_seconds",
|
|
Help: "External service call duration in seconds",
|
|
Buckets: prometheus.DefBuckets,
|
|
ConstLabels: prometheus.Labels{"service": serviceName},
|
|
},
|
|
[]string{"service_name", "endpoint", "error"},
|
|
),
|
|
}
|
|
}
|
|
|
|
// GetNamespace returns the metrics namespace
|
|
func (m *Metrics) GetNamespace() string {
|
|
return m.namespace
|
|
}
|
|
|
|
// GetSubsystem returns the metrics subsystem
|
|
func (m *Metrics) GetSubsystem() string {
|
|
return m.subsystem
|
|
}
|
|
|
|
// GetServiceName returns the service name
|
|
func (m *Metrics) GetServiceName() string {
|
|
return m.serviceName
|
|
}
|
|
|
|
// GetFullMetricName returns the full metric name with namespace and subsystem
|
|
func (m *Metrics) GetFullMetricName(metricName string) string {
|
|
return fmt.Sprintf("%s_%s_%s", m.namespace, m.subsystem, metricName)
|
|
}
|
|
|
|
// RecordHTTPRequest HTTP Metrics Functions
|
|
func (m *Metrics) RecordHTTPRequest(method, endpoint, statusCode string, duration time.Duration) {
|
|
m.HTTPRequest.WithLabelValues(method, endpoint, statusCode).Observe(duration.Seconds())
|
|
}
|
|
|
|
// NormalizePath normalizes HTTP paths by replacing numeric IDs and parameters with placeholders
|
|
// This prevents metric cardinality explosion while maintaining meaningful endpoint grouping
|
|
func (m *Metrics) NormalizePath(path string) string {
|
|
// Replace numeric IDs with :id placeholder
|
|
path = regexp.MustCompile(`/\d+`).ReplaceAllString(path, "/:id")
|
|
|
|
// Replace UUIDs with :uuid placeholder
|
|
path = regexp.MustCompile(`/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}`).ReplaceAllString(path, "/:uuid")
|
|
|
|
// Replace other common parameter patterns
|
|
path = regexp.MustCompile(`/[a-zA-Z0-9]{20,}`).ReplaceAllString(path, "/:hash") // Long hashes
|
|
path = regexp.MustCompile(`/\d{10,}`).ReplaceAllString(path, "/:long_id") // Very long numbers
|
|
return path
|
|
}
|
|
|
|
// NormalizeExternalServiceEndpoint normalizes external service endpoint names
|
|
// Use this when you have dynamic endpoint names that could cause cardinality issues
|
|
func (m *Metrics) NormalizeExternalServiceEndpoint(endpoint string) string {
|
|
// Replace numeric IDs with :id placeholder
|
|
endpoint = regexp.MustCompile(`\d+`).ReplaceAllString(endpoint, ":id")
|
|
|
|
// Replace UUIDs with :uuid placeholder
|
|
endpoint = regexp.MustCompile(`[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}`).ReplaceAllString(endpoint, ":uuid")
|
|
|
|
// Replace other common parameter patterns
|
|
endpoint = regexp.MustCompile(`[a-zA-Z0-9]{20,}`).ReplaceAllString(endpoint, ":hash") // Long hashes
|
|
endpoint = regexp.MustCompile(`\d{10,}`).ReplaceAllString(endpoint, ":long_id") // Very long numbers
|
|
|
|
return endpoint
|
|
}
|
|
|
|
// RecordDatabaseQuery Database Metrics Functions
|
|
func (m *Metrics) RecordDatabaseQuery(operation, table string, duration time.Duration, err error) {
|
|
m.DatabaseQuery.WithLabelValues(operation, table, m.classifyError(err)).Observe(duration.Seconds())
|
|
}
|
|
|
|
// RecordRabbitMQMessage RabbitMQ Metrics Functions
|
|
func (m *Metrics) RecordRabbitMQMessage(exchange, routingKey, action string, duration time.Duration, err error) {
|
|
m.RabbitMQMessages.WithLabelValues(exchange, routingKey, action, m.classifyError(err)).Observe(duration.Seconds())
|
|
}
|
|
|
|
// RecordBusinessOperation Business Metrics Functions
|
|
func (m *Metrics) RecordBusinessOperation(operationType string, err error, duration time.Duration) {
|
|
m.BusinessOperations.WithLabelValues(operationType, m.classifyError(err)).Observe(duration.Seconds())
|
|
}
|
|
|
|
// RecordCacheHit Cache Metrics Functions
|
|
func (m *Metrics) RecordCacheHit(cacheType, keyPattern, action string, hit bool, err error, duration time.Duration) {
|
|
m.Cache.WithLabelValues(cacheType, keyPattern, action, strconv.FormatBool(hit), m.classifyError(err)).Observe(duration.Seconds())
|
|
}
|
|
|
|
// RecordExternalServiceCall External Service Metrics Functions
|
|
func (m *Metrics) RecordExternalServiceCall(serviceName, endpoint string, err error, duration time.Duration) {
|
|
m.ExternalServiceCall.WithLabelValues(serviceName, endpoint, m.classifyError(err)).Observe(duration.Seconds())
|
|
}
|
|
|
|
// Utility Functions
|
|
func (m *Metrics) classifyError(err error) string {
|
|
if err == nil {
|
|
return "none"
|
|
}
|
|
|
|
errStr := err.Error()
|
|
switch {
|
|
case strings.Contains(errStr, "connection"):
|
|
return "connection_error"
|
|
case strings.Contains(errStr, "connection lost"):
|
|
return "connection_lost"
|
|
case strings.Contains(errStr, "connection reset by peer"):
|
|
return "connection_reset_by_peer"
|
|
case strings.Contains(errStr, "timeout"):
|
|
return "timeout_error"
|
|
case strings.Contains(strings.ToLower(errStr), "deadlock"):
|
|
return "deadlock_error"
|
|
case strings.Contains(errStr, "not found") || strings.Contains(errStr, "NotFound"):
|
|
return "not_found_error"
|
|
case strings.Contains(errStr, "Duplicate"):
|
|
return "duplicate_error"
|
|
case strings.Contains(errStr, "permission"):
|
|
return "permission_error"
|
|
case strings.Contains(errStr, "validation"):
|
|
return "validation_error"
|
|
case strings.Contains(errStr, "failed to publish") || strings.Contains(errStr, "publish error"):
|
|
return "publish_error"
|
|
case strings.Contains(errStr, "failed to marshal"):
|
|
return "marshal_error"
|
|
case strings.Contains(errStr, "failed to save"):
|
|
return "save_error"
|
|
case strings.Contains(errStr, "too many open files"):
|
|
return "too_many_open_files"
|
|
case strings.Contains(errStr, "no such file or directory"):
|
|
return "no_such_file"
|
|
case strings.Contains(errStr, "failed to parse CSV"):
|
|
return "parse_csv_error"
|
|
case strings.Contains(errStr, "Internal Server Error"):
|
|
return "internal_server_error"
|
|
default:
|
|
return "unknown_error"
|
|
}
|
|
}
|
|
|
|
// RecordCacheMetrics records comprehensive store metrics
|
|
func (m *Metrics) RecordCacheMetrics(cacheType, keyPattern, action string, hit bool, err error, duration time.Duration) {
|
|
m.RecordCacheHit(cacheType, keyPattern, action, hit, err, duration)
|
|
}
|
|
|
|
// RecordDatabaseOperation records comprehensive database operation metrics
|
|
func (m *Metrics) RecordDatabaseOperation(operation, table string, duration time.Duration, err error) {
|
|
m.RecordDatabaseQuery(operation, table, duration, err)
|
|
}
|
|
|
|
// GetMetricsSummary returns a summary of current metrics
|
|
func (m *Metrics) GetMetricsSummary() map[string]interface{} {
|
|
return map[string]interface{}{
|
|
"uptime_seconds": time.Since(startTime).Seconds(),
|
|
"goroutines": runtime.NumGoroutine(),
|
|
"start_time": startTime.Format(time.RFC3339),
|
|
}
|
|
}
|