Skip to content

Go微服务监控实战指南

引言

微服务架构的复杂性使得监控变得尤为重要。本文将详细介绍如何使用现代化工具对Go微服务进行全方位监控。

监控指标

基础指标

go
package metrics

import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
)

var (
    RequestCounter = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "http_requests_total",
            Help: "Total number of HTTP requests",
        },
        []string{"method", "endpoint", "status"},
    )
    
    RequestDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "http_request_duration_seconds",
            Help:    "HTTP request duration in seconds",
            Buckets: []float64{0.1, 0.5, 1, 2, 5},
        },
        []string{"method", "endpoint"},
    )
    
    ActiveConnections = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "active_connections",
            Help: "Number of active connections",
        },
    )
)

中间件实现

go
func MetricsMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        start := time.Now()
        
        // 包装ResponseWriter以获取状态码
        rw := NewResponseWriter(w)
        
        // 处理请求
        next.ServeHTTP(rw, r)
        
        // 记录指标
        duration := time.Since(start).Seconds()
        RequestCounter.WithLabelValues(
            r.Method,
            r.URL.Path,
            strconv.Itoa(rw.statusCode),
        ).Inc()
        
        RequestDuration.WithLabelValues(
            r.Method,
            r.URL.Path,
        ).Observe(duration)
    })
}

链路追踪

Jaeger集成

go
package tracing

import (
    "github.com/opentracing/opentracing-go"
    "github.com/uber/jaeger-client-go"
    "github.com/uber/jaeger-client-go/config"
)

func InitTracer(service string) (opentracing.Tracer, error) {
    cfg := &config.Configuration{
        ServiceName: service,
        Sampler: &config.SamplerConfig{
            Type:  "const",
            Param: 1,
        },
        Reporter: &config.ReporterConfig{
            LogSpans:            true,
            LocalAgentHostPort: "jaeger:6831",
        },
    }
    
    tracer, _, err := cfg.NewTracer(config.Logger(jaeger.StdLogger))
    if err != nil {
        return nil, err
    }
    
    opentracing.SetGlobalTracer(tracer)
    return tracer, nil
}

请求追踪

go
func TracingMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        spanCtx, _ := opentracing.GlobalTracer().Extract(
            opentracing.HTTPHeaders,
            opentracing.HTTPHeadersCarrier(r.Header),
        )
        
        span := opentracing.GlobalTracer().StartSpan(
            "http_request",
            opentracing.ChildOf(spanCtx),
        )
        defer span.Finish()
        
        // 添加请求信息到span
        span.SetTag("http.method", r.Method)
        span.SetTag("http.url", r.URL.Path)
        
        // 将span注入到context
        ctx := opentracing.ContextWithSpan(r.Context(), span)
        next.ServeHTTP(w, r.WithContext(ctx))
    })
}

日志管理

结构化日志

go
package logging

import (
    "github.com/sirupsen/logrus"
    "gopkg.in/natefinch/lumberjack.v2"
)

func InitLogger() *logrus.Logger {
    logger := logrus.New()
    
    // 设置输出格式
    logger.SetFormatter(&logrus.JSONFormatter{
        TimestampFormat: "2006-01-02 15:04:05",
    })
    
    // 设置日志轮转
    logger.SetOutput(&lumberjack.Logger{
        Filename:   "/var/log/app.log",
        MaxSize:    100, // MB
        MaxBackups: 3,
        MaxAge:     28,   // days
        Compress:   true,
    })
    
    return logger
}

上下文日志

go
type ContextLogger struct {
    *logrus.Entry
}

func NewContextLogger(ctx context.Context, logger *logrus.Logger) *ContextLogger {
    // 从context获取请求ID
    requestID := ctx.Value("request_id").(string)
    
    // 从context获取追踪信息
    span := opentracing.SpanFromContext(ctx)
    fields := logrus.Fields{
        "request_id": requestID,
    }
    
    if span != nil {
        fields["trace_id"] = span.Context().(jaeger.SpanContext).TraceID()
    }
    
    return &ContextLogger{
        Entry: logger.WithFields(fields),
    }
}

健康检查

健康检查实现

go
package health

import (
    "database/sql"
    "github.com/go-redis/redis/v8"
)

type HealthChecker struct {
    db    *sql.DB
    redis *redis.Client
}

func (h *HealthChecker) Check() map[string]string {
    status := make(map[string]string)
    
    // 检查数据库连接
    if err := h.db.Ping(); err != nil {
        status["database"] = "unhealthy"
    } else {
        status["database"] = "healthy"
    }
    
    // 检查Redis连接
    if err := h.redis.Ping(context.Background()).Err(); err != nil {
        status["redis"] = "unhealthy"
    } else {
        status["redis"] = "healthy"
    }
    
    return status
}

健康检查接口

go
func HealthCheckHandler(checker *HealthChecker) http.HandlerFunc {
    return func(w http.ResponseWriter, r *http.Request) {
        status := checker.Check()
        
        // 检查是否所有组件都健康
        allHealthy := true
        for _, s := range status {
            if s != "healthy" {
                allHealthy = false
                break
            }
        }
        
        w.Header().Set("Content-Type", "application/json")
        if !allHealthy {
            w.WriteHeader(http.StatusServiceUnavailable)
        }
        
        json.NewEncoder(w).Encode(status)
    }
}

告警配置

Prometheus告警规则

yaml
groups:
- name: service_alerts
  rules:
  - alert: HighErrorRate
    expr: |
      sum(rate(http_requests_total{status=~"5.."}[5m])) 
      / 
      sum(rate(http_requests_total[5m])) > 0.1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: High error rate detected
      description: Error rate is above 10% for the last 5 minutes
      
  - alert: SlowResponses
    expr: |
      histogram_quantile(0.95, 
        sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
      ) > 2
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Slow response times detected
      description: 95th percentile of response times is above 2 seconds

告警通知

go
package alerting

import (
    "bytes"
    "encoding/json"
    "net/http"
)

type AlertNotifier struct {
    webhookURL string
}

func (n *AlertNotifier) Notify(alert Alert) error {
    payload := map[string]interface{}{
        "text": alert.Message,
        "attachments": []map[string]interface{}{
            {
                "title": alert.Title,
                "color": alert.Severity,
                "fields": []map[string]string{
                    {
                        "title": "Service",
                        "value": alert.Service,
                    },
                    {
                        "title": "Time",
                        "value": alert.Time.Format("2006-01-02 15:04:05"),
                    },
                },
            },
        },
    }
    
    body, _ := json.Marshal(payload)
    _, err := http.Post(n.webhookURL, "application/json", bytes.NewBuffer(body))
    return err
}

监控面板

Grafana仪表板

json
{
  "dashboard": {
    "title": "Service Monitoring",
    "panels": [
      {
        "title": "Request Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "sum(rate(http_requests_total[5m])) by (endpoint)",
            "legendFormat": "{{endpoint}}"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (endpoint) / sum(rate(http_requests_total[5m])) by (endpoint)",
            "legendFormat": "{{endpoint}}"
          }
        ]
      },
      {
        "title": "Response Time",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, endpoint))",
            "legendFormat": "{{endpoint}}"
          }
        ]
      }
    ]
  }
}

最佳实践

  1. 合理设置监控指标
  2. 实现分布式追踪
  3. 配置合适的告警阈值
  4. 保持日志结构化
  5. 定期检查监控系统

总结

完善的监控系统是保障微服务稳定运行的关键。通过合理配置Prometheus、Jaeger等工具,可以构建全面的监控体系。

参考资料

  1. Prometheus文档
  2. Jaeger文档
  3. Google SRE手册

幸运的人用童年治愈一生,不幸的人用一生治愈童年 —— 强爸