Go微服务监控实战指南
引言
微服务架构的复杂性使得监控变得尤为重要。本文将详细介绍如何使用现代化工具对Go微服务进行全方位监控。
监控指标
基础指标
go
package metrics
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
var (
RequestCounter = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "endpoint", "status"},
)
RequestDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: []float64{0.1, 0.5, 1, 2, 5},
},
[]string{"method", "endpoint"},
)
ActiveConnections = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "active_connections",
Help: "Number of active connections",
},
)
)
中间件实现
go
func MetricsMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
// 包装ResponseWriter以获取状态码
rw := NewResponseWriter(w)
// 处理请求
next.ServeHTTP(rw, r)
// 记录指标
duration := time.Since(start).Seconds()
RequestCounter.WithLabelValues(
r.Method,
r.URL.Path,
strconv.Itoa(rw.statusCode),
).Inc()
RequestDuration.WithLabelValues(
r.Method,
r.URL.Path,
).Observe(duration)
})
}
链路追踪
Jaeger集成
go
package tracing
import (
"github.com/opentracing/opentracing-go"
"github.com/uber/jaeger-client-go"
"github.com/uber/jaeger-client-go/config"
)
func InitTracer(service string) (opentracing.Tracer, error) {
cfg := &config.Configuration{
ServiceName: service,
Sampler: &config.SamplerConfig{
Type: "const",
Param: 1,
},
Reporter: &config.ReporterConfig{
LogSpans: true,
LocalAgentHostPort: "jaeger:6831",
},
}
tracer, _, err := cfg.NewTracer(config.Logger(jaeger.StdLogger))
if err != nil {
return nil, err
}
opentracing.SetGlobalTracer(tracer)
return tracer, nil
}
请求追踪
go
func TracingMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
spanCtx, _ := opentracing.GlobalTracer().Extract(
opentracing.HTTPHeaders,
opentracing.HTTPHeadersCarrier(r.Header),
)
span := opentracing.GlobalTracer().StartSpan(
"http_request",
opentracing.ChildOf(spanCtx),
)
defer span.Finish()
// 添加请求信息到span
span.SetTag("http.method", r.Method)
span.SetTag("http.url", r.URL.Path)
// 将span注入到context
ctx := opentracing.ContextWithSpan(r.Context(), span)
next.ServeHTTP(w, r.WithContext(ctx))
})
}
日志管理
结构化日志
go
package logging
import (
"github.com/sirupsen/logrus"
"gopkg.in/natefinch/lumberjack.v2"
)
func InitLogger() *logrus.Logger {
logger := logrus.New()
// 设置输出格式
logger.SetFormatter(&logrus.JSONFormatter{
TimestampFormat: "2006-01-02 15:04:05",
})
// 设置日志轮转
logger.SetOutput(&lumberjack.Logger{
Filename: "/var/log/app.log",
MaxSize: 100, // MB
MaxBackups: 3,
MaxAge: 28, // days
Compress: true,
})
return logger
}
上下文日志
go
type ContextLogger struct {
*logrus.Entry
}
func NewContextLogger(ctx context.Context, logger *logrus.Logger) *ContextLogger {
// 从context获取请求ID
requestID := ctx.Value("request_id").(string)
// 从context获取追踪信息
span := opentracing.SpanFromContext(ctx)
fields := logrus.Fields{
"request_id": requestID,
}
if span != nil {
fields["trace_id"] = span.Context().(jaeger.SpanContext).TraceID()
}
return &ContextLogger{
Entry: logger.WithFields(fields),
}
}
健康检查
健康检查实现
go
package health
import (
"database/sql"
"github.com/go-redis/redis/v8"
)
type HealthChecker struct {
db *sql.DB
redis *redis.Client
}
func (h *HealthChecker) Check() map[string]string {
status := make(map[string]string)
// 检查数据库连接
if err := h.db.Ping(); err != nil {
status["database"] = "unhealthy"
} else {
status["database"] = "healthy"
}
// 检查Redis连接
if err := h.redis.Ping(context.Background()).Err(); err != nil {
status["redis"] = "unhealthy"
} else {
status["redis"] = "healthy"
}
return status
}
健康检查接口
go
func HealthCheckHandler(checker *HealthChecker) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
status := checker.Check()
// 检查是否所有组件都健康
allHealthy := true
for _, s := range status {
if s != "healthy" {
allHealthy = false
break
}
}
w.Header().Set("Content-Type", "application/json")
if !allHealthy {
w.WriteHeader(http.StatusServiceUnavailable)
}
json.NewEncoder(w).Encode(status)
}
}
告警配置
Prometheus告警规则
yaml
groups:
- name: service_alerts
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m])) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: High error rate detected
description: Error rate is above 10% for the last 5 minutes
- alert: SlowResponses
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
) > 2
for: 5m
labels:
severity: warning
annotations:
summary: Slow response times detected
description: 95th percentile of response times is above 2 seconds
告警通知
go
package alerting
import (
"bytes"
"encoding/json"
"net/http"
)
type AlertNotifier struct {
webhookURL string
}
func (n *AlertNotifier) Notify(alert Alert) error {
payload := map[string]interface{}{
"text": alert.Message,
"attachments": []map[string]interface{}{
{
"title": alert.Title,
"color": alert.Severity,
"fields": []map[string]string{
{
"title": "Service",
"value": alert.Service,
},
{
"title": "Time",
"value": alert.Time.Format("2006-01-02 15:04:05"),
},
},
},
},
}
body, _ := json.Marshal(payload)
_, err := http.Post(n.webhookURL, "application/json", bytes.NewBuffer(body))
return err
}
监控面板
Grafana仪表板
json
{
"dashboard": {
"title": "Service Monitoring",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total[5m])) by (endpoint)",
"legendFormat": "{{endpoint}}"
}
]
},
{
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (endpoint) / sum(rate(http_requests_total[5m])) by (endpoint)",
"legendFormat": "{{endpoint}}"
}
]
},
{
"title": "Response Time",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, endpoint))",
"legendFormat": "{{endpoint}}"
}
]
}
]
}
}
最佳实践
- 合理设置监控指标
- 实现分布式追踪
- 配置合适的告警阈值
- 保持日志结构化
- 定期检查监控系统
总结
完善的监控系统是保障微服务稳定运行的关键。通过合理配置Prometheus、Jaeger等工具,可以构建全面的监控体系。
参考资料
- Prometheus文档
- Jaeger文档
- Google SRE手册