Grafana, Prometheus và Alertmanager

#technical #infrastructure #monitor-tools #blog-post

Triển khai bằng docker

docker-compose.yml

networks:
  monitoring:
    driver: bridge

volumes:
  prometheus_data: {}

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    restart: unless-stopped
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./nginx.yml:/etc/prometheus/rules/nginx.yml
      - ./prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=2400h'
      - '--query.max-concurrency=200'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--web.enable-lifecycle'
      - '--web.enable-admin-api'
    ports:
      - '9090:9090'
    networks:
      - monitoring
    extra_hosts:
      - "host.docker.internal:host-gateway"
  grafana:
    image: grafana/grafana-oss
    container_name: grafana
    restart: unless-stopped
    volumes:
      - ./grafana_data:/var/lib/grafana
    ports:
     - '3000:3000'
    networks:
     - monitoring
  alertmanager:
    image: prom/alertmanager:v0.27.0
    restart: unless-stopped
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
      - ./alertmanager_data:/alertmanager
    ports:
      - 9093:9093
    command:
    - '--config.file=/etc/alertmanager/alertmanager.yml'
    - '--log.level=info'
    - '--storage.path=/alertmanager'
    - '--data.retention=240h'
    networks:
     - monitoring

prometheus.yml

global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - http://alertmanager:9095
rule_files:
  - /etc/prometheus/rules/*.yml

scrape_configs:
  - job_name: "prometheus"
    static_configs:
      - targets: ["localhost:9090"]
        labels: {"cluster" : "dev"}
  - job_name: 'nginx_stast'
    static_configs:
      - targets:
        - '10.0.0.10:8088'
        - '10.1.0.11:8088'
    metrics_path: "/stats/format/prometheus"

alertmanager.yml

global:
  resolve_timeout: 5m
# https://github.com/prometheus/alertmanager/blob/main/doc/examples/simple.yml

route:
  group_by: ['alertname', 'cluster', 'service']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 3h
  receiver: team-X-mails

routes:
  - matchers:
      - service=~"foo1|foo2|baz"
    receiver: team-X-mails

    routes:
      - matchers:
          - severity="critical"
        receiver: team-X-pager
  - matchers:
      - service="files"
    receiver: team-Y-mails

    routes:
      - matchers:
          - severity="critical"
        receiver: team-Y-pager

  - matchers:
      - service="database"
    receiver: team-DB-pager
    # Also group alerts by affected database.
    group_by: [alertname, cluster, database]
    routes:
      - matchers:
          - owner="team-X"
        receiver: team-X-pager
        continue: true
      - matchers:
          - owner="team-Y"
        receiver: team-Y-pager



inhibit_rules:
  - source_matchers: [severity="critical"]
    target_matchers: [severity="warning"]
    # Apply inhibition if the alertname is the same.
    # CAUTION:
    #   If all label names listed in `equal` are missing
    #   from both the source and target alerts,
    #   the inhibition rule will apply!
    equal: [alertname, cluster, service]


receivers:
- name: 'webhook'
  webhook_configs:
  - url: 'http://alert.example.com/api/'
    http_config:
      basic_auth:
        username: 'user'
        password: 'pass'
    send_resolved: true
- name: 'team-X-mails'
  email_configs:
    - to: '[email protected]'
- name: 'team-DB-pager'
  pagerduty_configs:
    - service_key: <team-DB-key>

nginx.yml

# https://github.com/prometheus/prometheus/blob/release-3.2/config/testdata/first.rules
groups:
- name: nginx
  rules:
  - alert: NginxHigh4xxRate
    expr: rate(nginx_vts_server_requests_total{code="4xx"}[5m]) > 100
    for: 5m
    labels:
      severity: warning
      group: nginx
    annotations:
      summary: "High rate of 4xx errors on {{ $labels.host }}"
      description: "The Nginx server at {{ $labels.host }} is experiencing a high rate ({{ $value }} requests per second) of 4xx errors in the last 5 minutes."

Chạy bằng cách run command sau
```
docker compose up -d
```
Stop bằng command
```
docker-compose down
```