Grafana, Prometheus và Alertmanager
Triển khai bằng docker
networks:
monitoring:
driver: bridge
volumes:
prometheus_data: {}
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
restart: unless-stopped
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./nginx.yml:/etc/prometheus/rules/nginx.yml
- ./prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=2400h'
- '--query.max-concurrency=200'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
ports:
- '9090:9090'
networks:
- monitoring
extra_hosts:
- "host.docker.internal:host-gateway"
grafana:
image: grafana/grafana-oss
container_name: grafana
restart: unless-stopped
volumes:
- ./grafana_data:/var/lib/grafana
ports:
- '3000:3000'
networks:
- monitoring
alertmanager:
image: prom/alertmanager:v0.27.0
restart: unless-stopped
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
- ./alertmanager_data:/alertmanager
ports:
- 9093:9093
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--log.level=info'
- '--storage.path=/alertmanager'
- '--data.retention=240h'
networks:
- monitoring
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
alertmanagers:
- static_configs:
- targets:
- http://alertmanager:9095
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
labels: {"cluster" : "dev"}
- job_name: 'nginx_stast'
static_configs:
- targets:
- '10.0.0.10:8088'
- '10.1.0.11:8088'
metrics_path: "/stats/format/prometheus"
global:
resolve_timeout: 5m
# https://github.com/prometheus/alertmanager/blob/main/doc/examples/simple.yml
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 30s
group_interval: 5m
repeat_interval: 3h
receiver: team-X-mails
routes:
- matchers:
- service=~"foo1|foo2|baz"
receiver: team-X-mails
routes:
- matchers:
- severity="critical"
receiver: team-X-pager
- matchers:
- service="files"
receiver: team-Y-mails
routes:
- matchers:
- severity="critical"
receiver: team-Y-pager
- matchers:
- service="database"
receiver: team-DB-pager
# Also group alerts by affected database.
group_by: [alertname, cluster, database]
routes:
- matchers:
- owner="team-X"
receiver: team-X-pager
continue: true
- matchers:
- owner="team-Y"
receiver: team-Y-pager
inhibit_rules:
- source_matchers: [severity="critical"]
target_matchers: [severity="warning"]
# Apply inhibition if the alertname is the same.
# CAUTION:
# If all label names listed in `equal` are missing
# from both the source and target alerts,
# the inhibition rule will apply!
equal: [alertname, cluster, service]
receivers:
- name: 'webhook'
webhook_configs:
- url: 'http://alert.example.com/api/'
http_config:
basic_auth:
username: 'user'
password: 'pass'
send_resolved: true
- name: 'team-X-mails'
email_configs:
- to: '[email protected]'
- name: 'team-DB-pager'
pagerduty_configs:
- service_key: <team-DB-key>
# https://github.com/prometheus/prometheus/blob/release-3.2/config/testdata/first.rules
groups:
- name: nginx
rules:
- alert: NginxHigh4xxRate
expr: rate(nginx_vts_server_requests_total{code="4xx"}[5m]) > 100
for: 5m
labels:
severity: warning
group: nginx
annotations:
summary: "High rate of 4xx errors on {{ $labels.host }}"
description: "The Nginx server at {{ $labels.host }} is experiencing a high rate ({{ $value }} requests per second) of 4xx errors in the last 5 minutes."
- Chạy bằng cách run command sau
docker compose up -d
- Stop bằng command
docker-compose down