Система мониторинга
Комплексное руководство по настройке системы мониторинга с Prometheus, Grafana и Alertmanager.
Prometheus
Docker Compose для Prometheus
# prometheus/docker-compose.yml
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
ports:
- 9090:9090
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./rules:/etc/prometheus/rules
- prometheus_data:/prometheus
networks:
- monitoring
restart: unless-stopped
grafana:
image: grafana/grafana:latest
container_name: grafana
ports:
- 3000:3000
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_USERS_ALLOW_SIGN_UP=false
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards
- ./grafana/datasources:/etc/grafana/provisioning/datasources
networks:
- monitoring
restart: unless-stopped
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
ports:
- 9093:9093
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
- alertmanager_data:/alertmanager
networks:
- monitoring
restart: unless-stopped
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
command:
- '--path.rootfs=/host'
ports:
- 9100:9100
volumes:
- /:/host:ro
networks:
- monitoring
restart: unless-stopped
volumes:
prometheus_data:
grafana_data:
alertmanager_data:
networks:
monitoring:
driver: bridge
Конфигурация Prometheus
# prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
- job_name: 'nginx'
static_configs:
- targets: ['nginx-exporter:9113']
- job_name: 'mysql'
static_configs:
- targets: ['mysql-exporter:9104']
- job_name: 'docker'
static_configs:
- targets: ['cadvisor:8080']
- job_name: 'blackbox'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://example.com
- https://api.example.com
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
Правила алертов
# prometheus/rules/alerts.yml
groups:
- name: system-alerts
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} has been down for more than 1 minute."
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 80% for more than 5 minutes."
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is above 85% for more than 5 minutes."
- alert: DiskSpaceLow
expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Disk usage is above 85% on {{ $labels.mountpoint }}."
- alert: HighLoadAverage
expr: node_load15 > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High load average on {{ $labels.instance }}"
description: "Load average is {{ $value }} for more than 5 minutes."
- name: nginx-alerts
rules:
- alert: NginxDown
expr: nginx_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Nginx is down on {{ $labels.instance }}"
description: "Nginx has been down for more than 1 minute."
- alert: NginxHighErrorRate
expr: rate(nginx_http_requests_total{status=~"5.."}[5m]) / rate(nginx_http_requests_total[5m]) * 100 > 5
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate on {{ $labels.instance }}"
description: "Error rate is {{ $value }}% for more than 5 minutes."
- name: mysql-alerts
rules:
- alert: MySQLDown
expr: mysql_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "MySQL is down on {{ $labels.instance }}"
description: "MySQL has been down for more than 1 minute."
- alert: MySQLSlowQueries
expr: rate(mysql_global_status_slow_queries[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "MySQL slow queries on {{ $labels.instance }}"
description: "MySQL slow query rate is {{ $value }} per second."
Alertmanager
Конфигурация Alertmanager
# prometheus/alertmanager.yml
global:
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password: 'app_password'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
email_configs:
- to: 'admin@example.com'
subject: 'Alert: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Instance: {{ .Labels.instance }}
Severity: {{ .Labels.severity }}
{{ end }}
- name: 'slack-notifications'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#alerts'
title: 'Alert: {{ .GroupLabels.alertname }}'
text: |
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Instance:* {{ .Labels.instance }}
*Severity:* {{ .Labels.severity }}
{{ end }}
- name: 'telegram-notifications'
webhook_configs:
- url: 'http://telegram-bot:8080/webhook'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
Grafana
Datasource конфигурация
# grafana/datasources/prometheus.yml
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
Dashboard для системного мониторинга
{
"dashboard": {
"id": null,
"title": "System Monitoring",
"tags": ["prometheus", "system"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "CPU Usage",
"type": "stat",
"targets": [
{
"expr": "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "{{ instance }}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 60},
{"color": "red", "value": 80}
]
}
}
}
},
{
"id": 2,
"title": "Memory Usage",
"type": "stat",
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"legendFormat": "{{ instance }}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 70},
{"color": "red", "value": 85}
]
}
}
}
},
{
"id": 3,
"title": "Disk Usage",
"type": "bargauge",
"targets": [
{
"expr": "(1 - (node_filesystem_avail_bytes{fstype!=\"tmpfs\"} / node_filesystem_size_bytes{fstype!=\"tmpfs\"})) * 100",
"legendFormat": "{{ instance }} - {{ mountpoint }}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 75},
{"color": "red", "value": 85}
]
}
}
}
},
{
"id": 4,
"title": "Network Traffic",
"type": "timeseries",
"targets": [
{
"expr": "rate(node_network_receive_bytes_total{device!=\"lo\"}[5m]) * 8",
"legendFormat": "{{ instance }} - {{ device }} RX"
},
{
"expr": "rate(node_network_transmit_bytes_total{device!=\"lo\"}[5m]) * 8",
"legendFormat": "{{ instance }} - {{ device }} TX"
}
],
"fieldConfig": {
"defaults": {
"unit": "bps"
}
}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}
Exporters
Nginx Exporter
# nginx-exporter.yml
version: '3.8'
services:
nginx-exporter:
image: nginx/nginx-prometheus-exporter:latest
container_name: nginx-exporter
command:
- -nginx.scrape-uri=http://nginx/nginx_status
ports:
- 9113:9113
networks:
- monitoring
restart: unless-stopped
MySQL Exporter
# mysql-exporter.yml
version: '3.8'
services:
mysql-exporter:
image: prom/mysqld-exporter:latest
container_name: mysql-exporter
environment:
- DATA_SOURCE_NAME=exporter:password@(mysql:3306)/
ports:
- 9104:9104
networks:
- monitoring
restart: unless-stopped
Blackbox Exporter
# blackbox-exporter.yml
version: '3.8'
services:
blackbox-exporter:
image: prom/blackbox-exporter:latest
container_name: blackbox-exporter
ports:
- 9115:9115
volumes:
- ./blackbox.yml:/etc/blackbox_exporter/config.yml
networks:
- monitoring
restart: unless-stopped
# blackbox.yml
modules:
http_2xx:
prober: http
timeout: 5s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
valid_status_codes: []
method: GET
preferred_ip_protocol: "ip4"
http_post_2xx:
prober: http
timeout: 5s
http:
method: POST
headers:
Content-Type: application/json
body: '{}'
tcp_connect:
prober: tcp
timeout: 5s
icmp:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: "ip4"
dns:
prober: dns
timeout: 5s
dns:
query_name: "example.com"
query_type: "A"
Loki для логов
Docker Compose с Loki
# loki/docker-compose.yml
version: '3.8'
services:
loki:
image: grafana/loki:latest
container_name: loki
ports:
- 3100:3100
volumes:
- ./loki-config.yml:/etc/loki/local-config.yaml
- loki_data:/loki
command: -config.file=/etc/loki/local-config.yaml
networks:
- monitoring
restart: unless-stopped
promtail:
image: grafana/promtail:latest
container_name: promtail
volumes:
- ./promtail-config.yml:/etc/promtail/config.yml
- /var/log:/var/log:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
command: -config.file=/etc/promtail/config.yml
networks:
- monitoring
restart: unless-stopped
volumes:
loki_data:
networks:
monitoring:
external: true
Конфигурация Loki
# loki/loki-config.yml
auth_enabled: false
server:
http_listen_port: 3100
ingester:
lifecycler:
address: 127.0.0.1
ring:
kvstore:
store: inmemory
replication_factor: 1
final_sleep: 0s
chunk_idle_period: 5m
chunk_retain_period: 30s
schema_config:
configs:
- from: 2020-10-24
store: boltdb
object_store: filesystem
schema: v11
index:
prefix: index_
period: 168h
storage_config:
boltdb:
directory: /loki/index
filesystem:
directory: /loki/chunks
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
chunk_store_config:
max_look_back_period: 0s
table_manager:
retention_deletes_enabled: false
retention_period: 0s
Конфигурация Promtail
# loki/promtail-config.yml
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: system
static_configs:
- targets:
- localhost
labels:
job: varlogs
__path__: /var/log/*log
- job_name: nginx
static_configs:
- targets:
- localhost
labels:
job: nginx
__path__: /var/log/nginx/*.log
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
relabel_configs:
- source_labels: ['__meta_docker_container_name']
regex: '/(.*)'
target_label: 'container'
Установка через скрипт
Автоматизированная установка
#!/bin/bash
# monitoring-setup.sh
set -e
MONITORING_DIR="/opt/monitoring"
COMPOSE_VERSION="2.20.0"
echo "Создание директории для мониторинга..."
sudo mkdir -p $MONITORING_DIR
cd $MONITORING_DIR
echo "Установка Docker Compose..."
sudo curl -L "https://github.com/docker/compose/releases/download/v${COMPOSE_VERSION}/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
sudo chmod +x /usr/local/bin/docker-compose
echo "Создание конфигурационных файлов..."
# Prometheus config
cat > prometheus.yml << 'EOF'
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
EOF
# Alertmanager config
cat > alertmanager.yml << 'EOF'
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@localhost'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://localhost:5001/'
EOF
# Docker Compose
cat > docker-compose.yml << 'EOF'
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
ports:
- 9090:9090
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
networks:
- monitoring
restart: unless-stopped
grafana:
image: grafana/grafana:latest
container_name: grafana
ports:
- 3000:3000
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin123
volumes:
- grafana_data:/var/lib/grafana
networks:
- monitoring
restart: unless-stopped
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
ports:
- 9093:9093
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
networks:
- monitoring
restart: unless-stopped
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
command:
- '--path.rootfs=/host'
ports:
- 9100:9100
volumes:
- /:/host:ro
networks:
- monitoring
restart: unless-stopped
volumes:
prometheus_data:
grafana_data:
networks:
monitoring:
driver: bridge
EOF
echo "Запуск системы мониторинга..."
docker-compose up -d
echo "Ожидание запуска сервисов..."
sleep 30
echo "Проверка статуса сервисов..."
docker-compose ps
echo ""
echo "Система мониторинга установлена!"
echo "Prometheus: http://localhost:9090"
echo "Grafana: http://localhost:3000 (admin/admin123)"
echo "Alertmanager: http://localhost:9093"
echo "Node Exporter: http://localhost:9100"
Полезные запросы PromQL
Системные метрики
# CPU использование по ядрам
100 - (avg by(instance, cpu) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# Общее CPU использование
100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# Использование памяти
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
# Использование диска
(1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100
# Load Average
node_load1
node_load5
node_load15
# Сетевой трафик
rate(node_network_receive_bytes_total{device!="lo"}[5m]) * 8
rate(node_network_transmit_bytes_total{device!="lo"}[5m]) * 8
# Количество процессов
node_processes_state{state="running"}
node_processes_state{state="sleeping"}
Nginx метрики
# Количество запросов в секунду
rate(nginx_http_requests_total[5m])
# Статус коды ответов
rate(nginx_http_requests_total{status=~"2.."}[5m])
rate(nginx_http_requests_total{status=~"4.."}[5m])
rate(nginx_http_requests_total{status=~"5.."}[5m])
# Активные соединения
nginx_connections_active
nginx_connections_reading
nginx_connections_writing
nginx_connections_waiting
MySQL метрики
# Количество соединений
mysql_global_status_threads_connected
mysql_global_variables_max_connections
# Запросы в секунду
rate(mysql_global_status_queries[5m])
# Медленные запросы
rate(mysql_global_status_slow_queries[5m])
# Буферы
mysql_global_status_innodb_buffer_pool_pages_total
mysql_global_status_innodb_buffer_pool_pages_free