Andre Lorbach 757cc7922b feat(rosi-collector): impstats sidecar on server, Grafana rate limits, docs
- Add optional impstats sidecar on collector server via init.sh
  - Prompts or SERVER_IMPSTATS_SIDECAR=true
  - check_sidecar_requirements() prompts for python3-venv on Debian/Ubuntu
  - Auto-add server to node and impstats Prometheus targets (no duplicates)
- Traefik: separate rate-limit-grafana (600/min, burst 300) for Grafana
  - Reduces 429 errors on dashboard reload
- Host Metrics: Node Status Overview panel height set to 10
  - Fixed height; table scrolls for many hosts
- Docs: installation, grafana_dashboards, client_setup, troubleshooting
  - SERVER_IMPSTATS_SIDECAR, server auto-registration, 429, ensurepip
2026-02-11 12:10:26 +00:00

323 lines
10 KiB
YAML

services:
traefik:
image: traefik:v3.6.2
container_name: traefik-central
restart: unless-stopped
command:
- --api.dashboard=true
- --api.insecure=false
- --ping=true
- --providers.docker=true
- --providers.docker.exposedbydefault=false
- --providers.docker.network=rosi-collector-net
- --providers.file.filename=/etc/traefik/dynamic.yml
- --providers.file.watch=true
- --entrypoints.web.address=:80
- --entrypoints.websecure.address=:443
- --entrypoints.prometheus.address=:9090
- --certificatesresolvers.letsencrypt.acme.tlschallenge=true
- --certificatesresolvers.letsencrypt.acme.email=${TRAEFIK_EMAIL}
- --certificatesresolvers.letsencrypt.acme.storage=/letsencrypt/acme.json
- --entrypoints.web.http.redirections.entrypoint.to=websecure
- --entrypoints.web.http.redirections.entrypoint.scheme=https
logging:
driver: json-file
options:
max-size: "10m"
max-file: "3"
ports:
- "80:80"
- "443:443"
- "9090:9090"
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- ./traefik/letsencrypt:/letsencrypt
- ./traefik/dynamic.yml:/etc/traefik/dynamic.yml:ro
networks:
- rosi-collector-net
healthcheck:
test: ["CMD", "traefik", "healthcheck", "--ping"]
interval: 30s
timeout: 5s
retries: 3
start_period: 10s
deploy:
resources:
limits:
cpus: '1'
memory: 512M
reservations:
cpus: '0.25'
memory: 128M
loki:
image: grafana/loki:2.9.6
command: -config.file=/etc/loki/local-config.yml
ports:
- "127.0.0.1:3100:3100"
volumes:
- loki-data:/loki
- ./loki-config.yml:/etc/loki/local-config.yml:ro
restart: unless-stopped
networks:
- rosi-collector-net
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3100/ready >/dev/null 2>&1 || exit 1"]
interval: 10s
timeout: 3s
retries: 10
start_period: 10s
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
grafana:
image: grafana/grafana:11.4.0
env_file: .env
environment:
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin}
GF_USERS_ALLOW_SIGN_UP: "false"
GF_SERVER_ROOT_URL: https://${TRAEFIK_DOMAIN}/
GF_SERVER_DOMAIN: ${TRAEFIK_DOMAIN}
# Increase datasource proxy limits to handle more concurrent requests
GF_DATAPROXY_MAX_IDLE_CONNS_PER_HOST: "300"
GF_DATAPROXY_MAX_IDLE_CONNS: "300"
GF_DATAPROXY_TIMEOUT: "60"
GF_DATAPROXY_KEEP_ALIVE: "60"
# Increase query timeout
GF_DATAPROXY_QUERY_TIMEOUT: "60"
# Allow more concurrent requests per datasource
GF_DATAPROXY_MAX_CONCURRENT_REQUESTS: "250"
# Increase response limit for large queries
GF_DATAPROXY_RESPONSE_LIMIT: "0"
# SMTP configuration for alerting
GF_SMTP_ENABLED: ${SMTP_ENABLED:-false}
GF_SMTP_HOST: ${SMTP_HOST:-}
GF_SMTP_PORT: ${SMTP_PORT:-587}
GF_SMTP_USER: ${SMTP_USER:-}
GF_SMTP_PASSWORD: ${SMTP_PASSWORD:-}
GF_SMTP_FROM_ADDRESS: ${ALERT_EMAIL_FROM:-}
GF_SMTP_FROM_NAME: "Rsyslog Central Alerts"
GF_SMTP_SKIP_VERIFY: ${SMTP_SKIP_VERIFY:-false}
# Set default home dashboard to Syslog Explorer
GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH: /etc/grafana/provisioning/dashboards/generated/syslog-explorer.json
ports:
- "127.0.0.1:3000:3000"
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning:ro
depends_on:
- loki
- traefik
restart: unless-stopped
networks:
- rosi-collector-net
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api/health || exit 1"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
deploy:
resources:
limits:
cpus: '1'
memory: 1G
reservations:
cpus: '0.25'
memory: 256M
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
labels:
- "traefik.enable=true"
- 'traefik.http.routers.grafana.rule=Host(`${TRAEFIK_DOMAIN}`) && !PathPrefix(`/downloads`)'
- "traefik.http.routers.grafana.entrypoints=websecure"
- "traefik.http.routers.grafana.tls.certresolver=letsencrypt"
- "traefik.http.routers.grafana.middlewares=security-headers@file,rate-limit-grafana@file"
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
- "traefik.http.routers.grafana.priority=1"
prometheus:
image: prom/prometheus:v3.1.0
container_name: prometheus-central
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --web.console.libraries=/usr/share/prometheus/console_libraries
- --web.console.templates=/usr/share/prometheus/consoles
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./prometheus-targets:/etc/prometheus/targets:ro
- prometheus-data:/prometheus
restart: unless-stopped
networks:
- rosi-collector-net
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:9090/-/healthy || exit 1"]
interval: 30s
timeout: 5s
retries: 3
start_period: 15s
deploy:
resources:
limits:
cpus: '1'
memory: 1G
reservations:
cpus: '0.25'
memory: 256M
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
labels:
- "traefik.enable=true"
- 'traefik.http.routers.prometheus.rule=Host(`${TRAEFIK_DOMAIN}`)'
- "traefik.http.routers.prometheus.entrypoints=prometheus"
- "traefik.http.routers.prometheus.tls.certresolver=letsencrypt"
- "traefik.http.routers.prometheus.middlewares=admin-auth@file,security-headers@file,rate-limit@file"
- "traefik.http.services.prometheus.loadbalancer.server.port=9090"
rsyslog:
# NOTE: Using :latest is intentional - we want users to always get the newest
# rsyslog-collector image with security fixes. For production pinning, override
# in docker-compose.override.yml or use RSYSLOG_IMAGE environment variable.
image: rsyslog/rsyslog-collector:latest
env_file: .env
environment:
WRITE_JSON_FILE: ${WRITE_JSON_FILE:-off}
container_name: rsyslog-central
ports:
- "0.0.0.0:514:514/udp"
- "[::]:514:514/udp"
- "0.0.0.0:10514:514/tcp"
- "[::]:10514:514/tcp"
volumes:
- /var/log/rsyslog-central:/var/log
- ./rsyslog.conf/30-send-loki-http.conf:/etc/rsyslog.d/30-send-loki-http.conf:ro
- ./rsyslog.conf/80-file-output.conf:/etc/rsyslog.d/80-file-output.conf:ro
restart: unless-stopped
depends_on:
loki:
condition: service_healthy
networks:
- rosi-collector-net
# Healthcheck: container runs rsyslog in foreground (-n), so no PID file exists; check process instead.
healthcheck:
test: ["CMD-SHELL", "pidof rsyslogd >/dev/null || exit 1"]
interval: 30s
timeout: 5s
retries: 3
start_period: 10s
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
cap_add:
- NET_BIND_SERVICE
- SYSLOG
# TLS-enabled rsyslog on port 6514 (only starts when SYSLOG_TLS_ENABLED=true)
# Uses the built-in TLS support from rsyslog-collector image (PR #6336)
rsyslog-tls:
# NOTE: Using :latest is intentional - same as rsyslog service above
image: rsyslog/rsyslog-collector:latest
env_file: .env
environment:
WRITE_JSON_FILE: ${WRITE_JSON_FILE:-off}
# Disable plain TCP/UDP since the main rsyslog container handles those
ENABLE_UDP: "off"
ENABLE_TCP: "off"
# Enable TLS on port 6514
ENABLE_TLS: "on"
TLS_CA_FILE: "/etc/rsyslog.d/certs/ca.pem"
TLS_CERT_FILE: "/etc/rsyslog.d/certs/server-cert.pem"
TLS_KEY_FILE: "/etc/rsyslog.d/certs/server-key.pem"
# x509/certvalid = accept any client with valid CA-signed cert
# x509/name = require StreamDriverPermittedPeers config (stricter)
TLS_AUTH_MODE: "x509/certvalid"
container_name: rsyslog-tls-central
profiles:
- tls
ports:
- "0.0.0.0:6514:6514/tcp"
- "[::]:6514:6514/tcp"
volumes:
- /var/log/rsyslog-central:/var/log
- ./rsyslog.conf/30-send-loki-http.conf:/etc/rsyslog.d/30-send-loki-http.conf:ro
- ./rsyslog.conf/80-file-output.conf:/etc/rsyslog.d/80-file-output.conf:ro
- ./certs:/etc/rsyslog.d/certs:ro
restart: unless-stopped
depends_on:
loki:
condition: service_healthy
networks:
- rosi-collector-net
# Healthcheck: container runs rsyslog in foreground (-n), so no PID file exists; check process instead.
healthcheck:
test: ["CMD-SHELL", "pidof rsyslogd >/dev/null || exit 1"]
interval: 30s
timeout: 5s
retries: 3
start_period: 10s
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
cap_add:
- NET_BIND_SERVICE
- SYSLOG
downloads:
image: nginx:1.27.3-alpine
container_name: downloads-central
restart: unless-stopped
user: "101:101"
read_only: true
tmpfs:
- /var/cache/nginx:uid=101,gid=101
- /var/run:uid=101,gid=101
- /tmp:uid=101,gid=101
volumes:
- ./downloads:/usr/share/nginx/html/downloads:ro
networks:
- rosi-collector-net
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://127.0.0.1:80/ || exit 1"]
interval: 30s
timeout: 5s
retries: 3
start_period: 10s
deploy:
resources:
limits:
cpus: '0.5'
memory: 128M
reservations:
cpus: '0.1'
memory: 32M
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
labels:
- "traefik.enable=true"
- 'traefik.http.routers.downloads.rule=Host(`${TRAEFIK_DOMAIN}`) && PathPrefix(`/downloads`)'
- "traefik.http.routers.downloads.entrypoints=websecure"
- "traefik.http.routers.downloads.tls.certresolver=letsencrypt"
- "traefik.http.routers.downloads.middlewares=security-headers@file"
- "traefik.http.services.downloads.loadbalancer.server.port=80"
- "traefik.http.routers.downloads.priority=100"
volumes:
loki-data:
grafana-data:
prometheus-data:
networks:
rosi-collector-net:
name: rosi-collector-net
external: true
driver: bridge