{ config, ... }: { sops.secrets.alertmanager = { }; sops.secrets.hass-token.owner = "prometheus"; # imports = [ # ./matrix-alertmanager.nix # ./irc-alertmanager.nix # ./rules.nix # ]; services.prometheus = { webExternalUrl = "https://prometheus.cloonar.com"; alertmanagers = [ { static_configs = [ { targets = [ "localhost:9093" ]; } ]; } ]; rules = [ '' ALERT node_down IF up == 0 FOR 5m LABELS { severity="page" } ANNOTATIONS { summary = "{{$labels.alias}}: Node is down.", description = "{{$labels.alias}} has been down for more than 5 minutes." } ALERT node_systemd_service_failed IF node_systemd_unit_state{state="failed"} == 1 FOR 4m LABELS { severity="page" } ANNOTATIONS { summary = "{{$labels.alias}}: Service {{$labels.name}} failed to start.", description = "{{$labels.alias}} failed to (re)start service {{$labels.name}}." } ALERT node_filesystem_full_90percent IF sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} * 0.1) / 1024^3 FOR 5m LABELS { severity="page" } ANNOTATIONS { summary = "{{$labels.alias}}: Filesystem is running out of space soon.", description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% space left on its filesystem." } ALERT node_filesystem_full_in_4h IF predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4*3600) <= 0 FOR 5m LABELS { severity="page" } ANNOTATIONS { summary = "{{$labels.alias}}: Filesystem is running out of space in 4 hours.", description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 4 hours" } ALERT node_filedescriptors_full_in_3h IF predict_linear(node_filefd_allocated[1h], 3*3600) >= node_filefd_maximum FOR 20m LABELS { severity="page" } ANNOTATIONS { summary = "{{$labels.alias}} is running out of available file descriptors in 3 hours.", description = "{{$labels.alias}} is running out of available file descriptors in approx. 3 hours" } ALERT node_load1_90percent IF node_load1 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0.9 FOR 1h LABELS { severity="page" } ANNOTATIONS { summary = "{{$labels.alias}}: Running on high load.", description = "{{$labels.alias}} is running with > 90% total load for at least 1h." } ALERT node_cpu_util_90percent IF 100 - (avg by (alias) (irate(node_cpu{mode="idle"}[5m])) * 100) >= 90 FOR 1h LABELS { severity="page" } ANNOTATIONS { summary = "{{$labels.alias}}: High CPU utilization.", description = "{{$labels.alias}} has total CPU utilization over 90% for at least 1h." } ALERT node_ram_using_90percent IF node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal * 0.1 FOR 30m LABELS { severity="page" } ANNOTATIONS { summary="{{$labels.alias}}: Using lots of RAM.", description="{{$labels.alias}} is using at least 90% of its RAM for at least 30 minutes now.", } ALERT node_swap_using_80percent IF node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) > node_memory_SwapTotal * 0.8 FOR 10m LABELS { severity="page" } ANNOTATIONS { summary="{{$labels.alias}}: Running out of swap soon.", description="{{$labels.alias}} is using 80% of its swap space for at least 10 minutes now." } ALERT homeassistant = { IF homeassistant_entity_available{domain="persistent_notification", entity!~"persistent_notification.http_login|persistent_notification.recorder_database_migration"} >= 0 ANNOTATIONS { description="homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}" } ALERT gitea IF rate(promhttp_metric_handler_requests_total{job="gitea", code="500"}[5m]) > 3 ANNOTATIONS { description="{{$labels.instance}}: gitea instances error rate went up: {{$value}} errors in 5 minutes" } '' ]; scrapeConfigs = [ { job_name = "telegraf"; scrape_interval = "60s"; metrics_path = "/metrics"; static_configs = [ { targets = [ "web-01.cloonar.com:9273" ]; labels.host = "web-01.cloonar.com"; } { targets = [ "web-arm.cloonar.com:9273" ]; labels.host = "web-arm.cloonar.com"; } { targets = [ "fw.cloonar.com:9273" ]; labels.host = "fw.cloonar.com"; } { targets = [ "mail.cloonar.com:9273" ]; labels.host = "mail.cloonar.com"; } { targets = [ "git.cloonar.com:9273" ]; labels.host = "git.cloonar.com"; } { targets = [ "home-assistant.cloonar.com:9273" ]; labels.host = "home-assistant.cloonar.com"; } { targets = map (host: "${host}.cloonar.com:9273") [ "web-01" "web-arm" "fw" "mail" "git" "home-assistant" ]; labels.org = "cloonar"; } ]; } { job_name = "homeassistant"; scrape_interval = "60s"; metrics_path = "/api/prometheus"; authorization.credentials_file = config.sops.secrets.hass-token.path; scheme = "https"; static_configs = [ { targets = [ "home-assistant.cloonar.com:443" ]; } ]; } { job_name = "gitea"; scrape_interval = "60s"; metrics_path = "/metrics"; scheme = "https"; static_configs = [ { targets = [ "git.cloonar.com:443" ]; } ]; } ]; }; # services.prometheus.alertmanager = { # enable = true; # environmentFile = config.sops.secrets.alertmanager.path; # webExternalUrl = "https://alertmanager.cloonar.com"; # listenAddress = "[::1]"; # configuration = { # global = { # # The smarthost and SMTP sender used for mail notifications. # smtp_smarthost = "mail.cloonar.com:587"; # smtp_from = "alertmanager@cloonar.com"; # smtp_auth_username = "alertmanager@cloonar.com"; # smtp_auth_password = "$SMTP_PASSWORD"; # }; # route = { # receiver = "default"; # routes = [ # { # group_by = [ "host" ]; # match_re.org = "krebs"; # group_wait = "5m"; # group_interval = "5m"; # repeat_interval = "4h"; # receiver = "krebs"; # } # { # group_by = [ "host" ]; # match_re.org = "nix-community"; # group_wait = "5m"; # group_interval = "5m"; # repeat_interval = "4h"; # receiver = "nix-community"; # } # { # group_by = [ "host" ]; # match_re.org = "clan-lol"; # group_wait = "5m"; # group_interval = "5m"; # repeat_interval = "4h"; # receiver = "clan-lol"; # } # { # group_by = [ "host" ]; # group_wait = "30s"; # group_interval = "2m"; # repeat_interval = "2h"; # receiver = "all"; # } # ]; # }; # receivers = [ # { # name = "krebs"; # webhook_configs = [ # { # url = "http://127.0.0.1:9223/"; # max_alerts = 5; # } # ]; # } # #{ # # name = "numtide"; # # slack_configs = [ # # { # # token = "$SLACK_TOKEN"; # # api_url = "https://"; # # } # # ]; # #} # { # name = "nix-community"; # webhook_configs = [ # { # url = "http://localhost:9088/alert"; # max_alerts = 5; # } # ]; # } # { # name = "clan-lol"; # webhook_configs = [ # # TODO # #{ # # url = "http://localhost:4050/services/hooks/YWxlcnRtYW5hZ2VyX3NlcnZpY2U"; # # max_alerts = 5; # #} # ]; # } # { # name = "all"; # pushover_configs = [ # { # user_key = "$PUSHOVER_USER_KEY"; # token = "$PUSHOVER_TOKEN"; # priority = "0"; # } # ]; # } # { # name = "default"; # } # ]; # }; # }; }