From dd976ab3bce9e04b78d132552c0cf3c696efa76e Mon Sep 17 00:00:00 2001 From: Dominik Polakovics Date: Sat, 19 Aug 2023 06:33:21 +0200 Subject: [PATCH] add sops --- .../web-01.cloonar.com/modules/prometheus.nix | 404 ++++++++++-------- .../modules/victoriametrics.nix | 2 + 2 files changed, 233 insertions(+), 173 deletions(-) diff --git a/hosts/web-01.cloonar.com/modules/prometheus.nix b/hosts/web-01.cloonar.com/modules/prometheus.nix index 4b71f90..83f265c 100644 --- a/hosts/web-01.cloonar.com/modules/prometheus.nix +++ b/hosts/web-01.cloonar.com/modules/prometheus.nix @@ -11,6 +11,120 @@ services.prometheus = { webExternalUrl = "https://prometheus.cloonar.com"; + alertmanagers = [ + { + static_configs = [ + { + targets = [ "localhost:9093" ]; + } + ]; + } + ]; + rules = [ + '' + ALERT node_down + IF up == 0 + FOR 5m + LABELS { + severity="page" + } + ANNOTATIONS { + summary = "{{$labels.alias}}: Node is down.", + description = "{{$labels.alias}} has been down for more than 5 minutes." + } + ALERT node_systemd_service_failed + IF node_systemd_unit_state{state="failed"} == 1 + FOR 4m + LABELS { + severity="page" + } + ANNOTATIONS { + summary = "{{$labels.alias}}: Service {{$labels.name}} failed to start.", + description = "{{$labels.alias}} failed to (re)start service {{$labels.name}}." + } + ALERT node_filesystem_full_90percent + IF sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} * 0.1) / 1024^3 + FOR 5m + LABELS { + severity="page" + } + ANNOTATIONS { + summary = "{{$labels.alias}}: Filesystem is running out of space soon.", + description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% space left on its filesystem." + } + ALERT node_filesystem_full_in_4h + IF predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4*3600) <= 0 + FOR 5m + LABELS { + severity="page" + } + ANNOTATIONS { + summary = "{{$labels.alias}}: Filesystem is running out of space in 4 hours.", + description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 4 hours" + } + ALERT node_filedescriptors_full_in_3h + IF predict_linear(node_filefd_allocated[1h], 3*3600) >= node_filefd_maximum + FOR 20m + LABELS { + severity="page" + } + ANNOTATIONS { + summary = "{{$labels.alias}} is running out of available file descriptors in 3 hours.", + description = "{{$labels.alias}} is running out of available file descriptors in approx. 3 hours" + } + ALERT node_load1_90percent + IF node_load1 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0.9 + FOR 1h + LABELS { + severity="page" + } + ANNOTATIONS { + summary = "{{$labels.alias}}: Running on high load.", + description = "{{$labels.alias}} is running with > 90% total load for at least 1h." + } + ALERT node_cpu_util_90percent + IF 100 - (avg by (alias) (irate(node_cpu{mode="idle"}[5m])) * 100) >= 90 + FOR 1h + LABELS { + severity="page" + } + ANNOTATIONS { + summary = "{{$labels.alias}}: High CPU utilization.", + description = "{{$labels.alias}} has total CPU utilization over 90% for at least 1h." + } + ALERT node_ram_using_90percent + IF node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal * 0.1 + FOR 30m + LABELS { + severity="page" + } + ANNOTATIONS { + summary="{{$labels.alias}}: Using lots of RAM.", + description="{{$labels.alias}} is using at least 90% of its RAM for at least 30 minutes now.", + } + ALERT node_swap_using_80percent + IF node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) > node_memory_SwapTotal * 0.8 + FOR 10m + LABELS { + severity="page" + } + ANNOTATIONS { + summary="{{$labels.alias}}: Running out of swap soon.", + description="{{$labels.alias}} is using 80% of its swap space for at least 10 minutes now." + } + ALERT homeassistant = { + IF homeassistant_entity_available{domain="persistent_notification", entity!~"persistent_notification.http_login|persistent_notification.recorder_database_migration"} >= 0 + ANNOTATIONS { + description="homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}" + } + + ALERT gitea + IF rate(promhttp_metric_handler_requests_total{job="gitea", code="500"}[5m]) > 3 + ANNOTATIONS { + description="{{$labels.instance}}: gitea instances error rate went up: {{$value}} errors in 5 minutes" + } + '' + ]; scrapeConfigs = [ { job_name = "telegraf"; @@ -19,84 +133,37 @@ static_configs = [ { targets = [ - "turingmachine.r:9273" - "bernie.r:9273" - #"rock.r:9273" + "web-01.cloonar.com:9273" ]; - labels.type = "mobile"; + labels.host = "web-01.cloonar.com"; } { targets = [ - "eva.r:9273" - "eve.r:9273" - "blob64.r:9273" - "matchbox.r:9273" - "alertmanager.r:80" - "prometheus.r:80" - #"rock.r:9273" + "mail.cloonar.com:9273" ]; + labels.host = "mail.cloonar.com"; } { targets = [ - "rauter.r:9273" + "git.cloonar.com:9273" ]; - # to make it compatible with the node-exporter dashboard - labels.host = "rauter.r:9273"; + labels.host = "git.cloonar.com"; } { targets = [ - "prism.r:9273" - "gum.r:9273" - "kelle.r:9273" + "home-assistant.cloonar.com:9273" ]; - - labels.org = "krebs"; + labels.host = "home-assistant.cloonar.com"; } { - targets = [ - "clan.lol:9273" + targets = map (host: "${host}.cloonar.com:9273") [ + "web-01" + "mail" + "git" + "home-assistant" ]; - labels.org = "clan-lol"; - } - #{ - # targets = [ - # "dev1.numtide.com.r:9273" - # ]; - - # labels.org = "numtide"; - #} - { - targets = map (host: "${host}.r:9273") [ - # university - "amy" - "clara" - "rose" - - "astrid" - "dan" - "mickey" - "bill" - "nardole" - "yasmin" - "ryan" - "graham" - - "astrid" - "dan" - "mickey" - - "jackson" - "christina" - "adelaide" - "wilfred" - "river" - "jack" - - "ruby" - ]; - - labels.org = "uni"; + labels.org = "cloonar"; } ]; } @@ -131,118 +198,109 @@ ]; } ]; - alertmanagers = [ - { - static_configs = [ - { - targets = [ "localhost:9093" ]; - } - ]; - } - ]; - }; - services.prometheus.alertmanager = { - enable = true; - environmentFile = config.sops.secrets.alertmanager.path; - webExternalUrl = "https://alertmanager.cloonar.com"; - listenAddress = "[::1]"; - configuration = { - global = { - # The smarthost and SMTP sender used for mail notifications. - smtp_smarthost = "mail.cloonar.com:587"; - smtp_from = "alertmanager@cloonar.com"; - smtp_auth_username = "alertmanager@cloonar.com"; - smtp_auth_password = "$SMTP_PASSWORD"; - }; - route = { - receiver = "default"; - routes = [ - { - group_by = [ "host" ]; - match_re.org = "krebs"; - group_wait = "5m"; - group_interval = "5m"; - repeat_interval = "4h"; - receiver = "krebs"; - } - { - group_by = [ "host" ]; - match_re.org = "nix-community"; - group_wait = "5m"; - group_interval = "5m"; - repeat_interval = "4h"; - receiver = "nix-community"; - } - { - group_by = [ "host" ]; - match_re.org = "clan-lol"; - group_wait = "5m"; - group_interval = "5m"; - repeat_interval = "4h"; - receiver = "clan-lol"; - } - { - group_by = [ "host" ]; - group_wait = "30s"; - group_interval = "2m"; - repeat_interval = "2h"; - receiver = "all"; - } - ]; - }; - receivers = [ - { - name = "krebs"; - webhook_configs = [ - { - url = "http://127.0.0.1:9223/"; - max_alerts = 5; - } - ]; - } - #{ - # name = "numtide"; - # slack_configs = [ - # { - # token = "$SLACK_TOKEN"; - # api_url = "https://"; - # } - # ]; - #} - { - name = "nix-community"; - webhook_configs = [ - { - url = "http://localhost:9088/alert"; - max_alerts = 5; - } - ]; - } - { - name = "clan-lol"; - webhook_configs = [ - # TODO - #{ - # url = "http://localhost:4050/services/hooks/YWxlcnRtYW5hZ2VyX3NlcnZpY2U"; - # max_alerts = 5; - #} - ]; - } - { - name = "all"; - pushover_configs = [ - { - user_key = "$PUSHOVER_USER_KEY"; - token = "$PUSHOVER_TOKEN"; - priority = "0"; - } - ]; - } - { - name = "default"; - } - ]; - }; }; + # services.prometheus.alertmanager = { + # enable = true; + # environmentFile = config.sops.secrets.alertmanager.path; + # webExternalUrl = "https://alertmanager.cloonar.com"; + # listenAddress = "[::1]"; + # configuration = { + # global = { + # # The smarthost and SMTP sender used for mail notifications. + # smtp_smarthost = "mail.cloonar.com:587"; + # smtp_from = "alertmanager@cloonar.com"; + # smtp_auth_username = "alertmanager@cloonar.com"; + # smtp_auth_password = "$SMTP_PASSWORD"; + # }; + # route = { + # receiver = "default"; + # routes = [ + # { + # group_by = [ "host" ]; + # match_re.org = "krebs"; + # group_wait = "5m"; + # group_interval = "5m"; + # repeat_interval = "4h"; + # receiver = "krebs"; + # } + # { + # group_by = [ "host" ]; + # match_re.org = "nix-community"; + # group_wait = "5m"; + # group_interval = "5m"; + # repeat_interval = "4h"; + # receiver = "nix-community"; + # } + # { + # group_by = [ "host" ]; + # match_re.org = "clan-lol"; + # group_wait = "5m"; + # group_interval = "5m"; + # repeat_interval = "4h"; + # receiver = "clan-lol"; + # } + # { + # group_by = [ "host" ]; + # group_wait = "30s"; + # group_interval = "2m"; + # repeat_interval = "2h"; + # receiver = "all"; + # } + # ]; + # }; + # receivers = [ + # { + # name = "krebs"; + # webhook_configs = [ + # { + # url = "http://127.0.0.1:9223/"; + # max_alerts = 5; + # } + # ]; + # } + # #{ + # # name = "numtide"; + # # slack_configs = [ + # # { + # # token = "$SLACK_TOKEN"; + # # api_url = "https://"; + # # } + # # ]; + # #} + # { + # name = "nix-community"; + # webhook_configs = [ + # { + # url = "http://localhost:9088/alert"; + # max_alerts = 5; + # } + # ]; + # } + # { + # name = "clan-lol"; + # webhook_configs = [ + # # TODO + # #{ + # # url = "http://localhost:4050/services/hooks/YWxlcnRtYW5hZ2VyX3NlcnZpY2U"; + # # max_alerts = 5; + # #} + # ]; + # } + # { + # name = "all"; + # pushover_configs = [ + # { + # user_key = "$PUSHOVER_USER_KEY"; + # token = "$PUSHOVER_TOKEN"; + # priority = "0"; + # } + # ]; + # } + # { + # name = "default"; + # } + # ]; + # }; + # }; } diff --git a/hosts/web-01.cloonar.com/modules/victoriametrics.nix b/hosts/web-01.cloonar.com/modules/victoriametrics.nix index a2f2bb7..1c320e0 100644 --- a/hosts/web-01.cloonar.com/modules/victoriametrics.nix +++ b/hosts/web-01.cloonar.com/modules/victoriametrics.nix @@ -3,6 +3,8 @@ services.victoriametrics.enable = true; services.prometheus.exporters.node.enable = true; + sops.secrets.promtail-nginx-password.owner = "root"; + services.nginx.virtualHosts."victoria-server.cloonar.com" = { forceSSL = true; enableACME = true;