From f1ea4b9b20c2e85792a19159af5626038d0a8fe0 Mon Sep 17 00:00:00 2001 From: Dominik Polakovics Date: Sun, 1 Jun 2025 00:47:43 +0200 Subject: [PATCH] feat: implement website alerting plan with Blackbox Exporter and VictoriaMetrics integration --- hosts/fw/configuration.nix | 1 - hosts/mail/configuration.nix | 1 - hosts/web-arm/configuration.nix | 4 +- hosts/web-arm/modules/blackbox-exporter.nix | 56 ++++++++++ .../grafana/alerting/websites/default.nix | 72 +++++++++++++ hosts/web-arm/modules/grafana/default.nix | 1 + hosts/web-arm/modules/victoriametrics.nix | 101 ++++++++++++------ hosts/web-arm/modules/web/typo3.nix | 51 --------- scripts/test-configuration | 4 +- utils/modules/victoriametrics/default.nix | 51 +++++---- 10 files changed, 236 insertions(+), 106 deletions(-) create mode 100644 hosts/web-arm/modules/blackbox-exporter.nix create mode 100644 hosts/web-arm/modules/grafana/alerting/websites/default.nix diff --git a/hosts/fw/configuration.nix b/hosts/fw/configuration.nix index 930d428..8a00734 100644 --- a/hosts/fw/configuration.nix +++ b/hosts/fw/configuration.nix @@ -9,7 +9,6 @@ ./utils/modules/autoupgrade.nix ./utils/modules/promtail ./utils/modules/borgbackup.nix - # ./utils/modules/netdata.nix # fw ./modules/network-prefix.nix diff --git a/hosts/mail/configuration.nix b/hosts/mail/configuration.nix index 0bc9d7b..e4fd7e2 100644 --- a/hosts/mail/configuration.nix +++ b/hosts/mail/configuration.nix @@ -14,7 +14,6 @@ ./utils/modules/borgbackup.nix ./utils/modules/promtail ./utils/modules/victoriametrics - ./utils/modules/netdata.nix ./modules/set-nix-channel.nix # Automatically manage nix-channel from /var/bento/channel ./hardware-configuration.nix diff --git a/hosts/web-arm/configuration.nix b/hosts/web-arm/configuration.nix index 54c74d9..be836b4 100644 --- a/hosts/web-arm/configuration.nix +++ b/hosts/web-arm/configuration.nix @@ -1,4 +1,4 @@ -{ lib, pkgs, ... }: { +{ config, lib, pkgs, ... }: { imports = [ ./utils/bento.nix ./utils/modules/sops.nix @@ -17,12 +17,12 @@ ./modules/grafana/default.nix ./modules/loki.nix ./modules/victoriametrics.nix + ./modules/blackbox-exporter.nix ./modules/updns.nix ./utils/modules/autoupgrade.nix ./utils/modules/promtail ./utils/modules/borgbackup.nix - ./utils/modules/netdata.nix ./modules/set-nix-channel.nix # Automatically manage nix-channel from /var/bento/channel ./hardware-configuration.nix diff --git a/hosts/web-arm/modules/blackbox-exporter.nix b/hosts/web-arm/modules/blackbox-exporter.nix new file mode 100644 index 0000000..9b35d67 --- /dev/null +++ b/hosts/web-arm/modules/blackbox-exporter.nix @@ -0,0 +1,56 @@ +{ config, pkgs, lib, ... }: + +with lib; + +let + hostname = config.networking.hostName; + + nginxVHosts = config.services.nginx.virtualHosts or {}; + allDomains = lib.attrNames nginxVHosts; + httpsDomains = lib.map (d: "https://${d}") allDomains; + domainsString = builtins.concatStringsSep "\n " + (map (d: "\"${d}\",") httpsDomains); +in { + config = { + # Systemd service for Blackbox Exporter + systemd.services.blackbox-exporter = { + description = "Blackbox Exporter"; + after = [ "network-online.target" ]; + wantedBy = [ "multi-user.target" ]; + serviceConfig.ExecStart = '' + ${pkgs.prometheus-blackbox-exporter}/bin/blackbox_exporter \ + --config.file=/etc/blackbox_exporter/blackbox.yml + ''; + }; + + # Configuration file for Blackbox Exporter + environment.etc."blackbox_exporter/blackbox.yml".text = '' + modules: + http_2xx: + prober: http + ''; + + # Add scrape config for VictoriaMetrics agent + services.victoriametrics.extraScrapeConfigs = [ + '' + - job_name: "blackbox_http_all_domains" + metrics_path: "/probe" + params: + module: ["http_2xx"] + + static_configs: + - targets: + [ + ${domainsString} + ] + + relabel_configs: + - source_labels: ["__address__"] + target_label: "__param_target" + replacement: "$$1" + - source_labels: ["__param_target"] + target_label: "instance" + '' + ]; + }; +} diff --git a/hosts/web-arm/modules/grafana/alerting/websites/default.nix b/hosts/web-arm/modules/grafana/alerting/websites/default.nix new file mode 100644 index 0000000..c495194 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/websites/default.nix @@ -0,0 +1,72 @@ +{ lib, pkgs, config, ... }: + +let + nginxVHosts = config.services.nginx.virtualHosts or {}; + allDomains = lib.attrNames nginxVHosts; + httpsDomains = lib.map (d: "https://${d}") allDomains; + websiteAlertRules = lib.map (target: + let + domain = lib.replaceStrings ["://" "." "-" "/" ] ["-" "-" "_" "_"] target + "-down-alert"; + uid = builtins.hashString "sha1" domain; + in { + uid = uid; + title = "Website " + target + " Down"; + condition = "C"; + + data = [ + { + refId = "A"; + relativeTimeRange = { from = 300; to = 0; }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "probe_success{target=\"" + target + "\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = target; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "Website " + target + " is unreachable."; + summary = "Website Down"; + }; + labels = { + severity = "critical"; + website_url = target; + }; + } + ) httpsDomains; +in { + services.grafana.provision.alerting.rules.settings.groups = [ + { + name = "Website Alerts"; + folder = "Websites"; + interval = "1m"; + rules = websiteAlertRules; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/default.nix b/hosts/web-arm/modules/grafana/default.nix index 8bf0771..6ec5819 100644 --- a/hosts/web-arm/modules/grafana/default.nix +++ b/hosts/web-arm/modules/grafana/default.nix @@ -31,6 +31,7 @@ in # Individual alert files removed, now handled by alerting/system/default.nix ./alerting/system/default.nix # Added: Imports the consolidated system alerts module ./alerting/service/default.nix # Added: Imports the new service alerts module + ./alerting/websites/default.nix # Added: Imports the new websites alerts module # ... other rule files can be added here ... ./datasources/victoriametrics.nix ./datasources/loki.nix # Add Loki datasource diff --git a/hosts/web-arm/modules/victoriametrics.nix b/hosts/web-arm/modules/victoriametrics.nix index a2788e7..0dbd915 100644 --- a/hosts/web-arm/modules/victoriametrics.nix +++ b/hosts/web-arm/modules/victoriametrics.nix @@ -1,43 +1,84 @@ -{ config, ... }: +{ config, lib, ... }: +with lib; let + # configure_prom = builtins.toFile "prometheus.yml" '' + # scrape_configs: + # - job_name: 'server' + # stream_parse: true + # static_configs: + # - targets: + # - ${config.networking.hostName}:9100 + # ''; configure_prom = builtins.toFile "prometheus.yml" '' scrape_configs: - - job_name: 'server' + # System metrics + - job_name: 'node' stream_parse: true static_configs: - targets: - ${config.networking.hostName}:9100 + + # Systemd service monitoring + - job_name: 'systemd' + metrics_path: /metrics + params: + collect[]: + - 'systemd.service.state' + - 'systemd.service.start_time_seconds' + - 'systemd.unit_file.state' + static_configs: + - targets: + - ${config.networking.hostName}:9100 + relabel_configs: + # Filter for specific services we want to monitor + - source_labels: [__name__] + regex: 'node_systemd_unit_state' + action: keep + - source_labels: [name] + regex: '(container@git|microvm@git-runner-|postfix|dovecot|openldap|wireguard-wg_cloonar).*\.service' + action: keep + + ${concatStringsSep "\n" config.services.victoriametrics.extraScrapeConfigs} ''; in { - services.prometheus.exporters.node.enable = true; - - sops.secrets.victoria-nginx-password.owner = "nginx"; - - services.victoriametrics = { - enable = true; - extraOptions = [ - "-promscrape.config=${configure_prom}" - ]; - }; - - services.nginx.virtualHosts."victoria-server.cloonar.com" = { - forceSSL = true; - enableACME = true; - acmeRoot = null; - locations."/" = { - proxyWebsockets = true; - extraConfig = '' - auth_basic "Victoria password"; - auth_basic_user_file ${config.sops.secrets.victoria-nginx-password.path}; - - proxy_read_timeout 1800s; - proxy_redirect off; - proxy_connect_timeout 1600s; - - access_log off; - proxy_pass http://127.0.0.1:8428; - ''; + options.services.victoriametrics = { + extraScrapeConfigs = mkOption { + type = types.listOf types.str; + default = []; + description = "Additional Prometheus scrape job YAML snippets for Blackbox Exporter probes"; }; }; + config = { + services.prometheus.exporters.node.enable = true; + + sops.secrets.victoria-nginx-password.owner = "nginx"; + + services.victoriametrics = { + enable = true; + extraOptions = [ + "-promscrape.config=${configure_prom}" + ]; + }; + + services.nginx.virtualHosts."victoria-server.cloonar.com" = { + forceSSL = true; + enableACME = true; + acmeRoot = null; + locations."/" = { + proxyWebsockets = true; + extraConfig = '' + auth_basic "Victoria password"; + auth_basic_user_file ${config.sops.secrets.victoria-nginx-password.path}; + + proxy_read_timeout 1800s; + proxy_redirect off; + proxy_connect_timeout 1600s; + + access_log off; + proxy_pass http://127.0.0.1:8428; + ''; + }; + }; + }; } diff --git a/hosts/web-arm/modules/web/typo3.nix b/hosts/web-arm/modules/web/typo3.nix index 1e7db9c..409228f 100644 --- a/hosts/web-arm/modules/web/typo3.nix +++ b/hosts/web-arm/modules/web/typo3.nix @@ -97,18 +97,6 @@ in }; config = { - # systemd.services = mapAttrs' (instance: instanceOpts: - # let - # domain = if instanceOpts.domain != null then instanceOpts.domain else instance; - # in - # nameValuePair "phpfpm-${domain}" { - # serviceConfig = { - # ProtectHome = lib.mkForce "tmpfs"; - # BindPaths = "BindPaths=/var/www/${domain}:/var/www/${domain}"; - # }; - # } - # ) cfg.instances; - systemd.timers = mapAttrs' (instance: instanceOpts: let domain = if instanceOpts.domain != null then instanceOpts.domain else instance; @@ -244,45 +232,6 @@ in } ''; - # locations."/typo3/login" = { - # extraConfig = '' - # # Basic Authelia Config - # # Send a subsequent request to Authelia to verify if the user is authenticated - # # and has the right permissions to access the resource. - # auth_request /authelia; - # # Set the `target_url` variable based on the request. It will be used to build the portal - # # URL with the correct redirection parameter. - # auth_request_set $target_url $scheme://$http_host$request_uri; - # # Set the X-Forwarded-User and X-Forwarded-Groups with the headers - # # returned by Authelia for the backends which can consume them. - # # This is not safe, as the backend must make sure that they come from the - # # proxy. In the future, it's gonna be safe to just use OAuth. - # auth_request_set $user $upstream_http_remote_user; - # auth_request_set $groups $upstream_http_remote_groups; - # auth_request_set $name $upstream_http_remote_name; - # auth_request_set $email $upstream_http_remote_email; - # proxy_set_header Remote-User $user; - # proxy_set_header Remote-Groups $groups; - # proxy_set_header Remote-Name $name; - # proxy_set_header Remote-Email $email; - # # If Authelia returns 401, then nginx redirects the user to the login portal. - # # If it returns 200, then the request pass through to the backend. - # # For other type of errors, nginx will handle them as usual. - # error_page 401 =302 https://auth.cloonar.com/?rd=$target_url; - # - # fastcgi_param REMOTE_USER $user; - # - # include ${pkgs.nginx}/conf/fastcgi.conf; - # fastcgi_buffer_size 32k; - # fastcgi_buffers 8 16k; - # fastcgi_connect_timeout 240s; - # fastcgi_read_timeout 240s; - # fastcgi_send_timeout 240s; - # fastcgi_pass unix:${config.services.phpfpm.pools."${domain}".socket}; - # fastcgi_param SCRIPT_FILENAME ${cfg.dataDir}/${domain}/public/typo3/index.php; - # ''; - # }; - locations."/favicon.ico".extraConfig = '' log_not_found off; access_log off; diff --git a/scripts/test-configuration b/scripts/test-configuration index f01e24f..6c6dcd1 100755 --- a/scripts/test-configuration +++ b/scripts/test-configuration @@ -45,7 +45,7 @@ fi # Execute nixos-rebuild dry-build # Store the output and error streams, and the exit code -NIX_OUTPUT_ERR=$(nixos-rebuild dry-build $SHOW_TRACE_OPT -I nixos-config="$CONFIG_PATH" 2>&1) +NIX_OUTPUT_ERR=$(nixos-rebuild dry-build $SHOW_TRACE_OPT -I nixos-config="$CONFIG_PATH" --show-trace 2>&1) NIX_EXIT_STATUS=$? # Check the exit status @@ -61,4 +61,4 @@ else echo "Output from nixos-rebuild:" >&2 echo "$NIX_OUTPUT_ERR" >&2 exit "$NIX_EXIT_STATUS" -fi \ No newline at end of file +fi diff --git a/utils/modules/victoriametrics/default.nix b/utils/modules/victoriametrics/default.nix index fd14825..b39821c 100644 --- a/utils/modules/victoriametrics/default.nix +++ b/utils/modules/victoriametrics/default.nix @@ -1,4 +1,5 @@ -{ config, pkgs, ... }: +{ config, lib, pkgs, ... }: +with lib; let configure_prom = builtins.toFile "prometheus.yml" '' scrape_configs: @@ -28,29 +29,41 @@ let - source_labels: [name] regex: '(container@git|microvm@git-runner-|postfix|dovecot|openldap|wireguard-wg_cloonar).*\.service' action: keep + + ${concatStringsSep "\n " config.services.victoriametrics.extraScrapeConfigs} ''; in { - sops.secrets.victoria-agent-env = { - sopsFile = ./secrets.yaml; + options.services.victoriametrics = { + extraScrapeConfigs = mkOption { + type = types.listOf types.str; + default = []; + description = "Additional Prometheus scrape job YAML snippets for Blackbox Exporter probes"; + }; }; - # Node exporter for system metrics - services.prometheus.exporters.node = { - enable = true; - enabledCollectors = [ - "systemd" # Enable systemd collector for service monitoring - ]; - }; - - systemd.services.export-to-prometheus = { - path = with pkgs; [victoriametrics]; - enable = true; - after = ["network-online.target"]; - wantedBy = ["multi-user.target"]; - script = "vmagent -promscrape.config=${configure_prom} -envflag.enable -remoteWrite.url=https://victoria-server.cloonar.com/api/v1/write"; + config = { + sops.secrets.victoria-agent-env = { + sopsFile = ./secrets.yaml; + }; - serviceConfig = { - EnvironmentFile=config.sops.secrets.victoria-agent-env.path; + # Node exporter for system metrics + services.prometheus.exporters.node = { + enable = true; + enabledCollectors = [ + "systemd" # Enable systemd collector for service monitoring + ]; + }; + + systemd.services.export-to-prometheus = { + path = with pkgs; [victoriametrics]; + enable = true; + after = ["network-online.target"]; + wantedBy = ["multi-user.target"]; + script = "vmagent -promscrape.config=${configure_prom} -envflag.enable -remoteWrite.url=https://victoria-server.cloonar.com/api/v1/write"; + + serviceConfig = { + EnvironmentFile=config.sops.secrets.victoria-agent-env.path; + }; }; }; }