diff --git a/hosts/amzebs-01/configuration.nix b/hosts/amzebs-01/configuration.nix index 28c34a6..9ccb1e1 100644 --- a/hosts/amzebs-01/configuration.nix +++ b/hosts/amzebs-01/configuration.nix @@ -7,6 +7,7 @@ ./modules/mysql.nix ./modules/web/stack.nix ./modules/laravel-storage.nix + ./modules/blackbox-exporter.nix ./utils/modules/autoupgrade.nix ./utils/modules/promtail diff --git a/hosts/amzebs-01/modules/blackbox-exporter.nix b/hosts/amzebs-01/modules/blackbox-exporter.nix new file mode 100644 index 0000000..dda9122 --- /dev/null +++ b/hosts/amzebs-01/modules/blackbox-exporter.nix @@ -0,0 +1,83 @@ +{ config, pkgs, lib, ... }: + +with lib; + +let + hostname = config.networking.hostName; + + cfg = config.services.blackbox-exporter; + nginxVHosts = config.services.nginx.virtualHosts or {}; + allDomains = lib.attrNames nginxVHosts; + filteredDomains = builtins.filter (d: !builtins.elem d cfg.blacklistDomains) allDomains; + httpsDomains = lib.map (d: "https://${d}") filteredDomains; + domainsString = builtins.concatStringsSep "\n " + (map (d: "\"${d}\",") httpsDomains); +in { + options.services.blackbox-exporter.blacklistDomains = mkOption { + type = types.listOf types.str; + default = []; + description = "List of domains to exclude from Blackbox Exporter monitoring"; + }; + + config = { + services.blackbox-exporter = { + blacklistDomains = [ + # Currently no domains blacklisted - monitoring all nginx virtualHosts + ]; + }; + + # Systemd service for Blackbox Exporter + systemd.services.blackbox-exporter = { + description = "Blackbox Exporter"; + after = [ "network-online.target" ]; + wantedBy = [ "multi-user.target" ]; + serviceConfig.ExecStart = '' + ${pkgs.prometheus-blackbox-exporter}/bin/blackbox_exporter \ + --config.file=/etc/blackbox_exporter/blackbox.yml + ''; + }; + + # Configuration file for Blackbox Exporter + environment.etc."blackbox_exporter/blackbox.yml".text = '' + modules: + http_200_final: + prober: http + http: + method: GET + follow_redirects: true + preferred_ip_protocol: "ip4" # avoid blanket IPv6 failures + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + valid_status_codes: [200] + ''; + + # Add scrape config for VictoriaMetrics agent + services.victoriametrics.extraScrapeConfigs = [ + '' + - job_name: "blackbox_http_all_domains" + metrics_path: "/probe" + params: + module: ["http_200_final"] + + static_configs: + - targets: + [ + ${domainsString} + ] + + relabel_configs: + - source_labels: ["__address__"] + target_label: "__param_target" + regex: '(.*)' + replacement: "$1" + - source_labels: ["__param_target"] + target_label: "instance" + - target_label: "__address__" + replacement: "127.0.0.1:9115" + - source_labels: ["__address__"] + regex: "127\\.0\\.0\\.1:9115" + target_label: "__scheme__" + replacement: "http" + '' + ]; + }; +} diff --git a/hosts/web-arm/modules/grafana/alerting/service/amzebs_mysql_down.nix b/hosts/web-arm/modules/grafana/alerting/service/amzebs_mysql_down.nix new file mode 100644 index 0000000..9416794 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/amzebs_mysql_down.nix @@ -0,0 +1,58 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "amzebs-mysql-service-down-alert-uid"; + title = "MySQL Service Down on amzebs-01"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"mysql.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "MySQL service is down on amzebs-01"; + summary = "MySQL Service Down on amzebs-01"; + }; + labels = { + severity = "critical"; + host = "amzebs-01"; + }; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/service/amzebs_nginx_down.nix b/hosts/web-arm/modules/grafana/alerting/service/amzebs_nginx_down.nix new file mode 100644 index 0000000..a3b2119 --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/amzebs_nginx_down.nix @@ -0,0 +1,58 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "amzebs-nginx-service-down-alert-uid"; + title = "Nginx Service Down on amzebs-01"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=\"nginx.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "last"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "Nginx service is down on amzebs-01"; + summary = "Nginx Service Down on amzebs-01"; + }; + labels = { + severity = "critical"; + host = "amzebs-01"; + }; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/service/amzebs_phpfpm_down.nix b/hosts/web-arm/modules/grafana/alerting/service/amzebs_phpfpm_down.nix new file mode 100644 index 0000000..bfe52fa --- /dev/null +++ b/hosts/web-arm/modules/grafana/alerting/service/amzebs_phpfpm_down.nix @@ -0,0 +1,58 @@ +{ lib, pkgs, config, ... }: +{ + grafanaAlertRuleDefinitions = [ + { + uid = "amzebs-phpfpm-service-down-alert-uid"; + title = "PHP-FPM Service Down on amzebs-01"; + condition = "C"; + data = [ + { + refId = "A"; + relativeTimeRange = { + from = 300; + to = 0; + }; + datasourceUid = "vm-datasource-uid"; + model = { + editorMode = "code"; + expr = "node_systemd_unit_state{state=\"active\", name=~\"phpfpm-.*\\\\.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)"; + hide = false; + intervalMs = 1000; + legendFormat = "__auto"; + maxDataPoints = 43200; + range = true; + refId = "A"; + }; + } + { + refId = "B"; + datasourceUid = "__expr__"; + model = { + type = "reduce"; + expression = "A"; + reducer = "min"; + }; + } + { + refId = "C"; + datasourceUid = "__expr__"; + model = { + type = "math"; + expression = "$B < 1"; + }; + } + ]; + noDataState = "Alerting"; + execErrState = "Alerting"; + for = "5m"; + annotations = { + description = "One or more PHP-FPM services are down on amzebs-01"; + summary = "PHP-FPM Service Down on amzebs-01"; + }; + labels = { + severity = "critical"; + host = "amzebs-01"; + }; + } + ]; +} diff --git a/hosts/web-arm/modules/grafana/alerting/service/default.nix b/hosts/web-arm/modules/grafana/alerting/service/default.nix index 07f1848..cdcd759 100644 --- a/hosts/web-arm/modules/grafana/alerting/service/default.nix +++ b/hosts/web-arm/modules/grafana/alerting/service/default.nix @@ -7,12 +7,20 @@ let openldapDownAlertRules = (import ./openldap_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; wireguardDownAlertRules = (import ./wireguard_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + # amzebs-01 service alerts + ambebsMysqlDownAlertRules = (import ./amzebs_mysql_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + ambebsNginxDownAlertRules = (import ./amzebs_nginx_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + ambebsPhpfpmDownAlertRules = (import ./amzebs_phpfpm_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; + allServiceRules = giteaDownAlertRules ++ giteaRunnerDownAlertRules ++ postfixDownAlertRules ++ dovecotDownAlertRules ++ openldapDownAlertRules - ++ wireguardDownAlertRules; + ++ wireguardDownAlertRules + ++ ambebsMysqlDownAlertRules + ++ ambebsNginxDownAlertRules + ++ ambebsPhpfpmDownAlertRules; in { services.grafana.provision.alerting.rules.settings.groups = [ diff --git a/hosts/web-arm/modules/grafana/alerting/websites/default.nix b/hosts/web-arm/modules/grafana/alerting/websites/default.nix index fe17a9c..8df31ff 100644 --- a/hosts/web-arm/modules/grafana/alerting/websites/default.nix +++ b/hosts/web-arm/modules/grafana/alerting/websites/default.nix @@ -6,6 +6,9 @@ let allDomains = (lib.attrNames nginxVHosts) ++ [ "foundry-vtt.cloonar.com" + # amzebs-01 domains + "ebs.cloonar.dev" + "api.ebs.cloonar.dev" ]; filteredDomains = builtins.filter (d: !builtins.elem d cfg.blacklistDomains) allDomains; httpsDomains = lib.map (d: "https://${d}") filteredDomains;