feat: add alerting for amz ebs server and websites blackbox

This commit is contained in:
2025-11-14 23:08:27 +01:00
parent 01d3ab1357
commit 8a2a68a91c
7 changed files with 270 additions and 1 deletions

View File

@@ -7,6 +7,7 @@
./modules/mysql.nix ./modules/mysql.nix
./modules/web/stack.nix ./modules/web/stack.nix
./modules/laravel-storage.nix ./modules/laravel-storage.nix
./modules/blackbox-exporter.nix
./utils/modules/autoupgrade.nix ./utils/modules/autoupgrade.nix
./utils/modules/promtail ./utils/modules/promtail

View File

@@ -0,0 +1,83 @@
{ config, pkgs, lib, ... }:
with lib;
let
hostname = config.networking.hostName;
cfg = config.services.blackbox-exporter;
nginxVHosts = config.services.nginx.virtualHosts or {};
allDomains = lib.attrNames nginxVHosts;
filteredDomains = builtins.filter (d: !builtins.elem d cfg.blacklistDomains) allDomains;
httpsDomains = lib.map (d: "https://${d}") filteredDomains;
domainsString = builtins.concatStringsSep "\n "
(map (d: "\"${d}\",") httpsDomains);
in {
options.services.blackbox-exporter.blacklistDomains = mkOption {
type = types.listOf types.str;
default = [];
description = "List of domains to exclude from Blackbox Exporter monitoring";
};
config = {
services.blackbox-exporter = {
blacklistDomains = [
# Currently no domains blacklisted - monitoring all nginx virtualHosts
];
};
# Systemd service for Blackbox Exporter
systemd.services.blackbox-exporter = {
description = "Blackbox Exporter";
after = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
serviceConfig.ExecStart = ''
${pkgs.prometheus-blackbox-exporter}/bin/blackbox_exporter \
--config.file=/etc/blackbox_exporter/blackbox.yml
'';
};
# Configuration file for Blackbox Exporter
environment.etc."blackbox_exporter/blackbox.yml".text = ''
modules:
http_200_final:
prober: http
http:
method: GET
follow_redirects: true
preferred_ip_protocol: "ip4" # avoid blanket IPv6 failures
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
valid_status_codes: [200]
'';
# Add scrape config for VictoriaMetrics agent
services.victoriametrics.extraScrapeConfigs = [
''
- job_name: "blackbox_http_all_domains"
metrics_path: "/probe"
params:
module: ["http_200_final"]
static_configs:
- targets:
[
${domainsString}
]
relabel_configs:
- source_labels: ["__address__"]
target_label: "__param_target"
regex: '(.*)'
replacement: "$1"
- source_labels: ["__param_target"]
target_label: "instance"
- target_label: "__address__"
replacement: "127.0.0.1:9115"
- source_labels: ["__address__"]
regex: "127\\.0\\.0\\.1:9115"
target_label: "__scheme__"
replacement: "http"
''
];
};
}

View File

@@ -0,0 +1,58 @@
{ lib, pkgs, config, ... }:
{
grafanaAlertRuleDefinitions = [
{
uid = "amzebs-mysql-service-down-alert-uid";
title = "MySQL Service Down on amzebs-01";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = {
from = 300;
to = 0;
};
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = "node_systemd_unit_state{state=\"active\", name=\"mysql.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)";
hide = false;
intervalMs = 1000;
legendFormat = "__auto";
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "MySQL service is down on amzebs-01";
summary = "MySQL Service Down on amzebs-01";
};
labels = {
severity = "critical";
host = "amzebs-01";
};
}
];
}

View File

@@ -0,0 +1,58 @@
{ lib, pkgs, config, ... }:
{
grafanaAlertRuleDefinitions = [
{
uid = "amzebs-nginx-service-down-alert-uid";
title = "Nginx Service Down on amzebs-01";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = {
from = 300;
to = 0;
};
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = "node_systemd_unit_state{state=\"active\", name=\"nginx.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)";
hide = false;
intervalMs = 1000;
legendFormat = "__auto";
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "Nginx service is down on amzebs-01";
summary = "Nginx Service Down on amzebs-01";
};
labels = {
severity = "critical";
host = "amzebs-01";
};
}
];
}

View File

@@ -0,0 +1,58 @@
{ lib, pkgs, config, ... }:
{
grafanaAlertRuleDefinitions = [
{
uid = "amzebs-phpfpm-service-down-alert-uid";
title = "PHP-FPM Service Down on amzebs-01";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = {
from = 300;
to = 0;
};
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = "node_systemd_unit_state{state=\"active\", name=~\"phpfpm-.*\\\\.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)";
hide = false;
intervalMs = 1000;
legendFormat = "__auto";
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "min";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "One or more PHP-FPM services are down on amzebs-01";
summary = "PHP-FPM Service Down on amzebs-01";
};
labels = {
severity = "critical";
host = "amzebs-01";
};
}
];
}

View File

@@ -7,12 +7,20 @@ let
openldapDownAlertRules = (import ./openldap_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; openldapDownAlertRules = (import ./openldap_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
wireguardDownAlertRules = (import ./wireguard_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; wireguardDownAlertRules = (import ./wireguard_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
# amzebs-01 service alerts
ambebsMysqlDownAlertRules = (import ./amzebs_mysql_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
ambebsNginxDownAlertRules = (import ./amzebs_nginx_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
ambebsPhpfpmDownAlertRules = (import ./amzebs_phpfpm_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
allServiceRules = giteaDownAlertRules allServiceRules = giteaDownAlertRules
++ giteaRunnerDownAlertRules ++ giteaRunnerDownAlertRules
++ postfixDownAlertRules ++ postfixDownAlertRules
++ dovecotDownAlertRules ++ dovecotDownAlertRules
++ openldapDownAlertRules ++ openldapDownAlertRules
++ wireguardDownAlertRules; ++ wireguardDownAlertRules
++ ambebsMysqlDownAlertRules
++ ambebsNginxDownAlertRules
++ ambebsPhpfpmDownAlertRules;
in in
{ {
services.grafana.provision.alerting.rules.settings.groups = [ services.grafana.provision.alerting.rules.settings.groups = [

View File

@@ -6,6 +6,9 @@ let
allDomains = allDomains =
(lib.attrNames nginxVHosts) ++ [ (lib.attrNames nginxVHosts) ++ [
"foundry-vtt.cloonar.com" "foundry-vtt.cloonar.com"
# amzebs-01 domains
"ebs.cloonar.dev"
"api.ebs.cloonar.dev"
]; ];
filteredDomains = builtins.filter (d: !builtins.elem d cfg.blacklistDomains) allDomains; filteredDomains = builtins.filter (d: !builtins.elem d cfg.blacklistDomains) allDomains;
httpsDomains = lib.map (d: "https://${d}") filteredDomains; httpsDomains = lib.map (d: "https://${d}") filteredDomains;