feat: add alerting for amz ebs server and websites blackbox
This commit is contained in:
@@ -7,6 +7,7 @@
|
|||||||
./modules/mysql.nix
|
./modules/mysql.nix
|
||||||
./modules/web/stack.nix
|
./modules/web/stack.nix
|
||||||
./modules/laravel-storage.nix
|
./modules/laravel-storage.nix
|
||||||
|
./modules/blackbox-exporter.nix
|
||||||
|
|
||||||
./utils/modules/autoupgrade.nix
|
./utils/modules/autoupgrade.nix
|
||||||
./utils/modules/promtail
|
./utils/modules/promtail
|
||||||
|
|||||||
83
hosts/amzebs-01/modules/blackbox-exporter.nix
Normal file
83
hosts/amzebs-01/modules/blackbox-exporter.nix
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
{ config, pkgs, lib, ... }:
|
||||||
|
|
||||||
|
with lib;
|
||||||
|
|
||||||
|
let
|
||||||
|
hostname = config.networking.hostName;
|
||||||
|
|
||||||
|
cfg = config.services.blackbox-exporter;
|
||||||
|
nginxVHosts = config.services.nginx.virtualHosts or {};
|
||||||
|
allDomains = lib.attrNames nginxVHosts;
|
||||||
|
filteredDomains = builtins.filter (d: !builtins.elem d cfg.blacklistDomains) allDomains;
|
||||||
|
httpsDomains = lib.map (d: "https://${d}") filteredDomains;
|
||||||
|
domainsString = builtins.concatStringsSep "\n "
|
||||||
|
(map (d: "\"${d}\",") httpsDomains);
|
||||||
|
in {
|
||||||
|
options.services.blackbox-exporter.blacklistDomains = mkOption {
|
||||||
|
type = types.listOf types.str;
|
||||||
|
default = [];
|
||||||
|
description = "List of domains to exclude from Blackbox Exporter monitoring";
|
||||||
|
};
|
||||||
|
|
||||||
|
config = {
|
||||||
|
services.blackbox-exporter = {
|
||||||
|
blacklistDomains = [
|
||||||
|
# Currently no domains blacklisted - monitoring all nginx virtualHosts
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|
||||||
|
# Systemd service for Blackbox Exporter
|
||||||
|
systemd.services.blackbox-exporter = {
|
||||||
|
description = "Blackbox Exporter";
|
||||||
|
after = [ "network-online.target" ];
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
serviceConfig.ExecStart = ''
|
||||||
|
${pkgs.prometheus-blackbox-exporter}/bin/blackbox_exporter \
|
||||||
|
--config.file=/etc/blackbox_exporter/blackbox.yml
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
# Configuration file for Blackbox Exporter
|
||||||
|
environment.etc."blackbox_exporter/blackbox.yml".text = ''
|
||||||
|
modules:
|
||||||
|
http_200_final:
|
||||||
|
prober: http
|
||||||
|
http:
|
||||||
|
method: GET
|
||||||
|
follow_redirects: true
|
||||||
|
preferred_ip_protocol: "ip4" # avoid blanket IPv6 failures
|
||||||
|
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
||||||
|
valid_status_codes: [200]
|
||||||
|
'';
|
||||||
|
|
||||||
|
# Add scrape config for VictoriaMetrics agent
|
||||||
|
services.victoriametrics.extraScrapeConfigs = [
|
||||||
|
''
|
||||||
|
- job_name: "blackbox_http_all_domains"
|
||||||
|
metrics_path: "/probe"
|
||||||
|
params:
|
||||||
|
module: ["http_200_final"]
|
||||||
|
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
[
|
||||||
|
${domainsString}
|
||||||
|
]
|
||||||
|
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: ["__address__"]
|
||||||
|
target_label: "__param_target"
|
||||||
|
regex: '(.*)'
|
||||||
|
replacement: "$1"
|
||||||
|
- source_labels: ["__param_target"]
|
||||||
|
target_label: "instance"
|
||||||
|
- target_label: "__address__"
|
||||||
|
replacement: "127.0.0.1:9115"
|
||||||
|
- source_labels: ["__address__"]
|
||||||
|
regex: "127\\.0\\.0\\.1:9115"
|
||||||
|
target_label: "__scheme__"
|
||||||
|
replacement: "http"
|
||||||
|
''
|
||||||
|
];
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
{ lib, pkgs, config, ... }:
|
||||||
|
{
|
||||||
|
grafanaAlertRuleDefinitions = [
|
||||||
|
{
|
||||||
|
uid = "amzebs-mysql-service-down-alert-uid";
|
||||||
|
title = "MySQL Service Down on amzebs-01";
|
||||||
|
condition = "C";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
relativeTimeRange = {
|
||||||
|
from = 300;
|
||||||
|
to = 0;
|
||||||
|
};
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
model = {
|
||||||
|
editorMode = "code";
|
||||||
|
expr = "node_systemd_unit_state{state=\"active\", name=\"mysql.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)";
|
||||||
|
hide = false;
|
||||||
|
intervalMs = 1000;
|
||||||
|
legendFormat = "__auto";
|
||||||
|
maxDataPoints = 43200;
|
||||||
|
range = true;
|
||||||
|
refId = "A";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "B";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$B < 1";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
noDataState = "Alerting";
|
||||||
|
execErrState = "Alerting";
|
||||||
|
for = "5m";
|
||||||
|
annotations = {
|
||||||
|
description = "MySQL service is down on amzebs-01";
|
||||||
|
summary = "MySQL Service Down on amzebs-01";
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "critical";
|
||||||
|
host = "amzebs-01";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
{ lib, pkgs, config, ... }:
|
||||||
|
{
|
||||||
|
grafanaAlertRuleDefinitions = [
|
||||||
|
{
|
||||||
|
uid = "amzebs-nginx-service-down-alert-uid";
|
||||||
|
title = "Nginx Service Down on amzebs-01";
|
||||||
|
condition = "C";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
relativeTimeRange = {
|
||||||
|
from = 300;
|
||||||
|
to = 0;
|
||||||
|
};
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
model = {
|
||||||
|
editorMode = "code";
|
||||||
|
expr = "node_systemd_unit_state{state=\"active\", name=\"nginx.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)";
|
||||||
|
hide = false;
|
||||||
|
intervalMs = 1000;
|
||||||
|
legendFormat = "__auto";
|
||||||
|
maxDataPoints = 43200;
|
||||||
|
range = true;
|
||||||
|
refId = "A";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "B";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$B < 1";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
noDataState = "Alerting";
|
||||||
|
execErrState = "Alerting";
|
||||||
|
for = "5m";
|
||||||
|
annotations = {
|
||||||
|
description = "Nginx service is down on amzebs-01";
|
||||||
|
summary = "Nginx Service Down on amzebs-01";
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "critical";
|
||||||
|
host = "amzebs-01";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
{ lib, pkgs, config, ... }:
|
||||||
|
{
|
||||||
|
grafanaAlertRuleDefinitions = [
|
||||||
|
{
|
||||||
|
uid = "amzebs-phpfpm-service-down-alert-uid";
|
||||||
|
title = "PHP-FPM Service Down on amzebs-01";
|
||||||
|
condition = "C";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
relativeTimeRange = {
|
||||||
|
from = 300;
|
||||||
|
to = 0;
|
||||||
|
};
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
model = {
|
||||||
|
editorMode = "code";
|
||||||
|
expr = "node_systemd_unit_state{state=\"active\", name=~\"phpfpm-.*\\\\.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)";
|
||||||
|
hide = false;
|
||||||
|
intervalMs = 1000;
|
||||||
|
legendFormat = "__auto";
|
||||||
|
maxDataPoints = 43200;
|
||||||
|
range = true;
|
||||||
|
refId = "A";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "B";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "min";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$B < 1";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
noDataState = "Alerting";
|
||||||
|
execErrState = "Alerting";
|
||||||
|
for = "5m";
|
||||||
|
annotations = {
|
||||||
|
description = "One or more PHP-FPM services are down on amzebs-01";
|
||||||
|
summary = "PHP-FPM Service Down on amzebs-01";
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "critical";
|
||||||
|
host = "amzebs-01";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
@@ -7,12 +7,20 @@ let
|
|||||||
openldapDownAlertRules = (import ./openldap_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
openldapDownAlertRules = (import ./openldap_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
||||||
wireguardDownAlertRules = (import ./wireguard_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
wireguardDownAlertRules = (import ./wireguard_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
||||||
|
|
||||||
|
# amzebs-01 service alerts
|
||||||
|
ambebsMysqlDownAlertRules = (import ./amzebs_mysql_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
||||||
|
ambebsNginxDownAlertRules = (import ./amzebs_nginx_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
||||||
|
ambebsPhpfpmDownAlertRules = (import ./amzebs_phpfpm_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
||||||
|
|
||||||
allServiceRules = giteaDownAlertRules
|
allServiceRules = giteaDownAlertRules
|
||||||
++ giteaRunnerDownAlertRules
|
++ giteaRunnerDownAlertRules
|
||||||
++ postfixDownAlertRules
|
++ postfixDownAlertRules
|
||||||
++ dovecotDownAlertRules
|
++ dovecotDownAlertRules
|
||||||
++ openldapDownAlertRules
|
++ openldapDownAlertRules
|
||||||
++ wireguardDownAlertRules;
|
++ wireguardDownAlertRules
|
||||||
|
++ ambebsMysqlDownAlertRules
|
||||||
|
++ ambebsNginxDownAlertRules
|
||||||
|
++ ambebsPhpfpmDownAlertRules;
|
||||||
in
|
in
|
||||||
{
|
{
|
||||||
services.grafana.provision.alerting.rules.settings.groups = [
|
services.grafana.provision.alerting.rules.settings.groups = [
|
||||||
|
|||||||
@@ -6,6 +6,9 @@ let
|
|||||||
allDomains =
|
allDomains =
|
||||||
(lib.attrNames nginxVHosts) ++ [
|
(lib.attrNames nginxVHosts) ++ [
|
||||||
"foundry-vtt.cloonar.com"
|
"foundry-vtt.cloonar.com"
|
||||||
|
# amzebs-01 domains
|
||||||
|
"ebs.cloonar.dev"
|
||||||
|
"api.ebs.cloonar.dev"
|
||||||
];
|
];
|
||||||
filteredDomains = builtins.filter (d: !builtins.elem d cfg.blacklistDomains) allDomains;
|
filteredDomains = builtins.filter (d: !builtins.elem d cfg.blacklistDomains) allDomains;
|
||||||
httpsDomains = lib.map (d: "https://${d}") filteredDomains;
|
httpsDomains = lib.map (d: "https://${d}") filteredDomains;
|
||||||
|
|||||||
Reference in New Issue
Block a user