feat: add alerting for amz ebs server and websites blackbox
This commit is contained in:
@@ -7,6 +7,7 @@
|
||||
./modules/mysql.nix
|
||||
./modules/web/stack.nix
|
||||
./modules/laravel-storage.nix
|
||||
./modules/blackbox-exporter.nix
|
||||
|
||||
./utils/modules/autoupgrade.nix
|
||||
./utils/modules/promtail
|
||||
|
||||
83
hosts/amzebs-01/modules/blackbox-exporter.nix
Normal file
83
hosts/amzebs-01/modules/blackbox-exporter.nix
Normal file
@@ -0,0 +1,83 @@
|
||||
{ config, pkgs, lib, ... }:
|
||||
|
||||
with lib;
|
||||
|
||||
let
|
||||
hostname = config.networking.hostName;
|
||||
|
||||
cfg = config.services.blackbox-exporter;
|
||||
nginxVHosts = config.services.nginx.virtualHosts or {};
|
||||
allDomains = lib.attrNames nginxVHosts;
|
||||
filteredDomains = builtins.filter (d: !builtins.elem d cfg.blacklistDomains) allDomains;
|
||||
httpsDomains = lib.map (d: "https://${d}") filteredDomains;
|
||||
domainsString = builtins.concatStringsSep "\n "
|
||||
(map (d: "\"${d}\",") httpsDomains);
|
||||
in {
|
||||
options.services.blackbox-exporter.blacklistDomains = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [];
|
||||
description = "List of domains to exclude from Blackbox Exporter monitoring";
|
||||
};
|
||||
|
||||
config = {
|
||||
services.blackbox-exporter = {
|
||||
blacklistDomains = [
|
||||
# Currently no domains blacklisted - monitoring all nginx virtualHosts
|
||||
];
|
||||
};
|
||||
|
||||
# Systemd service for Blackbox Exporter
|
||||
systemd.services.blackbox-exporter = {
|
||||
description = "Blackbox Exporter";
|
||||
after = [ "network-online.target" ];
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
serviceConfig.ExecStart = ''
|
||||
${pkgs.prometheus-blackbox-exporter}/bin/blackbox_exporter \
|
||||
--config.file=/etc/blackbox_exporter/blackbox.yml
|
||||
'';
|
||||
};
|
||||
|
||||
# Configuration file for Blackbox Exporter
|
||||
environment.etc."blackbox_exporter/blackbox.yml".text = ''
|
||||
modules:
|
||||
http_200_final:
|
||||
prober: http
|
||||
http:
|
||||
method: GET
|
||||
follow_redirects: true
|
||||
preferred_ip_protocol: "ip4" # avoid blanket IPv6 failures
|
||||
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
|
||||
valid_status_codes: [200]
|
||||
'';
|
||||
|
||||
# Add scrape config for VictoriaMetrics agent
|
||||
services.victoriametrics.extraScrapeConfigs = [
|
||||
''
|
||||
- job_name: "blackbox_http_all_domains"
|
||||
metrics_path: "/probe"
|
||||
params:
|
||||
module: ["http_200_final"]
|
||||
|
||||
static_configs:
|
||||
- targets:
|
||||
[
|
||||
${domainsString}
|
||||
]
|
||||
|
||||
relabel_configs:
|
||||
- source_labels: ["__address__"]
|
||||
target_label: "__param_target"
|
||||
regex: '(.*)'
|
||||
replacement: "$1"
|
||||
- source_labels: ["__param_target"]
|
||||
target_label: "instance"
|
||||
- target_label: "__address__"
|
||||
replacement: "127.0.0.1:9115"
|
||||
- source_labels: ["__address__"]
|
||||
regex: "127\\.0\\.0\\.1:9115"
|
||||
target_label: "__scheme__"
|
||||
replacement: "http"
|
||||
''
|
||||
];
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
{ lib, pkgs, config, ... }:
|
||||
{
|
||||
grafanaAlertRuleDefinitions = [
|
||||
{
|
||||
uid = "amzebs-mysql-service-down-alert-uid";
|
||||
title = "MySQL Service Down on amzebs-01";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
relativeTimeRange = {
|
||||
from = 300;
|
||||
to = 0;
|
||||
};
|
||||
datasourceUid = "vm-datasource-uid";
|
||||
model = {
|
||||
editorMode = "code";
|
||||
expr = "node_systemd_unit_state{state=\"active\", name=\"mysql.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)";
|
||||
hide = false;
|
||||
intervalMs = 1000;
|
||||
legendFormat = "__auto";
|
||||
maxDataPoints = 43200;
|
||||
range = true;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "math";
|
||||
expression = "$B < 1";
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "Alerting";
|
||||
execErrState = "Alerting";
|
||||
for = "5m";
|
||||
annotations = {
|
||||
description = "MySQL service is down on amzebs-01";
|
||||
summary = "MySQL Service Down on amzebs-01";
|
||||
};
|
||||
labels = {
|
||||
severity = "critical";
|
||||
host = "amzebs-01";
|
||||
};
|
||||
}
|
||||
];
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
{ lib, pkgs, config, ... }:
|
||||
{
|
||||
grafanaAlertRuleDefinitions = [
|
||||
{
|
||||
uid = "amzebs-nginx-service-down-alert-uid";
|
||||
title = "Nginx Service Down on amzebs-01";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
relativeTimeRange = {
|
||||
from = 300;
|
||||
to = 0;
|
||||
};
|
||||
datasourceUid = "vm-datasource-uid";
|
||||
model = {
|
||||
editorMode = "code";
|
||||
expr = "node_systemd_unit_state{state=\"active\", name=\"nginx.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)";
|
||||
hide = false;
|
||||
intervalMs = 1000;
|
||||
legendFormat = "__auto";
|
||||
maxDataPoints = 43200;
|
||||
range = true;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "math";
|
||||
expression = "$B < 1";
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "Alerting";
|
||||
execErrState = "Alerting";
|
||||
for = "5m";
|
||||
annotations = {
|
||||
description = "Nginx service is down on amzebs-01";
|
||||
summary = "Nginx Service Down on amzebs-01";
|
||||
};
|
||||
labels = {
|
||||
severity = "critical";
|
||||
host = "amzebs-01";
|
||||
};
|
||||
}
|
||||
];
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
{ lib, pkgs, config, ... }:
|
||||
{
|
||||
grafanaAlertRuleDefinitions = [
|
||||
{
|
||||
uid = "amzebs-phpfpm-service-down-alert-uid";
|
||||
title = "PHP-FPM Service Down on amzebs-01";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
relativeTimeRange = {
|
||||
from = 300;
|
||||
to = 0;
|
||||
};
|
||||
datasourceUid = "vm-datasource-uid";
|
||||
model = {
|
||||
editorMode = "code";
|
||||
expr = "node_systemd_unit_state{state=\"active\", name=~\"phpfpm-.*\\\\.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)";
|
||||
hide = false;
|
||||
intervalMs = 1000;
|
||||
legendFormat = "__auto";
|
||||
maxDataPoints = 43200;
|
||||
range = true;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
expression = "A";
|
||||
reducer = "min";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "math";
|
||||
expression = "$B < 1";
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "Alerting";
|
||||
execErrState = "Alerting";
|
||||
for = "5m";
|
||||
annotations = {
|
||||
description = "One or more PHP-FPM services are down on amzebs-01";
|
||||
summary = "PHP-FPM Service Down on amzebs-01";
|
||||
};
|
||||
labels = {
|
||||
severity = "critical";
|
||||
host = "amzebs-01";
|
||||
};
|
||||
}
|
||||
];
|
||||
}
|
||||
@@ -7,12 +7,20 @@ let
|
||||
openldapDownAlertRules = (import ./openldap_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
||||
wireguardDownAlertRules = (import ./wireguard_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
||||
|
||||
# amzebs-01 service alerts
|
||||
ambebsMysqlDownAlertRules = (import ./amzebs_mysql_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
||||
ambebsNginxDownAlertRules = (import ./amzebs_nginx_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
||||
ambebsPhpfpmDownAlertRules = (import ./amzebs_phpfpm_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
||||
|
||||
allServiceRules = giteaDownAlertRules
|
||||
++ giteaRunnerDownAlertRules
|
||||
++ postfixDownAlertRules
|
||||
++ dovecotDownAlertRules
|
||||
++ openldapDownAlertRules
|
||||
++ wireguardDownAlertRules;
|
||||
++ wireguardDownAlertRules
|
||||
++ ambebsMysqlDownAlertRules
|
||||
++ ambebsNginxDownAlertRules
|
||||
++ ambebsPhpfpmDownAlertRules;
|
||||
in
|
||||
{
|
||||
services.grafana.provision.alerting.rules.settings.groups = [
|
||||
|
||||
@@ -6,6 +6,9 @@ let
|
||||
allDomains =
|
||||
(lib.attrNames nginxVHosts) ++ [
|
||||
"foundry-vtt.cloonar.com"
|
||||
# amzebs-01 domains
|
||||
"ebs.cloonar.dev"
|
||||
"api.ebs.cloonar.dev"
|
||||
];
|
||||
filteredDomains = builtins.filter (d: !builtins.elem d cfg.blacklistDomains) allDomains;
|
||||
httpsDomains = lib.map (d: "https://${d}") filteredDomains;
|
||||
|
||||
Reference in New Issue
Block a user