feat: implement website alerting plan with Blackbox Exporter and VictoriaMetrics integration

This commit is contained in:
2025-06-01 00:47:43 +02:00
parent b6b90bca7d
commit f1ea4b9b20
10 changed files with 236 additions and 106 deletions

View File

@@ -9,7 +9,6 @@
./utils/modules/autoupgrade.nix
./utils/modules/promtail
./utils/modules/borgbackup.nix
# ./utils/modules/netdata.nix
# fw
./modules/network-prefix.nix

View File

@@ -14,7 +14,6 @@
./utils/modules/borgbackup.nix
./utils/modules/promtail
./utils/modules/victoriametrics
./utils/modules/netdata.nix
./modules/set-nix-channel.nix # Automatically manage nix-channel from /var/bento/channel
./hardware-configuration.nix

View File

@@ -1,4 +1,4 @@
{ lib, pkgs, ... }: {
{ config, lib, pkgs, ... }: {
imports = [
./utils/bento.nix
./utils/modules/sops.nix
@@ -17,12 +17,12 @@
./modules/grafana/default.nix
./modules/loki.nix
./modules/victoriametrics.nix
./modules/blackbox-exporter.nix
./modules/updns.nix
./utils/modules/autoupgrade.nix
./utils/modules/promtail
./utils/modules/borgbackup.nix
./utils/modules/netdata.nix
./modules/set-nix-channel.nix # Automatically manage nix-channel from /var/bento/channel
./hardware-configuration.nix

View File

@@ -0,0 +1,56 @@
{ config, pkgs, lib, ... }:
with lib;
let
hostname = config.networking.hostName;
nginxVHosts = config.services.nginx.virtualHosts or {};
allDomains = lib.attrNames nginxVHosts;
httpsDomains = lib.map (d: "https://${d}") allDomains;
domainsString = builtins.concatStringsSep "\n "
(map (d: "\"${d}\",") httpsDomains);
in {
config = {
# Systemd service for Blackbox Exporter
systemd.services.blackbox-exporter = {
description = "Blackbox Exporter";
after = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
serviceConfig.ExecStart = ''
${pkgs.prometheus-blackbox-exporter}/bin/blackbox_exporter \
--config.file=/etc/blackbox_exporter/blackbox.yml
'';
};
# Configuration file for Blackbox Exporter
environment.etc."blackbox_exporter/blackbox.yml".text = ''
modules:
http_2xx:
prober: http
'';
# Add scrape config for VictoriaMetrics agent
services.victoriametrics.extraScrapeConfigs = [
''
- job_name: "blackbox_http_all_domains"
metrics_path: "/probe"
params:
module: ["http_2xx"]
static_configs:
- targets:
[
${domainsString}
]
relabel_configs:
- source_labels: ["__address__"]
target_label: "__param_target"
replacement: "$$1"
- source_labels: ["__param_target"]
target_label: "instance"
''
];
};
}

View File

@@ -0,0 +1,72 @@
{ lib, pkgs, config, ... }:
let
nginxVHosts = config.services.nginx.virtualHosts or {};
allDomains = lib.attrNames nginxVHosts;
httpsDomains = lib.map (d: "https://${d}") allDomains;
websiteAlertRules = lib.map (target:
let
domain = lib.replaceStrings ["://" "." "-" "/" ] ["-" "-" "_" "_"] target + "-down-alert";
uid = builtins.hashString "sha1" domain;
in {
uid = uid;
title = "Website " + target + " Down";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = { from = 300; to = 0; };
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = "probe_success{target=\"" + target + "\"} OR on() vector(0)";
hide = false;
intervalMs = 1000;
legendFormat = target;
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "Website " + target + " is unreachable.";
summary = "Website Down";
};
labels = {
severity = "critical";
website_url = target;
};
}
) httpsDomains;
in {
services.grafana.provision.alerting.rules.settings.groups = [
{
name = "Website Alerts";
folder = "Websites";
interval = "1m";
rules = websiteAlertRules;
}
];
}

View File

@@ -31,6 +31,7 @@ in
# Individual alert files removed, now handled by alerting/system/default.nix
./alerting/system/default.nix # Added: Imports the consolidated system alerts module
./alerting/service/default.nix # Added: Imports the new service alerts module
./alerting/websites/default.nix # Added: Imports the new websites alerts module
# ... other rule files can be added here ...
./datasources/victoriametrics.nix
./datasources/loki.nix # Add Loki datasource

View File

@@ -1,43 +1,84 @@
{ config, ... }:
{ config, lib, ... }:
with lib;
let
# configure_prom = builtins.toFile "prometheus.yml" ''
# scrape_configs:
# - job_name: 'server'
# stream_parse: true
# static_configs:
# - targets:
# - ${config.networking.hostName}:9100
# '';
configure_prom = builtins.toFile "prometheus.yml" ''
scrape_configs:
- job_name: 'server'
# System metrics
- job_name: 'node'
stream_parse: true
static_configs:
- targets:
- ${config.networking.hostName}:9100
# Systemd service monitoring
- job_name: 'systemd'
metrics_path: /metrics
params:
collect[]:
- 'systemd.service.state'
- 'systemd.service.start_time_seconds'
- 'systemd.unit_file.state'
static_configs:
- targets:
- ${config.networking.hostName}:9100
relabel_configs:
# Filter for specific services we want to monitor
- source_labels: [__name__]
regex: 'node_systemd_unit_state'
action: keep
- source_labels: [name]
regex: '(container@git|microvm@git-runner-|postfix|dovecot|openldap|wireguard-wg_cloonar).*\.service'
action: keep
${concatStringsSep "\n" config.services.victoriametrics.extraScrapeConfigs}
'';
in {
services.prometheus.exporters.node.enable = true;
sops.secrets.victoria-nginx-password.owner = "nginx";
services.victoriametrics = {
enable = true;
extraOptions = [
"-promscrape.config=${configure_prom}"
];
};
services.nginx.virtualHosts."victoria-server.cloonar.com" = {
forceSSL = true;
enableACME = true;
acmeRoot = null;
locations."/" = {
proxyWebsockets = true;
extraConfig = ''
auth_basic "Victoria password";
auth_basic_user_file ${config.sops.secrets.victoria-nginx-password.path};
proxy_read_timeout 1800s;
proxy_redirect off;
proxy_connect_timeout 1600s;
access_log off;
proxy_pass http://127.0.0.1:8428;
'';
options.services.victoriametrics = {
extraScrapeConfigs = mkOption {
type = types.listOf types.str;
default = [];
description = "Additional Prometheus scrape job YAML snippets for Blackbox Exporter probes";
};
};
config = {
services.prometheus.exporters.node.enable = true;
sops.secrets.victoria-nginx-password.owner = "nginx";
services.victoriametrics = {
enable = true;
extraOptions = [
"-promscrape.config=${configure_prom}"
];
};
services.nginx.virtualHosts."victoria-server.cloonar.com" = {
forceSSL = true;
enableACME = true;
acmeRoot = null;
locations."/" = {
proxyWebsockets = true;
extraConfig = ''
auth_basic "Victoria password";
auth_basic_user_file ${config.sops.secrets.victoria-nginx-password.path};
proxy_read_timeout 1800s;
proxy_redirect off;
proxy_connect_timeout 1600s;
access_log off;
proxy_pass http://127.0.0.1:8428;
'';
};
};
};
}

View File

@@ -97,18 +97,6 @@ in
};
config = {
# systemd.services = mapAttrs' (instance: instanceOpts:
# let
# domain = if instanceOpts.domain != null then instanceOpts.domain else instance;
# in
# nameValuePair "phpfpm-${domain}" {
# serviceConfig = {
# ProtectHome = lib.mkForce "tmpfs";
# BindPaths = "BindPaths=/var/www/${domain}:/var/www/${domain}";
# };
# }
# ) cfg.instances;
systemd.timers = mapAttrs' (instance: instanceOpts:
let
domain = if instanceOpts.domain != null then instanceOpts.domain else instance;
@@ -244,45 +232,6 @@ in
}
'';
# locations."/typo3/login" = {
# extraConfig = ''
# # Basic Authelia Config
# # Send a subsequent request to Authelia to verify if the user is authenticated
# # and has the right permissions to access the resource.
# auth_request /authelia;
# # Set the `target_url` variable based on the request. It will be used to build the portal
# # URL with the correct redirection parameter.
# auth_request_set $target_url $scheme://$http_host$request_uri;
# # Set the X-Forwarded-User and X-Forwarded-Groups with the headers
# # returned by Authelia for the backends which can consume them.
# # This is not safe, as the backend must make sure that they come from the
# # proxy. In the future, it's gonna be safe to just use OAuth.
# auth_request_set $user $upstream_http_remote_user;
# auth_request_set $groups $upstream_http_remote_groups;
# auth_request_set $name $upstream_http_remote_name;
# auth_request_set $email $upstream_http_remote_email;
# proxy_set_header Remote-User $user;
# proxy_set_header Remote-Groups $groups;
# proxy_set_header Remote-Name $name;
# proxy_set_header Remote-Email $email;
# # If Authelia returns 401, then nginx redirects the user to the login portal.
# # If it returns 200, then the request pass through to the backend.
# # For other type of errors, nginx will handle them as usual.
# error_page 401 =302 https://auth.cloonar.com/?rd=$target_url;
#
# fastcgi_param REMOTE_USER $user;
#
# include ${pkgs.nginx}/conf/fastcgi.conf;
# fastcgi_buffer_size 32k;
# fastcgi_buffers 8 16k;
# fastcgi_connect_timeout 240s;
# fastcgi_read_timeout 240s;
# fastcgi_send_timeout 240s;
# fastcgi_pass unix:${config.services.phpfpm.pools."${domain}".socket};
# fastcgi_param SCRIPT_FILENAME ${cfg.dataDir}/${domain}/public/typo3/index.php;
# '';
# };
locations."/favicon.ico".extraConfig = ''
log_not_found off;
access_log off;