feat: implement website alerting plan with Blackbox Exporter and VictoriaMetrics integration
This commit is contained in:
@@ -9,7 +9,6 @@
|
|||||||
./utils/modules/autoupgrade.nix
|
./utils/modules/autoupgrade.nix
|
||||||
./utils/modules/promtail
|
./utils/modules/promtail
|
||||||
./utils/modules/borgbackup.nix
|
./utils/modules/borgbackup.nix
|
||||||
# ./utils/modules/netdata.nix
|
|
||||||
|
|
||||||
# fw
|
# fw
|
||||||
./modules/network-prefix.nix
|
./modules/network-prefix.nix
|
||||||
|
|||||||
@@ -14,7 +14,6 @@
|
|||||||
./utils/modules/borgbackup.nix
|
./utils/modules/borgbackup.nix
|
||||||
./utils/modules/promtail
|
./utils/modules/promtail
|
||||||
./utils/modules/victoriametrics
|
./utils/modules/victoriametrics
|
||||||
./utils/modules/netdata.nix
|
|
||||||
./modules/set-nix-channel.nix # Automatically manage nix-channel from /var/bento/channel
|
./modules/set-nix-channel.nix # Automatically manage nix-channel from /var/bento/channel
|
||||||
|
|
||||||
./hardware-configuration.nix
|
./hardware-configuration.nix
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
{ lib, pkgs, ... }: {
|
{ config, lib, pkgs, ... }: {
|
||||||
imports = [
|
imports = [
|
||||||
./utils/bento.nix
|
./utils/bento.nix
|
||||||
./utils/modules/sops.nix
|
./utils/modules/sops.nix
|
||||||
@@ -17,12 +17,12 @@
|
|||||||
./modules/grafana/default.nix
|
./modules/grafana/default.nix
|
||||||
./modules/loki.nix
|
./modules/loki.nix
|
||||||
./modules/victoriametrics.nix
|
./modules/victoriametrics.nix
|
||||||
|
./modules/blackbox-exporter.nix
|
||||||
./modules/updns.nix
|
./modules/updns.nix
|
||||||
|
|
||||||
./utils/modules/autoupgrade.nix
|
./utils/modules/autoupgrade.nix
|
||||||
./utils/modules/promtail
|
./utils/modules/promtail
|
||||||
./utils/modules/borgbackup.nix
|
./utils/modules/borgbackup.nix
|
||||||
./utils/modules/netdata.nix
|
|
||||||
./modules/set-nix-channel.nix # Automatically manage nix-channel from /var/bento/channel
|
./modules/set-nix-channel.nix # Automatically manage nix-channel from /var/bento/channel
|
||||||
|
|
||||||
./hardware-configuration.nix
|
./hardware-configuration.nix
|
||||||
|
|||||||
56
hosts/web-arm/modules/blackbox-exporter.nix
Normal file
56
hosts/web-arm/modules/blackbox-exporter.nix
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
{ config, pkgs, lib, ... }:
|
||||||
|
|
||||||
|
with lib;
|
||||||
|
|
||||||
|
let
|
||||||
|
hostname = config.networking.hostName;
|
||||||
|
|
||||||
|
nginxVHosts = config.services.nginx.virtualHosts or {};
|
||||||
|
allDomains = lib.attrNames nginxVHosts;
|
||||||
|
httpsDomains = lib.map (d: "https://${d}") allDomains;
|
||||||
|
domainsString = builtins.concatStringsSep "\n "
|
||||||
|
(map (d: "\"${d}\",") httpsDomains);
|
||||||
|
in {
|
||||||
|
config = {
|
||||||
|
# Systemd service for Blackbox Exporter
|
||||||
|
systemd.services.blackbox-exporter = {
|
||||||
|
description = "Blackbox Exporter";
|
||||||
|
after = [ "network-online.target" ];
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
serviceConfig.ExecStart = ''
|
||||||
|
${pkgs.prometheus-blackbox-exporter}/bin/blackbox_exporter \
|
||||||
|
--config.file=/etc/blackbox_exporter/blackbox.yml
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
|
# Configuration file for Blackbox Exporter
|
||||||
|
environment.etc."blackbox_exporter/blackbox.yml".text = ''
|
||||||
|
modules:
|
||||||
|
http_2xx:
|
||||||
|
prober: http
|
||||||
|
'';
|
||||||
|
|
||||||
|
# Add scrape config for VictoriaMetrics agent
|
||||||
|
services.victoriametrics.extraScrapeConfigs = [
|
||||||
|
''
|
||||||
|
- job_name: "blackbox_http_all_domains"
|
||||||
|
metrics_path: "/probe"
|
||||||
|
params:
|
||||||
|
module: ["http_2xx"]
|
||||||
|
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
[
|
||||||
|
${domainsString}
|
||||||
|
]
|
||||||
|
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: ["__address__"]
|
||||||
|
target_label: "__param_target"
|
||||||
|
replacement: "$$1"
|
||||||
|
- source_labels: ["__param_target"]
|
||||||
|
target_label: "instance"
|
||||||
|
''
|
||||||
|
];
|
||||||
|
};
|
||||||
|
}
|
||||||
72
hosts/web-arm/modules/grafana/alerting/websites/default.nix
Normal file
72
hosts/web-arm/modules/grafana/alerting/websites/default.nix
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
{ lib, pkgs, config, ... }:
|
||||||
|
|
||||||
|
let
|
||||||
|
nginxVHosts = config.services.nginx.virtualHosts or {};
|
||||||
|
allDomains = lib.attrNames nginxVHosts;
|
||||||
|
httpsDomains = lib.map (d: "https://${d}") allDomains;
|
||||||
|
websiteAlertRules = lib.map (target:
|
||||||
|
let
|
||||||
|
domain = lib.replaceStrings ["://" "." "-" "/" ] ["-" "-" "_" "_"] target + "-down-alert";
|
||||||
|
uid = builtins.hashString "sha1" domain;
|
||||||
|
in {
|
||||||
|
uid = uid;
|
||||||
|
title = "Website " + target + " Down";
|
||||||
|
condition = "C";
|
||||||
|
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
relativeTimeRange = { from = 300; to = 0; };
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
model = {
|
||||||
|
editorMode = "code";
|
||||||
|
expr = "probe_success{target=\"" + target + "\"} OR on() vector(0)";
|
||||||
|
hide = false;
|
||||||
|
intervalMs = 1000;
|
||||||
|
legendFormat = target;
|
||||||
|
maxDataPoints = 43200;
|
||||||
|
range = true;
|
||||||
|
refId = "A";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "B";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$B < 1";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
noDataState = "Alerting";
|
||||||
|
execErrState = "Alerting";
|
||||||
|
for = "5m";
|
||||||
|
annotations = {
|
||||||
|
description = "Website " + target + " is unreachable.";
|
||||||
|
summary = "Website Down";
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "critical";
|
||||||
|
website_url = target;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
) httpsDomains;
|
||||||
|
in {
|
||||||
|
services.grafana.provision.alerting.rules.settings.groups = [
|
||||||
|
{
|
||||||
|
name = "Website Alerts";
|
||||||
|
folder = "Websites";
|
||||||
|
interval = "1m";
|
||||||
|
rules = websiteAlertRules;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
@@ -31,6 +31,7 @@ in
|
|||||||
# Individual alert files removed, now handled by alerting/system/default.nix
|
# Individual alert files removed, now handled by alerting/system/default.nix
|
||||||
./alerting/system/default.nix # Added: Imports the consolidated system alerts module
|
./alerting/system/default.nix # Added: Imports the consolidated system alerts module
|
||||||
./alerting/service/default.nix # Added: Imports the new service alerts module
|
./alerting/service/default.nix # Added: Imports the new service alerts module
|
||||||
|
./alerting/websites/default.nix # Added: Imports the new websites alerts module
|
||||||
# ... other rule files can be added here ...
|
# ... other rule files can be added here ...
|
||||||
./datasources/victoriametrics.nix
|
./datasources/victoriametrics.nix
|
||||||
./datasources/loki.nix # Add Loki datasource
|
./datasources/loki.nix # Add Loki datasource
|
||||||
|
|||||||
@@ -1,14 +1,55 @@
|
|||||||
{ config, ... }:
|
{ config, lib, ... }:
|
||||||
|
with lib;
|
||||||
let
|
let
|
||||||
|
# configure_prom = builtins.toFile "prometheus.yml" ''
|
||||||
|
# scrape_configs:
|
||||||
|
# - job_name: 'server'
|
||||||
|
# stream_parse: true
|
||||||
|
# static_configs:
|
||||||
|
# - targets:
|
||||||
|
# - ${config.networking.hostName}:9100
|
||||||
|
# '';
|
||||||
configure_prom = builtins.toFile "prometheus.yml" ''
|
configure_prom = builtins.toFile "prometheus.yml" ''
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
- job_name: 'server'
|
# System metrics
|
||||||
|
- job_name: 'node'
|
||||||
stream_parse: true
|
stream_parse: true
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets:
|
- targets:
|
||||||
- ${config.networking.hostName}:9100
|
- ${config.networking.hostName}:9100
|
||||||
|
|
||||||
|
# Systemd service monitoring
|
||||||
|
- job_name: 'systemd'
|
||||||
|
metrics_path: /metrics
|
||||||
|
params:
|
||||||
|
collect[]:
|
||||||
|
- 'systemd.service.state'
|
||||||
|
- 'systemd.service.start_time_seconds'
|
||||||
|
- 'systemd.unit_file.state'
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- ${config.networking.hostName}:9100
|
||||||
|
relabel_configs:
|
||||||
|
# Filter for specific services we want to monitor
|
||||||
|
- source_labels: [__name__]
|
||||||
|
regex: 'node_systemd_unit_state'
|
||||||
|
action: keep
|
||||||
|
- source_labels: [name]
|
||||||
|
regex: '(container@git|microvm@git-runner-|postfix|dovecot|openldap|wireguard-wg_cloonar).*\.service'
|
||||||
|
action: keep
|
||||||
|
|
||||||
|
${concatStringsSep "\n" config.services.victoriametrics.extraScrapeConfigs}
|
||||||
'';
|
'';
|
||||||
in {
|
in {
|
||||||
|
options.services.victoriametrics = {
|
||||||
|
extraScrapeConfigs = mkOption {
|
||||||
|
type = types.listOf types.str;
|
||||||
|
default = [];
|
||||||
|
description = "Additional Prometheus scrape job YAML snippets for Blackbox Exporter probes";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
config = {
|
||||||
services.prometheus.exporters.node.enable = true;
|
services.prometheus.exporters.node.enable = true;
|
||||||
|
|
||||||
sops.secrets.victoria-nginx-password.owner = "nginx";
|
sops.secrets.victoria-nginx-password.owner = "nginx";
|
||||||
@@ -39,5 +80,5 @@ in {
|
|||||||
'';
|
'';
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -97,18 +97,6 @@ in
|
|||||||
};
|
};
|
||||||
|
|
||||||
config = {
|
config = {
|
||||||
# systemd.services = mapAttrs' (instance: instanceOpts:
|
|
||||||
# let
|
|
||||||
# domain = if instanceOpts.domain != null then instanceOpts.domain else instance;
|
|
||||||
# in
|
|
||||||
# nameValuePair "phpfpm-${domain}" {
|
|
||||||
# serviceConfig = {
|
|
||||||
# ProtectHome = lib.mkForce "tmpfs";
|
|
||||||
# BindPaths = "BindPaths=/var/www/${domain}:/var/www/${domain}";
|
|
||||||
# };
|
|
||||||
# }
|
|
||||||
# ) cfg.instances;
|
|
||||||
|
|
||||||
systemd.timers = mapAttrs' (instance: instanceOpts:
|
systemd.timers = mapAttrs' (instance: instanceOpts:
|
||||||
let
|
let
|
||||||
domain = if instanceOpts.domain != null then instanceOpts.domain else instance;
|
domain = if instanceOpts.domain != null then instanceOpts.domain else instance;
|
||||||
@@ -244,45 +232,6 @@ in
|
|||||||
}
|
}
|
||||||
'';
|
'';
|
||||||
|
|
||||||
# locations."/typo3/login" = {
|
|
||||||
# extraConfig = ''
|
|
||||||
# # Basic Authelia Config
|
|
||||||
# # Send a subsequent request to Authelia to verify if the user is authenticated
|
|
||||||
# # and has the right permissions to access the resource.
|
|
||||||
# auth_request /authelia;
|
|
||||||
# # Set the `target_url` variable based on the request. It will be used to build the portal
|
|
||||||
# # URL with the correct redirection parameter.
|
|
||||||
# auth_request_set $target_url $scheme://$http_host$request_uri;
|
|
||||||
# # Set the X-Forwarded-User and X-Forwarded-Groups with the headers
|
|
||||||
# # returned by Authelia for the backends which can consume them.
|
|
||||||
# # This is not safe, as the backend must make sure that they come from the
|
|
||||||
# # proxy. In the future, it's gonna be safe to just use OAuth.
|
|
||||||
# auth_request_set $user $upstream_http_remote_user;
|
|
||||||
# auth_request_set $groups $upstream_http_remote_groups;
|
|
||||||
# auth_request_set $name $upstream_http_remote_name;
|
|
||||||
# auth_request_set $email $upstream_http_remote_email;
|
|
||||||
# proxy_set_header Remote-User $user;
|
|
||||||
# proxy_set_header Remote-Groups $groups;
|
|
||||||
# proxy_set_header Remote-Name $name;
|
|
||||||
# proxy_set_header Remote-Email $email;
|
|
||||||
# # If Authelia returns 401, then nginx redirects the user to the login portal.
|
|
||||||
# # If it returns 200, then the request pass through to the backend.
|
|
||||||
# # For other type of errors, nginx will handle them as usual.
|
|
||||||
# error_page 401 =302 https://auth.cloonar.com/?rd=$target_url;
|
|
||||||
#
|
|
||||||
# fastcgi_param REMOTE_USER $user;
|
|
||||||
#
|
|
||||||
# include ${pkgs.nginx}/conf/fastcgi.conf;
|
|
||||||
# fastcgi_buffer_size 32k;
|
|
||||||
# fastcgi_buffers 8 16k;
|
|
||||||
# fastcgi_connect_timeout 240s;
|
|
||||||
# fastcgi_read_timeout 240s;
|
|
||||||
# fastcgi_send_timeout 240s;
|
|
||||||
# fastcgi_pass unix:${config.services.phpfpm.pools."${domain}".socket};
|
|
||||||
# fastcgi_param SCRIPT_FILENAME ${cfg.dataDir}/${domain}/public/typo3/index.php;
|
|
||||||
# '';
|
|
||||||
# };
|
|
||||||
|
|
||||||
locations."/favicon.ico".extraConfig = ''
|
locations."/favicon.ico".extraConfig = ''
|
||||||
log_not_found off;
|
log_not_found off;
|
||||||
access_log off;
|
access_log off;
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ fi
|
|||||||
|
|
||||||
# Execute nixos-rebuild dry-build
|
# Execute nixos-rebuild dry-build
|
||||||
# Store the output and error streams, and the exit code
|
# Store the output and error streams, and the exit code
|
||||||
NIX_OUTPUT_ERR=$(nixos-rebuild dry-build $SHOW_TRACE_OPT -I nixos-config="$CONFIG_PATH" 2>&1)
|
NIX_OUTPUT_ERR=$(nixos-rebuild dry-build $SHOW_TRACE_OPT -I nixos-config="$CONFIG_PATH" --show-trace 2>&1)
|
||||||
NIX_EXIT_STATUS=$?
|
NIX_EXIT_STATUS=$?
|
||||||
|
|
||||||
# Check the exit status
|
# Check the exit status
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
{ config, pkgs, ... }:
|
{ config, lib, pkgs, ... }:
|
||||||
|
with lib;
|
||||||
let
|
let
|
||||||
configure_prom = builtins.toFile "prometheus.yml" ''
|
configure_prom = builtins.toFile "prometheus.yml" ''
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
@@ -28,8 +29,19 @@ let
|
|||||||
- source_labels: [name]
|
- source_labels: [name]
|
||||||
regex: '(container@git|microvm@git-runner-|postfix|dovecot|openldap|wireguard-wg_cloonar).*\.service'
|
regex: '(container@git|microvm@git-runner-|postfix|dovecot|openldap|wireguard-wg_cloonar).*\.service'
|
||||||
action: keep
|
action: keep
|
||||||
|
|
||||||
|
${concatStringsSep "\n " config.services.victoriametrics.extraScrapeConfigs}
|
||||||
'';
|
'';
|
||||||
in {
|
in {
|
||||||
|
options.services.victoriametrics = {
|
||||||
|
extraScrapeConfigs = mkOption {
|
||||||
|
type = types.listOf types.str;
|
||||||
|
default = [];
|
||||||
|
description = "Additional Prometheus scrape job YAML snippets for Blackbox Exporter probes";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
config = {
|
||||||
sops.secrets.victoria-agent-env = {
|
sops.secrets.victoria-agent-env = {
|
||||||
sopsFile = ./secrets.yaml;
|
sopsFile = ./secrets.yaml;
|
||||||
};
|
};
|
||||||
@@ -53,4 +65,5 @@ in {
|
|||||||
EnvironmentFile=config.sops.secrets.victoria-agent-env.path;
|
EnvironmentFile=config.sops.secrets.victoria-agent-env.path;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user