feat: implement website alerting plan with Blackbox Exporter and VictoriaMetrics integration
This commit is contained in:
@@ -9,7 +9,6 @@
|
||||
./utils/modules/autoupgrade.nix
|
||||
./utils/modules/promtail
|
||||
./utils/modules/borgbackup.nix
|
||||
# ./utils/modules/netdata.nix
|
||||
|
||||
# fw
|
||||
./modules/network-prefix.nix
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
./utils/modules/borgbackup.nix
|
||||
./utils/modules/promtail
|
||||
./utils/modules/victoriametrics
|
||||
./utils/modules/netdata.nix
|
||||
./modules/set-nix-channel.nix # Automatically manage nix-channel from /var/bento/channel
|
||||
|
||||
./hardware-configuration.nix
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
{ lib, pkgs, ... }: {
|
||||
{ config, lib, pkgs, ... }: {
|
||||
imports = [
|
||||
./utils/bento.nix
|
||||
./utils/modules/sops.nix
|
||||
@@ -17,12 +17,12 @@
|
||||
./modules/grafana/default.nix
|
||||
./modules/loki.nix
|
||||
./modules/victoriametrics.nix
|
||||
./modules/blackbox-exporter.nix
|
||||
./modules/updns.nix
|
||||
|
||||
./utils/modules/autoupgrade.nix
|
||||
./utils/modules/promtail
|
||||
./utils/modules/borgbackup.nix
|
||||
./utils/modules/netdata.nix
|
||||
./modules/set-nix-channel.nix # Automatically manage nix-channel from /var/bento/channel
|
||||
|
||||
./hardware-configuration.nix
|
||||
|
||||
56
hosts/web-arm/modules/blackbox-exporter.nix
Normal file
56
hosts/web-arm/modules/blackbox-exporter.nix
Normal file
@@ -0,0 +1,56 @@
|
||||
{ config, pkgs, lib, ... }:
|
||||
|
||||
with lib;
|
||||
|
||||
let
|
||||
hostname = config.networking.hostName;
|
||||
|
||||
nginxVHosts = config.services.nginx.virtualHosts or {};
|
||||
allDomains = lib.attrNames nginxVHosts;
|
||||
httpsDomains = lib.map (d: "https://${d}") allDomains;
|
||||
domainsString = builtins.concatStringsSep "\n "
|
||||
(map (d: "\"${d}\",") httpsDomains);
|
||||
in {
|
||||
config = {
|
||||
# Systemd service for Blackbox Exporter
|
||||
systemd.services.blackbox-exporter = {
|
||||
description = "Blackbox Exporter";
|
||||
after = [ "network-online.target" ];
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
serviceConfig.ExecStart = ''
|
||||
${pkgs.prometheus-blackbox-exporter}/bin/blackbox_exporter \
|
||||
--config.file=/etc/blackbox_exporter/blackbox.yml
|
||||
'';
|
||||
};
|
||||
|
||||
# Configuration file for Blackbox Exporter
|
||||
environment.etc."blackbox_exporter/blackbox.yml".text = ''
|
||||
modules:
|
||||
http_2xx:
|
||||
prober: http
|
||||
'';
|
||||
|
||||
# Add scrape config for VictoriaMetrics agent
|
||||
services.victoriametrics.extraScrapeConfigs = [
|
||||
''
|
||||
- job_name: "blackbox_http_all_domains"
|
||||
metrics_path: "/probe"
|
||||
params:
|
||||
module: ["http_2xx"]
|
||||
|
||||
static_configs:
|
||||
- targets:
|
||||
[
|
||||
${domainsString}
|
||||
]
|
||||
|
||||
relabel_configs:
|
||||
- source_labels: ["__address__"]
|
||||
target_label: "__param_target"
|
||||
replacement: "$$1"
|
||||
- source_labels: ["__param_target"]
|
||||
target_label: "instance"
|
||||
''
|
||||
];
|
||||
};
|
||||
}
|
||||
72
hosts/web-arm/modules/grafana/alerting/websites/default.nix
Normal file
72
hosts/web-arm/modules/grafana/alerting/websites/default.nix
Normal file
@@ -0,0 +1,72 @@
|
||||
{ lib, pkgs, config, ... }:
|
||||
|
||||
let
|
||||
nginxVHosts = config.services.nginx.virtualHosts or {};
|
||||
allDomains = lib.attrNames nginxVHosts;
|
||||
httpsDomains = lib.map (d: "https://${d}") allDomains;
|
||||
websiteAlertRules = lib.map (target:
|
||||
let
|
||||
domain = lib.replaceStrings ["://" "." "-" "/" ] ["-" "-" "_" "_"] target + "-down-alert";
|
||||
uid = builtins.hashString "sha1" domain;
|
||||
in {
|
||||
uid = uid;
|
||||
title = "Website " + target + " Down";
|
||||
condition = "C";
|
||||
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
relativeTimeRange = { from = 300; to = 0; };
|
||||
datasourceUid = "vm-datasource-uid";
|
||||
model = {
|
||||
editorMode = "code";
|
||||
expr = "probe_success{target=\"" + target + "\"} OR on() vector(0)";
|
||||
hide = false;
|
||||
intervalMs = 1000;
|
||||
legendFormat = target;
|
||||
maxDataPoints = 43200;
|
||||
range = true;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "math";
|
||||
expression = "$B < 1";
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "Alerting";
|
||||
execErrState = "Alerting";
|
||||
for = "5m";
|
||||
annotations = {
|
||||
description = "Website " + target + " is unreachable.";
|
||||
summary = "Website Down";
|
||||
};
|
||||
labels = {
|
||||
severity = "critical";
|
||||
website_url = target;
|
||||
};
|
||||
}
|
||||
) httpsDomains;
|
||||
in {
|
||||
services.grafana.provision.alerting.rules.settings.groups = [
|
||||
{
|
||||
name = "Website Alerts";
|
||||
folder = "Websites";
|
||||
interval = "1m";
|
||||
rules = websiteAlertRules;
|
||||
}
|
||||
];
|
||||
}
|
||||
@@ -31,6 +31,7 @@ in
|
||||
# Individual alert files removed, now handled by alerting/system/default.nix
|
||||
./alerting/system/default.nix # Added: Imports the consolidated system alerts module
|
||||
./alerting/service/default.nix # Added: Imports the new service alerts module
|
||||
./alerting/websites/default.nix # Added: Imports the new websites alerts module
|
||||
# ... other rule files can be added here ...
|
||||
./datasources/victoriametrics.nix
|
||||
./datasources/loki.nix # Add Loki datasource
|
||||
|
||||
@@ -1,43 +1,84 @@
|
||||
{ config, ... }:
|
||||
{ config, lib, ... }:
|
||||
with lib;
|
||||
let
|
||||
# configure_prom = builtins.toFile "prometheus.yml" ''
|
||||
# scrape_configs:
|
||||
# - job_name: 'server'
|
||||
# stream_parse: true
|
||||
# static_configs:
|
||||
# - targets:
|
||||
# - ${config.networking.hostName}:9100
|
||||
# '';
|
||||
configure_prom = builtins.toFile "prometheus.yml" ''
|
||||
scrape_configs:
|
||||
- job_name: 'server'
|
||||
# System metrics
|
||||
- job_name: 'node'
|
||||
stream_parse: true
|
||||
static_configs:
|
||||
- targets:
|
||||
- ${config.networking.hostName}:9100
|
||||
|
||||
# Systemd service monitoring
|
||||
- job_name: 'systemd'
|
||||
metrics_path: /metrics
|
||||
params:
|
||||
collect[]:
|
||||
- 'systemd.service.state'
|
||||
- 'systemd.service.start_time_seconds'
|
||||
- 'systemd.unit_file.state'
|
||||
static_configs:
|
||||
- targets:
|
||||
- ${config.networking.hostName}:9100
|
||||
relabel_configs:
|
||||
# Filter for specific services we want to monitor
|
||||
- source_labels: [__name__]
|
||||
regex: 'node_systemd_unit_state'
|
||||
action: keep
|
||||
- source_labels: [name]
|
||||
regex: '(container@git|microvm@git-runner-|postfix|dovecot|openldap|wireguard-wg_cloonar).*\.service'
|
||||
action: keep
|
||||
|
||||
${concatStringsSep "\n" config.services.victoriametrics.extraScrapeConfigs}
|
||||
'';
|
||||
in {
|
||||
services.prometheus.exporters.node.enable = true;
|
||||
|
||||
sops.secrets.victoria-nginx-password.owner = "nginx";
|
||||
|
||||
services.victoriametrics = {
|
||||
enable = true;
|
||||
extraOptions = [
|
||||
"-promscrape.config=${configure_prom}"
|
||||
];
|
||||
};
|
||||
|
||||
services.nginx.virtualHosts."victoria-server.cloonar.com" = {
|
||||
forceSSL = true;
|
||||
enableACME = true;
|
||||
acmeRoot = null;
|
||||
locations."/" = {
|
||||
proxyWebsockets = true;
|
||||
extraConfig = ''
|
||||
auth_basic "Victoria password";
|
||||
auth_basic_user_file ${config.sops.secrets.victoria-nginx-password.path};
|
||||
|
||||
proxy_read_timeout 1800s;
|
||||
proxy_redirect off;
|
||||
proxy_connect_timeout 1600s;
|
||||
|
||||
access_log off;
|
||||
proxy_pass http://127.0.0.1:8428;
|
||||
'';
|
||||
options.services.victoriametrics = {
|
||||
extraScrapeConfigs = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [];
|
||||
description = "Additional Prometheus scrape job YAML snippets for Blackbox Exporter probes";
|
||||
};
|
||||
};
|
||||
|
||||
config = {
|
||||
services.prometheus.exporters.node.enable = true;
|
||||
|
||||
sops.secrets.victoria-nginx-password.owner = "nginx";
|
||||
|
||||
services.victoriametrics = {
|
||||
enable = true;
|
||||
extraOptions = [
|
||||
"-promscrape.config=${configure_prom}"
|
||||
];
|
||||
};
|
||||
|
||||
services.nginx.virtualHosts."victoria-server.cloonar.com" = {
|
||||
forceSSL = true;
|
||||
enableACME = true;
|
||||
acmeRoot = null;
|
||||
locations."/" = {
|
||||
proxyWebsockets = true;
|
||||
extraConfig = ''
|
||||
auth_basic "Victoria password";
|
||||
auth_basic_user_file ${config.sops.secrets.victoria-nginx-password.path};
|
||||
|
||||
proxy_read_timeout 1800s;
|
||||
proxy_redirect off;
|
||||
proxy_connect_timeout 1600s;
|
||||
|
||||
access_log off;
|
||||
proxy_pass http://127.0.0.1:8428;
|
||||
'';
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
@@ -97,18 +97,6 @@ in
|
||||
};
|
||||
|
||||
config = {
|
||||
# systemd.services = mapAttrs' (instance: instanceOpts:
|
||||
# let
|
||||
# domain = if instanceOpts.domain != null then instanceOpts.domain else instance;
|
||||
# in
|
||||
# nameValuePair "phpfpm-${domain}" {
|
||||
# serviceConfig = {
|
||||
# ProtectHome = lib.mkForce "tmpfs";
|
||||
# BindPaths = "BindPaths=/var/www/${domain}:/var/www/${domain}";
|
||||
# };
|
||||
# }
|
||||
# ) cfg.instances;
|
||||
|
||||
systemd.timers = mapAttrs' (instance: instanceOpts:
|
||||
let
|
||||
domain = if instanceOpts.domain != null then instanceOpts.domain else instance;
|
||||
@@ -244,45 +232,6 @@ in
|
||||
}
|
||||
'';
|
||||
|
||||
# locations."/typo3/login" = {
|
||||
# extraConfig = ''
|
||||
# # Basic Authelia Config
|
||||
# # Send a subsequent request to Authelia to verify if the user is authenticated
|
||||
# # and has the right permissions to access the resource.
|
||||
# auth_request /authelia;
|
||||
# # Set the `target_url` variable based on the request. It will be used to build the portal
|
||||
# # URL with the correct redirection parameter.
|
||||
# auth_request_set $target_url $scheme://$http_host$request_uri;
|
||||
# # Set the X-Forwarded-User and X-Forwarded-Groups with the headers
|
||||
# # returned by Authelia for the backends which can consume them.
|
||||
# # This is not safe, as the backend must make sure that they come from the
|
||||
# # proxy. In the future, it's gonna be safe to just use OAuth.
|
||||
# auth_request_set $user $upstream_http_remote_user;
|
||||
# auth_request_set $groups $upstream_http_remote_groups;
|
||||
# auth_request_set $name $upstream_http_remote_name;
|
||||
# auth_request_set $email $upstream_http_remote_email;
|
||||
# proxy_set_header Remote-User $user;
|
||||
# proxy_set_header Remote-Groups $groups;
|
||||
# proxy_set_header Remote-Name $name;
|
||||
# proxy_set_header Remote-Email $email;
|
||||
# # If Authelia returns 401, then nginx redirects the user to the login portal.
|
||||
# # If it returns 200, then the request pass through to the backend.
|
||||
# # For other type of errors, nginx will handle them as usual.
|
||||
# error_page 401 =302 https://auth.cloonar.com/?rd=$target_url;
|
||||
#
|
||||
# fastcgi_param REMOTE_USER $user;
|
||||
#
|
||||
# include ${pkgs.nginx}/conf/fastcgi.conf;
|
||||
# fastcgi_buffer_size 32k;
|
||||
# fastcgi_buffers 8 16k;
|
||||
# fastcgi_connect_timeout 240s;
|
||||
# fastcgi_read_timeout 240s;
|
||||
# fastcgi_send_timeout 240s;
|
||||
# fastcgi_pass unix:${config.services.phpfpm.pools."${domain}".socket};
|
||||
# fastcgi_param SCRIPT_FILENAME ${cfg.dataDir}/${domain}/public/typo3/index.php;
|
||||
# '';
|
||||
# };
|
||||
|
||||
locations."/favicon.ico".extraConfig = ''
|
||||
log_not_found off;
|
||||
access_log off;
|
||||
|
||||
@@ -45,7 +45,7 @@ fi
|
||||
|
||||
# Execute nixos-rebuild dry-build
|
||||
# Store the output and error streams, and the exit code
|
||||
NIX_OUTPUT_ERR=$(nixos-rebuild dry-build $SHOW_TRACE_OPT -I nixos-config="$CONFIG_PATH" 2>&1)
|
||||
NIX_OUTPUT_ERR=$(nixos-rebuild dry-build $SHOW_TRACE_OPT -I nixos-config="$CONFIG_PATH" --show-trace 2>&1)
|
||||
NIX_EXIT_STATUS=$?
|
||||
|
||||
# Check the exit status
|
||||
@@ -61,4 +61,4 @@ else
|
||||
echo "Output from nixos-rebuild:" >&2
|
||||
echo "$NIX_OUTPUT_ERR" >&2
|
||||
exit "$NIX_EXIT_STATUS"
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
{ config, pkgs, ... }:
|
||||
{ config, lib, pkgs, ... }:
|
||||
with lib;
|
||||
let
|
||||
configure_prom = builtins.toFile "prometheus.yml" ''
|
||||
scrape_configs:
|
||||
@@ -28,29 +29,41 @@ let
|
||||
- source_labels: [name]
|
||||
regex: '(container@git|microvm@git-runner-|postfix|dovecot|openldap|wireguard-wg_cloonar).*\.service'
|
||||
action: keep
|
||||
|
||||
${concatStringsSep "\n " config.services.victoriametrics.extraScrapeConfigs}
|
||||
'';
|
||||
in {
|
||||
sops.secrets.victoria-agent-env = {
|
||||
sopsFile = ./secrets.yaml;
|
||||
options.services.victoriametrics = {
|
||||
extraScrapeConfigs = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [];
|
||||
description = "Additional Prometheus scrape job YAML snippets for Blackbox Exporter probes";
|
||||
};
|
||||
};
|
||||
|
||||
# Node exporter for system metrics
|
||||
services.prometheus.exporters.node = {
|
||||
enable = true;
|
||||
enabledCollectors = [
|
||||
"systemd" # Enable systemd collector for service monitoring
|
||||
];
|
||||
};
|
||||
|
||||
systemd.services.export-to-prometheus = {
|
||||
path = with pkgs; [victoriametrics];
|
||||
enable = true;
|
||||
after = ["network-online.target"];
|
||||
wantedBy = ["multi-user.target"];
|
||||
script = "vmagent -promscrape.config=${configure_prom} -envflag.enable -remoteWrite.url=https://victoria-server.cloonar.com/api/v1/write";
|
||||
config = {
|
||||
sops.secrets.victoria-agent-env = {
|
||||
sopsFile = ./secrets.yaml;
|
||||
};
|
||||
|
||||
serviceConfig = {
|
||||
EnvironmentFile=config.sops.secrets.victoria-agent-env.path;
|
||||
# Node exporter for system metrics
|
||||
services.prometheus.exporters.node = {
|
||||
enable = true;
|
||||
enabledCollectors = [
|
||||
"systemd" # Enable systemd collector for service monitoring
|
||||
];
|
||||
};
|
||||
|
||||
systemd.services.export-to-prometheus = {
|
||||
path = with pkgs; [victoriametrics];
|
||||
enable = true;
|
||||
after = ["network-online.target"];
|
||||
wantedBy = ["multi-user.target"];
|
||||
script = "vmagent -promscrape.config=${configure_prom} -envflag.enable -remoteWrite.url=https://victoria-server.cloonar.com/api/v1/write";
|
||||
|
||||
serviceConfig = {
|
||||
EnvironmentFile=config.sops.secrets.victoria-agent-env.path;
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user