fix: alerting

This commit is contained in:
2026-01-05 10:45:38 +01:00
parent ed451e3b95
commit 21c5c6dbd5
6 changed files with 32 additions and 31 deletions

View File

@@ -60,6 +60,9 @@
}; };
}; };
# Systemd services to monitor
services.victoriametrics.monitoredServices = [ "mysql" "nginx" "phpfpm-.*" ];
# backups - adjust repo for this host # backups - adjust repo for this host
borgbackup.repo = "u149513-sub10@u149513-sub10.your-backup.de:borg"; borgbackup.repo = "u149513-sub10@u149513-sub10.your-backup.de:borg";

View File

@@ -76,6 +76,9 @@
networkPrefix = "10.42"; networkPrefix = "10.42";
# Systemd services to monitor
services.victoriametrics.monitoredServices = [ "ai-mailer" "container@git" "microvm@git-runner-" ];
nixpkgs.overlays = [ nixpkgs.overlays = [
(import ./utils/overlays/packages.nix) (import ./utils/overlays/packages.nix)
]; ];

View File

@@ -2,41 +2,18 @@
let let
configure_prom = builtins.toFile "prometheus.yml" '' configure_prom = builtins.toFile "prometheus.yml" ''
scrape_configs: scrape_configs:
# System metrics - job_name: 'server'
- job_name: 'node'
stream_parse: true stream_parse: true
static_configs: static_configs:
- targets: - targets:
- ${config.networking.hostName}:9100 - ${config.networking.hostName}:9100
# Systemd service monitoring
- job_name: 'systemd'
metrics_path: /metrics
params:
collect[]:
- 'systemd.service.state'
- 'systemd.service.start_time_seconds'
- 'systemd.unit_file.state'
static_configs:
- targets:
- ${config.networking.hostName}:9100
relabel_configs:
- source_labels: [__name__]
regex: 'node_systemd_unit_state'
action: keep
- source_labels: [name]
regex: '(ai-mailer|container@git|microvm@git-runner-).*\.service'
action: keep
''; '';
in { in {
sops.secrets.victoria-agent-env = { sops.secrets.victoria-agent-env = {
sopsFile = ../utils/modules/victoriametrics/secrets.yaml; sopsFile = ../utils/modules/victoriametrics/secrets.yaml;
}; };
services.prometheus.exporters.node = { services.prometheus.exporters.node.enable = true;
enable = true;
enabledCollectors = [ "systemd" ];
};
systemd.services.export-fw-to-prometheus = { systemd.services.export-fw-to-prometheus = {
path = with pkgs; [victoriametrics]; path = with pkgs; [victoriametrics];

View File

@@ -5,4 +5,7 @@
./postfix-exporter.nix ./postfix-exporter.nix
./dovecot-exporter.nix ./dovecot-exporter.nix
]; ];
# Systemd services to monitor
services.victoriametrics.monitoredServices = [ "postfix" "dovecot" "openldap" "wireguard-wg_cloonar" ];
} }

View File

@@ -9,10 +9,10 @@ let
{ name = "OpenLDAP"; service = "openldap.service"; instance = "mail:9100"; } { name = "OpenLDAP"; service = "openldap.service"; instance = "mail:9100"; }
{ name = "Gitea"; service = "container@git.service"; instance = "fw:9100"; } { name = "Gitea"; service = "container@git.service"; instance = "fw:9100"; }
{ name = "Gitea Runner"; service = "microvm@git-runner-1.service"; instance = "fw:9100"; } { name = "Gitea Runner"; service = "microvm@git-runner-1.service"; instance = "fw:9100"; }
{ name = "WireGuard"; service = "wireguard-wg_cloonar.service"; instance = "mail:9100"; } { name = "WireGuard"; service = "wireguard-wg_cloonar.service"; instance = "fw:9100"; }
{ name = "MySQL"; service = "mysql.service"; instance = "amzebs-01:9100"; } { name = "MySQL"; service = "mysql.service"; instance = "amzebs-01:9100"; }
{ name = "Nginx"; service = "nginx.service"; instance = "amzebs-01:9100"; } { name = "Nginx"; service = "nginx.service"; instance = "amzebs-01:9100"; }
{ name = "PHP-FPM"; service = "phpfpm-.*\\.service"; instance = "amzebs-01:9100"; } { name = "PHP-FPM"; service = "phpfpm-.*[.]service"; instance = "amzebs-01:9100"; }
]; ];
# Extract host from instance (e.g., "fw:9100" -> "fw") # Extract host from instance (e.g., "fw:9100" -> "fw")
@@ -25,12 +25,17 @@ let
isRegex = svc: lib.hasInfix ".*" svc || lib.hasInfix "\\" svc; isRegex = svc: lib.hasInfix ".*" svc || lib.hasInfix "\\" svc;
# Build the PromQL expression # Build the PromQL expression
# For regex patterns: use min() to alert if ANY matching service is down
# For single services: use OR vector(0) to handle missing metrics
mkExpr = svc: mkExpr = svc:
let let
nameMatch = if isRegex svc.service nameMatch = if isRegex svc.service
then "name=~\"${svc.service}\"" then "name=~\"${svc.service}\""
else "name=\"${svc.service}\""; else "name=\"${svc.service}\"";
in "node_systemd_unit_state{state=\"active\", ${nameMatch}, instance=\"${svc.instance}\"} OR on() vector(0)"; baseQuery = "node_systemd_unit_state{state=\"active\", ${nameMatch}, instance=\"${svc.instance}\"}";
in if isRegex svc.service
then "min(${baseQuery})"
else "${baseQuery} OR on() vector(0)";
mkServiceAlert = svc: { mkServiceAlert = svc: {
uid = mkUid svc.name; uid = mkUid svc.name;

View File

@@ -1,6 +1,9 @@
{ config, lib, pkgs, ... }: { config, lib, pkgs, ... }:
with lib; with lib;
let let
cfg = config.services.victoriametrics;
serviceRegex = concatStringsSep "|" cfg.monitoredServices;
configure_prom = builtins.toFile "prometheus.yml" '' configure_prom = builtins.toFile "prometheus.yml" ''
scrape_configs: scrape_configs:
# System metrics # System metrics
@@ -27,13 +30,20 @@ let
regex: 'node_systemd_unit_state' regex: 'node_systemd_unit_state'
action: keep action: keep
- source_labels: [name] - source_labels: [name]
regex: '(container@git|microvm@git-runner-|postfix|dovecot|openldap|wireguard-wg_cloonar).*\.service' regex: '(${serviceRegex}).*\.service'
action: keep action: keep
${concatStringsSep "\n" config.services.victoriametrics.extraScrapeConfigs} ${concatStringsSep "\n" cfg.extraScrapeConfigs}
''; '';
in { in {
options.services.victoriametrics = { options.services.victoriametrics = {
monitoredServices = mkOption {
type = types.listOf types.str;
default = [];
description = "List of systemd service name patterns to monitor (without .service suffix)";
example = [ "mysql" "nginx" "phpfpm-.*" ];
};
extraScrapeConfigs = mkOption { extraScrapeConfigs = mkOption {
type = types.listOf types.str; type = types.listOf types.str;
default = []; default = [];