fix: alerting
This commit is contained in:
@@ -60,6 +60,9 @@
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
# Systemd services to monitor
|
||||||
|
services.victoriametrics.monitoredServices = [ "mysql" "nginx" "phpfpm-.*" ];
|
||||||
|
|
||||||
# backups - adjust repo for this host
|
# backups - adjust repo for this host
|
||||||
borgbackup.repo = "u149513-sub10@u149513-sub10.your-backup.de:borg";
|
borgbackup.repo = "u149513-sub10@u149513-sub10.your-backup.de:borg";
|
||||||
|
|
||||||
|
|||||||
@@ -76,6 +76,9 @@
|
|||||||
|
|
||||||
networkPrefix = "10.42";
|
networkPrefix = "10.42";
|
||||||
|
|
||||||
|
# Systemd services to monitor
|
||||||
|
services.victoriametrics.monitoredServices = [ "ai-mailer" "container@git" "microvm@git-runner-" ];
|
||||||
|
|
||||||
nixpkgs.overlays = [
|
nixpkgs.overlays = [
|
||||||
(import ./utils/overlays/packages.nix)
|
(import ./utils/overlays/packages.nix)
|
||||||
];
|
];
|
||||||
|
|||||||
@@ -2,42 +2,19 @@
|
|||||||
let
|
let
|
||||||
configure_prom = builtins.toFile "prometheus.yml" ''
|
configure_prom = builtins.toFile "prometheus.yml" ''
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
# System metrics
|
- job_name: 'server'
|
||||||
- job_name: 'node'
|
|
||||||
stream_parse: true
|
stream_parse: true
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets:
|
- targets:
|
||||||
- ${config.networking.hostName}:9100
|
- ${config.networking.hostName}:9100
|
||||||
|
|
||||||
# Systemd service monitoring
|
|
||||||
- job_name: 'systemd'
|
|
||||||
metrics_path: /metrics
|
|
||||||
params:
|
|
||||||
collect[]:
|
|
||||||
- 'systemd.service.state'
|
|
||||||
- 'systemd.service.start_time_seconds'
|
|
||||||
- 'systemd.unit_file.state'
|
|
||||||
static_configs:
|
|
||||||
- targets:
|
|
||||||
- ${config.networking.hostName}:9100
|
|
||||||
relabel_configs:
|
|
||||||
- source_labels: [__name__]
|
|
||||||
regex: 'node_systemd_unit_state'
|
|
||||||
action: keep
|
|
||||||
- source_labels: [name]
|
|
||||||
regex: '(ai-mailer|container@git|microvm@git-runner-).*\.service'
|
|
||||||
action: keep
|
|
||||||
'';
|
'';
|
||||||
in {
|
in {
|
||||||
sops.secrets.victoria-agent-env = {
|
sops.secrets.victoria-agent-env = {
|
||||||
sopsFile = ../utils/modules/victoriametrics/secrets.yaml;
|
sopsFile = ../utils/modules/victoriametrics/secrets.yaml;
|
||||||
};
|
};
|
||||||
|
|
||||||
services.prometheus.exporters.node = {
|
services.prometheus.exporters.node.enable = true;
|
||||||
enable = true;
|
|
||||||
enabledCollectors = [ "systemd" ];
|
|
||||||
};
|
|
||||||
|
|
||||||
systemd.services.export-fw-to-prometheus = {
|
systemd.services.export-fw-to-prometheus = {
|
||||||
path = with pkgs; [victoriametrics];
|
path = with pkgs; [victoriametrics];
|
||||||
enable = true;
|
enable = true;
|
||||||
|
|||||||
@@ -5,4 +5,7 @@
|
|||||||
./postfix-exporter.nix
|
./postfix-exporter.nix
|
||||||
./dovecot-exporter.nix
|
./dovecot-exporter.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
|
# Systemd services to monitor
|
||||||
|
services.victoriametrics.monitoredServices = [ "postfix" "dovecot" "openldap" "wireguard-wg_cloonar" ];
|
||||||
}
|
}
|
||||||
@@ -9,10 +9,10 @@ let
|
|||||||
{ name = "OpenLDAP"; service = "openldap.service"; instance = "mail:9100"; }
|
{ name = "OpenLDAP"; service = "openldap.service"; instance = "mail:9100"; }
|
||||||
{ name = "Gitea"; service = "container@git.service"; instance = "fw:9100"; }
|
{ name = "Gitea"; service = "container@git.service"; instance = "fw:9100"; }
|
||||||
{ name = "Gitea Runner"; service = "microvm@git-runner-1.service"; instance = "fw:9100"; }
|
{ name = "Gitea Runner"; service = "microvm@git-runner-1.service"; instance = "fw:9100"; }
|
||||||
{ name = "WireGuard"; service = "wireguard-wg_cloonar.service"; instance = "mail:9100"; }
|
{ name = "WireGuard"; service = "wireguard-wg_cloonar.service"; instance = "fw:9100"; }
|
||||||
{ name = "MySQL"; service = "mysql.service"; instance = "amzebs-01:9100"; }
|
{ name = "MySQL"; service = "mysql.service"; instance = "amzebs-01:9100"; }
|
||||||
{ name = "Nginx"; service = "nginx.service"; instance = "amzebs-01:9100"; }
|
{ name = "Nginx"; service = "nginx.service"; instance = "amzebs-01:9100"; }
|
||||||
{ name = "PHP-FPM"; service = "phpfpm-.*\\.service"; instance = "amzebs-01:9100"; }
|
{ name = "PHP-FPM"; service = "phpfpm-.*[.]service"; instance = "amzebs-01:9100"; }
|
||||||
];
|
];
|
||||||
|
|
||||||
# Extract host from instance (e.g., "fw:9100" -> "fw")
|
# Extract host from instance (e.g., "fw:9100" -> "fw")
|
||||||
@@ -25,12 +25,17 @@ let
|
|||||||
isRegex = svc: lib.hasInfix ".*" svc || lib.hasInfix "\\" svc;
|
isRegex = svc: lib.hasInfix ".*" svc || lib.hasInfix "\\" svc;
|
||||||
|
|
||||||
# Build the PromQL expression
|
# Build the PromQL expression
|
||||||
|
# For regex patterns: use min() to alert if ANY matching service is down
|
||||||
|
# For single services: use OR vector(0) to handle missing metrics
|
||||||
mkExpr = svc:
|
mkExpr = svc:
|
||||||
let
|
let
|
||||||
nameMatch = if isRegex svc.service
|
nameMatch = if isRegex svc.service
|
||||||
then "name=~\"${svc.service}\""
|
then "name=~\"${svc.service}\""
|
||||||
else "name=\"${svc.service}\"";
|
else "name=\"${svc.service}\"";
|
||||||
in "node_systemd_unit_state{state=\"active\", ${nameMatch}, instance=\"${svc.instance}\"} OR on() vector(0)";
|
baseQuery = "node_systemd_unit_state{state=\"active\", ${nameMatch}, instance=\"${svc.instance}\"}";
|
||||||
|
in if isRegex svc.service
|
||||||
|
then "min(${baseQuery})"
|
||||||
|
else "${baseQuery} OR on() vector(0)";
|
||||||
|
|
||||||
mkServiceAlert = svc: {
|
mkServiceAlert = svc: {
|
||||||
uid = mkUid svc.name;
|
uid = mkUid svc.name;
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
{ config, lib, pkgs, ... }:
|
{ config, lib, pkgs, ... }:
|
||||||
with lib;
|
with lib;
|
||||||
let
|
let
|
||||||
|
cfg = config.services.victoriametrics;
|
||||||
|
serviceRegex = concatStringsSep "|" cfg.monitoredServices;
|
||||||
|
|
||||||
configure_prom = builtins.toFile "prometheus.yml" ''
|
configure_prom = builtins.toFile "prometheus.yml" ''
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
# System metrics
|
# System metrics
|
||||||
@@ -27,13 +30,20 @@ let
|
|||||||
regex: 'node_systemd_unit_state'
|
regex: 'node_systemd_unit_state'
|
||||||
action: keep
|
action: keep
|
||||||
- source_labels: [name]
|
- source_labels: [name]
|
||||||
regex: '(container@git|microvm@git-runner-|postfix|dovecot|openldap|wireguard-wg_cloonar).*\.service'
|
regex: '(${serviceRegex}).*\.service'
|
||||||
action: keep
|
action: keep
|
||||||
|
|
||||||
${concatStringsSep "\n" config.services.victoriametrics.extraScrapeConfigs}
|
${concatStringsSep "\n" cfg.extraScrapeConfigs}
|
||||||
'';
|
'';
|
||||||
in {
|
in {
|
||||||
options.services.victoriametrics = {
|
options.services.victoriametrics = {
|
||||||
|
monitoredServices = mkOption {
|
||||||
|
type = types.listOf types.str;
|
||||||
|
default = [];
|
||||||
|
description = "List of systemd service name patterns to monitor (without .service suffix)";
|
||||||
|
example = [ "mysql" "nginx" "phpfpm-.*" ];
|
||||||
|
};
|
||||||
|
|
||||||
extraScrapeConfigs = mkOption {
|
extraScrapeConfigs = mkOption {
|
||||||
type = types.listOf types.str;
|
type = types.listOf types.str;
|
||||||
default = [];
|
default = [];
|
||||||
|
|||||||
Reference in New Issue
Block a user