Compare commits

..

2 commits

12 changed files with 118 additions and 541 deletions

View file

@ -2,18 +2,41 @@
let let
configure_prom = builtins.toFile "prometheus.yml" '' configure_prom = builtins.toFile "prometheus.yml" ''
scrape_configs: scrape_configs:
- job_name: 'server' # System metrics
- job_name: 'node'
stream_parse: true stream_parse: true
static_configs: static_configs:
- targets: - targets:
- ${config.networking.hostName}:9100 - ${config.networking.hostName}:9100
# Systemd service monitoring
- job_name: 'systemd'
metrics_path: /metrics
params:
collect[]:
- 'systemd.service.state'
- 'systemd.service.start_time_seconds'
- 'systemd.unit_file.state'
static_configs:
- targets:
- ${config.networking.hostName}:9100
relabel_configs:
- source_labels: [__name__]
regex: 'node_systemd_unit_state'
action: keep
- source_labels: [name]
regex: '(ai-mailer|container@git|microvm@git-runner-).*\.service'
action: keep
''; '';
in { in {
sops.secrets.victoria-agent-env = { sops.secrets.victoria-agent-env = {
sopsFile = ../utils/modules/victoriametrics/secrets.yaml; sopsFile = ../utils/modules/victoriametrics/secrets.yaml;
}; };
services.prometheus.exporters.node.enable = true; services.prometheus.exporters.node = {
enable = true;
enabledCollectors = [ "systemd" ];
};
systemd.services.export-fw-to-prometheus = { systemd.services.export-fw-to-prometheus = {
path = with pkgs; [victoriametrics]; path = with pkgs; [victoriametrics];

View file

@ -1,58 +0,0 @@
{ lib, pkgs, config, ... }:
{
grafanaAlertRuleDefinitions = [
{
uid = "amzebs-mysql-service-down-alert-uid";
title = "MySQL Service Down on amzebs-01";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = {
from = 300;
to = 0;
};
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = "node_systemd_unit_state{state=\"active\", name=\"mysql.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)";
hide = false;
intervalMs = 1000;
legendFormat = "__auto";
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "MySQL service is down on amzebs-01";
summary = "MySQL Service Down on amzebs-01";
};
labels = {
severity = "critical";
host = "amzebs-01";
};
}
];
}

View file

@ -1,58 +0,0 @@
{ lib, pkgs, config, ... }:
{
grafanaAlertRuleDefinitions = [
{
uid = "amzebs-nginx-service-down-alert-uid";
title = "Nginx Service Down on amzebs-01";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = {
from = 300;
to = 0;
};
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = "node_systemd_unit_state{state=\"active\", name=\"nginx.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)";
hide = false;
intervalMs = 1000;
legendFormat = "__auto";
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "Nginx service is down on amzebs-01";
summary = "Nginx Service Down on amzebs-01";
};
labels = {
severity = "critical";
host = "amzebs-01";
};
}
];
}

View file

@ -1,58 +0,0 @@
{ lib, pkgs, config, ... }:
{
grafanaAlertRuleDefinitions = [
{
uid = "amzebs-phpfpm-service-down-alert-uid";
title = "PHP-FPM Service Down on amzebs-01";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = {
from = 300;
to = 0;
};
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = "node_systemd_unit_state{state=\"active\", name=~\"phpfpm-.*\\\\.service\", instance=\"amzebs-01:9100\"} OR on() vector(0)";
hide = false;
intervalMs = 1000;
legendFormat = "__auto";
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "min";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "One or more PHP-FPM services are down on amzebs-01";
summary = "PHP-FPM Service Down on amzebs-01";
};
labels = {
severity = "critical";
host = "amzebs-01";
};
}
];
}

View file

@ -1,26 +1,6 @@
{ lib, pkgs, config, ... }: { lib, pkgs, config, ... }:
let let
giteaDownAlertRules = (import ./gitea_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions; servicesDownAlertRules = (import ./services_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
giteaRunnerDownAlertRules = (import ./gitea_runner_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
postfixDownAlertRules = (import ./postfix_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
dovecotDownAlertRules = (import ./dovecot_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
openldapDownAlertRules = (import ./openldap_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
wireguardDownAlertRules = (import ./wireguard_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
# amzebs-01 service alerts
ambebsMysqlDownAlertRules = (import ./amzebs_mysql_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
ambebsNginxDownAlertRules = (import ./amzebs_nginx_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
ambebsPhpfpmDownAlertRules = (import ./amzebs_phpfpm_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
allServiceRules = giteaDownAlertRules
++ giteaRunnerDownAlertRules
++ postfixDownAlertRules
++ dovecotDownAlertRules
++ openldapDownAlertRules
++ wireguardDownAlertRules
++ ambebsMysqlDownAlertRules
++ ambebsNginxDownAlertRules
++ ambebsPhpfpmDownAlertRules;
in in
{ {
services.grafana.provision.alerting.rules.settings.groups = [ services.grafana.provision.alerting.rules.settings.groups = [
@ -28,7 +8,7 @@ in
name = "Service Alerts"; name = "Service Alerts";
folder = "Service Monitoring"; folder = "Service Monitoring";
interval = "1m"; interval = "1m";
rules = allServiceRules; rules = servicesDownAlertRules;
} }
]; ];
} }

View file

@ -1,57 +0,0 @@
{ lib, pkgs, config, ... }:
{
grafanaAlertRuleDefinitions = [
{
uid = "dovecot-service-down-alert-uid";
title = "Dovecot Service Down";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = {
from = 300;
to = 0;
};
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = "node_systemd_unit_state{state=\"active\", name=\"dovecot.service\"} OR on() vector(0)";
hide = false;
intervalMs = 1000;
legendFormat = "__auto";
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "Dovecot service is down on {{ $labels.instance }}";
summary = "Dovecot Service Down";
};
labels = {
severity = "critical";
};
}
];
}

View file

@ -1,57 +0,0 @@
{ lib, pkgs, config, ... }:
{
grafanaAlertRuleDefinitions = [
{
uid = "gitea-service-down-alert-uid";
title = "Gitea Service Down";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = {
from = 300;
to = 0;
};
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = "node_systemd_unit_state{state=\"active\", name=\"container@git.service\"} OR on() vector(0)";
hide = false;
intervalMs = 1000;
legendFormat = "__auto";
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "Gitea service is down on {{ $labels.instance }}";
summary = "Gitea Service Down";
};
labels = {
severity = "critical";
};
}
];
}

View file

@ -1,57 +0,0 @@
{ lib, pkgs, config, ... }:
{
grafanaAlertRuleDefinitions = [
{
uid = "gitea-runner-service-down-alert-uid";
title = "Gitea Runner Service Down";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = {
from = 300;
to = 0;
};
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = "node_systemd_unit_state{state=\"active\", name=\"microvm@git-runner-1.service\"} OR on() vector(0)";
hide = false;
intervalMs = 1000;
legendFormat = "__auto";
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "Gitea Runner service is down on {{ $labels.instance }}";
summary = "Gitea Runner Service Down";
};
labels = {
severity = "critical";
};
}
];
}

View file

@ -1,57 +0,0 @@
{ lib, pkgs, config, ... }:
{
grafanaAlertRuleDefinitions = [
{
uid = "openldap-service-down-alert-uid";
title = "OpenLDAP Service Down";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = {
from = 300;
to = 0;
};
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = "node_systemd_unit_state{state=\"active\", name=\"openldap.service\"} OR on() vector(0)";
hide = false;
intervalMs = 1000;
legendFormat = "__auto";
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "OpenLDAP service is down on {{ $labels.instance }}";
summary = "OpenLDAP Service Down";
};
labels = {
severity = "critical";
};
}
];
}

View file

@ -1,57 +0,0 @@
{ lib, pkgs, config, ... }:
{
grafanaAlertRuleDefinitions = [
{
uid = "postfix-service-down-alert-uid";
title = "Postfix Service Down";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = {
from = 300;
to = 0;
};
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = "node_systemd_unit_state{state=\"active\", name=\"postfix.service\"} OR on() vector(0)";
hide = false;
intervalMs = 1000;
legendFormat = "__auto";
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "Postfix service is down on {{ $labels.instance }}";
summary = "Postfix Service Down";
};
labels = {
severity = "critical";
};
}
];
}

View file

@ -0,0 +1,90 @@
{ lib, pkgs, config, ... }:
let
# Add services here - each entry generates an alert rule
# instance = which node exporter to query (hostname:9100)
monitoredServices = [
{ name = "AI-Mailer"; service = "ai-mailer.service"; instance = "fw:9100"; }
{ name = "Postfix"; service = "postfix.service"; instance = "mail:9100"; }
{ name = "Dovecot"; service = "dovecot.service"; instance = "mail:9100"; }
{ name = "OpenLDAP"; service = "openldap.service"; instance = "mail:9100"; }
{ name = "Gitea"; service = "container@git.service"; instance = "fw:9100"; }
{ name = "Gitea Runner"; service = "microvm@git-runner-1.service"; instance = "fw:9100"; }
{ name = "WireGuard"; service = "wireguard-wg_cloonar.service"; instance = "mail:9100"; }
{ name = "MySQL"; service = "mysql.service"; instance = "amzebs-01:9100"; }
{ name = "Nginx"; service = "nginx.service"; instance = "amzebs-01:9100"; }
{ name = "PHP-FPM"; service = "phpfpm-.*\\.service"; instance = "amzebs-01:9100"; }
];
# Extract host from instance (e.g., "fw:9100" -> "fw")
getHost = instance: lib.head (lib.splitString ":" instance);
# Generate a unique UID from service name
mkUid = name: "${lib.toLower (lib.replaceStrings [" " "@" "."] ["-" "-" "-"] name)}-down-uid";
# Check if service pattern uses regex (contains special chars)
isRegex = svc: lib.hasInfix ".*" svc || lib.hasInfix "\\" svc;
# Build the PromQL expression
mkExpr = svc:
let
nameMatch = if isRegex svc.service
then "name=~\"${svc.service}\""
else "name=\"${svc.service}\"";
in "node_systemd_unit_state{state=\"active\", ${nameMatch}, instance=\"${svc.instance}\"} OR on() vector(0)";
mkServiceAlert = svc: {
uid = mkUid svc.name;
title = "${svc.name} Service Down on ${getHost svc.instance}";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = {
from = 300;
to = 0;
};
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = mkExpr svc;
hide = false;
intervalMs = 1000;
legendFormat = "__auto";
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "${svc.name} service is down on ${getHost svc.instance}";
summary = "${svc.name} Service Down";
};
labels = {
severity = "critical";
host = getHost svc.instance;
};
};
in {
grafanaAlertRuleDefinitions = map mkServiceAlert monitoredServices;
}

View file

@ -1,57 +0,0 @@
{ lib, pkgs, config, ... }:
{
grafanaAlertRuleDefinitions = [
{
uid = "wireguard-service-down-alert-uid";
title = "WireGuard Service Down";
condition = "C";
data = [
{
refId = "A";
relativeTimeRange = {
from = 300;
to = 0;
};
datasourceUid = "vm-datasource-uid";
model = {
editorMode = "code";
expr = "node_systemd_unit_state{state=\"active\", name=\"wireguard-wg_cloonar.service\"} OR on() vector(0)";
hide = false;
intervalMs = 1000;
legendFormat = "__auto";
maxDataPoints = 43200;
range = true;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "math";
expression = "$B < 1";
};
}
];
noDataState = "Alerting";
execErrState = "Alerting";
for = "5m";
annotations = {
description = "WireGuard service is down on {{ $labels.instance }}";
summary = "WireGuard Service Down";
};
labels = {
severity = "critical";
};
}
];
}