321 lines
9.4 KiB
Nix
321 lines
9.4 KiB
Nix
{ config, ... }:
|
|
{
|
|
sops.secrets.alertmanager = { };
|
|
sops.secrets.hass-token.owner = "prometheus";
|
|
|
|
# imports = [
|
|
# ./matrix-alertmanager.nix
|
|
# ./irc-alertmanager.nix
|
|
# ./rules.nix
|
|
# ];
|
|
|
|
services.prometheus = {
|
|
webExternalUrl = "https://prometheus.cloonar.com";
|
|
alertmanagers = [
|
|
{
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:9093" ];
|
|
}
|
|
];
|
|
}
|
|
];
|
|
rules = [
|
|
''
|
|
ALERT node_down
|
|
IF up == 0
|
|
FOR 5m
|
|
LABELS {
|
|
severity="page"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "{{$labels.alias}}: Node is down.",
|
|
description = "{{$labels.alias}} has been down for more than 5 minutes."
|
|
}
|
|
ALERT node_systemd_service_failed
|
|
IF node_systemd_unit_state{state="failed"} == 1
|
|
FOR 4m
|
|
LABELS {
|
|
severity="page"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "{{$labels.alias}}: Service {{$labels.name}} failed to start.",
|
|
description = "{{$labels.alias}} failed to (re)start service {{$labels.name}}."
|
|
}
|
|
ALERT node_filesystem_full_90percent
|
|
IF sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} * 0.1) / 1024^3
|
|
FOR 5m
|
|
LABELS {
|
|
severity="page"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "{{$labels.alias}}: Filesystem is running out of space soon.",
|
|
description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% space left on its filesystem."
|
|
}
|
|
ALERT node_filesystem_full_in_4h
|
|
IF predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4*3600) <= 0
|
|
FOR 5m
|
|
LABELS {
|
|
severity="page"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "{{$labels.alias}}: Filesystem is running out of space in 4 hours.",
|
|
description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 4 hours"
|
|
}
|
|
ALERT node_filedescriptors_full_in_3h
|
|
IF predict_linear(node_filefd_allocated[1h], 3*3600) >= node_filefd_maximum
|
|
FOR 20m
|
|
LABELS {
|
|
severity="page"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "{{$labels.alias}} is running out of available file descriptors in 3 hours.",
|
|
description = "{{$labels.alias}} is running out of available file descriptors in approx. 3 hours"
|
|
}
|
|
ALERT node_load1_90percent
|
|
IF node_load1 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0.9
|
|
FOR 1h
|
|
LABELS {
|
|
severity="page"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "{{$labels.alias}}: Running on high load.",
|
|
description = "{{$labels.alias}} is running with > 90% total load for at least 1h."
|
|
}
|
|
ALERT node_cpu_util_90percent
|
|
IF 100 - (avg by (alias) (irate(node_cpu{mode="idle"}[5m])) * 100) >= 90
|
|
FOR 1h
|
|
LABELS {
|
|
severity="page"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "{{$labels.alias}}: High CPU utilization.",
|
|
description = "{{$labels.alias}} has total CPU utilization over 90% for at least 1h."
|
|
}
|
|
ALERT node_ram_using_90percent
|
|
IF node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal * 0.1
|
|
FOR 30m
|
|
LABELS {
|
|
severity="page"
|
|
}
|
|
ANNOTATIONS {
|
|
summary="{{$labels.alias}}: Using lots of RAM.",
|
|
description="{{$labels.alias}} is using at least 90% of its RAM for at least 30 minutes now.",
|
|
}
|
|
ALERT node_swap_using_80percent
|
|
IF node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) > node_memory_SwapTotal * 0.8
|
|
FOR 10m
|
|
LABELS {
|
|
severity="page"
|
|
}
|
|
ANNOTATIONS {
|
|
summary="{{$labels.alias}}: Running out of swap soon.",
|
|
description="{{$labels.alias}} is using 80% of its swap space for at least 10 minutes now."
|
|
}
|
|
ALERT homeassistant = {
|
|
IF homeassistant_entity_available{domain="persistent_notification", entity!~"persistent_notification.http_login|persistent_notification.recorder_database_migration"} >= 0
|
|
ANNOTATIONS {
|
|
description="homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}"
|
|
}
|
|
|
|
ALERT gitea
|
|
IF rate(promhttp_metric_handler_requests_total{job="gitea", code="500"}[5m]) > 3
|
|
ANNOTATIONS {
|
|
description="{{$labels.instance}}: gitea instances error rate went up: {{$value}} errors in 5 minutes"
|
|
}
|
|
''
|
|
];
|
|
scrapeConfigs = [
|
|
{
|
|
job_name = "telegraf";
|
|
scrape_interval = "60s";
|
|
metrics_path = "/metrics";
|
|
static_configs = [
|
|
{
|
|
targets = [
|
|
"web-01.cloonar.com:9273"
|
|
];
|
|
labels.host = "web-01.cloonar.com";
|
|
}
|
|
{
|
|
targets = [
|
|
"web-arm.cloonar.com:9273"
|
|
];
|
|
labels.host = "web-arm.cloonar.com";
|
|
}
|
|
{
|
|
targets = [
|
|
"fw.cloonar.com:9273"
|
|
];
|
|
labels.host = "fw.cloonar.com";
|
|
}
|
|
{
|
|
targets = [
|
|
"mail.cloonar.com:9273"
|
|
];
|
|
labels.host = "mail.cloonar.com";
|
|
}
|
|
{
|
|
targets = [
|
|
"git.cloonar.com:9273"
|
|
];
|
|
labels.host = "git.cloonar.com";
|
|
}
|
|
{
|
|
targets = [
|
|
"home-assistant.cloonar.com:9273"
|
|
];
|
|
labels.host = "home-assistant.cloonar.com";
|
|
}
|
|
{
|
|
targets = map (host: "${host}.cloonar.com:9273") [
|
|
"web-01"
|
|
"web-arm"
|
|
"fw"
|
|
"mail"
|
|
"git"
|
|
"home-assistant"
|
|
];
|
|
|
|
labels.org = "cloonar";
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "homeassistant";
|
|
scrape_interval = "60s";
|
|
metrics_path = "/api/prometheus";
|
|
|
|
authorization.credentials_file = config.sops.secrets.hass-token.path;
|
|
|
|
scheme = "https";
|
|
static_configs = [
|
|
{
|
|
targets = [
|
|
"home-assistant.cloonar.com:443"
|
|
];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "gitea";
|
|
scrape_interval = "60s";
|
|
metrics_path = "/metrics";
|
|
|
|
scheme = "https";
|
|
static_configs = [
|
|
{
|
|
targets = [
|
|
"git.cloonar.com:443"
|
|
];
|
|
}
|
|
];
|
|
}
|
|
];
|
|
};
|
|
# services.prometheus.alertmanager = {
|
|
# enable = true;
|
|
# environmentFile = config.sops.secrets.alertmanager.path;
|
|
# webExternalUrl = "https://alertmanager.cloonar.com";
|
|
# listenAddress = "[::1]";
|
|
# configuration = {
|
|
# global = {
|
|
# # The smarthost and SMTP sender used for mail notifications.
|
|
# smtp_smarthost = "mail.cloonar.com:587";
|
|
# smtp_from = "alertmanager@cloonar.com";
|
|
# smtp_auth_username = "alertmanager@cloonar.com";
|
|
# smtp_auth_password = "$SMTP_PASSWORD";
|
|
# };
|
|
# route = {
|
|
# receiver = "default";
|
|
# routes = [
|
|
# {
|
|
# group_by = [ "host" ];
|
|
# match_re.org = "krebs";
|
|
# group_wait = "5m";
|
|
# group_interval = "5m";
|
|
# repeat_interval = "4h";
|
|
# receiver = "krebs";
|
|
# }
|
|
# {
|
|
# group_by = [ "host" ];
|
|
# match_re.org = "nix-community";
|
|
# group_wait = "5m";
|
|
# group_interval = "5m";
|
|
# repeat_interval = "4h";
|
|
# receiver = "nix-community";
|
|
# }
|
|
# {
|
|
# group_by = [ "host" ];
|
|
# match_re.org = "clan-lol";
|
|
# group_wait = "5m";
|
|
# group_interval = "5m";
|
|
# repeat_interval = "4h";
|
|
# receiver = "clan-lol";
|
|
# }
|
|
# {
|
|
# group_by = [ "host" ];
|
|
# group_wait = "30s";
|
|
# group_interval = "2m";
|
|
# repeat_interval = "2h";
|
|
# receiver = "all";
|
|
# }
|
|
# ];
|
|
# };
|
|
# receivers = [
|
|
# {
|
|
# name = "krebs";
|
|
# webhook_configs = [
|
|
# {
|
|
# url = "http://127.0.0.1:9223/";
|
|
# max_alerts = 5;
|
|
# }
|
|
# ];
|
|
# }
|
|
# #{
|
|
# # name = "numtide";
|
|
# # slack_configs = [
|
|
# # {
|
|
# # token = "$SLACK_TOKEN";
|
|
# # api_url = "https://";
|
|
# # }
|
|
# # ];
|
|
# #}
|
|
# {
|
|
# name = "nix-community";
|
|
# webhook_configs = [
|
|
# {
|
|
# url = "http://localhost:9088/alert";
|
|
# max_alerts = 5;
|
|
# }
|
|
# ];
|
|
# }
|
|
# {
|
|
# name = "clan-lol";
|
|
# webhook_configs = [
|
|
# # TODO
|
|
# #{
|
|
# # url = "http://localhost:4050/services/hooks/YWxlcnRtYW5hZ2VyX3NlcnZpY2U";
|
|
# # max_alerts = 5;
|
|
# #}
|
|
# ];
|
|
# }
|
|
# {
|
|
# name = "all";
|
|
# pushover_configs = [
|
|
# {
|
|
# user_key = "$PUSHOVER_USER_KEY";
|
|
# token = "$PUSHOVER_TOKEN";
|
|
# priority = "0";
|
|
# }
|
|
# ];
|
|
# }
|
|
# {
|
|
# name = "default";
|
|
# }
|
|
# ];
|
|
# };
|
|
# };
|
|
|
|
}
|