Files
nixos/hosts/web-01.cloonar.com/modules/prometheus.nix
2023-08-19 06:33:21 +02:00

307 lines
9.1 KiB
Nix

{ config, ... }:
{
sops.secrets.alertmanager = { };
sops.secrets.hass-token.owner = "prometheus";
# imports = [
# ./matrix-alertmanager.nix
# ./irc-alertmanager.nix
# ./rules.nix
# ];
services.prometheus = {
webExternalUrl = "https://prometheus.cloonar.com";
alertmanagers = [
{
static_configs = [
{
targets = [ "localhost:9093" ];
}
];
}
];
rules = [
''
ALERT node_down
IF up == 0
FOR 5m
LABELS {
severity="page"
}
ANNOTATIONS {
summary = "{{$labels.alias}}: Node is down.",
description = "{{$labels.alias}} has been down for more than 5 minutes."
}
ALERT node_systemd_service_failed
IF node_systemd_unit_state{state="failed"} == 1
FOR 4m
LABELS {
severity="page"
}
ANNOTATIONS {
summary = "{{$labels.alias}}: Service {{$labels.name}} failed to start.",
description = "{{$labels.alias}} failed to (re)start service {{$labels.name}}."
}
ALERT node_filesystem_full_90percent
IF sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} * 0.1) / 1024^3
FOR 5m
LABELS {
severity="page"
}
ANNOTATIONS {
summary = "{{$labels.alias}}: Filesystem is running out of space soon.",
description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% space left on its filesystem."
}
ALERT node_filesystem_full_in_4h
IF predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4*3600) <= 0
FOR 5m
LABELS {
severity="page"
}
ANNOTATIONS {
summary = "{{$labels.alias}}: Filesystem is running out of space in 4 hours.",
description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 4 hours"
}
ALERT node_filedescriptors_full_in_3h
IF predict_linear(node_filefd_allocated[1h], 3*3600) >= node_filefd_maximum
FOR 20m
LABELS {
severity="page"
}
ANNOTATIONS {
summary = "{{$labels.alias}} is running out of available file descriptors in 3 hours.",
description = "{{$labels.alias}} is running out of available file descriptors in approx. 3 hours"
}
ALERT node_load1_90percent
IF node_load1 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0.9
FOR 1h
LABELS {
severity="page"
}
ANNOTATIONS {
summary = "{{$labels.alias}}: Running on high load.",
description = "{{$labels.alias}} is running with > 90% total load for at least 1h."
}
ALERT node_cpu_util_90percent
IF 100 - (avg by (alias) (irate(node_cpu{mode="idle"}[5m])) * 100) >= 90
FOR 1h
LABELS {
severity="page"
}
ANNOTATIONS {
summary = "{{$labels.alias}}: High CPU utilization.",
description = "{{$labels.alias}} has total CPU utilization over 90% for at least 1h."
}
ALERT node_ram_using_90percent
IF node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal * 0.1
FOR 30m
LABELS {
severity="page"
}
ANNOTATIONS {
summary="{{$labels.alias}}: Using lots of RAM.",
description="{{$labels.alias}} is using at least 90% of its RAM for at least 30 minutes now.",
}
ALERT node_swap_using_80percent
IF node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) > node_memory_SwapTotal * 0.8
FOR 10m
LABELS {
severity="page"
}
ANNOTATIONS {
summary="{{$labels.alias}}: Running out of swap soon.",
description="{{$labels.alias}} is using 80% of its swap space for at least 10 minutes now."
}
ALERT homeassistant = {
IF homeassistant_entity_available{domain="persistent_notification", entity!~"persistent_notification.http_login|persistent_notification.recorder_database_migration"} >= 0
ANNOTATIONS {
description="homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}"
}
ALERT gitea
IF rate(promhttp_metric_handler_requests_total{job="gitea", code="500"}[5m]) > 3
ANNOTATIONS {
description="{{$labels.instance}}: gitea instances error rate went up: {{$value}} errors in 5 minutes"
}
''
];
scrapeConfigs = [
{
job_name = "telegraf";
scrape_interval = "60s";
metrics_path = "/metrics";
static_configs = [
{
targets = [
"web-01.cloonar.com:9273"
];
labels.host = "web-01.cloonar.com";
}
{
targets = [
"mail.cloonar.com:9273"
];
labels.host = "mail.cloonar.com";
}
{
targets = [
"git.cloonar.com:9273"
];
labels.host = "git.cloonar.com";
}
{
targets = [
"home-assistant.cloonar.com:9273"
];
labels.host = "home-assistant.cloonar.com";
}
{
targets = map (host: "${host}.cloonar.com:9273") [
"web-01"
"mail"
"git"
"home-assistant"
];
labels.org = "cloonar";
}
];
}
{
job_name = "homeassistant";
scrape_interval = "60s";
metrics_path = "/api/prometheus";
authorization.credentials_file = config.sops.secrets.hass-token.path;
scheme = "https";
static_configs = [
{
targets = [
"home-assistant.cloonar.com:443"
];
}
];
}
{
job_name = "gitea";
scrape_interval = "60s";
metrics_path = "/metrics";
scheme = "https";
static_configs = [
{
targets = [
"git.cloonar.com:443"
];
}
];
}
];
};
# services.prometheus.alertmanager = {
# enable = true;
# environmentFile = config.sops.secrets.alertmanager.path;
# webExternalUrl = "https://alertmanager.cloonar.com";
# listenAddress = "[::1]";
# configuration = {
# global = {
# # The smarthost and SMTP sender used for mail notifications.
# smtp_smarthost = "mail.cloonar.com:587";
# smtp_from = "alertmanager@cloonar.com";
# smtp_auth_username = "alertmanager@cloonar.com";
# smtp_auth_password = "$SMTP_PASSWORD";
# };
# route = {
# receiver = "default";
# routes = [
# {
# group_by = [ "host" ];
# match_re.org = "krebs";
# group_wait = "5m";
# group_interval = "5m";
# repeat_interval = "4h";
# receiver = "krebs";
# }
# {
# group_by = [ "host" ];
# match_re.org = "nix-community";
# group_wait = "5m";
# group_interval = "5m";
# repeat_interval = "4h";
# receiver = "nix-community";
# }
# {
# group_by = [ "host" ];
# match_re.org = "clan-lol";
# group_wait = "5m";
# group_interval = "5m";
# repeat_interval = "4h";
# receiver = "clan-lol";
# }
# {
# group_by = [ "host" ];
# group_wait = "30s";
# group_interval = "2m";
# repeat_interval = "2h";
# receiver = "all";
# }
# ];
# };
# receivers = [
# {
# name = "krebs";
# webhook_configs = [
# {
# url = "http://127.0.0.1:9223/";
# max_alerts = 5;
# }
# ];
# }
# #{
# # name = "numtide";
# # slack_configs = [
# # {
# # token = "$SLACK_TOKEN";
# # api_url = "https://";
# # }
# # ];
# #}
# {
# name = "nix-community";
# webhook_configs = [
# {
# url = "http://localhost:9088/alert";
# max_alerts = 5;
# }
# ];
# }
# {
# name = "clan-lol";
# webhook_configs = [
# # TODO
# #{
# # url = "http://localhost:4050/services/hooks/YWxlcnRtYW5hZ2VyX3NlcnZpY2U";
# # max_alerts = 5;
# #}
# ];
# }
# {
# name = "all";
# pushover_configs = [
# {
# user_key = "$PUSHOVER_USER_KEY";
# token = "$PUSHOVER_TOKEN";
# priority = "0";
# }
# ];
# }
# {
# name = "default";
# }
# ];
# };
# };
}