add zammad to fw vm, add web-arm machine
This commit is contained in:
306
hosts/web-arm/modules/prometheus.nix
Normal file
306
hosts/web-arm/modules/prometheus.nix
Normal file
@@ -0,0 +1,306 @@
|
||||
{ config, ... }:
|
||||
{
|
||||
sops.secrets.alertmanager = { };
|
||||
sops.secrets.hass-token.owner = "prometheus";
|
||||
|
||||
# imports = [
|
||||
# ./matrix-alertmanager.nix
|
||||
# ./irc-alertmanager.nix
|
||||
# ./rules.nix
|
||||
# ];
|
||||
|
||||
services.prometheus = {
|
||||
webExternalUrl = "https://prometheus.cloonar.com";
|
||||
alertmanagers = [
|
||||
{
|
||||
static_configs = [
|
||||
{
|
||||
targets = [ "localhost:9093" ];
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
rules = [
|
||||
''
|
||||
ALERT node_down
|
||||
IF up == 0
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{$labels.alias}}: Node is down.",
|
||||
description = "{{$labels.alias}} has been down for more than 5 minutes."
|
||||
}
|
||||
ALERT node_systemd_service_failed
|
||||
IF node_systemd_unit_state{state="failed"} == 1
|
||||
FOR 4m
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{$labels.alias}}: Service {{$labels.name}} failed to start.",
|
||||
description = "{{$labels.alias}} failed to (re)start service {{$labels.name}}."
|
||||
}
|
||||
ALERT node_filesystem_full_90percent
|
||||
IF sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} * 0.1) / 1024^3
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{$labels.alias}}: Filesystem is running out of space soon.",
|
||||
description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% space left on its filesystem."
|
||||
}
|
||||
ALERT node_filesystem_full_in_4h
|
||||
IF predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4*3600) <= 0
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{$labels.alias}}: Filesystem is running out of space in 4 hours.",
|
||||
description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 4 hours"
|
||||
}
|
||||
ALERT node_filedescriptors_full_in_3h
|
||||
IF predict_linear(node_filefd_allocated[1h], 3*3600) >= node_filefd_maximum
|
||||
FOR 20m
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{$labels.alias}} is running out of available file descriptors in 3 hours.",
|
||||
description = "{{$labels.alias}} is running out of available file descriptors in approx. 3 hours"
|
||||
}
|
||||
ALERT node_load1_90percent
|
||||
IF node_load1 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0.9
|
||||
FOR 1h
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{$labels.alias}}: Running on high load.",
|
||||
description = "{{$labels.alias}} is running with > 90% total load for at least 1h."
|
||||
}
|
||||
ALERT node_cpu_util_90percent
|
||||
IF 100 - (avg by (alias) (irate(node_cpu{mode="idle"}[5m])) * 100) >= 90
|
||||
FOR 1h
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{$labels.alias}}: High CPU utilization.",
|
||||
description = "{{$labels.alias}} has total CPU utilization over 90% for at least 1h."
|
||||
}
|
||||
ALERT node_ram_using_90percent
|
||||
IF node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal * 0.1
|
||||
FOR 30m
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary="{{$labels.alias}}: Using lots of RAM.",
|
||||
description="{{$labels.alias}} is using at least 90% of its RAM for at least 30 minutes now.",
|
||||
}
|
||||
ALERT node_swap_using_80percent
|
||||
IF node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) > node_memory_SwapTotal * 0.8
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary="{{$labels.alias}}: Running out of swap soon.",
|
||||
description="{{$labels.alias}} is using 80% of its swap space for at least 10 minutes now."
|
||||
}
|
||||
ALERT homeassistant = {
|
||||
IF homeassistant_entity_available{domain="persistent_notification", entity!~"persistent_notification.http_login|persistent_notification.recorder_database_migration"} >= 0
|
||||
ANNOTATIONS {
|
||||
description="homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}"
|
||||
}
|
||||
|
||||
ALERT gitea
|
||||
IF rate(promhttp_metric_handler_requests_total{job="gitea", code="500"}[5m]) > 3
|
||||
ANNOTATIONS {
|
||||
description="{{$labels.instance}}: gitea instances error rate went up: {{$value}} errors in 5 minutes"
|
||||
}
|
||||
''
|
||||
];
|
||||
scrapeConfigs = [
|
||||
{
|
||||
job_name = "telegraf";
|
||||
scrape_interval = "60s";
|
||||
metrics_path = "/metrics";
|
||||
static_configs = [
|
||||
{
|
||||
targets = [
|
||||
"web-01.cloonar.com:9273"
|
||||
];
|
||||
labels.host = "web-01.cloonar.com";
|
||||
}
|
||||
{
|
||||
targets = [
|
||||
"mail.cloonar.com:9273"
|
||||
];
|
||||
labels.host = "mail.cloonar.com";
|
||||
}
|
||||
{
|
||||
targets = [
|
||||
"git.cloonar.com:9273"
|
||||
];
|
||||
labels.host = "git.cloonar.com";
|
||||
}
|
||||
{
|
||||
targets = [
|
||||
"home-assistant.cloonar.com:9273"
|
||||
];
|
||||
labels.host = "home-assistant.cloonar.com";
|
||||
}
|
||||
{
|
||||
targets = map (host: "${host}.cloonar.com:9273") [
|
||||
"web-01"
|
||||
"mail"
|
||||
"git"
|
||||
"home-assistant"
|
||||
];
|
||||
|
||||
labels.org = "cloonar";
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
job_name = "homeassistant";
|
||||
scrape_interval = "60s";
|
||||
metrics_path = "/api/prometheus";
|
||||
|
||||
authorization.credentials_file = config.sops.secrets.hass-token.path;
|
||||
|
||||
scheme = "https";
|
||||
static_configs = [
|
||||
{
|
||||
targets = [
|
||||
"home-assistant.cloonar.com:443"
|
||||
];
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
job_name = "gitea";
|
||||
scrape_interval = "60s";
|
||||
metrics_path = "/metrics";
|
||||
|
||||
scheme = "https";
|
||||
static_configs = [
|
||||
{
|
||||
targets = [
|
||||
"git.cloonar.com:443"
|
||||
];
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
# services.prometheus.alertmanager = {
|
||||
# enable = true;
|
||||
# environmentFile = config.sops.secrets.alertmanager.path;
|
||||
# webExternalUrl = "https://alertmanager.cloonar.com";
|
||||
# listenAddress = "[::1]";
|
||||
# configuration = {
|
||||
# global = {
|
||||
# # The smarthost and SMTP sender used for mail notifications.
|
||||
# smtp_smarthost = "mail.cloonar.com:587";
|
||||
# smtp_from = "alertmanager@cloonar.com";
|
||||
# smtp_auth_username = "alertmanager@cloonar.com";
|
||||
# smtp_auth_password = "$SMTP_PASSWORD";
|
||||
# };
|
||||
# route = {
|
||||
# receiver = "default";
|
||||
# routes = [
|
||||
# {
|
||||
# group_by = [ "host" ];
|
||||
# match_re.org = "krebs";
|
||||
# group_wait = "5m";
|
||||
# group_interval = "5m";
|
||||
# repeat_interval = "4h";
|
||||
# receiver = "krebs";
|
||||
# }
|
||||
# {
|
||||
# group_by = [ "host" ];
|
||||
# match_re.org = "nix-community";
|
||||
# group_wait = "5m";
|
||||
# group_interval = "5m";
|
||||
# repeat_interval = "4h";
|
||||
# receiver = "nix-community";
|
||||
# }
|
||||
# {
|
||||
# group_by = [ "host" ];
|
||||
# match_re.org = "clan-lol";
|
||||
# group_wait = "5m";
|
||||
# group_interval = "5m";
|
||||
# repeat_interval = "4h";
|
||||
# receiver = "clan-lol";
|
||||
# }
|
||||
# {
|
||||
# group_by = [ "host" ];
|
||||
# group_wait = "30s";
|
||||
# group_interval = "2m";
|
||||
# repeat_interval = "2h";
|
||||
# receiver = "all";
|
||||
# }
|
||||
# ];
|
||||
# };
|
||||
# receivers = [
|
||||
# {
|
||||
# name = "krebs";
|
||||
# webhook_configs = [
|
||||
# {
|
||||
# url = "http://127.0.0.1:9223/";
|
||||
# max_alerts = 5;
|
||||
# }
|
||||
# ];
|
||||
# }
|
||||
# #{
|
||||
# # name = "numtide";
|
||||
# # slack_configs = [
|
||||
# # {
|
||||
# # token = "$SLACK_TOKEN";
|
||||
# # api_url = "https://";
|
||||
# # }
|
||||
# # ];
|
||||
# #}
|
||||
# {
|
||||
# name = "nix-community";
|
||||
# webhook_configs = [
|
||||
# {
|
||||
# url = "http://localhost:9088/alert";
|
||||
# max_alerts = 5;
|
||||
# }
|
||||
# ];
|
||||
# }
|
||||
# {
|
||||
# name = "clan-lol";
|
||||
# webhook_configs = [
|
||||
# # TODO
|
||||
# #{
|
||||
# # url = "http://localhost:4050/services/hooks/YWxlcnRtYW5hZ2VyX3NlcnZpY2U";
|
||||
# # max_alerts = 5;
|
||||
# #}
|
||||
# ];
|
||||
# }
|
||||
# {
|
||||
# name = "all";
|
||||
# pushover_configs = [
|
||||
# {
|
||||
# user_key = "$PUSHOVER_USER_KEY";
|
||||
# token = "$PUSHOVER_TOKEN";
|
||||
# priority = "0";
|
||||
# }
|
||||
# ];
|
||||
# }
|
||||
# {
|
||||
# name = "default";
|
||||
# }
|
||||
# ];
|
||||
# };
|
||||
# };
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user