add sops
This commit is contained in:
@@ -11,6 +11,120 @@
|
||||
|
||||
services.prometheus = {
|
||||
webExternalUrl = "https://prometheus.cloonar.com";
|
||||
alertmanagers = [
|
||||
{
|
||||
static_configs = [
|
||||
{
|
||||
targets = [ "localhost:9093" ];
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
rules = [
|
||||
''
|
||||
ALERT node_down
|
||||
IF up == 0
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{$labels.alias}}: Node is down.",
|
||||
description = "{{$labels.alias}} has been down for more than 5 minutes."
|
||||
}
|
||||
ALERT node_systemd_service_failed
|
||||
IF node_systemd_unit_state{state="failed"} == 1
|
||||
FOR 4m
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{$labels.alias}}: Service {{$labels.name}} failed to start.",
|
||||
description = "{{$labels.alias}} failed to (re)start service {{$labels.name}}."
|
||||
}
|
||||
ALERT node_filesystem_full_90percent
|
||||
IF sort(node_filesystem_free{device!="ramfs"} < node_filesystem_size{device!="ramfs"} * 0.1) / 1024^3
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{$labels.alias}}: Filesystem is running out of space soon.",
|
||||
description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% space left on its filesystem."
|
||||
}
|
||||
ALERT node_filesystem_full_in_4h
|
||||
IF predict_linear(node_filesystem_free{device!="ramfs"}[1h], 4*3600) <= 0
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{$labels.alias}}: Filesystem is running out of space in 4 hours.",
|
||||
description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space of in approx. 4 hours"
|
||||
}
|
||||
ALERT node_filedescriptors_full_in_3h
|
||||
IF predict_linear(node_filefd_allocated[1h], 3*3600) >= node_filefd_maximum
|
||||
FOR 20m
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{$labels.alias}} is running out of available file descriptors in 3 hours.",
|
||||
description = "{{$labels.alias}} is running out of available file descriptors in approx. 3 hours"
|
||||
}
|
||||
ALERT node_load1_90percent
|
||||
IF node_load1 / on(alias) count(node_cpu{mode="system"}) by (alias) >= 0.9
|
||||
FOR 1h
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{$labels.alias}}: Running on high load.",
|
||||
description = "{{$labels.alias}} is running with > 90% total load for at least 1h."
|
||||
}
|
||||
ALERT node_cpu_util_90percent
|
||||
IF 100 - (avg by (alias) (irate(node_cpu{mode="idle"}[5m])) * 100) >= 90
|
||||
FOR 1h
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "{{$labels.alias}}: High CPU utilization.",
|
||||
description = "{{$labels.alias}} has total CPU utilization over 90% for at least 1h."
|
||||
}
|
||||
ALERT node_ram_using_90percent
|
||||
IF node_memory_MemFree + node_memory_Buffers + node_memory_Cached < node_memory_MemTotal * 0.1
|
||||
FOR 30m
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary="{{$labels.alias}}: Using lots of RAM.",
|
||||
description="{{$labels.alias}} is using at least 90% of its RAM for at least 30 minutes now.",
|
||||
}
|
||||
ALERT node_swap_using_80percent
|
||||
IF node_memory_SwapTotal - (node_memory_SwapFree + node_memory_SwapCached) > node_memory_SwapTotal * 0.8
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity="page"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary="{{$labels.alias}}: Running out of swap soon.",
|
||||
description="{{$labels.alias}} is using 80% of its swap space for at least 10 minutes now."
|
||||
}
|
||||
ALERT homeassistant = {
|
||||
IF homeassistant_entity_available{domain="persistent_notification", entity!~"persistent_notification.http_login|persistent_notification.recorder_database_migration"} >= 0
|
||||
ANNOTATIONS {
|
||||
description="homeassistant notification {{$labels.entity}} ({{$labels.friendly_name}}): {{$value}}"
|
||||
}
|
||||
|
||||
ALERT gitea
|
||||
IF rate(promhttp_metric_handler_requests_total{job="gitea", code="500"}[5m]) > 3
|
||||
ANNOTATIONS {
|
||||
description="{{$labels.instance}}: gitea instances error rate went up: {{$value}} errors in 5 minutes"
|
||||
}
|
||||
''
|
||||
];
|
||||
scrapeConfigs = [
|
||||
{
|
||||
job_name = "telegraf";
|
||||
@@ -19,84 +133,37 @@
|
||||
static_configs = [
|
||||
{
|
||||
targets = [
|
||||
"turingmachine.r:9273"
|
||||
"bernie.r:9273"
|
||||
#"rock.r:9273"
|
||||
"web-01.cloonar.com:9273"
|
||||
];
|
||||
labels.type = "mobile";
|
||||
labels.host = "web-01.cloonar.com";
|
||||
}
|
||||
{
|
||||
targets = [
|
||||
"eva.r:9273"
|
||||
"eve.r:9273"
|
||||
"blob64.r:9273"
|
||||
"matchbox.r:9273"
|
||||
"alertmanager.r:80"
|
||||
"prometheus.r:80"
|
||||
#"rock.r:9273"
|
||||
"mail.cloonar.com:9273"
|
||||
];
|
||||
labels.host = "mail.cloonar.com";
|
||||
}
|
||||
{
|
||||
targets = [
|
||||
"rauter.r:9273"
|
||||
"git.cloonar.com:9273"
|
||||
];
|
||||
# to make it compatible with the node-exporter dashboard
|
||||
labels.host = "rauter.r:9273";
|
||||
labels.host = "git.cloonar.com";
|
||||
}
|
||||
{
|
||||
targets = [
|
||||
"prism.r:9273"
|
||||
"gum.r:9273"
|
||||
"kelle.r:9273"
|
||||
"home-assistant.cloonar.com:9273"
|
||||
];
|
||||
|
||||
labels.org = "krebs";
|
||||
labels.host = "home-assistant.cloonar.com";
|
||||
}
|
||||
{
|
||||
targets = [
|
||||
"clan.lol:9273"
|
||||
targets = map (host: "${host}.cloonar.com:9273") [
|
||||
"web-01"
|
||||
"mail"
|
||||
"git"
|
||||
"home-assistant"
|
||||
];
|
||||
|
||||
labels.org = "clan-lol";
|
||||
}
|
||||
#{
|
||||
# targets = [
|
||||
# "dev1.numtide.com.r:9273"
|
||||
# ];
|
||||
|
||||
# labels.org = "numtide";
|
||||
#}
|
||||
{
|
||||
targets = map (host: "${host}.r:9273") [
|
||||
# university
|
||||
"amy"
|
||||
"clara"
|
||||
"rose"
|
||||
|
||||
"astrid"
|
||||
"dan"
|
||||
"mickey"
|
||||
"bill"
|
||||
"nardole"
|
||||
"yasmin"
|
||||
"ryan"
|
||||
"graham"
|
||||
|
||||
"astrid"
|
||||
"dan"
|
||||
"mickey"
|
||||
|
||||
"jackson"
|
||||
"christina"
|
||||
"adelaide"
|
||||
"wilfred"
|
||||
"river"
|
||||
"jack"
|
||||
|
||||
"ruby"
|
||||
];
|
||||
|
||||
labels.org = "uni";
|
||||
labels.org = "cloonar";
|
||||
}
|
||||
];
|
||||
}
|
||||
@@ -131,118 +198,109 @@
|
||||
];
|
||||
}
|
||||
];
|
||||
alertmanagers = [
|
||||
{
|
||||
static_configs = [
|
||||
{
|
||||
targets = [ "localhost:9093" ];
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
services.prometheus.alertmanager = {
|
||||
enable = true;
|
||||
environmentFile = config.sops.secrets.alertmanager.path;
|
||||
webExternalUrl = "https://alertmanager.cloonar.com";
|
||||
listenAddress = "[::1]";
|
||||
configuration = {
|
||||
global = {
|
||||
# The smarthost and SMTP sender used for mail notifications.
|
||||
smtp_smarthost = "mail.cloonar.com:587";
|
||||
smtp_from = "alertmanager@cloonar.com";
|
||||
smtp_auth_username = "alertmanager@cloonar.com";
|
||||
smtp_auth_password = "$SMTP_PASSWORD";
|
||||
};
|
||||
route = {
|
||||
receiver = "default";
|
||||
routes = [
|
||||
{
|
||||
group_by = [ "host" ];
|
||||
match_re.org = "krebs";
|
||||
group_wait = "5m";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
receiver = "krebs";
|
||||
}
|
||||
{
|
||||
group_by = [ "host" ];
|
||||
match_re.org = "nix-community";
|
||||
group_wait = "5m";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
receiver = "nix-community";
|
||||
}
|
||||
{
|
||||
group_by = [ "host" ];
|
||||
match_re.org = "clan-lol";
|
||||
group_wait = "5m";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
receiver = "clan-lol";
|
||||
}
|
||||
{
|
||||
group_by = [ "host" ];
|
||||
group_wait = "30s";
|
||||
group_interval = "2m";
|
||||
repeat_interval = "2h";
|
||||
receiver = "all";
|
||||
}
|
||||
];
|
||||
};
|
||||
receivers = [
|
||||
{
|
||||
name = "krebs";
|
||||
webhook_configs = [
|
||||
{
|
||||
url = "http://127.0.0.1:9223/";
|
||||
max_alerts = 5;
|
||||
}
|
||||
];
|
||||
}
|
||||
#{
|
||||
# name = "numtide";
|
||||
# slack_configs = [
|
||||
# {
|
||||
# token = "$SLACK_TOKEN";
|
||||
# api_url = "https://";
|
||||
# }
|
||||
# ];
|
||||
#}
|
||||
{
|
||||
name = "nix-community";
|
||||
webhook_configs = [
|
||||
{
|
||||
url = "http://localhost:9088/alert";
|
||||
max_alerts = 5;
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
name = "clan-lol";
|
||||
webhook_configs = [
|
||||
# TODO
|
||||
#{
|
||||
# url = "http://localhost:4050/services/hooks/YWxlcnRtYW5hZ2VyX3NlcnZpY2U";
|
||||
# max_alerts = 5;
|
||||
#}
|
||||
];
|
||||
}
|
||||
{
|
||||
name = "all";
|
||||
pushover_configs = [
|
||||
{
|
||||
user_key = "$PUSHOVER_USER_KEY";
|
||||
token = "$PUSHOVER_TOKEN";
|
||||
priority = "0";
|
||||
}
|
||||
];
|
||||
}
|
||||
{
|
||||
name = "default";
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
# services.prometheus.alertmanager = {
|
||||
# enable = true;
|
||||
# environmentFile = config.sops.secrets.alertmanager.path;
|
||||
# webExternalUrl = "https://alertmanager.cloonar.com";
|
||||
# listenAddress = "[::1]";
|
||||
# configuration = {
|
||||
# global = {
|
||||
# # The smarthost and SMTP sender used for mail notifications.
|
||||
# smtp_smarthost = "mail.cloonar.com:587";
|
||||
# smtp_from = "alertmanager@cloonar.com";
|
||||
# smtp_auth_username = "alertmanager@cloonar.com";
|
||||
# smtp_auth_password = "$SMTP_PASSWORD";
|
||||
# };
|
||||
# route = {
|
||||
# receiver = "default";
|
||||
# routes = [
|
||||
# {
|
||||
# group_by = [ "host" ];
|
||||
# match_re.org = "krebs";
|
||||
# group_wait = "5m";
|
||||
# group_interval = "5m";
|
||||
# repeat_interval = "4h";
|
||||
# receiver = "krebs";
|
||||
# }
|
||||
# {
|
||||
# group_by = [ "host" ];
|
||||
# match_re.org = "nix-community";
|
||||
# group_wait = "5m";
|
||||
# group_interval = "5m";
|
||||
# repeat_interval = "4h";
|
||||
# receiver = "nix-community";
|
||||
# }
|
||||
# {
|
||||
# group_by = [ "host" ];
|
||||
# match_re.org = "clan-lol";
|
||||
# group_wait = "5m";
|
||||
# group_interval = "5m";
|
||||
# repeat_interval = "4h";
|
||||
# receiver = "clan-lol";
|
||||
# }
|
||||
# {
|
||||
# group_by = [ "host" ];
|
||||
# group_wait = "30s";
|
||||
# group_interval = "2m";
|
||||
# repeat_interval = "2h";
|
||||
# receiver = "all";
|
||||
# }
|
||||
# ];
|
||||
# };
|
||||
# receivers = [
|
||||
# {
|
||||
# name = "krebs";
|
||||
# webhook_configs = [
|
||||
# {
|
||||
# url = "http://127.0.0.1:9223/";
|
||||
# max_alerts = 5;
|
||||
# }
|
||||
# ];
|
||||
# }
|
||||
# #{
|
||||
# # name = "numtide";
|
||||
# # slack_configs = [
|
||||
# # {
|
||||
# # token = "$SLACK_TOKEN";
|
||||
# # api_url = "https://";
|
||||
# # }
|
||||
# # ];
|
||||
# #}
|
||||
# {
|
||||
# name = "nix-community";
|
||||
# webhook_configs = [
|
||||
# {
|
||||
# url = "http://localhost:9088/alert";
|
||||
# max_alerts = 5;
|
||||
# }
|
||||
# ];
|
||||
# }
|
||||
# {
|
||||
# name = "clan-lol";
|
||||
# webhook_configs = [
|
||||
# # TODO
|
||||
# #{
|
||||
# # url = "http://localhost:4050/services/hooks/YWxlcnRtYW5hZ2VyX3NlcnZpY2U";
|
||||
# # max_alerts = 5;
|
||||
# #}
|
||||
# ];
|
||||
# }
|
||||
# {
|
||||
# name = "all";
|
||||
# pushover_configs = [
|
||||
# {
|
||||
# user_key = "$PUSHOVER_USER_KEY";
|
||||
# token = "$PUSHOVER_TOKEN";
|
||||
# priority = "0";
|
||||
# }
|
||||
# ];
|
||||
# }
|
||||
# {
|
||||
# name = "default";
|
||||
# }
|
||||
# ];
|
||||
# };
|
||||
# };
|
||||
|
||||
}
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
services.victoriametrics.enable = true;
|
||||
services.prometheus.exporters.node.enable = true;
|
||||
|
||||
sops.secrets.promtail-nginx-password.owner = "root";
|
||||
|
||||
services.nginx.virtualHosts."victoria-server.cloonar.com" = {
|
||||
forceSSL = true;
|
||||
enableACME = true;
|
||||
|
||||
Reference in New Issue
Block a user