feat: add service monitoring alerts for Gitea, Postfix, Dovecot, OpenLDAP, and WireGuard, and consolidate alerting rules in Grafana
This commit is contained in:
16
hosts/web-arm/modules/grafana/alerting/service/default.nix
Normal file
16
hosts/web-arm/modules/grafana/alerting/service/default.nix
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
{ lib, pkgs, config, ... }:
|
||||||
|
let
|
||||||
|
serviceDownAlertRules = (import ./service_down.nix { inherit lib pkgs config; }).grafanaAlertRuleDefinitions;
|
||||||
|
|
||||||
|
allServiceRules = serviceDownAlertRules;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
services.grafana.provision.alerting.rules.settings.groups = [
|
||||||
|
{
|
||||||
|
name = "Service Alerts";
|
||||||
|
folder = "Service Monitoring";
|
||||||
|
interval = "1m";
|
||||||
|
rules = allServiceRules;
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
318
hosts/web-arm/modules/grafana/alerting/service/service_down.nix
Normal file
318
hosts/web-arm/modules/grafana/alerting/service/service_down.nix
Normal file
@@ -0,0 +1,318 @@
|
|||||||
|
{ lib, pkgs, config, ... }:
|
||||||
|
{
|
||||||
|
grafanaAlertRuleDefinitions = [
|
||||||
|
# Systemd service monitoring alerts
|
||||||
|
{
|
||||||
|
uid = "gitea-service-down-alert-uid";
|
||||||
|
title = "Gitea Service Down";
|
||||||
|
condition = "C";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
relativeTimeRange = {
|
||||||
|
from = 300;
|
||||||
|
to = 0;
|
||||||
|
};
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
model = {
|
||||||
|
editorMode = "code";
|
||||||
|
expr = "node_systemd_unit_state{state=\"active\", name=\"container@git.service\"} OR on() vector(0)";
|
||||||
|
hide = false;
|
||||||
|
intervalMs = 1000;
|
||||||
|
legendFormat = "__auto";
|
||||||
|
maxDataPoints = 43200;
|
||||||
|
range = true;
|
||||||
|
refId = "A";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "B";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$B < 1";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
noDataState = "Alerting";
|
||||||
|
execErrState = "Alerting";
|
||||||
|
for = "5m";
|
||||||
|
annotations = {
|
||||||
|
description = "Gitea service is down on {{ $labels.instance }}";
|
||||||
|
summary = "Gitea Service Down";
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "critical";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
uid = "gitea-runner-service-down-alert-uid";
|
||||||
|
title = "Gitea Runner Service Down";
|
||||||
|
condition = "C";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
relativeTimeRange = {
|
||||||
|
from = 300;
|
||||||
|
to = 0;
|
||||||
|
};
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
model = {
|
||||||
|
editorMode = "code";
|
||||||
|
expr = "node_systemd_unit_state{state=\"active\", name=\"microvm@git-runner-1.service \"} OR on() vector(0)";
|
||||||
|
hide = false;
|
||||||
|
intervalMs = 1000;
|
||||||
|
legendFormat = "__auto";
|
||||||
|
maxDataPoints = 43200;
|
||||||
|
range = true;
|
||||||
|
refId = "A";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "B";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$B < 1";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
noDataState = "Alerting";
|
||||||
|
execErrState = "Alerting";
|
||||||
|
for = "5m";
|
||||||
|
annotations = {
|
||||||
|
description = "Gitea Runner service is down on {{ $labels.instance }}";
|
||||||
|
summary = "Gitea Runner Service Down";
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "critical";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
uid = "postfix-service-down-alert-uid";
|
||||||
|
title = "Postfix Service Down";
|
||||||
|
condition = "C";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
relativeTimeRange = {
|
||||||
|
from = 300;
|
||||||
|
to = 0;
|
||||||
|
};
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
model = {
|
||||||
|
editorMode = "code";
|
||||||
|
expr = "node_systemd_unit_state{state=\"active\", name=\"postfix.service\"} OR on() vector(0)";
|
||||||
|
hide = false;
|
||||||
|
intervalMs = 1000;
|
||||||
|
legendFormat = "__auto";
|
||||||
|
maxDataPoints = 43200;
|
||||||
|
range = true;
|
||||||
|
refId = "A";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "B";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$B < 1";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
noDataState = "Alerting";
|
||||||
|
execErrState = "Alerting";
|
||||||
|
for = "5m";
|
||||||
|
annotations = {
|
||||||
|
description = "Postfix service is down on {{ $labels.instance }}";
|
||||||
|
summary = "Postfix Service Down";
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "critical";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
uid = "dovecot-service-down-alert-uid";
|
||||||
|
title = "Dovecot Service Down";
|
||||||
|
condition = "C";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
relativeTimeRange = {
|
||||||
|
from = 300;
|
||||||
|
to = 0;
|
||||||
|
};
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
model = {
|
||||||
|
editorMode = "code";
|
||||||
|
expr = "node_systemd_unit_state{state=\"active\", name=\"dovecot2.service\"} OR on() vector(0)";
|
||||||
|
hide = false;
|
||||||
|
intervalMs = 1000;
|
||||||
|
legendFormat = "__auto";
|
||||||
|
maxDataPoints = 43200;
|
||||||
|
range = true;
|
||||||
|
refId = "A";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "B";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$B < 1";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
noDataState = "Alerting";
|
||||||
|
execErrState = "Alerting";
|
||||||
|
for = "5m";
|
||||||
|
annotations = {
|
||||||
|
description = "Dovecot service is down on {{ $labels.instance }}";
|
||||||
|
summary = "Dovecot Service Down";
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "critical";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
uid = "openldap-service-down-alert-uid";
|
||||||
|
title = "OpenLDAP Service Down";
|
||||||
|
condition = "C";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
relativeTimeRange = {
|
||||||
|
from = 300;
|
||||||
|
to = 0;
|
||||||
|
};
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
model = {
|
||||||
|
editorMode = "code";
|
||||||
|
expr = "node_systemd_unit_state{state=\"active\", name=\"openldap.service\"} OR on() vector(0)";
|
||||||
|
hide = false;
|
||||||
|
intervalMs = 1000;
|
||||||
|
legendFormat = "__auto";
|
||||||
|
maxDataPoints = 43200;
|
||||||
|
range = true;
|
||||||
|
refId = "A";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "B";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$B < 1";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
noDataState = "Alerting";
|
||||||
|
execErrState = "Alerting";
|
||||||
|
for = "5m";
|
||||||
|
annotations = {
|
||||||
|
description = "OpenLDAP service is down on {{ $labels.instance }}";
|
||||||
|
summary = "OpenLDAP Service Down";
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "critical";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
uid = "wireguard-service-down-alert-uid";
|
||||||
|
title = "WireGuard Service Down";
|
||||||
|
condition = "C";
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
refId = "A";
|
||||||
|
relativeTimeRange = {
|
||||||
|
from = 300;
|
||||||
|
to = 0;
|
||||||
|
};
|
||||||
|
datasourceUid = "vm-datasource-uid";
|
||||||
|
model = {
|
||||||
|
editorMode = "code";
|
||||||
|
expr = "node_systemd_unit_state{state=\"active\", name=\"wireguard-wg_cloonar.service\"} OR on() vector(0)";
|
||||||
|
hide = false;
|
||||||
|
intervalMs = 1000;
|
||||||
|
legendFormat = "__auto";
|
||||||
|
maxDataPoints = 43200;
|
||||||
|
range = true;
|
||||||
|
refId = "A";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "B";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "reduce";
|
||||||
|
expression = "A";
|
||||||
|
reducer = "last";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
{
|
||||||
|
refId = "C";
|
||||||
|
datasourceUid = "__expr__";
|
||||||
|
model = {
|
||||||
|
type = "math";
|
||||||
|
expression = "$B < 1";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
noDataState = "Alerting";
|
||||||
|
execErrState = "Alerting";
|
||||||
|
for = "5m";
|
||||||
|
annotations = {
|
||||||
|
description = "WireGuard service is down on {{ $labels.instance }}";
|
||||||
|
summary = "WireGuard Service Down";
|
||||||
|
};
|
||||||
|
labels = {
|
||||||
|
severity = "critical";
|
||||||
|
};
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
@@ -35,6 +35,7 @@ in
|
|||||||
# ./alerting/inode_usage.nix
|
# ./alerting/inode_usage.nix
|
||||||
# ./alerting/ram_usage.nix
|
# ./alerting/ram_usage.nix
|
||||||
./alerting/system/default.nix # Added: Imports the consolidated system alerts module
|
./alerting/system/default.nix # Added: Imports the consolidated system alerts module
|
||||||
|
./alerting/service/default.nix # Added: Imports the new service alerts module
|
||||||
# ... other rule files can be added here ...
|
# ... other rule files can be added here ...
|
||||||
./datasources/victoriametrics.nix
|
./datasources/victoriametrics.nix
|
||||||
];
|
];
|
||||||
|
|||||||
@@ -1,273 +0,0 @@
|
|||||||
# Service Monitoring & Alerting Plan
|
|
||||||
|
|
||||||
**Date:** 2025-05-31
|
|
||||||
**Prepared by:** Roo (AI Technical Leader)
|
|
||||||
|
|
||||||
## 1. Goal
|
|
||||||
|
|
||||||
Implement "service down" alerting in Grafana for OpenLDAP, Postfix, Dovecot, Gitea, Gitea Action Runners (via Gitea server metrics), and WireGuard. All configurations are to be managed via NixOS, using VictoriaMetrics as the central metrics backend and Grafana for alerting.
|
|
||||||
|
|
||||||
## 2. Hosts Involved
|
|
||||||
|
|
||||||
* **`fw`**:
|
|
||||||
* Runs Gitea (in a container), Gitea Action Runner MicroVMs, and WireGuard interfaces.
|
|
||||||
* Will host `vmagent` for metrics collection.
|
|
||||||
* Will host `wireguard_exporter`.
|
|
||||||
* **`mail`**:
|
|
||||||
* Runs OpenLDAP, Postfix, and Dovecot.
|
|
||||||
* Will host `vmagent` for metrics collection.
|
|
||||||
* Will host `openldap_exporter`, `postfix_exporter`, and `dovecot_exporter`.
|
|
||||||
* **`web-arm`**:
|
|
||||||
* Runs the central VictoriaMetrics service (accessible as `victoria-server.cloonar.com`).
|
|
||||||
* Runs Grafana.
|
|
||||||
* Will host the NixOS-provisioned alert definitions for Grafana.
|
|
||||||
|
|
||||||
## 3. Strategy Overview
|
|
||||||
|
|
||||||
The strategy involves three main phases for each service:
|
|
||||||
|
|
||||||
1. **Metrics Exposure**: Ensure each service (or an associated exporter) provides Prometheus-compatible metrics.
|
|
||||||
2. **Metrics Collection**: Configure `vmagent` on the relevant hosts (`fw`, `mail`) to scrape these metrics and send them to the central VictoriaMetrics instance on `web-arm`.
|
|
||||||
3. **Alert Definition**: Define alert rules in Grafana (on `web-arm`) using NixOS provisioning. These rules will query VictoriaMetrics and trigger notifications (via the existing Pushover contact point) if a service is detected as down.
|
|
||||||
|
|
||||||
## 4. Detailed Plan
|
|
||||||
|
|
||||||
### 4.1. Metrics Exposure & Collection (Modular NixOS Approach)
|
|
||||||
|
|
||||||
#### 4.1.1. Gitea Server (on `fw` host)
|
|
||||||
* **Action**: Enable built-in Prometheus metrics in `hosts/fw/modules/gitea.nix`.
|
|
||||||
* Modify `services.gitea.settings` within the Gitea container's configuration.
|
|
||||||
* Example:
|
|
||||||
```nix
|
|
||||||
metrics = {
|
|
||||||
ENABLED = true;
|
|
||||||
TOKEN = "your_secure_token_here"; // Optional: Consider if a token is needed/desired
|
|
||||||
};
|
|
||||||
```
|
|
||||||
* The Gitea `/metrics` endpoint will also provide status information for Gitea Action Runners.
|
|
||||||
* **`vmagent` Scrape Job**: Defined in `hosts/fw/modules/gitea.nix`.
|
|
||||||
```nix
|
|
||||||
config.services.vmagent.scrapeJobs = [
|
|
||||||
{
|
|
||||||
job_name = "gitea";
|
|
||||||
static_configs = [{
|
|
||||||
targets = ["<gitea_container_ip_or_localhost>:<gitea_port>"]; // e.g., "localhost:3001" if vmagent is on host and can reach container port
|
|
||||||
}];
|
|
||||||
// metrics_path defaults to /metrics
|
|
||||||
}
|
|
||||||
];
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 4.1.2. Gitea Action Runners (on `fw` host, MicroVMs)
|
|
||||||
* **Action**: Monitoring will be performed by querying metrics exposed by the Gitea server itself (see 4.1.1). No separate `node_exporter` or specific scrape jobs for the runner VMs will be added for this phase of alerting.
|
|
||||||
|
|
||||||
#### 4.1.3. OpenLDAP (on `mail` host)
|
|
||||||
* **Action**:
|
|
||||||
1. Add `pkgs.openldap_exporter` to the `mail` host's system packages.
|
|
||||||
2. Configure `services.openldap-exporter` in `hosts/mail/modules/openldap.nix`.
|
|
||||||
* Ensure it points to the local OpenLDAP instance (e.g., `ldap:///`) and has permissions for `cn=monitor`.
|
|
||||||
* Default port is `9330`.
|
|
||||||
* **`vmagent` Scrape Job**: Defined in `hosts/mail/modules/openldap.nix`.
|
|
||||||
```nix
|
|
||||||
config.services.vmagent.scrapeJobs = [
|
|
||||||
{
|
|
||||||
job_name = "openldap";
|
|
||||||
static_configs = [{ targets = ["localhost:9330"]; }];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 4.1.4. Postfix (on `mail` host)
|
|
||||||
* **Action**:
|
|
||||||
1. Add `pkgs.postfix_exporter` to the `mail` host's system packages.
|
|
||||||
2. Configure `services.postfix-exporter` in `hosts/mail/modules/postfix.nix`.
|
|
||||||
* May require log file access or `postconf` permissions.
|
|
||||||
* Default port is `9154`.
|
|
||||||
* **`vmagent` Scrape Job**: Defined in `hosts/mail/modules/postfix.nix`.
|
|
||||||
```nix
|
|
||||||
config.services.vmagent.scrapeJobs = [
|
|
||||||
{
|
|
||||||
job_name = "postfix";
|
|
||||||
static_configs = [{ targets = ["localhost:9154"]; }];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 4.1.5. Dovecot (on `mail` host)
|
|
||||||
* **Action**:
|
|
||||||
1. In `hosts/mail/modules/dovecot.nix`, enable Dovecot's internal statistics service, making it accessible (e.g., via a local socket or TCP port).
|
|
||||||
2. Add `pkgs.dovecot_exporter` to the `mail` host's system packages.
|
|
||||||
3. Configure `services.dovecot-exporter` in `hosts/mail/modules/dovecot.nix` to connect to Dovecot's stats.
|
|
||||||
* Default port is `9166`.
|
|
||||||
* **`vmagent` Scrape Job**: Defined in `hosts/mail/modules/dovecot.nix`.
|
|
||||||
```nix
|
|
||||||
config.services.vmagent.scrapeJobs = [
|
|
||||||
{
|
|
||||||
job_name = "dovecot";
|
|
||||||
static_configs = [{ targets = ["localhost:9166"]; }];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 4.1.6. WireGuard (interfaces on `fw` host)
|
|
||||||
* **Action**:
|
|
||||||
1. Add `pkgs.wireguard_exporter` to the `fw` host's system packages.
|
|
||||||
2. Configure `services.wireguard-exporter` in `hosts/fw/modules/wireguard.nix`.
|
|
||||||
* Requires privileges for `wg show all dump`.
|
|
||||||
* Default port is `9586`.
|
|
||||||
* **`vmagent` Scrape Job**: Defined in `hosts/fw/modules/wireguard.nix`.
|
|
||||||
```nix
|
|
||||||
config.services.vmagent.scrapeJobs = [
|
|
||||||
{
|
|
||||||
job_name = "wireguard";
|
|
||||||
static_configs = [{ targets = ["localhost:9586"]; }];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 4.1.7. Central `vmagent` Configuration (`utils/modules/victoriametrics/default.nix`)
|
|
||||||
* **Action**:
|
|
||||||
1. Ensure the `services.vmagent.scrapeJobs` option is defined to allow merging of job lists from various modules:
|
|
||||||
```nix
|
|
||||||
# In utils/modules/victoriametrics/default.nix
|
|
||||||
options.services.vmagent.scrapeJobs = lib.mkOption {
|
|
||||||
type = lib.types.listOf lib.types.attrs;
|
|
||||||
default = [
|
|
||||||
# Default job for the host's own node_exporter
|
|
||||||
{ job_name = "node_exporter_${config.networking.hostName}"; # Unique job name
|
|
||||||
stream_parse = true;
|
|
||||||
static_configs = [{ targets = ["${config.networking.hostName}:9100"]; }];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
apply = lib.concatLists; // Or use lib.mkMerge for more complex merging if needed later
|
|
||||||
description = "List of scrape_configs jobs for vmagent.";
|
|
||||||
};
|
|
||||||
```
|
|
||||||
2. The `prometheus.yml` generation script (`configure_prom`) will use `config.services.vmagent.scrapeJobs`.
|
|
||||||
3. `vmagent` on `fw` and `mail` hosts will continue to use `config.sops.secrets.victoria-agent-env` for secure remote write to `https://victoria-server.cloonar.com/api/v1/write` (which is the `web-arm` host).
|
|
||||||
|
|
||||||
### 4.2. Grafana Alert Provisioning (on `web-arm` host)
|
|
||||||
|
|
||||||
* **Action**:
|
|
||||||
1. Create a new Nix file: `hosts/web-arm/modules/grafana/alerting/infrastructure/default.nix`.
|
|
||||||
2. Populate this file with alert rule groups. Example structure:
|
|
||||||
```nix
|
|
||||||
# In hosts/web-arm/modules/grafana/alerting/infrastructure/default.nix
|
|
||||||
{ lib, config, ... }:
|
|
||||||
{
|
|
||||||
services.grafana.provision.alerting.rules.settings.groups = lib.mkMerge [
|
|
||||||
{ // Gitea Server Alert
|
|
||||||
name = "Gitea Service Alerts";
|
|
||||||
folder = "Infrastructure Services";
|
|
||||||
interval = "1m";
|
|
||||||
rules = [
|
|
||||||
{
|
|
||||||
alert = "GiteaServerDown";
|
|
||||||
expr = ''up{job="gitea"} == 0'';
|
|
||||||
for = "2m";
|
|
||||||
labels = { severity = "critical", service = "gitea" };
|
|
||||||
annotations = { /* ... */ };
|
|
||||||
}
|
|
||||||
];
|
|
||||||
},
|
|
||||||
{ // Gitea Runner Alerts (via Gitea Server Metrics)
|
|
||||||
name = "Gitea Runner Alerts";
|
|
||||||
folder = "Infrastructure Services";
|
|
||||||
interval = "1m";
|
|
||||||
rules = [
|
|
||||||
{
|
|
||||||
alert = "GiteaRunnerOffline_git-runner-1";
|
|
||||||
// Verify exact metric name & labels from Gitea's /metrics endpoint
|
|
||||||
expr = ''gitea_actions_runner_status{runner_name="git-runner-1", status="offline"} == 1'';
|
|
||||||
for = "5m";
|
|
||||||
labels = { severity = "warning", service = "gitea-runner", runner = "git-runner-1" };
|
|
||||||
annotations = { /* ... */ };
|
|
||||||
},
|
|
||||||
{
|
|
||||||
alert = "GiteaRunnerOffline_git-runner-2";
|
|
||||||
expr = ''gitea_actions_runner_status{runner_name="git-runner-2", status="offline"} == 1'';
|
|
||||||
for = "5m";
|
|
||||||
labels = { severity = "warning", service = "gitea-runner", runner = "git-runner-2" };
|
|
||||||
annotations = { /* ... */ };
|
|
||||||
}
|
|
||||||
];
|
|
||||||
},
|
|
||||||
{ // OpenLDAP Alert
|
|
||||||
name = "OpenLDAP Service Alerts"; /* ... */
|
|
||||||
rules = [ { alert = "OpenLDAPDown"; expr = ''up{job="openldap"} == 0''; /* ... */ } ];
|
|
||||||
},
|
|
||||||
{ // Postfix Alert
|
|
||||||
name = "Postfix Service Alerts"; /* ... */
|
|
||||||
rules = [ { alert = "PostfixDown"; expr = ''up{job="postfix"} == 0''; /* ... */ } ];
|
|
||||||
},
|
|
||||||
{ // Dovecot Alert
|
|
||||||
name = "Dovecot Service Alerts"; /* ... */
|
|
||||||
rules = [ { alert = "DovecotDown"; expr = ''up{job="dovecot"} == 0''; /* ... */ } ];
|
|
||||||
},
|
|
||||||
{ // WireGuard Alert
|
|
||||||
name = "WireGuard Service Alerts"; /* ... */
|
|
||||||
rules = [ { alert = "WireGuardExporterDown"; expr = ''up{job="wireguard"} == 0''; /* ... */ } ];
|
|
||||||
}
|
|
||||||
];
|
|
||||||
}
|
|
||||||
```
|
|
||||||
3. Import this new rules file into `hosts/web-arm/modules/grafana/default.nix`:
|
|
||||||
```nix
|
|
||||||
imports = [
|
|
||||||
./alerting/system/default.nix
|
|
||||||
./alerting/infrastructure/default.nix // <-- Add this line
|
|
||||||
./datasources/victoriametrics.nix
|
|
||||||
];
|
|
||||||
```
|
|
||||||
4. Alerts will use the existing `cp_dominik` Pushover contact point by default.
|
|
||||||
|
|
||||||
## 5. Diagram of Metrics Flow
|
|
||||||
|
|
||||||
```mermaid
|
|
||||||
graph TD
|
|
||||||
subgraph Host_FW ["Host: fw (vmagent)"]
|
|
||||||
GiteaApp[Gitea in Container] -- :3001/metrics --> VMAgentFW
|
|
||||||
RunnerVM1[Gitea Runner VM 1] -.-> GiteaApp; subgraph Gitea Runner VMs
|
|
||||||
RunnerVM1
|
|
||||||
RunnerVM2[Gitea Runner VM 2] -.-> GiteaApp;
|
|
||||||
end
|
|
||||||
WG[WireGuard Kernel] -- wg show --> WGExporter(wireguard_exporter :9586)
|
|
||||||
WGExporter -- metrics --> VMAgentFW
|
|
||||||
VMAgentFW[vmagent] -- remoteWrite --> VictoriaMetricsSvc
|
|
||||||
end
|
|
||||||
|
|
||||||
subgraph Host_Mail ["Host: mail (vmagent)"]
|
|
||||||
OpenLDAPApp[OpenLDAP] -- cn=monitor --> OpenLDAPExporter(openldap_exporter :9330)
|
|
||||||
OpenLDAPExporter -- metrics --> VMAgentMail
|
|
||||||
PostfixApp[Postfix] -- logs/stats --> PostfixExporter(postfix_exporter :9154)
|
|
||||||
PostfixExporter -- metrics --> VMAgentMail
|
|
||||||
DovecotApp[Dovecot] -- stats --> DovecotExporter(dovecot_exporter :9166)
|
|
||||||
DovecotExporter -- metrics --> VMAgentMail
|
|
||||||
VMAgentMail[vmagent] -- remoteWrite --> VictoriaMetricsSvc
|
|
||||||
end
|
|
||||||
|
|
||||||
subgraph Host_Web_ARM ["Host: web-arm (victoria-server.cloonar.com)"]
|
|
||||||
VictoriaMetricsSvc[VictoriaMetrics Service]
|
|
||||||
Grafana[Grafana] -- queries --> VictoriaMetricsSvc
|
|
||||||
Grafana -- Alert Rules (Nix Provisioned) --> Notifications[Pushover]
|
|
||||||
end
|
|
||||||
|
|
||||||
style VMAgentFW fill:#lightgreen
|
|
||||||
style VMAgentMail fill:#lightgreen
|
|
||||||
style Grafana fill:#lightblue
|
|
||||||
style VictoriaMetricsSvc fill:#orange
|
|
||||||
```
|
|
||||||
|
|
||||||
## 6. Pre-Implementation Checklist & Notes
|
|
||||||
|
|
||||||
1. **Verify Exporter Package Names**: Confirm the exact NixOS package names in `pkgs` for:
|
|
||||||
* `openldap_exporter`
|
|
||||||
* `postfix_exporter`
|
|
||||||
* `dovecot_exporter`
|
|
||||||
* `wireguard_exporter`
|
|
||||||
2. **Gitea Metrics Token**: Decide on and implement a token strategy for Gitea's `/metrics` endpoint if desired for security.
|
|
||||||
3. **Gitea Runner Metrics**: Inspect the Gitea server's `/metrics` endpoint to confirm the exact metric names and labels for runner status (e.g., `gitea_actions_runner_status` or `gitea_actions_runners_total`) to ensure alert queries are accurate.
|
|
||||||
4. **Exporter Ports**: Default ports are assumed. Adjust configurations if non-default ports are used.
|
|
||||||
5. **Firewall Rules**: Ensure `vmagent` can reach all local exporter ports and that exporters can reach their respective services.
|
|
||||||
|
|
||||||
This plan provides a comprehensive approach to enhancing service monitoring and alerting.
|
|
||||||
@@ -2,18 +2,45 @@
|
|||||||
let
|
let
|
||||||
configure_prom = builtins.toFile "prometheus.yml" ''
|
configure_prom = builtins.toFile "prometheus.yml" ''
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
- job_name: 'server'
|
# System metrics
|
||||||
|
- job_name: 'node'
|
||||||
stream_parse: true
|
stream_parse: true
|
||||||
static_configs:
|
static_configs:
|
||||||
- targets:
|
- targets:
|
||||||
- ${config.networking.hostName}:9100
|
- ${config.networking.hostName}:9100
|
||||||
|
|
||||||
|
# Systemd service monitoring
|
||||||
|
- job_name: 'systemd'
|
||||||
|
metrics_path: /metrics
|
||||||
|
params:
|
||||||
|
collect[]:
|
||||||
|
- 'systemd.service.state'
|
||||||
|
- 'systemd.service.start_time_seconds'
|
||||||
|
- 'systemd.unit_file.state'
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- ${config.networking.hostName}:9100
|
||||||
|
relabel_configs:
|
||||||
|
# Filter for specific services we want to monitor
|
||||||
|
- source_labels: [__name__]
|
||||||
|
regex: 'node_systemd_unit_state'
|
||||||
|
action: keep
|
||||||
|
- source_labels: [name]
|
||||||
|
regex: '(container@git|microvm@git-runner-1|postfix|dovecot|openldap|wireguard-wg_cloonar).*\.service'
|
||||||
|
action: keep
|
||||||
'';
|
'';
|
||||||
in {
|
in {
|
||||||
sops.secrets.victoria-agent-env = {
|
sops.secrets.victoria-agent-env = {
|
||||||
sopsFile = ./secrets.yaml;
|
sopsFile = ./secrets.yaml;
|
||||||
};
|
};
|
||||||
|
|
||||||
services.prometheus.exporters.node.enable = true;
|
# Node exporter for system metrics
|
||||||
|
services.prometheus.exporters.node = {
|
||||||
|
enable = true;
|
||||||
|
enabledCollectors = [
|
||||||
|
"systemd" # Enable systemd collector for service monitoring
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|
||||||
systemd.services.export-to-prometheus = {
|
systemd.services.export-to-prometheus = {
|
||||||
path = with pkgs; [victoriametrics];
|
path = with pkgs; [victoriametrics];
|
||||||
|
|||||||
Reference in New Issue
Block a user