feat: power management for nas

2026-04-11 09:31:43 +02:00 · 2026-04-11 09:31:43 +02:00 · 46f42dab4b
commit 46f42dab4b
parent b02acb5b60
5 changed files with 235 additions and 1 deletions
--- a/hosts/fw/configuration.nix
+++ b/hosts/fw/configuration.nix
@ -27,6 +27,7 @@
    ./modules/podman.nix
    ./modules/omada.nix
    ./modules/ddclient.nix
+    ./modules/nas-wake-on-access.nix
    # ./modules/wol.nix


@ -94,7 +95,6 @@
  nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [
    "mongodb"
    "ai-mailer"
-    "filebot"
    "claude-code"
  ];

--- a/hosts/fw/modules/firewall.nix
+++ b/hosts/fw/modules/firewall.nix
@ -85,6 +85,12 @@
            chain forward {
              type filter hook forward priority filter; policy drop;

+              # Wake-on-access: flag new traffic aimed at the NAS so
+              # nas-wake-journal.service can fire a WOL. No verdict => falls
+              # through, does not broaden policy. ct state new skips ongoing
+              # flows; rate limit caps journal spam before it leaves the kernel.
+              ip daddr ${config.networkPrefix}.97.11 ct state new limit rate 30/minute log prefix "nas-wake: " comment "trigger wake-on-access"
+
              iifname "wg_cloonar" counter accept comment "test wireguard"

              iifname "wg_cloonar" oifname lo counter accept comment "wireguard to server"
--- a/hosts/fw/modules/nas-wake-on-access.nix
+++ b/hosts/fw/modules/nas-wake-on-access.nix
@ -0,0 +1,110 @@
+# NAS wake-on-access (fw side)
+#
+# Detects traffic aimed at the NAS (10.42.97.11) and sends a WOL magic
+# packet so the machine comes back up on demand after it has powered itself
+# off (see hosts/nas/modules/auto-shutdown.nix).
+#
+# Traffic reaches the NAS via two paths, so we need two detectors that feed
+# the same wake script:
+#
+#   1. Cross-VLAN traffic is routed through fw and hits nftables' forward
+#      chain. A logging rule tags these packets and a journal follower
+#      translates the log line into a wake invocation.
+#
+#   2. Same-VLAN (server) traffic stays on the bridge and never reaches
+#      nftables. A tcpdump follower watches ARP-who-has for 10.42.97.11 on
+#      the server interface and triggers the wake from there.
+{ config, lib, pkgs, ... }:
+
+let
+  nasIp = "${config.networkPrefix}.97.11";
+  nasMac = "6c:1f:f7:8e:a9:86";
+  serverBroadcast = "${config.networkPrefix}.97.255";
+  serverIface = "server";
+
+  stateDir = "/run/nas-wake-on-access";
+  lastWakeFile = "${stateDir}/last-wake";
+  cooldownSeconds = 30;
+
+  wakeScript = pkgs.writeShellScript "nas-wake" ''
+    set -euo pipefail
+
+    mkdir -p "${stateDir}"
+    now=$(date +%s)
+
+    # Cooldown gate: at most one WOL every ${toString cooldownSeconds}s.
+    if [[ -f "${lastWakeFile}" ]]; then
+      last=$(cat "${lastWakeFile}" 2>/dev/null || echo 0)
+      if (( now - last < ${toString cooldownSeconds} )); then
+        exit 0
+      fi
+    fi
+
+    # If the NAS answers ping it is already up; skip WOL but refresh
+    # the cooldown so repeated probes don't spin the CPU.
+    if ${pkgs.iputils}/bin/ping -c1 -W1 -n ${nasIp} >/dev/null 2>&1; then
+      echo "nas-wake: NAS already up, not sending WOL"
+      echo "$now" > "${lastWakeFile}"
+      exit 0
+    fi
+
+    echo "nas-wake: sending WOL to ${nasMac} via ${serverBroadcast}"
+    ${pkgs.wol}/bin/wol -i ${serverBroadcast} ${nasMac} || true
+    echo "$now" > "${lastWakeFile}"
+  '';
+
+  # Journal follower for cross-VLAN (routed) traffic. nftables logs a line
+  # prefixed with "nas-wake: " into the kernel ring buffer for every new
+  # packet headed to the NAS (rate-limited kernel-side).
+  journalFollowerScript = pkgs.writeShellScript "nas-wake-journal-follower" ''
+    set -euo pipefail
+    ${pkgs.systemd}/bin/journalctl -kf -o cat --since now \
+      | ${pkgs.gnugrep}/bin/grep --line-buffered -F "nas-wake:" \
+      | while IFS= read -r _line; do
+          ${wakeScript} || true
+        done
+  '';
+
+  # ARP follower for same-VLAN traffic. Clients on the server VLAN talk to
+  # the NAS directly via the bridge, so their packets never hit nftables.
+  # An ARP "who-has 10.42.97.11" is the reliable early signal that someone
+  # wants to reach the NAS.
+  arpFollowerScript = pkgs.writeShellScript "nas-wake-arp-follower" ''
+    set -euo pipefail
+    ${pkgs.tcpdump}/bin/tcpdump -i ${serverIface} -l -n -p -Q in \
+        'arp and host ${nasIp}' \
+      | while IFS= read -r _line; do
+          ${wakeScript} || true
+        done
+  '';
+in
+{
+  systemd.services.nas-wake-journal = {
+    description = "Wake NAS on cross-VLAN traffic (nftables log follower)";
+    after = [ "nftables.service" "systemd-journald.service" ];
+    requires = [ "systemd-journald.service" ];
+    wantedBy = [ "multi-user.target" ];
+    path = with pkgs; [ coreutils iputils wol systemd gnugrep ];
+    serviceConfig = {
+      Type = "simple";
+      ExecStart = "${journalFollowerScript}";
+      Restart = "always";
+      RestartSec = "5s";
+    };
+  };
+
+  systemd.services.nas-wake-arp = {
+    description = "Wake NAS on same-VLAN ARP (server bridge)";
+    after = [ "network-online.target" ];
+    wants = [ "network-online.target" ];
+    wantedBy = [ "multi-user.target" ];
+    path = with pkgs; [ coreutils iputils wol tcpdump ];
+    serviceConfig = {
+      Type = "simple";
+      ExecStart = "${arpFollowerScript}";
+      Restart = "always";
+      RestartSec = "5s";
+      AmbientCapabilities = [ "CAP_NET_RAW" "CAP_NET_ADMIN" ];
+    };
+  };
+}
--- a/hosts/nas/configuration.nix
+++ b/hosts/nas/configuration.nix
@ -21,6 +21,7 @@ in
    ./modules/audiobookshelf.nix
    ./modules/power-management.nix
    ./modules/disk-monitoring.nix
+    ./modules/auto-shutdown.nix
    ./modules/ugreen-leds.nix

    ./hardware-configuration.nix
@ -45,6 +46,10 @@ in
  networking.firewall.enable = true;
  networking.firewall.allowedTCPPorts = [ 22 ];

+  # Wake-on-LAN: fw re-wakes the NAS on demand after auto-shutdown.
+  # Assumes WOL is enabled in BIOS; translates to `ethtool -s enp2s0 wol g`.
+  networking.interfaces.enp2s0.wakeOnLan.enable = true;
+
  # SOPS configuration
  sops.age.sshKeyPaths = [ "/etc/ssh/ssh_host_ed25519_key" ];
  sops.defaultSopsFile = ./secrets.yaml;
--- a/hosts/nas/modules/auto-shutdown.nix
+++ b/hosts/nas/modules/auto-shutdown.nix
@ -0,0 +1,113 @@
+# NAS auto-shutdown
+# Powers the machine off when all of the following are true:
+#   1. No active SSH session
+#   2. pyload is not downloading (no non-local TCP peers)
+#   3. pyload has no hook children (extraction, filebot, unrar, ...)
+#   4. Both spinning HDDs are in standby
+#   5. At least 15 minutes have passed since boot (via OnBootSec)
+# The fw host re-wakes the NAS on demand via WOL
+# (see hosts/fw/modules/nas-wake-on-access.nix).
+{ config, lib, pkgs, ... }:
+
+let
+  # Spinning disks whose power state gates the shutdown decision.
+  # Only the Toshiba HDDs; NVMe drives do not spin down.
+  hdds = [
+    "/dev/disk/by-id/ata-TOSHIBA_MG10ACA20TE_8582A01SF4MJ"
+    "/dev/disk/by-id/ata-TOSHIBA_MG10ACA20TE_75V2A0H3F4MJ"
+  ];
+
+  # Peer addresses that do not count as "real download traffic":
+  # loopback, internal VLAN (10.42.0.0/16), and IPv6 loopback / link-local.
+  localPeerRegex = "^(127\\.|10\\.42\\.|\\[::1\\]|\\[fe80)";
+
+  autoShutdownScript = pkgs.writeShellScript "nas-auto-shutdown" ''
+    set -euo pipefail
+    log() { echo "auto-shutdown: $*"; }
+
+    # 1. SSH sessions. Cheapest check and it prevents shutting down while an
+    # admin is logged in. Uses ss at the socket layer so it catches
+    # forwarding-only sessions and `ssh host 'cmd'` runs that utmp misses.
+    # NOTE: SSH port hardcoded to 22 — keep in sync with configuration.nix.
+    ssh_sessions=$(${pkgs.iproute2}/bin/ss -H -t -n state established '( sport = :22 )' || true)
+    if [[ -n "$ssh_sessions" ]]; then
+      log "active SSH session present, staying up"
+      exit 0
+    fi
+
+    # 2. pyload active downloads: any established/outgoing pyload-owned socket
+    # to a non-local peer means a download is in flight.
+    pyload_conns=$(${pkgs.iproute2}/bin/ss -H -t -n -p \
+        state established state syn-sent state syn-recv 2>/dev/null \
+      | grep -F '"pyload"' \
+      | awk '{print $5}' \
+      | grep -Ev '${localPeerRegex}' || true)
+    if [[ -n "$pyload_conns" ]]; then
+      log "pyload has active non-local connections, staying up"
+      exit 0
+    fi
+
+    # 3. pyload hook children (extraction, filebot, unrar, 7z, java, ...).
+    # The package_finished hook is launched by pyload's ExternalScripts
+    # plugin, so every child lives in pyload.service's cgroup. cgroup.procs
+    # contains PIDs/TGIDs only, not TIDs, so pyload's internal thread pool
+    # cannot false-positive. Fail-safe: if the file is unreadable, stay up.
+    cgroup_procs=/sys/fs/cgroup/system.slice/pyload.service/cgroup.procs
+    if [[ ! -r "$cgroup_procs" ]]; then
+      log "pyload cgroup procs file unreadable ($cgroup_procs), staying up"
+      exit 0
+    fi
+    main_pid=$(${pkgs.systemd}/bin/systemctl show -p MainPID --value pyload.service)
+    children=$(grep -v -x -F "$main_pid" "$cgroup_procs" || true)
+    if [[ -n "$children" ]]; then
+      log "pyload hook children running ($(echo "$children" | tr '\n' ' ')), staying up"
+      exit 0
+    fi
+
+    # 4. Both spinning HDDs must be in standby. hdparm -C is non-disturbing
+    # (does not wake the disk). The udev rule in power-management.nix runs
+    # hdparm -S 180, so standby implies >= 15 min of firmware-level idle —
+    # no extra shell-level debounce needed.
+    for disk in ${lib.concatStringsSep " " hdds}; do
+      if [[ ! -e "$disk" ]]; then
+        log "disk $disk missing, staying up"
+        exit 0
+      fi
+      device=$(readlink -f "$disk")
+      power_state=$(${pkgs.hdparm}/bin/hdparm -C "$device" 2>/dev/null \
+        | grep -oP '(standby|active/idle|active|idle)' | head -1 || echo "unknown")
+      if [[ "$power_state" != "standby" ]]; then
+        log "$disk is $power_state, staying up"
+        exit 0
+      fi
+    done
+
+    # 5. All clear.
+    log "all checks clear, powering off"
+    ${pkgs.systemd}/bin/systemctl poweroff
+  '';
+in
+{
+  systemd.services.nas-auto-shutdown = {
+    description = "Power off NAS when idle (HDD standby + pyload/filebot quiet + no SSH)";
+    path = with pkgs; [ coreutils gawk gnugrep iproute2 hdparm systemd ];
+    serviceConfig = {
+      Type = "oneshot";
+      User = "root";
+      ExecStart = "${autoShutdownScript}";
+    };
+  };
+
+  systemd.timers.nas-auto-shutdown = {
+    description = "Run NAS idle check every minute";
+    wantedBy = [ "timers.target" ];
+    timerConfig = {
+      # Requirement: earliest shutdown is 15 min after boot, so a
+      # WOL-triggered wake is not immediately followed by another poweroff.
+      OnBootSec = "15min";
+      OnUnitActiveSec = "1min";
+      AccuracySec = "10s";
+      Persistent = false;
+    };
+  };
+}