From 57e645873079bb4b5425a210498f47288193eb83 Mon Sep 17 00:00:00 2001 From: Hoid Date: Wed, 18 Feb 2026 19:54:06 +0000 Subject: [PATCH] Add K3s infrastructure docs, backup plan, BG3 level 6, wind-down log --- memory/bg3.json | 14 +- memory/wind-down-log.json | 4 +- projects/business/memory/infrastructure.md | 201 +++++++++++++++++++++ 3 files changed, 211 insertions(+), 8 deletions(-) create mode 100644 projects/business/memory/infrastructure.md diff --git a/memory/bg3.json b/memory/bg3.json index 5d8f968..77b1271 100644 --- a/memory/bg3.json +++ b/memory/bg3.json @@ -3,8 +3,8 @@ "name": "Tam", "race": "Half-Orc", "class": "Fighter", - "subclass": "Battle Master (respec planned)", - "level": 5, + "subclass": "Battle Master", + "level": 6, "stats": {}, "feats": [], "fightingStyle": "", @@ -15,7 +15,7 @@ "name": "Astarion", "class": "Rogue", "subclass": "Thief", - "level": 5, + "level": 6, "build": "Melee Thief", "notes": "" }, @@ -23,7 +23,7 @@ "name": "Shadowheart", "class": "Cleric", "subclass": "Life Domain", - "level": 5, + "level": 6, "build": "Life Cleric", "notes": "Adamantine Armour, Shattered Flail" }, @@ -31,17 +31,17 @@ "name": "Gale", "class": "Wizard", "subclass": "Evocation", - "level": 5, + "level": 6, "build": "", "notes": "" } ], "act": 2, - "level": 5, + "level": 6, "currentQuest": "Act 2 - Shadow-Cursed Lands, has Moon Lantern with Pixie", "completedQuests": ["Rescue Halsin"], "completedAreas": ["Owlbear Cave", "Goblin Camp", "Act 1"], "decisions": [], "gold": 2000, - "notes": "Adamantine Forge: crafted armour for Tam and Shadowheart. Plays with gamepad — no keyboard shortcuts! All 4 chars already built as recommended. Astarion + Gale still on default weapons." + "notes": "Adamantine Forge: crafted armour for Tam and Shadowheart. Plays with gamepad — no keyboard shortcuts! All 4 chars already built as recommended. Astarion + Gale still on default weapons. Level 6 pending — recommended GWM for Tam, Expertise for Astarion, Blessed Healer for SH, Haste+Fireball for Gale." } diff --git a/memory/wind-down-log.json b/memory/wind-down-log.json index 41d416b..8ff29e5 100644 --- a/memory/wind-down-log.json +++ b/memory/wind-down-log.json @@ -1,6 +1,8 @@ { "date": "2026-02-18", "events": [ - {"time": "19:02", "type": "nudge", "note": "First wind-down nudge sent at 19:00 Vienna"} + {"time": "19:02", "type": "nudge", "note": "First wind-down nudge sent at 19:00 Vienna"}, + {"time": "19:15", "type": "activity", "note": "Playing BG3 - leveling party to 6"}, + {"time": "21:12", "type": "nudge", "note": "Nose shower reminder + wind-down suggestions"} ] } diff --git a/projects/business/memory/infrastructure.md b/projects/business/memory/infrastructure.md new file mode 100644 index 0000000..1f4d0a9 --- /dev/null +++ b/projects/business/memory/infrastructure.md @@ -0,0 +1,201 @@ +# K3s Infrastructure Documentation + +*Last updated: 2026-02-18* + +## Cluster Overview + +| Component | Details | +|-----------|---------| +| K3s Version | v1.34.4+k3s1 | +| Datacenter | Hetzner nbg1 | +| Server Type | CAX11 (ARM64, 2 vCPU, 4GB RAM) | +| Monthly Cost | €17.06 (3× CAX11 + LB) | +| Private Network | 10.0.0.0/16, ID 11949384 | +| Cluster CIDR | 10.42.0.0/16 | +| Service CIDR | 10.43.0.0/16 | +| Flannel Interface | enp7s0 (private network) | + +## Nodes + +| Node | Role | Public IP | Private IP | Hetzner ID | +|------|------|-----------|------------|------------| +| k3s-mgr | Control plane (tainted NoSchedule) | 188.34.201.101 | 10.0.1.5 | 121365837 | +| k3s-w1 | Worker | 159.69.23.121 | 10.0.1.6 | 121365839 | +| k3s-w2 | Worker | 46.225.169.60 | 10.0.1.7 | 121365840 | + +## Load Balancer + +| Field | Value | +|-------|-------| +| Name | k3s-lb | +| Hetzner ID | 5834131 | +| Public IP | 46.225.37.135 | +| Targets | k3s-w1, k3s-w2 (ports 80/443) | +| Health Checks | TCP, 15s interval, 3 retries, 10s timeout | + +## Installed Operators & Components + +| Component | Version | Notes | +|-----------|---------|-------| +| Traefik | Helm (DaemonSet) | Runs on all workers, handles ingress + TLS termination | +| cert-manager | 1.17.2 | Let's Encrypt ClusterIssuer `letsencrypt-prod` | +| CloudNativePG | 1.25.1 | PostgreSQL operator | + +## Database (CNPG) + +| Field | Value | +|-------|-------| +| Cluster Name | main-db | +| Namespace | postgres | +| Instances | 2 (primary + replica) | +| PostgreSQL | 17.4 | +| Storage | 10Gi local-path per instance | +| Databases | `docfast` (prod), `docfast_staging` (staging) | +| PgBouncer | `main-db-pooler`, 2 instances, transaction mode | + +### Credentials +- `docfast-db-credentials` secret: user=docfast, pass=docfast +- `main-db-superuser` secret: managed by CNPG + +## Namespaces + +| Namespace | Purpose | +|-----------|---------| +| postgres | CNPG cluster + pooler | +| docfast | Production DocFast (2 replicas) | +| docfast-staging | Staging DocFast (1 replica) | +| cnpg-system | CNPG operator | +| cert-manager | cert-manager | +| kube-system | K3s system (CoreDNS, Traefik, etc.) | + +## HA Configuration + +All spread constraints are **runtime patches** — may not survive K3s upgrades. Re-apply after updates. + +| Component | Replicas | Spread Strategy | +|-----------|----------|-----------------| +| CoreDNS | 3 | `preferredDuringScheduling` podAntiAffinity (mgr + w1 + w2) | +| CNPG Operator | 2 | `topologySpreadConstraints DoNotSchedule` (w1 + w2) | +| PgBouncer Pooler | 2 | `requiredDuringScheduling` podAntiAffinity via Pooler CRD (w1 + w2) | +| DocFast Prod | 2 | `preferredDuringScheduling` podAntiAffinity (w1 + w2) | +| DocFast Staging | 1 | Not HA by design | + +### Failover Tuning (2026-02-18) +- **Readiness probe**: every 5s, fail after 2 = pod unhealthy in ~10s +- **Liveness probe**: every 10s, fail after 3 +- **Node tolerations**: pods evicted after 10s (default was 300s) +- **Result**: Failover window ~10-15 seconds + +### HA Test Results (2026-02-18) +- ✅ w1 down: 4/4 health checks passed +- ✅ w2 down: 4/4 health checks passed, CNPG promoted replica +- ✅ mgr down: 4/4 health checks passed (workers keep running) + +## CI/CD Pipeline + +| Field | Value | +|-------|-------| +| Registry | git.cloonar.com (Forgejo container registry) | +| Runner | Agent host (178.115.247.134), x86 → ARM64 cross-compile via QEMU | +| Build time | ~8 min | +| Deployer SA | `docfast:deployer` with namespace-scoped RBAC | + +### Workflows +- **deploy.yml**: Push to `main` → build + deploy to `docfast-staging` +- **promote.yml**: Tag `v*` → build + deploy to `docfast` (prod) + +### Secrets Required in Forgejo +- `REGISTRY_TOKEN` — PAT with write:package scope +- `KUBECONFIG` — base64 encoded deployer kubeconfig + +### Pull Secrets +- `forgejo-registry` imagePullSecret in both `docfast` and `docfast-staging` namespaces + +## DNS + +| Record | Type | Value | +|--------|------|-------| +| docfast.dev | A | 46.225.37.135 (LB) | +| staging.docfast.dev | A | **NOT SET** — needed for staging TLS | +| MX | MX | mail.cloonar.com. | + +## Firewall + +- Name: coolify-fw, Hetzner ID 10553199 +- Port 6443 open to: 10.0.0.0/16 (cluster internal) + 178.115.247.134/32 (CI runner) + +## SSH Access + +Config in `/home/openclaw/.ssh/config`: +- `k3s-mgr`, `k3s-w1`, `k3s-w2` — root access +- `deployer` user on k3s-mgr — limited kubeconfig at `/home/deployer/.kube-config.yaml` +- KUBECONFIG on mgr: `/etc/rancher/k3s/k3s.yaml` + +--- + +## Backup Strategy (TO IMPLEMENT) + +### Current State: ❌ NO BACKUPS + +### Plan: Borg to Hetzner Storage Box + +Target: `u149513-sub11@u149513-sub11.your-backup.de:23` (already set up, SSH key configured) + +**1. Cluster State (etcd snapshots)** +- K3s built-in: `--etcd-snapshot-schedule-cron` on k3s-mgr +- Borg repo: `./k3s-cluster/` on Storage Box +- Contents: etcd snapshot + `/var/lib/rancher/k3s/server/manifests/` + all applied YAML manifests +- Schedule: Daily +- Retention: 7 daily, 4 weekly + +**2. Database (pg_dump)** +- CronJob in `postgres` namespace → `pg_dump` both databases +- Push to Borg repo: `./k3s-db/` on Storage Box +- Schedule: Every 6 hours +- Retention: 7 daily, 4 weekly +- DB size: ~8 MB (tiny — Borg dedup makes this basically free) + +**3. Kubernetes Manifests** +- Export all namespaced resources as YAML +- Include: deployments, services, ingresses, secrets (encrypted by Borg), configmaps, CNPG cluster spec, pooler spec +- Push to Borg alongside etcd snapshots + +**4. Recovery Procedure** +1. Provision 3 fresh CAX11 nodes +2. Install K3s, restore etcd snapshot +3. Or: fresh K3s + re-apply manifests from Borg +4. Restore CNPG database from pg_dump +5. Update DNS to new LB IP +6. Estimated recovery time: ~15-30 minutes + +### Future: CNPG Barman/S3 (when needed) +- Hetzner Object Storage (S3-compatible) +- Continuous WAL archiving for point-in-time recovery +- Worth it when DB grows past ~1 GB or revenue justifies €5/mo +- Current DB: 7.6 MB — overkill for now + +--- + +## Future Improvements + +### Priority: High +- [ ] **Implement Borg backup** (see above) — ZERO backups currently +- [ ] **DNS: staging.docfast.dev** → 46.225.37.135 — needed for staging ingress TLS +- [ ] **Persist HA spread constraints** — CoreDNS scale, CNPG operator replicas, pooler anti-affinity are runtime patches. Need infra-as-code (manifests in Git) to survive K3s upgrades/reinstalls +- [ ] **Decommission old server** (167.235.156.214) — still running, no longer serves traffic. Stop Docker, delete VM, save €4.5/mo + +### Priority: Medium +- [ ] **CNPG backup to S3** — upgrade from pg_dump to continuous WAL archiving when DB grows +- [ ] **Monitoring/alerting** — Prometheus + Grafana stack, or lightweight alternative (VictoriaMetrics) +- [ ] **Resource limits tuning** — current: 100m-1000m CPU, 256Mi-1Gi RAM per pod. Profile actual usage and right-size +- [ ] **Network policies** — restrict pod-to-pod traffic (e.g., only DocFast → PgBouncer, not direct to DB) +- [ ] **Pod Disruption Budgets** — ensure at least 1 pod stays running during voluntary disruptions (upgrades, drains) +- [ ] **Automated K3s upgrades** — system-upgrade-controller for rolling node updates + +### Priority: Low +- [ ] **Multi-project namespaces** — SnapAPI and future products get own namespaces + RBAC +- [ ] **ServiceAccount per CEO agent** — scoped kubectl access for autonomous deployment +- [ ] **Horizontal Pod Autoscaler** — scale DocFast replicas based on CPU/request load +- [ ] **External Secrets Operator** — centralized secret management instead of per-namespace secrets +- [ ] **Loki for log aggregation** — centralized logging instead of `kubectl logs` +- [ ] **Node auto-scaling** — Hetzner Cloud Controller Manager + Cluster Autoscaler