From 252fcb3ad08624c043d6abdda0413d22e4da6aaa Mon Sep 17 00:00:00 2001 From: Agent Date: Tue, 3 Feb 2026 22:02:51 +0000 Subject: [PATCH] derstandard: auto-track seen articles - items now shows only NEW (unseen) articles by default - items --all shows everything including seen - articles command auto-marks URLs as seen - Added seen/reset commands for state management - State stored in memory/derstandard-seen.txt --- TOOLS.md | 21 +++++++----- bin/derstandard | 67 ++++++++++++++++++++++++++++++++----- memory/derstandard-seen.txt | 1 + memory/heartbeat-state.json | 10 +++--- 4 files changed, 78 insertions(+), 21 deletions(-) create mode 100644 memory/derstandard-seen.txt diff --git a/TOOLS.md b/TOOLS.md index b911ec0..7771450 100644 --- a/TOOLS.md +++ b/TOOLS.md @@ -45,20 +45,25 @@ When user arrives home, HA calls the webhook. Check `memory/arrival-reminders.js Helper script: `~/bin/derstandard` ```bash -derstandard items [max] # Title + URL pairs (tab-separated) -derstandard article # Full article content for a specific URL -derstandard articles ,,... # Fetch multiple articles (comma-separated) -derstandard urls [max] # Article URLs only (default: 50) -derstandard titles [max] # Article titles only -derstandard raw [max] # Full RSS XML +derstandard items [max] # NEW items only (filters out seen) +derstandard items --all [max] # All items including already seen +derstandard article # Full article content for a specific URL +derstandard articles ,,... # Fetch multiple + auto-mark as seen +derstandard urls [max] # Article URLs only (default: 50) +derstandard titles [max] # Article titles only +derstandard seen # Show seen count and recent entries +derstandard reset # Clear seen history +derstandard raw [max] # Full RSS XML ``` - Uses internal fivefilters proxy (bypasses web_fetch private IP block) - Pre-processes output for minimal token usage +- **Auto-tracks seen articles** in `memory/derstandard-seen.txt` **Workflow for news briefing:** -1. `derstandard items` → pick interesting titles -2. `derstandard articles ,,...` → get full content for selected articles +1. `derstandard items` → only shows NEW articles (unseen) +2. `derstandard articles ,,...` → fetch content + auto-mark as seen +3. Next briefing: step 1 automatically excludes previously covered articles ## Forgejo Git Access diff --git a/bin/derstandard b/bin/derstandard index 24da1d8..1346152 100755 --- a/bin/derstandard +++ b/bin/derstandard @@ -4,19 +4,27 @@ set -e FIVEFILTERS_URL="https://fivefilters.cloonar.com" RSS_SOURCE="https://www.derstandard.at/rss" +SEEN_FILE="${DERSTANDARD_SEEN_FILE:-$HOME/clawd/memory/derstandard-seen.txt}" CURL="curl -sk" +# Ensure seen file exists +mkdir -p "$(dirname "$SEEN_FILE")" +touch "$SEEN_FILE" + usage() { cat < [args] Commands: - items [max] Title + URL pairs for selection (default: 50) + items [max] NEW items only (unseen), title + URL pairs + items --all [max] All items including seen article Fetch single article content - articles Fetch multiple articles (comma-separated) - urls [max] Article URLs only - titles [max] Article titles only + articles Fetch multiple articles + mark as seen + urls [max] Article URLs only (all) + titles [max] Article titles only (all) + seen Show seen count and recent entries + reset Clear seen history raw [max] Full RSS XML EOF } @@ -41,12 +49,41 @@ decode_entities() { sed 's/&amp;/\&/g; s/&/\&/g; s/<//g; s/"/"/g; s/'/'"'"'/g' } +mark_seen() { + local url="$1" + # Only add if not already present + if ! grep -qF "$url" "$SEEN_FILE" 2>/dev/null; then + echo "$url" >> "$SEEN_FILE" + fi +} + +is_seen() { + grep -qF "$1" "$SEEN_FILE" 2>/dev/null +} + case "${1:-}" in items) - feed=$(fetch_feed "${2:-50}") - titles=$(echo "$feed" | grep -oP '\K[^<]+' | tail -n +2 | decode_entities) - urls=$(echo "$feed" | grep -oP '<link>\K[^<]+' | grep "derstandard.at/story") - paste <(echo "$titles") <(echo "$urls") 2>/dev/null | head -"${2:-50}" + show_all=false + max=50 + shift || true + if [ "${1:-}" = "--all" ]; then + show_all=true + shift || true + fi + [ -n "${1:-}" ] && max="$1" + + feed=$(fetch_feed "$max") + + # Build items list + while IFS=$'\t' read -r title url; do + if $show_all || ! is_seen "$url"; then + printf '%s\t%s\n' "$title" "$url" + fi + done < <( + titles=$(echo "$feed" | grep -oP '<title>\K[^<]+' | tail -n +2 | decode_entities) + urls=$(echo "$feed" | grep -oP '<link>\K[^<]+' | grep "derstandard.at/story") + paste <(echo "$titles") <(echo "$urls") 2>/dev/null | head -"$max" + ) ;; article) [ -z "${2:-}" ] && { echo "Usage: derstandard article <url>"; exit 1; } @@ -62,6 +99,8 @@ case "${1:-}" in fetch_single_article "$url" echo "" echo "" + # Mark as seen + mark_seen "$url" done ;; urls) @@ -70,6 +109,18 @@ case "${1:-}" in titles) fetch_feed "${2:-50}" | grep -oP '<title>\K[^<]+' | tail -n +2 | decode_entities ;; + seen) + count=$(wc -l < "$SEEN_FILE" 2>/dev/null | tr -d ' ') + echo "Seen: $count URLs" + if [ "$count" -gt 0 ]; then + echo "Recent:" + tail -5 "$SEEN_FILE" + fi + ;; + reset) + > "$SEEN_FILE" + echo "Cleared seen history" + ;; raw) fetch_feed "${2:-50}" ;; diff --git a/memory/derstandard-seen.txt b/memory/derstandard-seen.txt new file mode 100644 index 0000000..b873fe8 --- /dev/null +++ b/memory/derstandard-seen.txt @@ -0,0 +1 @@ +https://www.derstandard.at/story/3000000307063/usa-schiessen-iranische-drohne-nahe-flugzeugtraeger-im-arabischen-meer-ab?ref=rss diff --git a/memory/heartbeat-state.json b/memory/heartbeat-state.json index 4addad7..c16ed26 100644 --- a/memory/heartbeat-state.json +++ b/memory/heartbeat-state.json @@ -7,10 +7,10 @@ }, "lastChecks": { "news": "2026-01-30T08:17:00Z", - "rheinmetall": "2026-02-02T04:10:00Z", - "rheinmetall_price": 1755.00, - "calendar": "2026-02-02T04:10:00Z", - "steam_hardware": "2026-01-31T07:56:00Z", - "notes": "RHM: €1,755.00, below threshold. Calendar: Gurkerl delivery 14:00 (no action needed). Steam hardware: 'early 2026' window, no official price or specific date yet." + "rheinmetall": "2026-02-03T22:00:00Z", + "rheinmetall_price": 1773.50, + "calendar": "2026-02-03T22:00:00Z", + "steam_hardware": "2026-02-03T22:00:00Z", + "notes": "RHM: €1,773.50, below threshold. Calendar Feb 4: 'nyc' meeting at 12:00 (video call, no action). Steam hardware: still 'early 2026', no official price/date." } }