ainews: add AI news helper script with seen tracking

- Aggregates Simon Willison, OpenAI Blog, Sebastian Raschka - Same workflow as derstandard: items marks as seen, auto-prunes at 200 - Updated all 4 AI news cron jobs to use new script - Removed obsolete ai-news-seen.json
2026-02-03 22:18:10 +00:00 · 2026-02-03 22:18:10 +00:00 · e6248879b3
commit e6248879b3
parent d0fa82ccc6
4 changed files with 186 additions and 32 deletions
--- a/TOOLS.md
+++ b/TOOLS.md
@ -103,6 +103,30 @@ curl -s -X REPORT -u "$NEXTCLOUD_USER:$NEXTCLOUD_PASS" \
  "$NEXTCLOUD_URL/remote.php/dav/calendars/$NEXTCLOUD_USER/$CALDAV_CALENDAR/"
 ```

+## AI News RSS
+
+Helper script: `~/bin/ainews`
+
+```bash
+ainews items [max]          # NEW items only (filters out seen)
+ainews items --all [max]    # All items including already seen
+ainews article <url>        # Full article content via fivefilters
+ainews articles <url1>,<url2>,... # Fetch multiple + auto-mark as seen
+ainews seen                 # Show seen count and recent entries
+ainews reset                # Clear seen history
+```
+
+- Aggregates: Simon Willison, OpenAI Blog, Sebastian Raschka
+- Auto-tracks seen articles in `memory/ainews-seen.txt`
+- Auto-prunes to 200 entries
+
+**Workflow for AI news briefing:**
+1. `ainews items` → shows NEW articles, marks them as seen
+2. Pick interesting ones, optionally fetch full content with `articles`
+3. Next briefing: only shows articles published since last check
+
+---
+
 ## Der Standard RSS Summaries

 - **Schedule:** 4× daily: 10:00, 14:00, 18:00, 22:00 (Vienna time)
--- a/bin/ainews
+++ b/bin/ainews
@ -0,0 +1,147 @@
+#!/bin/bash
+# AI News RSS helper - aggregates multiple AI-focused feeds
+set -e
+
+FIVEFILTERS_URL="https://fivefilters.cloonar.com"
+SEEN_FILE="${AINEWS_SEEN_FILE:-$HOME/clawd/memory/ainews-seen.txt}"
+
+CURL="curl -skL"
+
+# Ensure seen file exists
+mkdir -p "$(dirname "$SEEN_FILE")"
+touch "$SEEN_FILE"
+
+usage() {
+  cat <<EOF
+Usage: ainews <command> [args]
+
+Commands:
+  items [max]              NEW items only (unseen), title + source + URL
+  items --all [max]        All items including seen
+  article <url>            Fetch article content via fivefilters proxy
+  articles <url1,url2,...> Fetch multiple articles + mark as seen
+  seen                     Show seen count and recent entries
+  reset                    Clear seen history
+EOF
+}
+
+mark_seen() {
+  local url="$1"
+  if ! grep -qF "$url" "$SEEN_FILE" 2>/dev/null; then
+    echo "$url" >> "$SEEN_FILE"
+  fi
+}
+
+prune_seen() {
+  local count=$(wc -l < "$SEEN_FILE" 2>/dev/null | tr -d ' ')
+  if [ "$count" -gt 200 ]; then
+    tail -200 "$SEEN_FILE" > "${SEEN_FILE}.tmp" && mv "${SEEN_FILE}.tmp" "$SEEN_FILE"
+  fi
+}
+
+is_seen() {
+  grep -qF "$1" "$SEEN_FILE" 2>/dev/null
+}
+
+decode_entities() {
+  sed 's/&amp;amp;/\&/g; s/&amp;/\&/g; s/&lt;/</g; s/&gt;/>/g; s/&quot;/"/g; s/&#39;/'"'"'/g; s/<!\[CDATA\[//g; s/\]\]>//g'
+}
+
+fetch_simon_willison() {
+  $CURL "https://simonwillison.net/atom/everything/" 2>/dev/null | \
+    tr '\n' ' ' | sed 's/<entry>/\n<entry>/g' | grep '<entry>' | head -"${1:-20}" | \
+    while read -r entry; do
+      title=$(echo "$entry" | grep -oP '(?<=<title>)[^<]+' | head -1)
+      url=$(echo "$entry" | grep -oP '(?<=<link href=")[^"]+(?=" rel="alternate")' | head -1)
+      [ -n "$url" ] && [ -n "$title" ] && printf '%s\t[Willison]\t%s\n' "$(echo "$title" | decode_entities)" "$url"
+    done
+}
+
+fetch_openai() {
+  $CURL "https://openai.com/news/rss.xml" 2>/dev/null | \
+    tr '\n' ' ' | sed 's/<item>/\n<item>/g' | grep '<item>' | head -"${1:-10}" | \
+    while read -r item; do
+      title=$(echo "$item" | grep -oP '(?<=<title>)<!\[CDATA\[\K[^\]]+' | head -1)
+      url=$(echo "$item" | grep -oP '(?<=<link>)[^<]+' | head -1)
+      [ -n "$url" ] && [ -n "$title" ] && printf '%s\t[OpenAI]\t%s\n' "$title" "$url"
+    done
+}
+
+fetch_raschka() {
+  $CURL "https://magazine.sebastianraschka.com/feed" 2>/dev/null | \
+    tr '\n' ' ' | sed 's/<item>/\n<item>/g' | grep '<item>' | head -"${1:-10}" | \
+    while read -r item; do
+      title=$(echo "$item" | grep -oP '(?<=<title>)<!\[CDATA\[\K[^\]]+' | head -1)
+      url=$(echo "$item" | grep -oP '(?<=<link>)[^<]+' | head -1)
+      [ -n "$url" ] && [ -n "$title" ] && printf '%s\t[Raschka]\t%s\n' "$title" "$url"
+    done
+}
+
+fetch_single_article() {
+  local url="$1"
+  local encoded_url=$(printf '%s' "$url" | sed 's/:/%3A/g; s/\//%2F/g; s/\?/%3F/g; s/&/%26/g; s/=/%3D/g')
+  $CURL "${FIVEFILTERS_URL}/makefulltextfeed.php?url=${encoded_url}&max=1&links=preserve&exc=" | \
+    perl -0777 -ne 'print $1 if /<item>.*?<description>(.*?)<\/description>.*?<\/item>/s' | \
+    sed 's/&lt;/</g; s/&gt;/>/g; s/&quot;/"/g; s/&amp;/\&/g' | \
+    sed 's/<[^>]*>//g' | \
+    tr '\n' ' ' | sed 's/  */ /g'
+}
+
+case "${1:-}" in
+  items)
+    show_all=false
+    max=50
+    shift || true
+    if [ "${1:-}" = "--all" ]; then
+      show_all=true
+      shift || true
+    fi
+    [ -n "${1:-}" ] && max="$1"
+    
+    # Collect all items from all feeds
+    {
+      fetch_simon_willison "$max"
+      fetch_openai "$max"
+      fetch_raschka "$max"
+    } | while IFS=$'\t' read -r title source url; do
+      if $show_all || ! is_seen "$url"; then
+        printf '%s\t%s\t%s\n' "$title" "$source" "$url"
+      fi
+      mark_seen "$url"
+    done
+    
+    prune_seen
+    ;;
+  article)
+    [ -z "${2:-}" ] && { echo "Usage: ainews article <url>"; exit 1; }
+    fetch_single_article "$2"
+    ;;
+  articles)
+    [ -z "${2:-}" ] && { echo "Usage: ainews articles <url1,url2,...>"; exit 1; }
+    IFS=',' read -ra URLS <<< "$2"
+    for url in "${URLS[@]}"; do
+      title=$(echo "$url" | grep -oP '[^/]+$' | sed 's/-/ /g; s/\..*//; s/.*/\u&/')
+      echo "=== ${title} ==="
+      fetch_single_article "$url"
+      echo ""
+      echo ""
+      mark_seen "$url"
+    done
+    prune_seen
+    ;;
+  seen)
+    count=$(wc -l < "$SEEN_FILE" 2>/dev/null | tr -d ' ')
+    echo "Seen: $count URLs"
+    if [ "$count" -gt 0 ]; then
+      echo "Recent:"
+      tail -5 "$SEEN_FILE"
+    fi
+    ;;
+  reset)
+    > "$SEEN_FILE"
+    echo "Cleared seen history"
+    ;;
+  *)
+    usage
+    ;;
+esac
--- a/memory/ai-news-seen.json
+++ b/memory/ai-news-seen.json
@ -1,32 +0,0 @@
-{
-  "urls": [
-    "https://openai.com/index/sora-feed-philosophy",
-    "https://openai.com/index/snowflake-partnership",
-    "https://magazine.sebastianraschka.com/p/categories-of-inference-time-scaling",
-    "https://simonwillison.net/2026/Feb/3/january/",
-    "https://simonwillison.net/2026/Feb/3/brandon-sanderson/",
-    "https://openai.com/index/taisei",
-    "https://openai.com/index/the-next-chapter-for-ai-in-the-eu",
-    "https://venturebeat.com/infrastructure/railway-secures-usd100-million-to-challenge-aws-with-ai-native-cloud",
-    "https://venturebeat.com/infrastructure/claude-code-costs-up-to-usd200-a-month-goose-does-the-same-thing-for-free",
-    "https://simonwillison.net/2026/Feb/2/introducing-the-codex-app/",
-    "https://simonwillison.net/2026/Feb/2/no-humans-allowed/",
-    "https://openai.com/index/introducing-the-codex-app",
-    "https://openai.com/index/retiring-gpt-4o-and-older-models",
-    "https://simonwillison.net/2026/Feb/1/openclaw-in-docker/",
-    "https://magazine.sebastianraschka.com/p/state-of-llms-2025",
-    "https://openai.com/index/inside-our-in-house-data-agent",
-    "https://openai.com/index/introducing-gpt-5-2-codex",
-    "https://openai.com/index/unrolling-the-codex-agent-loop",
-    "https://simonwillison.net/2026/Jan/31/andrej-karpathy/",
-    "https://simonwillison.net/2026/Jan/30/steve-yegge/",
-    "https://simonwillison.net/2026/Jan/30/moltbook/",
-    "https://openai.com/index/introducing-prism",
-    "https://openai.com/index/scaling-postgresql",
-    "https://openai.com/index/praktika",
-    "https://openai.com/index/gpt-5-2-for-science-and-math",
-    "https://openai.com/index/disney-sora-agreement",
-    "https://openai.com/index/ten-years"
-  ],
-  "lastUpdated": "2026-02-03T21:05:00Z"
-}
--- a/memory/ainews-seen.txt
+++ b/memory/ainews-seen.txt
@ -0,0 +1,15 @@
+https://simonwillison.net/2026/Feb/3/january/#atom-everything
+https://simonwillison.net/2026/Feb/3/brandon-sanderson/#atom-everything
+https://simonwillison.net/2026/Feb/2/introducing-the-codex-app/#atom-everything
+https://simonwillison.net/2026/Feb/2/no-humans-allowed/#atom-everything
+https://simonwillison.net/2026/Feb/1/openclaw-in-docker/#atom-everything
+https://openai.com/index/sora-feed-philosophy
+https://openai.com/index/snowflake-partnership
+https://openai.com/index/introducing-the-codex-app
+https://openai.com/index/inside-our-in-house-data-agent
+https://openai.com/index/retiring-gpt-4o-and-older-models
+https://magazine.sebastianraschka.com/p/categories-of-inference-time-scaling
+https://magazine.sebastianraschka.com/p/state-of-llms-2025
+https://magazine.sebastianraschka.com/p/llm-research-papers-2025-part2
+https://magazine.sebastianraschka.com/p/technical-deepseek
+https://magazine.sebastianraschka.com/p/beyond-standard-llms