derstandard: auto-track seen articles

- items now shows only NEW (unseen) articles by default
- items --all shows everything including seen
- articles command auto-marks URLs as seen
- Added seen/reset commands for state management
- State stored in memory/derstandard-seen.txt
This commit is contained in:
Agent 2026-02-03 22:02:51 +00:00
parent 1eac52a97c
commit 252fcb3ad0
4 changed files with 78 additions and 21 deletions

View file

@ -45,20 +45,25 @@ When user arrives home, HA calls the webhook. Check `memory/arrival-reminders.js
Helper script: `~/bin/derstandard` Helper script: `~/bin/derstandard`
```bash ```bash
derstandard items [max] # Title + URL pairs (tab-separated) derstandard items [max] # NEW items only (filters out seen)
derstandard article <url> # Full article content for a specific URL derstandard items --all [max] # All items including already seen
derstandard articles <url1>,<url2>,... # Fetch multiple articles (comma-separated) derstandard article <url> # Full article content for a specific URL
derstandard urls [max] # Article URLs only (default: 50) derstandard articles <url1>,<url2>,... # Fetch multiple + auto-mark as seen
derstandard titles [max] # Article titles only derstandard urls [max] # Article URLs only (default: 50)
derstandard raw [max] # Full RSS XML derstandard titles [max] # Article titles only
derstandard seen # Show seen count and recent entries
derstandard reset # Clear seen history
derstandard raw [max] # Full RSS XML
``` ```
- Uses internal fivefilters proxy (bypasses web_fetch private IP block) - Uses internal fivefilters proxy (bypasses web_fetch private IP block)
- Pre-processes output for minimal token usage - Pre-processes output for minimal token usage
- **Auto-tracks seen articles** in `memory/derstandard-seen.txt`
**Workflow for news briefing:** **Workflow for news briefing:**
1. `derstandard items` → pick interesting titles 1. `derstandard items` → only shows NEW articles (unseen)
2. `derstandard articles <url1>,<url2>,...` → get full content for selected articles 2. `derstandard articles <url1>,<url2>,...` → fetch content + auto-mark as seen
3. Next briefing: step 1 automatically excludes previously covered articles
## Forgejo Git Access ## Forgejo Git Access

View file

@ -4,19 +4,27 @@ set -e
FIVEFILTERS_URL="https://fivefilters.cloonar.com" FIVEFILTERS_URL="https://fivefilters.cloonar.com"
RSS_SOURCE="https://www.derstandard.at/rss" RSS_SOURCE="https://www.derstandard.at/rss"
SEEN_FILE="${DERSTANDARD_SEEN_FILE:-$HOME/clawd/memory/derstandard-seen.txt}"
CURL="curl -sk" CURL="curl -sk"
# Ensure seen file exists
mkdir -p "$(dirname "$SEEN_FILE")"
touch "$SEEN_FILE"
usage() { usage() {
cat <<EOF cat <<EOF
Usage: derstandard <command> [args] Usage: derstandard <command> [args]
Commands: Commands:
items [max] Title + URL pairs for selection (default: 50) items [max] NEW items only (unseen), title + URL pairs
items --all [max] All items including seen
article <url> Fetch single article content article <url> Fetch single article content
articles <url1,url2,...> Fetch multiple articles (comma-separated) articles <url1,url2,...> Fetch multiple articles + mark as seen
urls [max] Article URLs only urls [max] Article URLs only (all)
titles [max] Article titles only titles [max] Article titles only (all)
seen Show seen count and recent entries
reset Clear seen history
raw [max] Full RSS XML raw [max] Full RSS XML
EOF EOF
} }
@ -41,12 +49,41 @@ decode_entities() {
sed 's/&amp;amp;/\&/g; s/&amp;/\&/g; s/&lt;/</g; s/&gt;/>/g; s/&quot;/"/g; s/&#39;/'"'"'/g' sed 's/&amp;amp;/\&/g; s/&amp;/\&/g; s/&lt;/</g; s/&gt;/>/g; s/&quot;/"/g; s/&#39;/'"'"'/g'
} }
mark_seen() {
local url="$1"
# Only add if not already present
if ! grep -qF "$url" "$SEEN_FILE" 2>/dev/null; then
echo "$url" >> "$SEEN_FILE"
fi
}
is_seen() {
grep -qF "$1" "$SEEN_FILE" 2>/dev/null
}
case "${1:-}" in case "${1:-}" in
items) items)
feed=$(fetch_feed "${2:-50}") show_all=false
titles=$(echo "$feed" | grep -oP '<title>\K[^<]+' | tail -n +2 | decode_entities) max=50
urls=$(echo "$feed" | grep -oP '<link>\K[^<]+' | grep "derstandard.at/story") shift || true
paste <(echo "$titles") <(echo "$urls") 2>/dev/null | head -"${2:-50}" if [ "${1:-}" = "--all" ]; then
show_all=true
shift || true
fi
[ -n "${1:-}" ] && max="$1"
feed=$(fetch_feed "$max")
# Build items list
while IFS=$'\t' read -r title url; do
if $show_all || ! is_seen "$url"; then
printf '%s\t%s\n' "$title" "$url"
fi
done < <(
titles=$(echo "$feed" | grep -oP '<title>\K[^<]+' | tail -n +2 | decode_entities)
urls=$(echo "$feed" | grep -oP '<link>\K[^<]+' | grep "derstandard.at/story")
paste <(echo "$titles") <(echo "$urls") 2>/dev/null | head -"$max"
)
;; ;;
article) article)
[ -z "${2:-}" ] && { echo "Usage: derstandard article <url>"; exit 1; } [ -z "${2:-}" ] && { echo "Usage: derstandard article <url>"; exit 1; }
@ -62,6 +99,8 @@ case "${1:-}" in
fetch_single_article "$url" fetch_single_article "$url"
echo "" echo ""
echo "" echo ""
# Mark as seen
mark_seen "$url"
done done
;; ;;
urls) urls)
@ -70,6 +109,18 @@ case "${1:-}" in
titles) titles)
fetch_feed "${2:-50}" | grep -oP '<title>\K[^<]+' | tail -n +2 | decode_entities fetch_feed "${2:-50}" | grep -oP '<title>\K[^<]+' | tail -n +2 | decode_entities
;; ;;
seen)
count=$(wc -l < "$SEEN_FILE" 2>/dev/null | tr -d ' ')
echo "Seen: $count URLs"
if [ "$count" -gt 0 ]; then
echo "Recent:"
tail -5 "$SEEN_FILE"
fi
;;
reset)
> "$SEEN_FILE"
echo "Cleared seen history"
;;
raw) raw)
fetch_feed "${2:-50}" fetch_feed "${2:-50}"
;; ;;

View file

@ -0,0 +1 @@
https://www.derstandard.at/story/3000000307063/usa-schiessen-iranische-drohne-nahe-flugzeugtraeger-im-arabischen-meer-ab?ref=rss

View file

@ -7,10 +7,10 @@
}, },
"lastChecks": { "lastChecks": {
"news": "2026-01-30T08:17:00Z", "news": "2026-01-30T08:17:00Z",
"rheinmetall": "2026-02-02T04:10:00Z", "rheinmetall": "2026-02-03T22:00:00Z",
"rheinmetall_price": 1755.00, "rheinmetall_price": 1773.50,
"calendar": "2026-02-02T04:10:00Z", "calendar": "2026-02-03T22:00:00Z",
"steam_hardware": "2026-01-31T07:56:00Z", "steam_hardware": "2026-02-03T22:00:00Z",
"notes": "RHM: €1,755.00, below threshold. Calendar: Gurkerl delivery 14:00 (no action needed). Steam hardware: 'early 2026' window, no official price or specific date yet." "notes": "RHM: €1,773.50, below threshold. Calendar Feb 4: 'nyc' meeting at 12:00 (video call, no action). Steam hardware: still 'early 2026', no official price/date."
} }
} }