config/bin/derstandard
Agent dbca7fc17d derstandard: mark items as seen on display, auto-prune at 200
- items command now marks ALL returned URLs as seen (not just fetched)
- Auto-prunes seen file to 200 entries (removes oldest)
2026-02-03 22:04:57 +00:00

145 lines
4.1 KiB
Bash
Executable file

#!/bin/bash
# Der Standard RSS helper - fetches via internal fivefilters
set -e
FIVEFILTERS_URL="https://fivefilters.cloonar.com"
RSS_SOURCE="https://www.derstandard.at/rss"
SEEN_FILE="${DERSTANDARD_SEEN_FILE:-$HOME/clawd/memory/derstandard-seen.txt}"
CURL="curl -sk"
# Ensure seen file exists
mkdir -p "$(dirname "$SEEN_FILE")"
touch "$SEEN_FILE"
usage() {
cat <<EOF
Usage: derstandard <command> [args]
Commands:
items [max] NEW items only (unseen), title + URL pairs
items --all [max] All items including seen
article <url> Fetch single article content
articles <url1,url2,...> Fetch multiple articles + mark as seen
urls [max] Article URLs only (all)
titles [max] Article titles only (all)
seen Show seen count and recent entries
reset Clear seen history
raw [max] Full RSS XML
EOF
}
fetch_feed() {
local max="${1:-50}"
local encoded_url=$(printf '%s' "$RSS_SOURCE" | sed 's/:/%3A/g; s/\//%2F/g')
$CURL "${FIVEFILTERS_URL}/makefulltextfeed.php?url=${encoded_url}&max=${max}&links=preserve&exc="
}
fetch_single_article() {
local url="$1"
local encoded_url=$(printf '%s' "$url" | sed 's/:/%3A/g; s/\//%2F/g; s/\?/%3F/g; s/&/%26/g; s/=/%3D/g')
$CURL "${FIVEFILTERS_URL}/makefulltextfeed.php?url=${encoded_url}&max=1&links=preserve&exc=" | \
perl -0777 -ne 'print $1 if /<item>.*?<description>(.*?)<\/description>.*?<\/item>/s' | \
sed 's/&lt;/</g; s/&gt;/>/g; s/&quot;/"/g; s/&amp;/\&/g' | \
sed 's/<[^>]*>//g' | \
tr '\n' ' ' | sed 's/ */ /g'
}
decode_entities() {
sed 's/&amp;amp;/\&/g; s/&amp;/\&/g; s/&lt;/</g; s/&gt;/>/g; s/&quot;/"/g; s/&#39;/'"'"'/g'
}
mark_seen() {
local url="$1"
# Only add if not already present
if ! grep -qF "$url" "$SEEN_FILE" 2>/dev/null; then
echo "$url" >> "$SEEN_FILE"
fi
}
prune_seen() {
# Keep only last 200 entries
local count=$(wc -l < "$SEEN_FILE" 2>/dev/null | tr -d ' ')
if [ "$count" -gt 200 ]; then
local excess=$((count - 200))
tail -200 "$SEEN_FILE" > "${SEEN_FILE}.tmp" && mv "${SEEN_FILE}.tmp" "$SEEN_FILE"
fi
}
is_seen() {
grep -qF "$1" "$SEEN_FILE" 2>/dev/null
}
case "${1:-}" in
items)
show_all=false
max=50
shift || true
if [ "${1:-}" = "--all" ]; then
show_all=true
shift || true
fi
[ -n "${1:-}" ] && max="$1"
feed=$(fetch_feed "$max")
# Build items list, mark all as seen, only display unseen (unless --all)
while IFS=$'\t' read -r title url; do
if $show_all || ! is_seen "$url"; then
printf '%s\t%s\n' "$title" "$url"
fi
# Mark as seen regardless of display
mark_seen "$url"
done < <(
titles=$(echo "$feed" | grep -oP '<title>\K[^<]+' | tail -n +2 | decode_entities)
urls=$(echo "$feed" | grep -oP '<link>\K[^<]+' | grep "derstandard.at/story")
paste <(echo "$titles") <(echo "$urls") 2>/dev/null | head -"$max"
)
# Prune to max 200 entries
prune_seen
;;
article)
[ -z "${2:-}" ] && { echo "Usage: derstandard article <url>"; exit 1; }
fetch_single_article "$2"
;;
articles)
[ -z "${2:-}" ] && { echo "Usage: derstandard articles <url1,url2,...>"; exit 1; }
IFS=',' read -ra URLS <<< "$2"
for url in "${URLS[@]}"; do
# Extract title from URL slug
title=$(echo "$url" | grep -oP '/\d+/\K[^?]+' | tr '-' ' ' | sed 's/.*/\u&/')
echo "=== ${title} ==="
fetch_single_article "$url"
echo ""
echo ""
# Mark as seen
mark_seen "$url"
done
prune_seen
;;
urls)
fetch_feed "${2:-50}" | grep -oP '<link>\K[^<]+' | grep "derstandard.at/story"
;;
titles)
fetch_feed "${2:-50}" | grep -oP '<title>\K[^<]+' | tail -n +2 | decode_entities
;;
seen)
count=$(wc -l < "$SEEN_FILE" 2>/dev/null | tr -d ' ')
echo "Seen: $count URLs"
if [ "$count" -gt 0 ]; then
echo "Recent:"
tail -5 "$SEEN_FILE"
fi
;;
reset)
> "$SEEN_FILE"
echo "Cleared seen history"
;;
raw)
fetch_feed "${2:-50}"
;;
*)
usage
;;
esac