Created
February 3, 2026 21:49
-
-
Save mislav/a99e2523237594c52ddafc16f3454f31 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # | |
| # Usage: | |
| # bash readeck-cleanup.sh <limit> | |
| # | |
| # This script is based on https://codeberg.org/gollyhatch/readeck-cleanup | |
| # | |
| # The script tries to refetch "broken" bookmarks (i.e. bookmarks that "have errors") or bookmarks | |
| # whose content is empty from either the original URL or, failing that, from the Wayback Machine. If | |
| # fresh content is refetched, it replaces the broken bookmark but inherits its creation timestamp | |
| # and labels. | |
| # | |
| set -euo pipefail | |
| function bail() { | |
| echo "$@" >&2 | |
| exit 1 | |
| } | |
| [ -z "$READECK_API_TOKEN" ] && bail 'READECK_API_TOKEN not set.' | |
| [ -z "$READECK_BASE_URL" ] && bail 'READECK_BASE_URL not set.' | |
| # User agent string used for checking whether the resource at the original URL is still there. | |
| # We want to avoid identifying as "curl" since websites are prone to outright blocking that. | |
| USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" | |
| LABEL_CLEANUP="__cleanup" | |
| LABEL_GONE="gone" | |
| function get_broken_bookmarks() { | |
| local LIMIT="${1:-1}" | |
| # API returns "404 Not Found" when limit is > 100. | |
| [ "$LIMIT" -gt 100 ] && bail 'Maximum limit is 100.' | |
| curl \ | |
| -fsS -XGET \ | |
| --header "Accept: application/json" \ | |
| --header "Authorization: Bearer $READECK_API_TOKEN" \ | |
| "${READECK_BASE_URL%/}/api/bookmarks?has_errors=true&labels=-${LABEL_CLEANUP}+-${LABEL_GONE}&limit=$LIMIT" | |
| } | |
| function get_fixed_bookmarks() { | |
| local LIMIT="${1:-1}" | |
| curl \ | |
| -fsS -XGET \ | |
| --header "Accept: application/json" \ | |
| --header "Authorization: Bearer $READECK_API_TOKEN" \ | |
| "${READECK_BASE_URL%/}/api/bookmarks?has_errors=false&is_loaded=true&labels=${LABEL_CLEANUP}&limit=$LIMIT" | |
| } | |
| function get_empty_bookmarks() { | |
| local LIMIT="${1:-1}" | |
| curl \ | |
| -fsS -XGET \ | |
| --header "Accept: application/json" \ | |
| --header "Authorization: Bearer $READECK_API_TOKEN" \ | |
| "${READECK_BASE_URL%/}/api/bookmarks?type=article&has_errors=false&is_loaded=true&labels=-${LABEL_CLEANUP}&sort=duration&limit=$LIMIT" | |
| } | |
| function get_still_broken_bookmarks() { | |
| local LIMIT="${1:-1}" | |
| # API returns "404 Not Found" when limit is > 100. | |
| [ "$LIMIT" -gt 100 ] && bail 'Maximum limit is 100.' | |
| curl \ | |
| -fsS -XGET \ | |
| --header "Accept: application/json" \ | |
| --header "Authorization: Bearer $READECK_API_TOKEN" \ | |
| "${READECK_BASE_URL%/}/api/bookmarks?has_errors=true&is_loaded=true&labels=${LABEL_CLEANUP}&limit=$LIMIT" | |
| } | |
| function add_bookmark_label() { | |
| local ID="$1" | |
| jq -n --arg l "$2" '{add_labels:[$l]}' | \ | |
| curl \ | |
| -fsS -XPATCH \ | |
| --header "Accept: application/json" \ | |
| --header "Content-Type: application/json" \ | |
| --header "Authorization: Bearer $READECK_API_TOKEN" \ | |
| -d @- \ | |
| --output /dev/null \ | |
| "${READECK_BASE_URL%/}/api/bookmarks/$ID" | |
| } | |
| function remove_bookmark_label() { | |
| local ID="$1" | |
| jq -n --arg l "$2" '{remove_labels:[$l]}' | \ | |
| curl \ | |
| -fsS -XPATCH \ | |
| --header "Accept: application/json" \ | |
| --header "Content-Type: application/json" \ | |
| --header "Authorization: Bearer $READECK_API_TOKEN" \ | |
| -d @- \ | |
| --output /dev/null \ | |
| "${READECK_BASE_URL%/}/api/bookmarks/$ID" | |
| } | |
| function delete_bookmark() { | |
| local ID="$1" | |
| curl \ | |
| -fsS -XDELETE \ | |
| --header "Accept: application/json" \ | |
| --header "Authorization: Bearer $READECK_API_TOKEN" \ | |
| "${READECK_BASE_URL%/}/api/bookmarks/$ID" | |
| } | |
| function post_bookmark() { | |
| local URL="$1" | |
| local CREATED_AT="${2:-}" | |
| local HTML_FILE="${3:-}" | |
| local args=(--form url="$URL" --form labels="$LABEL_CLEANUP") | |
| if [ -n "$CREATED_AT" ]; then | |
| args+=(--form created="$CREATED_AT") | |
| fi | |
| if [ -n "$HTML_FILE" ]; then | |
| local file | |
| file="$(mktemp)" | |
| jq -cn \ | |
| --arg url "$URL" \ | |
| --arg ct "$(grep -i '^content-type:' < "$HTML_FILE" | head -1 | cut -d ' ' -f 2-)" \ | |
| '{url:$url,headers:{"content-type":$ct}}' > "$file" | |
| LC_ALL=C sed -n '/^\r$/,$p' "$HTML_FILE" >> "$file" | |
| args+=(--form resource="<${file}") | |
| fi | |
| curl \ | |
| -fsS -i -XPOST "${args[@]}" \ | |
| --header "Accept: application/json" \ | |
| --header "Authorization: Bearer $READECK_API_TOKEN" \ | |
| "${READECK_BASE_URL%/}/api/bookmarks" \ | |
| | sed -rne 's/^bookmark-id: (.*)/\1/Ip' | |
| } | |
| function wayback_snapshot() { | |
| local URL="$1" | |
| local snapshot_url file error_msg | |
| local attempts=3 | |
| while [ "$attempts" -gt 0 ]; do | |
| : $((attempts--)) | |
| if ! snapshot_url="$(curl -fsS --get --data-urlencode url="$URL" https://archive.org/wayback/available | \ | |
| jq -r '.archived_snapshots.closest.url')"; then | |
| sleep 5 | |
| continue | |
| fi | |
| [[ -n $snapshot_url && $snapshot_url != "null" ]] || return 1 | |
| file="$(mktemp)" | |
| if ! error_msg="$(curl -fsSL -i -o "$file" "$snapshot_url" 2>&1)"; then | |
| echo "error fetching Wayback Machine snapshot: $error_msg" >&2 | |
| sleep 10 | |
| continue | |
| fi | |
| printf '%s' "$file" | |
| break | |
| done | |
| } | |
| limit="${1:-}" | |
| get_broken_bookmarks "$limit" | jq -r '.[] | [.id, .url, .resources.log.src, .title] | @tsv' | \ | |
| while IFS=$'\t' read -r BROKEN_BOOKMARK_ID URL LOG_URL TITLE; do | |
| bookmark_log="$(curl -fsS -H "Authorization: Bearer $READECK_API_TOKEN" "$LOG_URL" | grep '^\[ERRO\]')" | |
| if grep 'cannot load resource .\+ step\.name="start"' <<<"$bookmark_log" | head -1 | grep -q '\(Invalid status code (4..)\|cannot resolve\)'; then | |
| readwise_content="$(jq -r --arg url "$URL" '.results[] | select(.source_url == $url) | .html_content' ~/Projects/readeck/tmp/readwise-*.json)" | |
| if [[ $readwise_content != "" && $readwise_content != "null" ]]; then | |
| printf '%s (%s) %s\n' "$BROKEN_BOOKMARK_ID" "${TITLE:-no title}" "$URL" | |
| printf '%s\n\n' "$bookmark_log" | |
| fi | |
| fi | |
| done | |
| get_broken_bookmarks "$limit" | jq -r '.[] | [.id, .created, .url] | @tsv' | \ | |
| while read -r BROKEN_BOOKMARK_ID CREATED_AT URL; do | |
| echo "Broken bookmark: $URL" | |
| snapshot_file="" | |
| if curl -m 10 -H "User-Agent: $USER_AGENT" --output /dev/null --silent --head --fail "$URL"; then | |
| echo -n "URL seems to work, re-saving bookmark: " | |
| elif snapshot_file="$(wayback_snapshot "$URL")"; then | |
| echo -n "Snapshot found in Wayback Machine: " | |
| else | |
| # echo "URL gone, deleting bookmark $BROKEN_BOOKMARK_ID: $URL" | |
| # delete_bookmark "$BROKEN_BOOKMARK_ID" | |
| echo "URL gone, updating bookmark $BROKEN_BOOKMARK_ID" | |
| add_bookmark_label "$BROKEN_BOOKMARK_ID" "$LABEL_GONE" | |
| continue | |
| fi | |
| if NEW_BOOKMARK_ID=$(post_bookmark "$URL" "$CREATED_AT" "$snapshot_file"); then | |
| delete_bookmark "$BROKEN_BOOKMARK_ID" | |
| echo "${READECK_BASE_URL%/}/bookmarks/$NEW_BOOKMARK_ID" | |
| else | |
| echo "Failed to re-save bookmark $BROKEN_BOOKMARK_ID ($URL)" | |
| [ -z "$snapshot_file" ] || echo "snapshot_file: $snapshot_file" | |
| fi | |
| done | |
| get_empty_bookmarks "$limit" | jq -r '.[] | [.id, .created, .reading_time // 0, .url] | @tsv' | \ | |
| while read -r BROKEN_BOOKMARK_ID CREATED_AT READING_TIME URL; do | |
| [ "$READING_TIME" -le 0 ] || continue | |
| echo "Empty bookmark: $URL" | |
| snapshot_file="" | |
| if curl -m 10 -H "User-Agent: $USER_AGENT" --output /dev/null --silent --head --fail "$URL"; then | |
| echo -n "URL seems to work, re-saving bookmark: " | |
| elif snapshot_file="$(wayback_snapshot "$URL")"; then | |
| echo -n "Snapshot found in Wayback Machine: " | |
| else | |
| echo "URL gone, deleting bookmark $BROKEN_BOOKMARK_ID: $URL" | |
| delete_bookmark "$BROKEN_BOOKMARK_ID" | |
| continue | |
| fi | |
| if NEW_BOOKMARK_ID=$(post_bookmark "$URL" "$CREATED_AT" "$snapshot_file"); then | |
| delete_bookmark "$BROKEN_BOOKMARK_ID" | |
| echo "${READECK_BASE_URL%/}/bookmarks/$NEW_BOOKMARK_ID" | |
| else | |
| echo "Failed to re-save bookmark $BROKEN_BOOKMARK_ID ($URL)" | |
| [ -z "$snapshot_file" ] || echo "snapshot_file: $snapshot_file" | |
| fi | |
| done | |
| get_fixed_bookmarks "$limit" | jq -r '.[] | [.id, .reading_time // 0, .url] | @tsv' | \ | |
| while read -r FIXED_BOOKMARK_ID READING_TIME URL; do | |
| [ "$READING_TIME" -gt 0 ] || continue | |
| echo "Fixed bookmark $FIXED_BOOKMARK_ID ($URL)" | |
| remove_bookmark_label "$FIXED_BOOKMARK_ID" "$LABEL_CLEANUP" | |
| done | |
| if [ "${2:-}" = 'DELETE_RESAVED' ]; then | |
| get_still_broken_bookmarks "$limit" | jq -r '.[] | [.id, .url] | @tsv' | \ | |
| while read -r BROKEN_BOOKMARK_ID URL; do | |
| echo "Previously re-saved bookmark is still broken, deleting it: $URL" | |
| delete_bookmark "$BROKEN_BOOKMARK_ID" | |
| done | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment