Skip to content

Instantly share code, notes, and snippets.

@mislav
Created February 3, 2026 21:49
Show Gist options
  • Select an option

  • Save mislav/a99e2523237594c52ddafc16f3454f31 to your computer and use it in GitHub Desktop.

Select an option

Save mislav/a99e2523237594c52ddafc16f3454f31 to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
#
# Usage:
# bash readeck-cleanup.sh <limit>
#
# This script is based on https://codeberg.org/gollyhatch/readeck-cleanup
#
# The script tries to refetch "broken" bookmarks (i.e. bookmarks that "have errors") or bookmarks
# whose content is empty from either the original URL or, failing that, from the Wayback Machine. If
# fresh content is refetched, it replaces the broken bookmark but inherits its creation timestamp
# and labels.
#
set -euo pipefail
function bail() {
echo "$@" >&2
exit 1
}
[ -z "$READECK_API_TOKEN" ] && bail 'READECK_API_TOKEN not set.'
[ -z "$READECK_BASE_URL" ] && bail 'READECK_BASE_URL not set.'
# User agent string used for checking whether the resource at the original URL is still there.
# We want to avoid identifying as "curl" since websites are prone to outright blocking that.
USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"
LABEL_CLEANUP="__cleanup"
LABEL_GONE="gone"
function get_broken_bookmarks() {
local LIMIT="${1:-1}"
# API returns "404 Not Found" when limit is > 100.
[ "$LIMIT" -gt 100 ] && bail 'Maximum limit is 100.'
curl \
-fsS -XGET \
--header "Accept: application/json" \
--header "Authorization: Bearer $READECK_API_TOKEN" \
"${READECK_BASE_URL%/}/api/bookmarks?has_errors=true&labels=-${LABEL_CLEANUP}+-${LABEL_GONE}&limit=$LIMIT"
}
function get_fixed_bookmarks() {
local LIMIT="${1:-1}"
curl \
-fsS -XGET \
--header "Accept: application/json" \
--header "Authorization: Bearer $READECK_API_TOKEN" \
"${READECK_BASE_URL%/}/api/bookmarks?has_errors=false&is_loaded=true&labels=${LABEL_CLEANUP}&limit=$LIMIT"
}
function get_empty_bookmarks() {
local LIMIT="${1:-1}"
curl \
-fsS -XGET \
--header "Accept: application/json" \
--header "Authorization: Bearer $READECK_API_TOKEN" \
"${READECK_BASE_URL%/}/api/bookmarks?type=article&has_errors=false&is_loaded=true&labels=-${LABEL_CLEANUP}&sort=duration&limit=$LIMIT"
}
function get_still_broken_bookmarks() {
local LIMIT="${1:-1}"
# API returns "404 Not Found" when limit is > 100.
[ "$LIMIT" -gt 100 ] && bail 'Maximum limit is 100.'
curl \
-fsS -XGET \
--header "Accept: application/json" \
--header "Authorization: Bearer $READECK_API_TOKEN" \
"${READECK_BASE_URL%/}/api/bookmarks?has_errors=true&is_loaded=true&labels=${LABEL_CLEANUP}&limit=$LIMIT"
}
function add_bookmark_label() {
local ID="$1"
jq -n --arg l "$2" '{add_labels:[$l]}' | \
curl \
-fsS -XPATCH \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
--header "Authorization: Bearer $READECK_API_TOKEN" \
-d @- \
--output /dev/null \
"${READECK_BASE_URL%/}/api/bookmarks/$ID"
}
function remove_bookmark_label() {
local ID="$1"
jq -n --arg l "$2" '{remove_labels:[$l]}' | \
curl \
-fsS -XPATCH \
--header "Accept: application/json" \
--header "Content-Type: application/json" \
--header "Authorization: Bearer $READECK_API_TOKEN" \
-d @- \
--output /dev/null \
"${READECK_BASE_URL%/}/api/bookmarks/$ID"
}
function delete_bookmark() {
local ID="$1"
curl \
-fsS -XDELETE \
--header "Accept: application/json" \
--header "Authorization: Bearer $READECK_API_TOKEN" \
"${READECK_BASE_URL%/}/api/bookmarks/$ID"
}
function post_bookmark() {
local URL="$1"
local CREATED_AT="${2:-}"
local HTML_FILE="${3:-}"
local args=(--form url="$URL" --form labels="$LABEL_CLEANUP")
if [ -n "$CREATED_AT" ]; then
args+=(--form created="$CREATED_AT")
fi
if [ -n "$HTML_FILE" ]; then
local file
file="$(mktemp)"
jq -cn \
--arg url "$URL" \
--arg ct "$(grep -i '^content-type:' < "$HTML_FILE" | head -1 | cut -d ' ' -f 2-)" \
'{url:$url,headers:{"content-type":$ct}}' > "$file"
LC_ALL=C sed -n '/^\r$/,$p' "$HTML_FILE" >> "$file"
args+=(--form resource="<${file}")
fi
curl \
-fsS -i -XPOST "${args[@]}" \
--header "Accept: application/json" \
--header "Authorization: Bearer $READECK_API_TOKEN" \
"${READECK_BASE_URL%/}/api/bookmarks" \
| sed -rne 's/^bookmark-id: (.*)/\1/Ip'
}
function wayback_snapshot() {
local URL="$1"
local snapshot_url file error_msg
local attempts=3
while [ "$attempts" -gt 0 ]; do
: $((attempts--))
if ! snapshot_url="$(curl -fsS --get --data-urlencode url="$URL" https://archive.org/wayback/available | \
jq -r '.archived_snapshots.closest.url')"; then
sleep 5
continue
fi
[[ -n $snapshot_url && $snapshot_url != "null" ]] || return 1
file="$(mktemp)"
if ! error_msg="$(curl -fsSL -i -o "$file" "$snapshot_url" 2>&1)"; then
echo "error fetching Wayback Machine snapshot: $error_msg" >&2
sleep 10
continue
fi
printf '%s' "$file"
break
done
}
limit="${1:-}"
get_broken_bookmarks "$limit" | jq -r '.[] | [.id, .url, .resources.log.src, .title] | @tsv' | \
while IFS=$'\t' read -r BROKEN_BOOKMARK_ID URL LOG_URL TITLE; do
bookmark_log="$(curl -fsS -H "Authorization: Bearer $READECK_API_TOKEN" "$LOG_URL" | grep '^\[ERRO\]')"
if grep 'cannot load resource .\+ step\.name="start"' <<<"$bookmark_log" | head -1 | grep -q '\(Invalid status code (4..)\|cannot resolve\)'; then
readwise_content="$(jq -r --arg url "$URL" '.results[] | select(.source_url == $url) | .html_content' ~/Projects/readeck/tmp/readwise-*.json)"
if [[ $readwise_content != "" && $readwise_content != "null" ]]; then
printf '%s (%s) %s\n' "$BROKEN_BOOKMARK_ID" "${TITLE:-no title}" "$URL"
printf '%s\n\n' "$bookmark_log"
fi
fi
done
get_broken_bookmarks "$limit" | jq -r '.[] | [.id, .created, .url] | @tsv' | \
while read -r BROKEN_BOOKMARK_ID CREATED_AT URL; do
echo "Broken bookmark: $URL"
snapshot_file=""
if curl -m 10 -H "User-Agent: $USER_AGENT" --output /dev/null --silent --head --fail "$URL"; then
echo -n "URL seems to work, re-saving bookmark: "
elif snapshot_file="$(wayback_snapshot "$URL")"; then
echo -n "Snapshot found in Wayback Machine: "
else
# echo "URL gone, deleting bookmark $BROKEN_BOOKMARK_ID: $URL"
# delete_bookmark "$BROKEN_BOOKMARK_ID"
echo "URL gone, updating bookmark $BROKEN_BOOKMARK_ID"
add_bookmark_label "$BROKEN_BOOKMARK_ID" "$LABEL_GONE"
continue
fi
if NEW_BOOKMARK_ID=$(post_bookmark "$URL" "$CREATED_AT" "$snapshot_file"); then
delete_bookmark "$BROKEN_BOOKMARK_ID"
echo "${READECK_BASE_URL%/}/bookmarks/$NEW_BOOKMARK_ID"
else
echo "Failed to re-save bookmark $BROKEN_BOOKMARK_ID ($URL)"
[ -z "$snapshot_file" ] || echo "snapshot_file: $snapshot_file"
fi
done
get_empty_bookmarks "$limit" | jq -r '.[] | [.id, .created, .reading_time // 0, .url] | @tsv' | \
while read -r BROKEN_BOOKMARK_ID CREATED_AT READING_TIME URL; do
[ "$READING_TIME" -le 0 ] || continue
echo "Empty bookmark: $URL"
snapshot_file=""
if curl -m 10 -H "User-Agent: $USER_AGENT" --output /dev/null --silent --head --fail "$URL"; then
echo -n "URL seems to work, re-saving bookmark: "
elif snapshot_file="$(wayback_snapshot "$URL")"; then
echo -n "Snapshot found in Wayback Machine: "
else
echo "URL gone, deleting bookmark $BROKEN_BOOKMARK_ID: $URL"
delete_bookmark "$BROKEN_BOOKMARK_ID"
continue
fi
if NEW_BOOKMARK_ID=$(post_bookmark "$URL" "$CREATED_AT" "$snapshot_file"); then
delete_bookmark "$BROKEN_BOOKMARK_ID"
echo "${READECK_BASE_URL%/}/bookmarks/$NEW_BOOKMARK_ID"
else
echo "Failed to re-save bookmark $BROKEN_BOOKMARK_ID ($URL)"
[ -z "$snapshot_file" ] || echo "snapshot_file: $snapshot_file"
fi
done
get_fixed_bookmarks "$limit" | jq -r '.[] | [.id, .reading_time // 0, .url] | @tsv' | \
while read -r FIXED_BOOKMARK_ID READING_TIME URL; do
[ "$READING_TIME" -gt 0 ] || continue
echo "Fixed bookmark $FIXED_BOOKMARK_ID ($URL)"
remove_bookmark_label "$FIXED_BOOKMARK_ID" "$LABEL_CLEANUP"
done
if [ "${2:-}" = 'DELETE_RESAVED' ]; then
get_still_broken_bookmarks "$limit" | jq -r '.[] | [.id, .url] | @tsv' | \
while read -r BROKEN_BOOKMARK_ID URL; do
echo "Previously re-saved bookmark is still broken, deleting it: $URL"
delete_bookmark "$BROKEN_BOOKMARK_ID"
done
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment