Created
September 16, 2025 19:51
-
-
Save corylanou/c832fe9168a9bdaaaa86ccaab74eafdc to your computer and use it in GitHub Desktop.
Litestream v0.5.0 Critical Bug: Restore fails after interruption + checkpoint (Data Loss)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Litestream v0.5.0 Critical Bug Reproduction Script | |
| # | |
| # This script demonstrates a CRITICAL data loss bug where restore fails | |
| # after Litestream is interrupted and a checkpoint occurs during downtime. | |
| # | |
| # Requirements: | |
| # - litestream binary (built from current main branch) | |
| # - litestream-test binary (from PR #748 or build with: go build -o bin/litestream-test ./cmd/litestream-test) | |
| # - SQLite3 command line tool | |
| # | |
| # Expected behavior: Database should restore successfully | |
| # Actual behavior: Restore fails with "nonsequential page numbers" error | |
| set -e | |
| echo "============================================" | |
| echo "Litestream v0.5.0 Critical Bug Reproduction" | |
| echo "============================================" | |
| echo "" | |
| echo "This demonstrates a data loss scenario where restore fails after:" | |
| echo "1. Litestream is killed (simulating crash)" | |
| echo "2. Writes continue and a checkpoint occurs" | |
| echo "3. Litestream is restarted" | |
| echo "" | |
| # Configuration | |
| DB="/tmp/critical-bug-test.db" | |
| REPLICA="/tmp/critical-bug-replica" | |
| # Clean up any previous test | |
| echo "[SETUP] Cleaning up previous test files..." | |
| rm -f "$DB"* | |
| rm -rf "$REPLICA" | |
| # Check for required binaries | |
| if [ ! -f "./bin/litestream" ]; then | |
| echo "ERROR: ./bin/litestream not found. Please build with: go build -o bin/litestream ./cmd/litestream" | |
| exit 1 | |
| fi | |
| if [ ! -f "./bin/litestream-test" ]; then | |
| echo "ERROR: ./bin/litestream-test not found. Please build with: go build -o bin/litestream-test ./cmd/litestream-test" | |
| exit 1 | |
| fi | |
| # Step 1: Create and populate initial database | |
| echo "" | |
| echo "[STEP 1] Creating test database (50MB)..." | |
| ./bin/litestream-test populate -db "$DB" -target-size 50MB -table-count 2 | |
| INITIAL_SIZE=$(ls -lh "$DB" 2>/dev/null | awk '{print $5}') | |
| echo "✓ Database created: $INITIAL_SIZE" | |
| # Step 2: Start Litestream replication | |
| echo "" | |
| echo "[STEP 2] Starting Litestream replication..." | |
| ./bin/litestream replicate "$DB" "file://$REPLICA" > /tmp/litestream.log 2>&1 & | |
| LITESTREAM_PID=$! | |
| sleep 3 | |
| if ! kill -0 $LITESTREAM_PID 2>/dev/null; then | |
| echo "ERROR: Litestream failed to start. Check /tmp/litestream.log" | |
| cat /tmp/litestream.log | |
| exit 1 | |
| fi | |
| echo "✓ Litestream running (PID: $LITESTREAM_PID)" | |
| # Step 3: Start continuous writes | |
| echo "" | |
| echo "[STEP 3] Starting continuous writes..." | |
| ./bin/litestream-test load -db "$DB" -write-rate 100 -duration 2m -pattern constant > /tmp/writes.log 2>&1 & | |
| WRITE_PID=$! | |
| echo "✓ Write load started (PID: $WRITE_PID)" | |
| # Step 4: Let it run normally for 20 seconds | |
| echo "" | |
| echo "[STEP 4] Running normally for 20 seconds..." | |
| sleep 20 | |
| # Get row count before interruption | |
| ROWS_BEFORE=$(sqlite3 "$DB" "SELECT COUNT(*) FROM load_test;" 2>/dev/null || echo "0") | |
| echo "✓ Rows written before interruption: $ROWS_BEFORE" | |
| # Step 5: Kill Litestream (simulate crash) | |
| echo "" | |
| echo "[STEP 5] Killing Litestream (simulating crash)..." | |
| kill -9 $LITESTREAM_PID 2>/dev/null || true | |
| echo "✓ Litestream killed" | |
| # Step 6: Let writes continue for 15 seconds without Litestream | |
| echo "" | |
| echo "[STEP 6] Continuing writes for 15 seconds (Litestream is down)..." | |
| sleep 15 | |
| # Step 7: Execute non-PASSIVE checkpoint | |
| echo "" | |
| echo "[STEP 7] Executing FULL checkpoint while Litestream is down..." | |
| CHECKPOINT_RESULT=$(sqlite3 "$DB" "PRAGMA wal_checkpoint(FULL);" 2>&1) | |
| echo "✓ Checkpoint result: $CHECKPOINT_RESULT" | |
| ROWS_AFTER_CHECKPOINT=$(sqlite3 "$DB" "SELECT COUNT(*) FROM load_test;") | |
| echo "✓ Rows after checkpoint: $ROWS_AFTER_CHECKPOINT" | |
| # Step 8: Resume Litestream | |
| echo "" | |
| echo "[STEP 8] Resuming Litestream..." | |
| ./bin/litestream replicate "$DB" "file://$REPLICA" >> /tmp/litestream.log 2>&1 & | |
| NEW_LITESTREAM_PID=$! | |
| sleep 3 | |
| if ! kill -0 $NEW_LITESTREAM_PID 2>/dev/null; then | |
| echo "WARNING: Litestream failed to restart" | |
| fi | |
| echo "✓ Litestream restarted (PID: $NEW_LITESTREAM_PID)" | |
| # Step 9: Let Litestream catch up | |
| echo "" | |
| echo "[STEP 9] Letting Litestream catch up for 20 seconds..." | |
| sleep 20 | |
| # Stop writes | |
| kill $WRITE_PID 2>/dev/null || true | |
| echo "✓ Writes stopped" | |
| # Wait for final sync | |
| sleep 5 | |
| # Get final row count | |
| FINAL_COUNT=$(sqlite3 "$DB" "SELECT COUNT(*) FROM load_test;") | |
| echo "✓ Final row count in source database: $FINAL_COUNT" | |
| # Kill Litestream | |
| kill $NEW_LITESTREAM_PID 2>/dev/null || true | |
| # Step 10: Attempt to restore (THIS IS WHERE THE BUG OCCURS) | |
| echo "" | |
| echo "[STEP 10] Attempting to restore database..." | |
| echo "==========================================" | |
| echo "" | |
| rm -f /tmp/restored.db | |
| if ./bin/litestream restore -o /tmp/restored.db "file://$REPLICA" 2>&1 | tee /tmp/restore-output.log; then | |
| echo "" | |
| echo "✓ SUCCESS: Restore completed successfully" | |
| # Verify restored database | |
| RESTORED_COUNT=$(sqlite3 /tmp/restored.db "SELECT COUNT(*) FROM load_test;" 2>/dev/null || echo "0") | |
| INTEGRITY=$(sqlite3 /tmp/restored.db "PRAGMA integrity_check;" 2>/dev/null || echo "FAILED") | |
| echo " - Restored row count: $RESTORED_COUNT" | |
| echo " - Integrity check: $INTEGRITY" | |
| if [ "$RESTORED_COUNT" -eq "$FINAL_COUNT" ]; then | |
| echo " - Data integrity: ✓ VERIFIED (no data loss)" | |
| else | |
| LOSS=$((FINAL_COUNT - RESTORED_COUNT)) | |
| echo " - Data integrity: ✗ FAILED (lost $LOSS rows)" | |
| fi | |
| else | |
| echo "" | |
| echo "✗ CRITICAL BUG REPRODUCED: Restore failed!" | |
| echo "" | |
| echo "Error output:" | |
| echo "-------------" | |
| cat /tmp/restore-output.log | |
| echo "" | |
| echo "This is the critical bug. The database cannot be restored after" | |
| echo "Litestream was interrupted and a checkpoint occurred during downtime." | |
| echo "" | |
| echo "Original database stats:" | |
| echo " - Rows before interruption: $ROWS_BEFORE" | |
| echo " - Rows after checkpoint: $ROWS_AFTER_CHECKPOINT" | |
| echo " - Final rows: $FINAL_COUNT" | |
| echo " - DATA IS UNRECOVERABLE" | |
| fi | |
| echo "" | |
| echo "==========================================" | |
| echo "Test artifacts saved in:" | |
| echo " - Source database: $DB" | |
| echo " - Replica files: $REPLICA/" | |
| echo " - Litestream log: /tmp/litestream.log" | |
| echo " - Restore output: /tmp/restore-output.log" | |
| echo "" | |
| # Clean up processes | |
| pkill -f litestream-test 2>/dev/null || true | |
| pkill -f "litestream replicate" 2>/dev/null || true | |
| echo "Test complete." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment