Skip to content

Instantly share code, notes, and snippets.

@Oddly
Created February 3, 2026 18:03
Show Gist options
  • Select an option

  • Save Oddly/621873ffa5f98c0b044f8c289212dea2 to your computer and use it in GitHub Desktop.

Select an option

Save Oddly/621873ffa5f98c0b044f8c289212dea2 to your computer and use it in GitHub Desktop.
#!/bin/bash
#
# Elasticsearch & Kibana RPM Upgrade Script
# ==========================================
# This script upgrades Elasticsearch and/or Kibana on the local node
# using RPMs downloaded from artifacts.elastic.co.
#
# It follows the official Elastic rolling upgrade procedure:
# 1. Pre-flight checks (current versions, cluster health, disk space)
# 2. Disable shard allocation
# 3. Flush all indices (best effort)
# 4. Stop the service
# 5. Download & install the RPM
# 6. Reload systemd and start the service
# 7. Wait for the node to rejoin the cluster
# 8. Re-enable shard allocation
# 9. Wait for cluster to go green
#
# Usage:
# ./upgrade-elastic.sh <target-version> [--es-only|--kibana-only] [--arch aarch64]
#
# Examples:
# ./upgrade-elastic.sh 8.17.0 # Upgrade both ES and Kibana
# ./upgrade-elastic.sh 8.17.0 --es-only # Upgrade Elasticsearch only
# ./upgrade-elastic.sh 8.17.0 --kibana-only # Upgrade Kibana only
# ./upgrade-elastic.sh 8.17.0 --arch aarch64 # Use aarch64 RPMs
# ./upgrade-elastic.sh 8.17.0 --yes # Auto-accept non-critical prompts
# ./upgrade-elastic.sh 8.17.0 --force # Skip ES-dependent checks (ES not running)
#
# Notes:
# - Run this script on each node individually (rolling upgrade)
# - Upgrade non-master-eligible nodes first, then master-eligible nodes
# - Make sure the cluster is GREEN before starting
# - This script must be run as root or with sudo
#
set -euo pipefail
# ==============================================================================
# Configuration & Defaults
# ==============================================================================
ARCH="x86_64"
UPGRADE_ES=true
UPGRADE_KIBANA=true
TARGET_VERSION=""
ES_URL="https://localhost:9200"
DOWNLOAD_DIR="/tmp/elastic-upgrade"
AUTO_YES=false
FORCE_MODE=false
ES_CURRENT=""
KIBANA_CURRENT=""
cluster_name=""
# Curl auth/TLS options -- populated during connection detection
# Stored as an array to avoid eval and shell injection
declare -a ES_CURL_AUTH=()
ES_CURL_INSECURE=false
# Lock file to prevent concurrent runs
LOCK_FILE="/var/run/elastic-upgrade.lock"
LOCK_FD=9
# Timeouts (seconds)
STOP_TIMEOUT=120
API_TIMEOUT=60
STARTUP_WAIT=120
KIBANA_STARTUP_WAIT=300
RECOVERY_WAIT=1200
API_RETRIES=3
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NC='\033[0m' # No Color
# Track state for cleanup on unexpected exit
_allocation_disabled=false
_recovery_boosted=false
# ==============================================================================
# Helper Functions
# ==============================================================================
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
success() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
error() { echo -e "${RED}[ERROR]${NC} $*"; }
header() { echo -e "\n${BOLD}═══════════════════════════════════════════════════════════${NC}"; echo -e "${BOLD} $*${NC}"; echo -e "${BOLD}═══════════════════════════════════════════════════════════${NC}\n"; }
step() { echo -e "\n${YELLOW}▶ STEP: $*${NC}\n"; }
# Cleanup handler for unexpected exits
cleanup_on_exit() {
local exit_code=$?
# Re-enable shard allocation if we disabled it and didn't re-enable
if $_allocation_disabled; then
echo ""
warn "Script interrupted! Attempting to re-enable shard allocation..."
if es_curl_quiet PUT "/_cluster/settings" '{"persistent":{"cluster.routing.allocation.enable":null},"transient":{"cluster.routing.allocation.enable":null}}'; then
success "Shard allocation re-enabled."
else
error "FAILED to re-enable shard allocation! Run manually:"
error " PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}"
fi
fi
# Reset recovery concurrency if we boosted it
if $_recovery_boosted; then
es_curl_quiet PUT "/_cluster/settings" '{"transient":{"cluster.routing.allocation.node_concurrent_incoming_recoveries":null}}' || true
fi
# Release lock
release_lock
exit "$exit_code"
}
trap cleanup_on_exit EXIT INT TERM
# Acquire exclusive lock to prevent concurrent runs
acquire_lock() {
eval "exec ${LOCK_FD}>${LOCK_FILE}"
if ! flock -n "$LOCK_FD"; then
error "Another instance of this script is already running (lock: ${LOCK_FILE})."
error "If you are sure no other instance is running, remove the lock file:"
error " rm -f ${LOCK_FILE}"
exit 1
fi
# Write PID to lock file for debugging
echo $$ >&"$LOCK_FD"
}
release_lock() {
flock -u "$LOCK_FD" 2>/dev/null || true
rm -f "$LOCK_FILE" 2>/dev/null || true
}
confirm() {
local message="$1"
local default="${2:-n}"
if $AUTO_YES; then
info "(auto-yes) $message -> yes"
return 0
fi
if [[ "$default" == "y" ]]; then
prompt="[Y/n]"
else
prompt="[y/N]"
fi
while true; do
echo -en "${BOLD}$message ${prompt}: ${NC}"
read -r answer
answer="${answer:-$default}"
case "${answer,,}" in
y|yes) return 0 ;;
n|no) return 1 ;;
*) echo "Please answer y or n." ;;
esac
done
}
confirm_or_abort() {
if ! confirm "$1" "${2:-y}"; then
warn "Aborted by user."
exit 0
fi
}
# Critical confirmation: in --yes mode, abort with exit 1 instead of auto-accepting
confirm_critical() {
local message="$1"
if $AUTO_YES; then
error "(auto-yes) CRITICAL: $message"
error "Cannot auto-accept critical issues. Resolve the problem and re-run."
exit 1
fi
confirm_or_abort "$message"
}
# Execute curl against the Elasticsearch API.
# Uses arrays instead of eval to avoid shell injection.
# Usage: es_curl <METHOD> <PATH> [JSON_DATA]
# Sets globals: ES_CURL_HTTP_CODE, ES_CURL_BODY
#
# IMPORTANT: Do NOT use body=$(es_curl ...) — command substitution runs in a
# subshell, so the global variables would not propagate back. Instead call
# es_curl directly and read ES_CURL_BODY / ES_CURL_HTTP_CODE afterwards.
ES_CURL_HTTP_CODE=""
ES_CURL_BODY=""
es_curl() {
local method="${1:-GET}"
local path="${2:-/}"
local data="${3:-}"
local base_url="${ES_URL%/}"
local clean_path="${path#/}"
local full_url="${base_url}/${clean_path}"
local -a cmd=(curl -s -w '\n%{http_code}' --max-time "$API_TIMEOUT")
# Add TLS options
if $ES_CURL_INSECURE; then
cmd+=(-k)
fi
# Add auth options (array-safe, no shell expansion)
if [[ ${#ES_CURL_AUTH[@]} -gt 0 ]]; then
cmd+=("${ES_CURL_AUTH[@]}")
fi
if [[ -n "$data" ]]; then
cmd+=(-X "$method" -H 'Content-Type: application/json' -d "$data" "$full_url")
else
cmd+=(-X "$method" "$full_url")
fi
local output
output=$("${cmd[@]}" 2>/dev/null) || true
# Parse HTTP code (last line) and body (everything else)
ES_CURL_HTTP_CODE=$(echo "$output" | tail -1)
ES_CURL_BODY=$(echo "$output" | sed '$d')
}
# Quiet version: returns 0 if HTTP 200, 1 otherwise. No output.
es_curl_quiet() {
es_curl "$@"
[[ "$ES_CURL_HTTP_CODE" == "200" ]]
}
# Retry wrapper for critical API calls
es_curl_retry() {
local retries="$API_RETRIES"
local attempt=1
while [[ $attempt -le $retries ]]; do
es_curl "$@"
if [[ "$ES_CURL_HTTP_CODE" == "200" ]]; then
return 0
fi
if [[ $attempt -lt $retries ]]; then
warn "API call failed (HTTP ${ES_CURL_HTTP_CODE}), retrying (${attempt}/${retries})..."
sleep 2
fi
attempt=$((attempt + 1))
done
return 1
}
# Test ES connectivity. Returns 0 if reachable, 1 otherwise.
test_es_connection() {
local -a cmd=(curl -s -o /dev/null -w '%{http_code}' --max-time 5)
if $ES_CURL_INSECURE; then
cmd+=(-k)
fi
if [[ ${#ES_CURL_AUTH[@]} -gt 0 ]]; then
cmd+=("${ES_CURL_AUTH[@]}")
fi
cmd+=("${ES_URL}/")
local code
code=$("${cmd[@]}" 2>/dev/null) || code="000"
echo "$code"
}
# Check if a systemd service exists
service_exists() {
systemctl list-unit-files "$1.service" &>/dev/null
}
# Get installed RPM version
get_rpm_version() {
local ver
if ver=$(rpm -q --queryformat '%{VERSION}' "$1" 2>/dev/null) && [[ "$ver" != *"not installed"* ]]; then
echo "$ver"
else
echo "not installed"
fi
}
# Download a file with progress, using curl (no wget dependency)
download_file() {
local url="$1"
local dest="$2"
local description="${3:-file}"
info "Downloading ${description}..."
if ! curl --fail --location --retry 3 --retry-delay 5 \
--connect-timeout 15 --max-time 600 \
--progress-bar -o "$dest" "$url"; then
error "Failed to download ${description}."
rm -f "$dest"
return 1
fi
return 0
}
# ==============================================================================
# Argument Parsing
# ==============================================================================
usage() {
echo "Usage: $0 <target-version> [OPTIONS]"
echo ""
echo "Options:"
echo " --es-only Only upgrade Elasticsearch"
echo " --kibana-only Only upgrade Kibana"
echo " --arch ARCH Architecture: x86_64 (default) or aarch64"
echo " --es-url URL Elasticsearch URL (default: https://localhost:9200)"
echo " --yes Auto-accept all non-critical prompts; exit 1 on critical issues"
echo " --force Skip pre-flight checks that require a running Elasticsearch"
echo " (cluster health, node analysis, version mix, deprecation API,"
echo " shard allocation, flush, rejoin wait, recovery wait)"
echo " -h, --help Show this help"
echo ""
echo "Flags can be combined. --force implies --yes for skipped steps."
exit 1
}
while [[ $# -gt 0 ]]; do
case "$1" in
--es-only) UPGRADE_KIBANA=false; shift ;;
--kibana-only) UPGRADE_ES=false; shift ;;
--arch) ARCH="$2"; shift 2 ;;
--es-url) ES_URL="$2"; shift 2 ;;
--yes|-y) AUTO_YES=true; shift ;;
--force|-f) FORCE_MODE=true; AUTO_YES=true; shift ;;
-h|--help) usage ;;
-*) error "Unknown option: $1"; usage ;;
*)
if [[ -z "$TARGET_VERSION" ]]; then
TARGET_VERSION="$1"
else
error "Unexpected argument: $1"
usage
fi
shift
;;
esac
done
if [[ -z "$TARGET_VERSION" ]]; then
error "Target version is required."
usage
fi
# Validate version format
if ! [[ "$TARGET_VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
error "Invalid version format: ${TARGET_VERSION}"
error "Expected format: MAJOR.MINOR.PATCH (e.g., 8.17.0)"
exit 1
fi
# Show active mode banners
if $FORCE_MODE; then
warn "╔══════════════════════════════════════════════════════════╗"
warn "║ FORCE MODE — skipping checks that require running ES ║"
warn "║ Shard allocation, flush, rejoin, and recovery steps ║"
warn "║ will be skipped. You must manage these manually. ║"
warn "╚══════════════════════════════════════════════════════════╝"
echo ""
elif $AUTO_YES; then
info "╔══════════════════════════════════════════════════════════╗"
info "║ AUTO-YES MODE — non-critical prompts will be accepted ║"
info "║ Critical issues will cause the script to exit 1. ║"
info "╚══════════════════════════════════════════════════════════╝"
echo ""
fi
# ==============================================================================
# Pre-flight: Root check
# ==============================================================================
if [[ $EUID -ne 0 ]]; then
error "This script must be run as root (or with sudo)."
exit 1
fi
# ==============================================================================
# Pre-flight: Acquire lock
# ==============================================================================
acquire_lock
# ==============================================================================
# Pre-flight: OS compatibility check
# ==============================================================================
is_rhel_compatible=false
if [[ -f /etc/redhat-release ]]; then
is_rhel_compatible=true
elif [[ -f /etc/os-release ]]; then
source /etc/os-release
case "${ID:-}" in
rhel|centos|fedora|rocky|alma|ol|scientific|amzn)
is_rhel_compatible=true
;;
esac
if [[ "${ID_LIKE:-}" == *"rhel"* ]] || [[ "${ID_LIKE:-}" == *"fedora"* ]]; then
is_rhel_compatible=true
fi
fi
if ! $is_rhel_compatible; then
error "This script is designed for RHEL-compatible systems (RHEL, CentOS, Rocky, Alma, Fedora, etc.)"
error "Detected OS does not appear to be RHEL-compatible."
if [[ -f /etc/os-release ]]; then
source /etc/os-release
error " Detected: ${PRETTY_NAME:-$ID}"
fi
error ""
error "For Debian/Ubuntu systems, use .deb packages instead of RPMs."
error "For other systems, consider using the tarball distribution."
exit 1
fi
if [[ -f /etc/os-release ]]; then
source /etc/os-release
info "Detected OS: ${PRETTY_NAME:-$ID}"
fi
# ==============================================================================
# Pre-flight: Required commands check
# ==============================================================================
header "Checking Required Commands"
missing_cmds=()
for cmd in curl rpm systemctl sha512sum df grep sed awk python3 diff journalctl ldd; do
if command -v "$cmd" &>/dev/null; then
success "Found: $(command -v "$cmd")"
else
error "Missing: ${BOLD}${cmd}${NC}"
missing_cmds+=("$cmd")
fi
done
# Detect package manager: prefer dnf, fall back to yum
if command -v dnf &>/dev/null; then
PKG_MGR="dnf"
success "Found: $(command -v dnf) (package manager)"
elif command -v yum &>/dev/null; then
PKG_MGR="yum"
success "Found: $(command -v yum) (package manager)"
else
error "Missing: ${BOLD}yum/dnf${NC} — no package manager found"
missing_cmds+=("yum/dnf")
fi
if [[ ${#missing_cmds[@]} -gt 0 ]]; then
echo ""
error "The following required commands are missing: ${missing_cmds[*]}"
error "Install them before running this script."
for cmd in "${missing_cmds[@]}"; do
case "$cmd" in
python3) error " -> ${PKG_MGR:-yum} install python3" ;;
sha512sum) error " -> Part of coreutils: ${PKG_MGR:-yum} install coreutils" ;;
journalctl) error " -> Part of systemd: ${PKG_MGR:-yum} install systemd" ;;
ldd) error " -> Part of glibc: ${PKG_MGR:-yum} install glibc-common" ;;
diff) error " -> Part of diffutils: ${PKG_MGR:-yum} install diffutils" ;;
esac
done
exit 1
fi
# ==============================================================================
# Pre-flight: Verify target version is available for download
# ==============================================================================
header "Verifying Target Version Availability"
info "Checking if version ${BOLD}${TARGET_VERSION}${NC} is available on artifacts.elastic.co..."
echo ""
version_available=true
if $UPGRADE_ES; then
ES_RPM_CHECK_URL="https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${TARGET_VERSION}-${ARCH}.rpm"
info "Checking Elasticsearch RPM: ${ES_RPM_CHECK_URL}"
es_http_code=$(curl -s -o /dev/null -w '%{http_code}' --head --max-time 15 "${ES_RPM_CHECK_URL}" 2>/dev/null) || es_http_code="000"
if [[ "$es_http_code" == "200" ]]; then
es_size=$(curl -sI --max-time 15 "${ES_RPM_CHECK_URL}" 2>/dev/null | grep -i 'Content-Length' | awk '{print $2}' | tr -d '\r') || es_size=""
if [[ -n "$es_size" && "$es_size" -gt 0 ]] 2>/dev/null; then
es_size_mb=$((es_size / 1024 / 1024))
success "Elasticsearch ${TARGET_VERSION} (${ARCH}) is available (${es_size_mb} MB)"
else
success "Elasticsearch ${TARGET_VERSION} (${ARCH}) is available"
fi
elif [[ "$es_http_code" == "404" ]]; then
error "Elasticsearch ${TARGET_VERSION} (${ARCH}) was ${BOLD}NOT FOUND${NC}${RED} (HTTP 404)${NC}"
error "URL: ${ES_RPM_CHECK_URL}"
error "Verify the version number and architecture are correct."
error "Browse available versions at: https://www.elastic.co/downloads/past-releases"
version_available=false
else
warn "Could not verify Elasticsearch RPM availability (HTTP ${es_http_code})."
warn "URL: ${ES_RPM_CHECK_URL}"
warn "This may be a network issue. The download will be attempted later."
fi
fi
if $UPGRADE_KIBANA; then
KIBANA_RPM_CHECK_URL="https://artifacts.elastic.co/downloads/kibana/kibana-${TARGET_VERSION}-${ARCH}.rpm"
info "Checking Kibana RPM: ${KIBANA_RPM_CHECK_URL}"
kibana_http_code=$(curl -s -o /dev/null -w '%{http_code}' --head --max-time 15 "${KIBANA_RPM_CHECK_URL}" 2>/dev/null) || kibana_http_code="000"
if [[ "$kibana_http_code" == "200" ]]; then
kibana_size=$(curl -sI --max-time 15 "${KIBANA_RPM_CHECK_URL}" 2>/dev/null | grep -i 'Content-Length' | awk '{print $2}' | tr -d '\r') || kibana_size=""
if [[ -n "$kibana_size" && "$kibana_size" -gt 0 ]] 2>/dev/null; then
kibana_size_mb=$((kibana_size / 1024 / 1024))
success "Kibana ${TARGET_VERSION} (${ARCH}) is available (${kibana_size_mb} MB)"
else
success "Kibana ${TARGET_VERSION} (${ARCH}) is available"
fi
elif [[ "$kibana_http_code" == "404" ]]; then
error "Kibana ${TARGET_VERSION} (${ARCH}) was ${BOLD}NOT FOUND${NC}${RED} (HTTP 404)${NC}"
error "URL: ${KIBANA_RPM_CHECK_URL}"
error "Verify the version number and architecture are correct."
error "Browse available versions at: https://www.elastic.co/downloads/past-releases"
version_available=false
else
warn "Could not verify Kibana RPM availability (HTTP ${kibana_http_code})."
warn "URL: ${KIBANA_RPM_CHECK_URL}"
warn "This may be a network issue. The download will be attempted later."
fi
fi
if ! $version_available; then
echo ""
error "One or more RPMs are not available for version ${TARGET_VERSION}."
confirm_or_abort "Continue anyway (downloads will fail later)?"
fi
# ==============================================================================
# Detect Elasticsearch connection settings
# This runs for BOTH --es-only and --kibana-only because the Kibana upgrade
# section needs ES API access to check shard allocation.
# Only skipped in --force mode where all API calls are bypassed.
# ==============================================================================
if $FORCE_MODE; then
header "Detecting Elasticsearch Connection"
warn "FORCE MODE: Skipping Elasticsearch connection detection."
warn "Cluster health, node analysis, deprecation checks, shard management"
warn "and recovery steps will all be skipped."
else
header "Detecting Elasticsearch Connection"
info "Testing connection to ${ES_URL}..."
# Try plain connection first
response=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "${ES_URL}/" 2>/dev/null) || response="000"
if [[ "$response" == "200" ]]; then
success "Connected to Elasticsearch (no auth required)"
elif [[ "$response" == "401" ]]; then
info "Elasticsearch requires authentication."
echo -en "${BOLD}Enter Elasticsearch username [elastic]: ${NC}"
read -r es_user
es_user="${es_user:-elastic}"
echo -en "${BOLD}Enter Elasticsearch password: ${NC}"
read -rs es_pass
echo ""
# Store credentials in array -- never passed through eval/shell expansion
ES_CURL_AUTH=(-u "${es_user}:${es_pass}")
ES_CURL_INSECURE=true
response=$(test_es_connection)
if [[ "$response" == "200" ]]; then
success "Authenticated successfully."
else
error "Authentication failed (HTTP $response). Please check credentials."
exit 1
fi
# Clear credential variables from memory (array persists for curl calls)
unset es_pass
elif [[ "$response" == "000" ]]; then
# Try https with -k
ES_URL="${ES_URL/http:/https:}"
ES_CURL_INSECURE=true
response=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 -k "${ES_URL}/" 2>/dev/null) || response="000"
if [[ "$response" == "200" ]]; then
success "Connected to Elasticsearch over HTTPS (self-signed cert)"
elif [[ "$response" == "401" ]]; then
info "Elasticsearch requires authentication (HTTPS)."
echo -en "${BOLD}Enter Elasticsearch username [elastic]: ${NC}"
read -r es_user
es_user="${es_user:-elastic}"
echo -en "${BOLD}Enter Elasticsearch password: ${NC}"
read -rs es_pass
echo ""
ES_CURL_AUTH=(-u "${es_user}:${es_pass}")
response=$(test_es_connection)
if [[ "$response" == "200" ]]; then
success "Authenticated successfully over HTTPS."
else
error "Authentication failed (HTTP $response)."
exit 1
fi
unset es_pass
else
error "Cannot connect to Elasticsearch at ${ES_URL} (HTTP $response)."
error "Make sure Elasticsearch is running and accessible."
error "You can specify a custom URL with: --es-url http://hostname:9200"
exit 1
fi
else
error "Unexpected response from Elasticsearch (HTTP $response)."
exit 1
fi
fi # end of: if ! $FORCE_MODE (ES connection)
# ==============================================================================
# STEP 0: Pre-flight Checks
# ==============================================================================
header "Pre-flight Checks"
# --- Current versions ---
step "Checking currently installed versions"
if $UPGRADE_ES; then
ES_CURRENT=$(get_rpm_version elasticsearch)
if [[ "$ES_CURRENT" == "not installed" ]]; then
error "Elasticsearch RPM is not installed on this system."
exit 1
fi
info "Elasticsearch installed version: ${BOLD}${ES_CURRENT}${NC}"
fi
if $UPGRADE_KIBANA; then
KIBANA_CURRENT=$(get_rpm_version kibana)
if [[ "$KIBANA_CURRENT" == "not installed" ]]; then
warn "Kibana RPM is not installed on this system. Skipping Kibana upgrade."
UPGRADE_KIBANA=false
else
info "Kibana installed version: ${BOLD}${KIBANA_CURRENT}${NC}"
fi
fi
info "Target version: ${BOLD}${TARGET_VERSION}${NC}"
echo ""
if $UPGRADE_ES && [[ "$ES_CURRENT" == "$TARGET_VERSION" ]]; then
warn "Elasticsearch is already at version ${TARGET_VERSION}."
if ! confirm "Continue anyway?" "n"; then
UPGRADE_ES=false
fi
fi
if $UPGRADE_KIBANA && [[ "$KIBANA_CURRENT" == "$TARGET_VERSION" ]]; then
warn "Kibana is already at version ${TARGET_VERSION}."
if ! confirm "Continue anyway?" "n"; then
UPGRADE_KIBANA=false
fi
fi
if ! $UPGRADE_ES && ! $UPGRADE_KIBANA; then
info "Nothing to upgrade."
exit 0
fi
# --- Version jump analysis ---
if $UPGRADE_ES; then
step "Analyzing version upgrade path"
upgrade_path_ok=true
read -r cur_major cur_minor cur_patch <<< "$(echo "$ES_CURRENT" | awk -F. '{print $1, $2, $3}')"
read -r tgt_major tgt_minor tgt_patch <<< "$(echo "$TARGET_VERSION" | awk -F. '{print $1, $2, $3}')"
if [[ -z "$cur_major" || -z "$tgt_major" ]]; then
warn "Could not parse version numbers. Skipping upgrade path analysis."
else
info "Current: ${BOLD}${ES_CURRENT}${NC} (major=${cur_major}, minor=${cur_minor}, patch=${cur_patch})"
info "Target: ${BOLD}${TARGET_VERSION}${NC} (major=${tgt_major}, minor=${tgt_minor}, patch=${tgt_patch})"
echo ""
if [[ "$tgt_major" -lt "$cur_major" ]] || \
{ [[ "$tgt_major" -eq "$cur_major" && "$tgt_minor" -lt "$cur_minor" ]]; } || \
{ [[ "$tgt_major" -eq "$cur_major" && "$tgt_minor" -eq "$cur_minor" && "$tgt_patch" -lt "$cur_patch" ]]; }; then
warn "Target version ${TARGET_VERSION} is OLDER than current ${ES_CURRENT}."
warn "Elasticsearch does not support downgrades. Nodes cannot be rolled back"
warn "once upgraded. This will install an older RPM but may cause problems."
confirm_or_abort "This looks like a downgrade. Are you sure?"
elif [[ "$tgt_major" -eq "$cur_major" ]]; then
if [[ "$tgt_minor" -eq "$cur_minor" ]]; then
success "Patch upgrade (${ES_CURRENT} -> ${TARGET_VERSION}). No special requirements."
else
success "Minor upgrade (${ES_CURRENT} -> ${TARGET_VERSION}). Rolling upgrade supported."
fi
elif [[ "$tgt_major" -eq $((cur_major + 1)) ]]; then
warn "This is a ${BOLD}MAJOR VERSION${NC}${YELLOW} upgrade (${cur_major}.x -> ${tgt_major}.x).${NC}"
echo ""
declare -A gateway_minor
gateway_minor[6]=8
gateway_minor[7]=17
gateway_minor[8]=19
required_minor="${gateway_minor[$cur_major]:-}"
if [[ -n "$required_minor" ]]; then
if [[ "$cur_major" -eq 8 && "$cur_minor" -eq 18 && "$tgt_major" -eq 9 && "$tgt_minor" -eq 0 ]]; then
success "On 8.18.x targeting 9.0.x — this specific path is supported by Elastic."
elif [[ "$cur_minor" -lt "$required_minor" ]]; then
error "Major upgrade from ${cur_major}.x to ${tgt_major}.x requires being on ${cur_major}.${required_minor}.x first."
error ""
error "Current version ${ES_CURRENT} is below the required stepping stone."
error ""
error "Required upgrade path:"
error " 1. First upgrade: ${ES_CURRENT} -> ${cur_major}.${required_minor}.x (minor upgrade)"
error " 2. Then upgrade: ${cur_major}.${required_minor}.x -> ${TARGET_VERSION} (major upgrade)"
error ""
error "Skipping the stepping-stone version may cause data loss or failed startup."
upgrade_path_ok=false
confirm_critical "Override and attempt direct major upgrade anyway? THIS IS DANGEROUS."
elif [[ "$cur_minor" -eq "$required_minor" ]]; then
success "On required gateway version ${cur_major}.${required_minor}.x. Major upgrade path is valid."
else
success "On ${ES_CURRENT}, which is above the required ${cur_major}.${required_minor}.x gateway."
fi
if [[ "$cur_major" -eq 8 && "$cur_minor" -eq 18 && "$tgt_minor" -gt 0 ]]; then
warn "Version 8.18.x can only upgrade to 9.0.x directly."
warn "For 9.1.x or later, you must first upgrade to 8.19.x."
error ""
error "Required upgrade path:"
error " 1. First upgrade: ${ES_CURRENT} -> 8.19.x"
error " 2. Then upgrade: 8.19.x -> ${TARGET_VERSION}"
upgrade_path_ok=false
confirm_critical "Override and attempt direct upgrade anyway? THIS IS DANGEROUS."
fi
else
warn "No gateway version defined for ${cur_major}.x -> ${tgt_major}.x in this script."
warn "Check the Elastic upgrade documentation for the correct upgrade path."
fi
echo ""
warn "Major upgrade checklist:"
warn " - Run the Upgrade Assistant in Kibana before upgrading"
warn " - Resolve ALL critical deprecation issues (checked later in this script)"
warn " - Ensure no indices created before ${cur_major}.0 exist (reindex or delete them)"
warn " - Review breaking changes: https://www.elastic.co/guide/en/elasticsearch/reference/${tgt_major}.x/breaking-changes.html"
warn " - Take a full snapshot backup before proceeding"
echo ""
if $upgrade_path_ok; then
confirm_or_abort "Acknowledge major upgrade requirements and continue?"
fi
elif [[ "$tgt_major" -gt $((cur_major + 1)) ]]; then
error "Upgrading from ${cur_major}.x to ${tgt_major}.x skips one or more major versions."
error "Elasticsearch does NOT support skipping major versions."
error ""
error "Required upgrade path:"
v=$cur_major
while [[ $v -lt $tgt_major ]]; do
next=$((v + 1))
gw="${gateway_minor[$v]:-last_minor}"
if [[ $v -eq $cur_major ]]; then
error " ${v}.x -> ${v}.${gw}.x (get to gateway minor first)"
fi
error " ${v}.${gw}.x -> ${next}.x"
v=$next
done
error ""
error "Each major version boundary must be crossed individually."
confirm_critical "Override and attempt anyway? THIS WILL ALMOST CERTAINLY FAIL."
fi
fi
fi
# --- Cluster health ---
if $FORCE_MODE; then
if $UPGRADE_ES || $UPGRADE_KIBANA; then
step "Checking cluster health"
warn "FORCE MODE: Skipping cluster health check."
fi
else
step "Checking cluster health"
es_curl GET "/_cluster/health?pretty"
health_body="$ES_CURL_BODY"
health_code="$ES_CURL_HTTP_CODE"
if [[ "$health_code" != "200" ]]; then
error "Failed to get cluster health (HTTP $health_code)."
exit 1
fi
cluster_status=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('status','unknown'))" 2>/dev/null) || cluster_status="unknown"
cluster_name=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('cluster_name',''))" 2>/dev/null) || cluster_name=""
num_nodes=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('number_of_nodes',0))" 2>/dev/null) || num_nodes="?"
unassigned=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('unassigned_shards',0))" 2>/dev/null) || unassigned="?"
info "Cluster: ${BOLD}${cluster_name}${NC}"
info "Nodes: ${BOLD}${num_nodes}${NC}"
info "Unassigned: ${BOLD}${unassigned}${NC}"
if [[ "$cluster_status" == "green" ]]; then
success "Cluster status: ${GREEN}${BOLD}GREEN${NC}"
elif [[ "$cluster_status" == "yellow" ]]; then
warn "Cluster status: ${YELLOW}${BOLD}YELLOW${NC}"
warn "It is recommended to start with a GREEN cluster."
confirm_or_abort "Continue with YELLOW cluster status?"
else
error "Cluster status: ${RED}${BOLD}RED${NC}"
error "DO NOT upgrade while cluster is RED."
confirm_critical "Are you absolutely sure you want to continue with RED status?"
fi
# --- Node roles, versions & upgrade order advice (ES upgrade only) ---
if $UPGRADE_ES; then
step "Analyzing cluster nodes and upgrade order"
es_curl GET "/_cat/nodes?h=name,ip,version,master,node.role&format=json"
nodes_body="$ES_CURL_BODY"
nodes_code="$ES_CURL_HTTP_CODE"
local_hostname=$(hostname -s 2>/dev/null) || local_hostname=""
local_fqdn=$(hostname -f 2>/dev/null) || local_fqdn=""
local_ips=$(hostname -I 2>/dev/null | tr ' ' '\n' | grep -v '^$') || local_ips=""
local_ips="${local_ips}
127.0.0.1
::1"
if [[ "$nodes_code" == "200" ]]; then
_pyfile=$(mktemp /tmp/elastic-upgrade-nodecheck.XXXXXX.py)
chmod 600 "$_pyfile"
cat > "$_pyfile" <<'PYEOF'
import json, sys, os
nodes = json.load(sys.stdin)
local_hostname = os.environ.get('LOCAL_HOSTNAME', '').lower()
local_fqdn = os.environ.get('LOCAL_FQDN', '').lower()
local_ips = set(ip.strip() for ip in os.environ.get('LOCAL_IPS', '').split('\n') if ip.strip())
target_version = os.environ.get('TARGET_VERSION', '')
# --- Identify local node ---
local_node = None
for n in nodes:
node_name = n.get('name', '').strip()
node_ip = n.get('ip', '').strip()
if (node_name.lower() == local_hostname or
node_name.lower() == local_fqdn or
node_ip in local_ips or
local_hostname and node_name.lower().startswith(local_hostname)):
local_node = node_name
break
# --- Classify nodes ---
tier1_coord = []
tier2_data = []
tier3_master = []
tier4_elected = None
all_entries = []
for n in nodes:
name = n.get('name', '').strip()
version = n.get('version', '').strip()
is_elected_master = n.get('master', '').strip() == '*'
roles = n.get('node.role', '').strip()
has_master_role = 'm' in roles
has_data_role = 'd' in roles
entry = {
'name': name,
'version': version,
'roles': roles,
'is_elected_master': is_elected_master,
'has_master_role': has_master_role,
'has_data_role': has_data_role,
'is_local': name == local_node,
'already_upgraded': version == target_version,
}
all_entries.append(entry)
if is_elected_master:
tier4_elected = entry
elif has_master_role:
tier3_master.append(entry)
elif has_data_role:
tier2_data.append(entry)
else:
tier1_coord.append(entry)
all_combined = all(e['has_master_role'] and e['has_data_role'] for e in all_entries)
# --- Print node table ---
print('NODE_TABLE_START')
for n in nodes:
name = n.get('name', '').strip()
version = n.get('version', '').strip()
roles = n.get('node.role', '').strip()
is_elected = n.get('master', '').strip() == '*'
markers = []
if is_elected:
markers.append('elected master')
if name == local_node:
markers.append('THIS NODE')
marker_str = f' ({", ".join(markers)})' if markers else ''
elected_icon = 'M' if is_elected else ' '
local_icon = '>' if name == local_node else ' '
print(f' {local_icon}{elected_icon} {name:<45} v{version:<12} roles: {roles}{marker_str}')
print('NODE_TABLE_END')
# --- Determine upgrade order ---
upgrade_order = []
upgrade_order += sorted(tier1_coord, key=lambda x: x['name'])
upgrade_order += sorted(tier2_data, key=lambda x: x['name'])
upgrade_order += sorted(tier3_master, key=lambda x: x['name'])
if tier4_elected:
upgrade_order.append(tier4_elected)
def tier_label(entry):
parts = []
if entry['is_elected_master']:
parts.append('elected master')
elif entry['has_master_role']:
parts.append('master-eligible')
if entry['has_data_role']:
parts.append('data')
if not entry['has_master_role'] and not entry['has_data_role']:
parts.append('coordinating/ingest')
return ', '.join(parts)
print('ORDER_START')
if all_combined:
print(f' NOTE: All nodes have both data and master roles (combined topology).')
print(f' NOTE: Upgrade any non-elected node first, elected master last.')
print(f' ---')
for i, n in enumerate(upgrade_order, 1):
status = 'DONE' if n['already_upgraded'] else 'PENDING'
local_marker = ' << THIS NODE' if n['is_local'] else ''
print(f' {i:>2}. {n["name"]:<40} v{n["version"]:<12} [{tier_label(n)}] {status}{local_marker}')
print('ORDER_END')
# --- Generate advice ---
print('ADVICE_START')
if local_node is None:
print('WARN|Could not identify which cluster node corresponds to this machine.')
print('WARN|Hostname: ' + local_hostname + ', FQDN: ' + local_fqdn)
print('WARN|Local IPs: ' + ', '.join(sorted(local_ips - {'127.0.0.1', '::1'})))
print('WARN|This is expected when --es-url points to a remote node.')
print('WARN|Verify manually that you are upgrading nodes in the correct order.')
local_entry = None
else:
local_entry = None
local_position = -1
for i, n in enumerate(upgrade_order):
if n['is_local']:
local_entry = n
local_position = i
break
if local_entry is None:
print('WARN|This node was identified but not found in the upgrade order. This is unexpected.')
elif local_entry['already_upgraded']:
print(f'OK|This node ({local_node}) is already at version {target_version}.')
else:
nodes_ahead_pending = [upgrade_order[i] for i in range(local_position) if not upgrade_order[i]['already_upgraded']]
if len(nodes_ahead_pending) > 0:
print(f'WARN|There are {len(nodes_ahead_pending)} node(s) that should ideally be upgraded BEFORE this one:')
for n in nodes_ahead_pending:
print(f'WARN| - {n["name"]} (v{n["version"]}, {tier_label(n)})')
if local_entry.get('is_elected_master'):
print('INFO|')
print('INFO|This node is the current elected master.')
print('INFO|The master role will transfer automatically when this node stops.')
elif all_combined:
print('INFO|')
print('INFO|All nodes share both data and master roles.')
print('INFO|Order among non-elected peers does not strictly matter,')
print('INFO|but the elected master should be upgraded last.')
else:
print(f'OK|This node ({local_node}) is next in the upgrade order. Safe to proceed.')
if local_entry.get('is_elected_master'):
other_masters = [n for n in tier3_master if not n['is_local']]
if other_masters:
print('OK|This is the elected master. A new master will be elected automatically when this node stops.')
else:
print('WARN|This is the ONLY master-eligible node. The cluster will be unavailable during upgrade.')
print('ADVICE_END')
# --- Cluster version mix analysis ---
print('VERSION_MIX_START')
versions_in_cluster = set(e['version'] for e in all_entries)
if local_node is None or local_entry is None:
from collections import Counter
version_counts = Counter(e['version'] for e in all_entries)
print(f'INFO|Versions currently in cluster:')
for v, count in sorted(version_counts.items()):
node_names = [e['name'] for e in all_entries if e['version'] == v]
print(f'INFO| v{v}: {count} node(s) — {", ".join(node_names)}')
print(f'WARN|Local node not found in cluster. Cannot predict version mix after upgrade.')
else:
versions_after = list(e['version'] for e in all_entries if not e.get('is_local'))
versions_after.append(target_version)
unique_after = set(versions_after)
from collections import Counter
version_counts = Counter(e['version'] for e in all_entries)
print(f'INFO|Versions currently in cluster:')
for v, count in sorted(version_counts.items()):
node_names = [e['name'] for e in all_entries if e['version'] == v]
print(f'INFO| v{v}: {count} node(s) — {", ".join(node_names)}')
if len(versions_in_cluster) == 1 and list(versions_in_cluster)[0] == target_version:
print(f'OK|All nodes are already on {target_version}.')
elif len(unique_after) <= 2:
if len(unique_after) == 1:
print(f'OK|After this upgrade, all nodes will be on {target_version}.')
else:
other_version = [v for v in unique_after if v != target_version]
remaining_old = len([v for v in versions_after if v != target_version])
print(f'OK|After this upgrade the cluster will have 2 versions (normal during rolling upgrade):')
print(f'OK| v{target_version} (upgraded) and v{other_version[0]} ({remaining_old} node(s) remaining)')
elif len(unique_after) >= 3:
print(f'BLOCK|')
print(f'BLOCK|After upgrading this node, the cluster would have {len(unique_after)} different versions:')
version_counts_after = Counter(versions_after)
for v, count in sorted(version_counts_after.items()):
print(f'BLOCK| v{v}: {count} node(s)')
print(f'BLOCK|')
print(f'BLOCK|Running 3+ versions simultaneously is NOT supported by Elasticsearch.')
print(f'BLOCK|This typically means a previous rolling upgrade was not completed.')
print(f'BLOCK|')
print(f'BLOCK|Recommended action:')
print(f'BLOCK| Complete the previous upgrade first — bring ALL nodes to the same')
print(f'BLOCK| version before starting a new upgrade to {target_version}.')
print('VERSION_MIX_END')
PYEOF
upgrade_advice=$(echo "$nodes_body" | LOCAL_HOSTNAME="$local_hostname" LOCAL_FQDN="$local_fqdn" LOCAL_IPS="$local_ips" TARGET_VERSION="$TARGET_VERSION" python3 "$_pyfile" 2>/dev/null) || true
rm -f "$_pyfile"
if [[ -z "$upgrade_advice" ]]; then
warn "Could not analyze node roles. Proceeding without upgrade order advice."
else
echo "$upgrade_advice" | sed -n '/NODE_TABLE_START/,/NODE_TABLE_END/p' | grep -v '_START\|_END'
echo ""
info "${BOLD}Legend:${NC} M = elected master, > = this node"
echo ""
info "${BOLD}Recommended upgrade order:${NC}"
echo "$upgrade_advice" | sed -n '/ORDER_START/,/ORDER_END/p' | grep -v '_START\|_END'
echo ""
has_warn=false
while IFS= read -r line; do
case "$line" in
WARN\|*)
has_warn=true
msg="${line#WARN|}"
[[ -n "$msg" ]] && warn "$msg"
;;
OK\|*)
msg="${line#OK|}"
[[ -n "$msg" ]] && success "$msg"
;;
INFO\|*)
msg="${line#INFO|}"
[[ -n "$msg" ]] && info "$msg"
;;
esac
done <<< "$(echo "$upgrade_advice" | sed -n '/ADVICE_START/,/ADVICE_END/p' | grep -v '_START\|_END')"
echo ""
if $has_warn; then
confirm_or_abort "Acknowledge the upgrade order warnings above and continue?"
fi
# --- Version mix analysis ---
version_mix_section=$(echo "$upgrade_advice" | sed -n '/VERSION_MIX_START/,/VERSION_MIX_END/p' | grep -v '_START\|_END')
if [[ -n "$version_mix_section" ]]; then
step "Checking cluster version consistency"
vmix_has_block=false
while IFS= read -r line; do
case "$line" in
BLOCK\|*)
vmix_has_block=true
msg="${line#BLOCK|}"
[[ -n "$msg" ]] && error "$msg"
;;
WARN\|*)
msg="${line#WARN|}"
[[ -n "$msg" ]] && warn "$msg"
;;
OK\|*)
msg="${line#OK|}"
[[ -n "$msg" ]] && success "$msg"
;;
INFO\|*)
msg="${line#INFO|}"
[[ -n "$msg" ]] && info "$msg"
;;
esac
done <<< "$version_mix_section"
echo ""
if $vmix_has_block; then
error "Upgrading this node would introduce 3+ versions into the cluster."
confirm_critical "Override version mix warning and continue anyway? THIS IS NOT SUPPORTED."
fi
fi
fi
else
warn "Could not retrieve node list (HTTP ${nodes_code}). Skipping upgrade order analysis."
fi
fi # end $UPGRADE_ES node analysis
fi # end of: if ! $FORCE_MODE (cluster health, node analysis, version mix)
# --- Disk space ---
step "Checking disk space"
disk_space_ok=true
free_tmp_mb=$(df -m /tmp 2>/dev/null | awk 'NR==2{print $4}')
info "Free space in /tmp (downloads): ${free_tmp_mb:-unknown} MB"
if [[ -n "$free_tmp_mb" && "$free_tmp_mb" -lt 1024 ]]; then
warn "Less than 1 GB free in /tmp. RPM download may fail."
disk_space_ok=false
fi
free_usr_mb=$(df -m /usr 2>/dev/null | awk 'NR==2{print $4}')
info "Free space in /usr (installation): ${free_usr_mb:-unknown} MB"
if [[ -n "$free_usr_mb" && "$free_usr_mb" -lt 1024 ]]; then
warn "Less than 1 GB free in /usr. RPM installation may fail."
disk_space_ok=false
fi
free_var_mb=$(df -m /var 2>/dev/null | awk 'NR==2{print $4}')
info "Free space in /var (data/logs): ${free_var_mb:-unknown} MB"
if [[ -n "$free_var_mb" && "$free_var_mb" -lt 512 ]]; then
warn "Less than 512 MB free in /var. May have issues with logs during upgrade."
disk_space_ok=false
fi
if ! $disk_space_ok; then
confirm_or_abort "Continue despite low disk space warnings?"
fi
# --- glibc version check ---
if $UPGRADE_ES; then
step "Checking glibc version"
info "Newer Elasticsearch versions bundle a JDK that may require a newer glibc."
info "If glibc is too old (e.g. CentOS 7 ships glibc 2.17), ES may fail to start."
glibc_version=$(ldd --version 2>&1 | head -1 | grep -oP '[0-9]+\.[0-9]+' | head -1) || glibc_version="unknown"
info "Detected glibc version: ${BOLD}${glibc_version}${NC}"
if [[ "$glibc_version" != "unknown" ]]; then
glibc_major=$(echo "$glibc_version" | cut -d. -f1)
glibc_minor=$(echo "$glibc_version" | cut -d. -f2)
if [[ "$glibc_major" -le 2 && "$glibc_minor" -lt 17 ]]; then
error "glibc ${glibc_version} is very old and likely incompatible with ES ${TARGET_VERSION}."
confirm_or_abort "Continue despite potential glibc incompatibility?"
elif [[ "$glibc_major" -le 2 && "$glibc_minor" -lt 31 ]]; then
warn "glibc ${glibc_version} may be too old for the bundled JDK in newer ES releases."
warn "If ES fails to start after upgrade, this is likely the cause."
warn "Consider testing this upgrade on one node first before rolling out."
confirm_or_abort "Acknowledge glibc risk and continue?"
else
success "glibc ${glibc_version} should be compatible."
fi
else
warn "Could not detect glibc version. Verify manually that your OS is compatible."
fi
fi
# --- Deprecation API check ---
if $UPGRADE_ES && ! $FORCE_MODE; then
step "Checking for deprecated settings"
info "Querying the deprecation API to find settings that may block the upgrade."
info "Command: GET /_migration/deprecations"
echo ""
es_curl GET "/_migration/deprecations"
deprec_body="$ES_CURL_BODY"
deprec_code="$ES_CURL_HTTP_CODE"
if [[ "$deprec_code" == "200" ]]; then
crit_count=$(echo "$deprec_body" | python3 -c "
import json, sys
try:
data = json.load(sys.stdin)
count = 0
for category in ['cluster_settings', 'node_settings', 'index_settings', 'ml_settings']:
items = data.get(category, [])
if isinstance(items, list):
for item in items:
if item.get('level') == 'critical':
count += 1
elif isinstance(items, dict):
for index_name, idx_items in items.items():
for item in idx_items:
if item.get('level') == 'critical':
count += 1
print(count)
except:
print(-1)
" 2>/dev/null) || crit_count="-1"
warn_count=$(echo "$deprec_body" | python3 -c "
import json, sys
try:
data = json.load(sys.stdin)
count = 0
for category in ['cluster_settings', 'node_settings', 'index_settings', 'ml_settings']:
items = data.get(category, [])
if isinstance(items, list):
for item in items:
if item.get('level') == 'warning':
count += 1
elif isinstance(items, dict):
for index_name, idx_items in items.items():
for item in idx_items:
if item.get('level') == 'warning':
count += 1
print(count)
except:
print(-1)
" 2>/dev/null) || warn_count="-1"
if [[ "$crit_count" == "-1" ]]; then
warn "Could not parse deprecation response. Review manually:"
warn " GET /_migration/deprecations"
elif [[ "$crit_count" -gt 0 ]]; then
error "Found ${BOLD}${crit_count} CRITICAL${NC}${RED} deprecation(s) that may prevent startup after upgrade!${NC}"
echo "$deprec_body" | python3 -c "
import json, sys
data = json.load(sys.stdin)
for category in ['cluster_settings', 'node_settings', 'index_settings', 'ml_settings']:
items = data.get(category, [])
if isinstance(items, list):
for item in items:
if item.get('level') == 'critical':
print(f\" [CRITICAL] [{category}] {item.get('message', 'unknown')}\")
elif isinstance(items, dict):
for idx_name, idx_items in items.items():
for item in idx_items:
if item.get('level') == 'critical':
print(f\" [CRITICAL] [index: {idx_name}] {item.get('message', 'unknown')}\")
" 2>/dev/null || true
echo ""
error "Critical deprecations MUST be resolved before upgrading."
error "Removed settings will prevent Elasticsearch from starting."
confirm_critical "Continue despite critical deprecation warnings?"
elif [[ "$warn_count" -gt 0 ]]; then
warn "Found ${warn_count} deprecation warning(s) (non-critical)."
info "Review with: GET /_migration/deprecations"
info "These won't block the upgrade but should be addressed."
else
success "No deprecation issues found."
fi
elif [[ "$deprec_code" == "404" ]]; then
warn "Deprecation API not available (older ES version). Skipping check."
warn "Manually review release notes for removed settings between ${ES_CURRENT} and ${TARGET_VERSION}."
else
warn "Deprecation API returned HTTP ${deprec_code}. Skipping check."
fi
fi
# --- Monitoring exporter config check ---
if $UPGRADE_ES; then
step "Checking for legacy monitoring configuration"
es_config="/etc/elasticsearch/elasticsearch.yml"
if [[ -f "$es_config" ]]; then
if grep -q "xpack.monitoring.exporters" "$es_config" 2>/dev/null; then
warn "Found ${BOLD}xpack.monitoring.exporters${NC}${YELLOW} in elasticsearch.yml${NC}"
warn "Legacy monitoring (internal collection) is deprecated since 8.x."
warn "Plan to migrate to Elastic Agent or Metricbeat monitoring collection."
echo ""
fi
if grep -q "xpack.monitoring.collection.enabled" "$es_config" 2>/dev/null; then
warn "Found ${BOLD}xpack.monitoring.collection.enabled${NC}${YELLOW} in elasticsearch.yml${NC}"
warn "Legacy internal monitoring collection is deprecated."
echo ""
fi
fi
fi
# --- Check for open ML jobs (warn about potential disruption) ---
if $UPGRADE_ES && ! $FORCE_MODE; then
step "Checking for running ML jobs"
es_curl GET "/_ml/anomaly_detectors/_all?allow_no_match=true"
ml_body="$ES_CURL_BODY"
ml_code="$ES_CURL_HTTP_CODE"
if [[ "$ml_code" == "200" ]]; then
open_jobs=$(echo "$ml_body" | python3 -c "
import json, sys
try:
data = json.load(sys.stdin)
jobs = data.get('jobs', [])
opened = [j['job_id'] for j in jobs if j.get('state') == 'opened']
print(len(opened))
except:
print(0)
" 2>/dev/null) || open_jobs="0"
if [[ "$open_jobs" -gt 0 ]]; then
warn "There are ${BOLD}${open_jobs}${NC}${YELLOW} open ML anomaly detection jobs.${NC}"
warn "These will be interrupted when Elasticsearch stops."
warn "They should recover automatically after restart, but may need"
warn "a few minutes to re-open and resume processing."
info "To see open jobs: GET /_ml/anomaly_detectors/_all?allow_no_match=true"
else
success "No open ML jobs."
fi
fi
fi
# ==============================================================================
# Summary & Confirmation
# ==============================================================================
header "Upgrade Plan"
if [[ -n "${cluster_name:-}" ]]; then
echo -e " Cluster: ${BOLD}${cluster_name}${NC}"
fi
echo -e " Target version: ${BOLD}${TARGET_VERSION}${NC}"
echo -e " Architecture: ${BOLD}${ARCH}${NC}"
$UPGRADE_ES && echo -e " Upgrade ES: ${GREEN}YES${NC} (${ES_CURRENT} -> ${TARGET_VERSION})"
$UPGRADE_ES || echo -e " Upgrade ES: ${YELLOW}NO${NC}"
$UPGRADE_KIBANA && echo -e " Upgrade Kibana: ${GREEN}YES${NC} (${KIBANA_CURRENT} -> ${TARGET_VERSION})"
$UPGRADE_KIBANA || echo -e " Upgrade Kibana: ${YELLOW}NO${NC}"
echo ""
echo -e "${BOLD}The following steps will be performed:${NC}"
echo " Pre-upgrade:"
echo " - Snapshot reminder"
echo " - Remove version locks (if any)"
echo " - Download and verify all RPMs"
echo " - Backup configuration files"
$UPGRADE_ES && cat <<'EOF'
Elasticsearch:
1. Disable shard allocation
2. Flush all indices (best effort)
3. Stop Elasticsearch
4. Install Elasticsearch RPM
4a. Keystore check
4b. Check for .rpmnew files
4c. Verify JVM heap settings
5. Reload systemd daemon
6. Start Elasticsearch
7. Wait for node to rejoin and re-enable allocation
7a. Re-enable shard allocation
7b. Wait for cluster recovery
EOF
$UPGRADE_KIBANA && cat <<'EOF'
Kibana:
K1. Verify shard allocation is enabled
K2. Stop Kibana
K3. Install Kibana RPM
K4. Reload systemd daemon
K5. Start Kibana and wait for ready
EOF
echo ""
confirm_or_abort "Proceed with the upgrade?"
# ==============================================================================
# Pre-upgrade: Snapshot Reminder
# ==============================================================================
if $UPGRADE_ES && ! $FORCE_MODE; then
header "Snapshot Reminder"
warn "Before upgrading, you should have a recent snapshot of your data."
warn "Snapshots are the ONLY reliable way to roll back Elasticsearch data."
echo ""
info "To check existing snapshots:"
info " GET /_snapshot/_all"
info " GET /_snapshot/<repo>/_all"
echo ""
info "To create a snapshot:"
info " PUT /_snapshot/<repo>/<snapshot_name>?wait_for_completion=true"
echo ""
confirm_or_abort "I have a recent snapshot or accept the risk of proceeding without one"
elif $UPGRADE_ES && $FORCE_MODE; then
warn "FORCE MODE: Skipping snapshot reminder. Ensure you have a backup!"
fi
# ==============================================================================
# Pre-upgrade: Remove version locks
# ==============================================================================
header "Checking Version Locks"
versionlock_removed_es=false
versionlock_removed_kibana=false
if command -v "${PKG_MGR}" &>/dev/null && ${PKG_MGR} versionlock list &>/dev/null 2>&1; then
info "Checking for version locks..."
if $UPGRADE_ES; then
if ${PKG_MGR} versionlock list 2>/dev/null | grep -q "elasticsearch"; then
warn "Found version lock for Elasticsearch"
info "Removing lock: ${PKG_MGR} versionlock delete elasticsearch*"
${PKG_MGR} versionlock delete "elasticsearch*" || ${PKG_MGR} versionlock delete "0:elasticsearch*" || true
versionlock_removed_es=true
success "Elasticsearch version lock removed."
fi
fi
if $UPGRADE_KIBANA; then
if ${PKG_MGR} versionlock list 2>/dev/null | grep -q "kibana"; then
warn "Found version lock for Kibana"
info "Removing lock: ${PKG_MGR} versionlock delete kibana*"
${PKG_MGR} versionlock delete "kibana*" || ${PKG_MGR} versionlock delete "0:kibana*" || true
versionlock_removed_kibana=true
success "Kibana version lock removed."
fi
fi
if ! $versionlock_removed_es && ! $versionlock_removed_kibana; then
success "No version locks found."
fi
else
info "Version lock plugin not available or not installed. Skipping."
fi
# ==============================================================================
# Pre-upgrade: Download all RPMs upfront
# ==============================================================================
header "Downloading RPM Packages"
mkdir -p "$DOWNLOAD_DIR"
chmod 700 "$DOWNLOAD_DIR"
ES_RPM="elasticsearch-${TARGET_VERSION}-${ARCH}.rpm"
ES_RPM_URL="https://artifacts.elastic.co/downloads/elasticsearch/${ES_RPM}"
ES_SHA_URL="${ES_RPM_URL}.sha512"
KIBANA_RPM="kibana-${TARGET_VERSION}-${ARCH}.rpm"
KIBANA_RPM_URL="https://artifacts.elastic.co/downloads/kibana/${KIBANA_RPM}"
KIBANA_SHA_URL="${KIBANA_RPM_URL}.sha512"
if $UPGRADE_ES; then
step "Downloading Elasticsearch RPM"
info "URL: ${ES_RPM_URL}"
if [[ -f "${DOWNLOAD_DIR}/${ES_RPM}" ]]; then
warn "File already exists: ${DOWNLOAD_DIR}/${ES_RPM}"
if confirm "Re-download and overwrite?" "n"; then
rm -f "${DOWNLOAD_DIR}/${ES_RPM}" "${DOWNLOAD_DIR}/${ES_RPM}.sha512"
else
info "Using existing file."
fi
fi
if [[ ! -f "${DOWNLOAD_DIR}/${ES_RPM}" ]]; then
if ! download_file "${ES_RPM_URL}" "${DOWNLOAD_DIR}/${ES_RPM}" "Elasticsearch ${TARGET_VERSION} RPM"; then
exit 1
fi
success "Elasticsearch RPM downloaded."
fi
info "Verifying SHA512 checksum..."
if ! curl -sf --max-time 30 --retry 3 -o "${DOWNLOAD_DIR}/${ES_RPM}.sha512" "${ES_SHA_URL}"; then
error "Failed to download Elasticsearch checksum file."
exit 1
fi
expected_hash=$(awk '{print $1}' "${DOWNLOAD_DIR}/${ES_RPM}.sha512")
actual_hash=$(sha512sum "${DOWNLOAD_DIR}/${ES_RPM}" | awk '{print $1}')
if [[ "$expected_hash" == "$actual_hash" ]]; then
success "Elasticsearch checksum verified."
else
error "Elasticsearch checksum mismatch!"
error "Expected: ${expected_hash}"
error "Actual: ${actual_hash}"
confirm_critical "This is dangerous. Continue despite checksum failure?"
fi
fi
if $UPGRADE_KIBANA; then
step "Downloading Kibana RPM"
info "URL: ${KIBANA_RPM_URL}"
if [[ -f "${DOWNLOAD_DIR}/${KIBANA_RPM}" ]]; then
warn "File already exists: ${DOWNLOAD_DIR}/${KIBANA_RPM}"
if confirm "Re-download and overwrite?" "n"; then
rm -f "${DOWNLOAD_DIR}/${KIBANA_RPM}" "${DOWNLOAD_DIR}/${KIBANA_RPM}.sha512"
else
info "Using existing file."
fi
fi
if [[ ! -f "${DOWNLOAD_DIR}/${KIBANA_RPM}" ]]; then
if ! download_file "${KIBANA_RPM_URL}" "${DOWNLOAD_DIR}/${KIBANA_RPM}" "Kibana ${TARGET_VERSION} RPM"; then
exit 1
fi
success "Kibana RPM downloaded."
fi
info "Verifying SHA512 checksum..."
if ! curl -sf --max-time 30 --retry 3 -o "${DOWNLOAD_DIR}/${KIBANA_RPM}.sha512" "${KIBANA_SHA_URL}"; then
error "Failed to download Kibana checksum file."
exit 1
fi
expected_hash=$(awk '{print $1}' "${DOWNLOAD_DIR}/${KIBANA_RPM}.sha512")
actual_hash=$(sha512sum "${DOWNLOAD_DIR}/${KIBANA_RPM}" | awk '{print $1}')
if [[ "$expected_hash" == "$actual_hash" ]]; then
success "Kibana checksum verified."
else
error "Kibana checksum mismatch!"
error "Expected: ${expected_hash}"
error "Actual: ${actual_hash}"
confirm_critical "This is dangerous. Continue despite checksum failure?"
fi
fi
success "All RPM packages downloaded and verified."
# ==============================================================================
# Pre-upgrade: Backup configuration files
# ==============================================================================
header "Backing Up Configuration Files"
BACKUP_TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_BASE="/var/backup"
BACKUP_DIR="${BACKUP_BASE}/elastic-upgrade-${BACKUP_TIMESTAMP}"
if [[ ! -d "$BACKUP_BASE" ]]; then
info "Creating backup directory: ${BACKUP_BASE}"
mkdir -p "$BACKUP_BASE"
chmod 700 "$BACKUP_BASE"
fi
# Check backup disk space
free_backup_mb=$(df -m "$BACKUP_BASE" 2>/dev/null | awk 'NR==2{print $4}')
if [[ -n "$free_backup_mb" && "$free_backup_mb" -lt 256 ]]; then
warn "Less than 256 MB free on backup filesystem (${BACKUP_BASE})."
confirm_or_abort "Continue without sufficient backup space?"
fi
if ! mkdir -p "$BACKUP_DIR"; then
error "Failed to create backup directory: ${BACKUP_DIR}"
error "Check permissions on ${BACKUP_BASE}"
exit 1
fi
chmod 700 "$BACKUP_DIR"
if $UPGRADE_ES; then
if [[ -d /etc/elasticsearch ]]; then
step "Backing up Elasticsearch configuration"
cp -a /etc/elasticsearch "$BACKUP_DIR/elasticsearch"
success "Elasticsearch config backed up to: ${BACKUP_DIR}/elasticsearch/"
config_count=$(find "$BACKUP_DIR/elasticsearch" -type f | wc -l)
info " Backed up ${config_count} file(s)"
else
warn "No /etc/elasticsearch directory found to backup."
fi
# Also backup systemd overrides (contains LimitNOFILE, LimitMEMLOCK, etc.)
if [[ -d /etc/systemd/system/elasticsearch.service.d ]]; then
step "Backing up Elasticsearch systemd overrides"
mkdir -p "$BACKUP_DIR/systemd-elasticsearch"
cp -a /etc/systemd/system/elasticsearch.service.d/* "$BACKUP_DIR/systemd-elasticsearch/" 2>/dev/null || true
success "Systemd overrides backed up to: ${BACKUP_DIR}/systemd-elasticsearch/"
fi
fi
if $UPGRADE_KIBANA; then
if [[ -d /etc/kibana ]]; then
step "Backing up Kibana configuration"
cp -a /etc/kibana "$BACKUP_DIR/kibana"
success "Kibana config backed up to: ${BACKUP_DIR}/kibana/"
config_count=$(find "$BACKUP_DIR/kibana" -type f | wc -l)
info " Backed up ${config_count} file(s)"
else
warn "No /etc/kibana directory found to backup."
fi
if [[ -d /etc/systemd/system/kibana.service.d ]]; then
mkdir -p "$BACKUP_DIR/systemd-kibana"
cp -a /etc/systemd/system/kibana.service.d/* "$BACKUP_DIR/systemd-kibana/" 2>/dev/null || true
success "Kibana systemd overrides backed up."
fi
fi
info "All backups stored in: ${BOLD}${BACKUP_DIR}${NC}"
echo ""
# ==============================================================================
# Elasticsearch Upgrade
# ==============================================================================
if $UPGRADE_ES; then
header "Upgrading Elasticsearch: ${ES_CURRENT} -> ${TARGET_VERSION}"
info "Using pre-downloaded RPM: ${DOWNLOAD_DIR}/${ES_RPM}"
echo ""
# --- Step 1: Disable shard allocation ---
if $FORCE_MODE; then
step "Step 1 - Disable shard allocation"
warn "FORCE MODE: Skipping (Elasticsearch may not be running)."
step "Step 2 - Flush all indices"
warn "FORCE MODE: Skipping (Elasticsearch may not be running)."
else
step "Step 1 - Disable shard allocation"
info "This prevents the cluster from rebalancing shards while the node is down."
info "Command: PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":\"primaries\"}}"
echo ""
confirm_or_abort "Disable shard allocation now?"
if es_curl_retry PUT "/_cluster/settings" '{"persistent":{"cluster.routing.allocation.enable":"primaries"}}'; then
_allocation_disabled=true
success "Shard allocation disabled (set to primaries only)."
else
error "Failed to disable shard allocation (HTTP $ES_CURL_HTTP_CODE)."
confirm_or_abort "Continue despite this failure?"
fi
# --- Step 2: Flush ---
step "Step 2 - Flush all indices"
info "Flushing ensures all data is written to disk before stopping."
info "Command: POST /_flush"
echo ""
confirm_or_abort "Flush all indices now?"
# Use longer timeout for flush -- large clusters may take a while
old_timeout="$API_TIMEOUT"
API_TIMEOUT=120
es_curl POST "/_flush"
flush_code="$ES_CURL_HTTP_CODE"
API_TIMEOUT="$old_timeout"
if [[ "$flush_code" == "200" ]]; then
success "Flush completed."
else
warn "Flush returned HTTP $flush_code (this is usually okay to continue)."
fi
fi # end of: if ! $FORCE_MODE (steps 1-2)
# --- Step 3: Stop Elasticsearch ---
step "Step 3 - Stop Elasticsearch service"
if ! systemctl is-active elasticsearch &>/dev/null; then
info "Elasticsearch service is not running. Skipping stop."
else
info "Command: systemctl stop elasticsearch (timeout: ${STOP_TIMEOUT}s)"
echo ""
confirm_or_abort "Stop Elasticsearch now?"
if ! timeout "$STOP_TIMEOUT" systemctl stop elasticsearch; then
error "Elasticsearch did not stop within ${STOP_TIMEOUT} seconds."
warn "The service may be stuck. Options:"
warn " 1. Wait longer: systemctl stop elasticsearch"
warn " 2. Force kill: systemctl kill -s SIGKILL elasticsearch"
confirm_or_abort "Force kill the Elasticsearch process?"
systemctl kill -s SIGKILL elasticsearch
sleep 2
fi
success "Elasticsearch stopped."
fi
# --- Step 4: Install RPM ---
step "Step 4 - Install Elasticsearch RPM"
info "Command: ${PKG_MGR} -y localinstall ${DOWNLOAD_DIR}/${ES_RPM}"
info "Note: Config files in /etc/elasticsearch/ will NOT be overwritten."
echo ""
confirm_or_abort "Install the Elasticsearch RPM now?"
${PKG_MGR} -y localinstall "${DOWNLOAD_DIR}/${ES_RPM}"
success "Elasticsearch RPM installed."
# --- Step 4a: Fix keystore permissions and upgrade ---
step "Step 4a - Checking Elasticsearch keystore"
keystore_file="/etc/elasticsearch/elasticsearch.keystore"
keystore_tmp="/etc/elasticsearch/elasticsearch.keystore.tmp"
ES_USER=$(systemctl show elasticsearch -p User --value 2>/dev/null) || \
ES_USER=$(systemctl show elasticsearch -p User 2>/dev/null | cut -d= -f2) || true
ES_GROUP=$(systemctl show elasticsearch -p Group --value 2>/dev/null) || \
ES_GROUP=$(systemctl show elasticsearch -p Group 2>/dev/null | cut -d= -f2) || true
ES_USER="${ES_USER:-elasticsearch}"
ES_GROUP="${ES_GROUP:-elasticsearch}"
[[ -z "$ES_USER" ]] && ES_USER="elasticsearch"
[[ -z "$ES_GROUP" ]] && ES_GROUP="elasticsearch"
info "Elasticsearch runs as user: ${ES_USER}, group: ${ES_GROUP}"
if [[ -f "$keystore_file" ]]; then
info "Keystore exists, checking permissions..."
chown -R "${ES_USER}:${ES_GROUP}" /etc/elasticsearch 2>/dev/null || true
chmod 750 /etc/elasticsearch 2>/dev/null || true
chmod 660 "$keystore_file" 2>/dev/null || true
if [[ -f "$keystore_tmp" ]]; then
warn "Found stale keystore temp file, removing..."
rm -f "$keystore_tmp"
fi
info "Running keystore upgrade as ${ES_USER} user..."
if [[ -x /usr/share/elasticsearch/bin/elasticsearch-keystore ]]; then
if sudo -u "$ES_USER" /usr/share/elasticsearch/bin/elasticsearch-keystore upgrade 2>&1; then
success "Keystore upgraded successfully."
else
warn "Keystore upgrade returned non-zero (may already be current format)."
info "If ES fails to start with keystore errors, check:"
info " ls -la /etc/elasticsearch/elasticsearch.keystore"
info " sudo -u ${ES_USER} /usr/share/elasticsearch/bin/elasticsearch-keystore upgrade"
fi
else
warn "elasticsearch-keystore binary not found or not executable."
fi
else
info "No existing keystore found (will be created on first start if needed)."
fi
# --- Step 4b: Check for .rpmnew config files ---
step "Step 4b - Checking for .rpmnew configuration files"
info "RPM upgrades may create .rpmnew files when your config has been modified."
echo ""
rpmnew_found=false
while IFS= read -r -d '' rpmnew_file; do
rpmnew_found=true
original="${rpmnew_file%.rpmnew}"
warn "Found: ${BOLD}${rpmnew_file}${NC}"
if [[ -f "$original" ]]; then
diff_output=$(diff --brief "$original" "$rpmnew_file" 2>/dev/null) || true
if [[ -n "$diff_output" ]]; then
warn " -> Differs from current ${original}"
info " -> Review with: diff ${original} ${rpmnew_file}"
fi
fi
done < <(find /etc/elasticsearch -name '*.rpmnew' -print0 2>/dev/null)
if $rpmnew_found; then
echo ""
warn "Review the .rpmnew files above and merge any needed changes into your config."
warn "The .rpmnew files contain the new defaults from version ${TARGET_VERSION}."
confirm_or_abort "Have you noted the .rpmnew files? Continue?"
else
success "No .rpmnew config files found (your configs were preserved cleanly)."
fi
# --- Step 4c: Verify JVM heap settings ---
step "Step 4c - Verifying JVM heap settings"
info "Checking that your JVM heap settings (-Xms / -Xmx) are still in place."
echo ""
jvm_opts_file="/etc/elasticsearch/jvm.options"
jvm_opts_d="/etc/elasticsearch/jvm.options.d"
heap_found=false
xms=""
xmx=""
if [[ -d "$jvm_opts_d" ]]; then
for f in "$jvm_opts_d"/*.options; do
if [[ -f "$f" ]]; then
_xms=$(grep -oP '^\s*-Xms\K\S+' "$f" 2>/dev/null | tail -1) || true
_xmx=$(grep -oP '^\s*-Xmx\K\S+' "$f" 2>/dev/null | tail -1) || true
if [[ -n "$_xms" || -n "$_xmx" ]]; then
heap_found=true
[[ -n "$_xms" ]] && xms="$_xms"
[[ -n "$_xmx" ]] && xmx="$_xmx"
info " Found in ${BOLD}${f}${NC}:"
[[ -n "$_xms" ]] && info " -Xms${_xms}"
[[ -n "$_xmx" ]] && info " -Xmx${_xmx}"
fi
fi
done
fi
if [[ -f "$jvm_opts_file" ]]; then
_xms=$(grep -oP '^\s*-Xms\K\S+' "$jvm_opts_file" 2>/dev/null | tail -1) || true
_xmx=$(grep -oP '^\s*-Xmx\K\S+' "$jvm_opts_file" 2>/dev/null | tail -1) || true
if [[ -n "$_xms" || -n "$_xmx" ]]; then
if ! $heap_found; then
heap_found=true
fi
[[ -n "$_xms" && -z "$xms" ]] && xms="$_xms"
[[ -n "$_xmx" && -z "$xmx" ]] && xmx="$_xmx"
info " Found in ${BOLD}${jvm_opts_file}${NC}:"
[[ -n "$_xms" ]] && info " -Xms${_xms}"
[[ -n "$_xmx" ]] && info " -Xmx${_xmx}"
fi
fi
if [[ -f "${jvm_opts_file}.rpmnew" ]]; then
warn "Found ${BOLD}${jvm_opts_file}.rpmnew${NC} — new JVM defaults from ${TARGET_VERSION}."
info " Review with: diff ${jvm_opts_file} ${jvm_opts_file}.rpmnew"
fi
if $heap_found; then
echo ""
if [[ -n "$xms" && -n "$xmx" && "$xms" != "$xmx" ]]; then
warn "Xms (${xms}) and Xmx (${xmx}) differ. Elastic recommends setting them equal."
fi
info "Verify these heap values are correct for this node before starting."
confirm_or_abort "JVM heap settings look correct? Continue?"
else
warn "No explicit -Xms/-Xmx found in jvm.options or jvm.options.d/"
warn "Elasticsearch will use its built-in defaults (typically 50% of RAM, max 31g)."
warn "The default may have changed between versions. Verify this is acceptable."
confirm_or_abort "Continue with default heap settings?"
fi
# --- Step 5: Reload systemd ---
step "Step 5 - Reload systemd daemon"
systemctl daemon-reload
success "Systemd daemon reloaded."
# --- Step 6: Start Elasticsearch ---
step "Step 6 - Start Elasticsearch service"
info "Command: systemctl start elasticsearch"
echo ""
confirm_or_abort "Start Elasticsearch now?"
systemctl start elasticsearch
info "Elasticsearch starting... waiting for it to become available."
retries=0
while [[ $retries -lt $STARTUP_WAIT ]]; do
es_curl GET "/" >/dev/null 2>&1
if [[ "$ES_CURL_HTTP_CODE" == "200" ]]; then
break
fi
retries=$((retries + 1))
echo -ne "\r Waiting for Elasticsearch... (${retries}/${STARTUP_WAIT}s)"
sleep 1
done
echo ""
if [[ $retries -ge $STARTUP_WAIT ]]; then
error "Elasticsearch did not start within ${STARTUP_WAIT} seconds."
error "Check the logs: journalctl -u elasticsearch -f"
recent_logs=$(journalctl -u elasticsearch --no-pager -n 30 2>/dev/null) || recent_logs=""
if echo "$recent_logs" | grep -qi "GLIBC\|glibc\|libc\.so\|GLIBCXX"; then
error ""
error "=== GLIBC INCOMPATIBILITY DETECTED ==="
error "The Elasticsearch JDK requires a newer glibc than this system provides."
error "Options:"
error " 1. Upgrade your OS (e.g. CentOS 7 -> RHEL 8/9, Rocky 8/9, etc.)"
error " 2. Install a compatible system JDK and configure ES_JAVA_HOME"
error " 3. Roll back: ${PKG_MGR} -y downgrade elasticsearch-${ES_CURRENT}"
fi
if echo "$recent_logs" | grep -qi "unknown setting\|unsupported setting\|IllegalArgument"; then
error ""
error "=== CONFIGURATION ERROR DETECTED ==="
error "Elasticsearch may have failed due to removed/deprecated settings."
error "Check elasticsearch.yml for settings that were removed in ${TARGET_VERSION}."
fi
if echo "$recent_logs" | grep -qi "keystore\|KeyStoreException"; then
error ""
error "=== KEYSTORE ERROR DETECTED ==="
error "Try running:"
error " sudo -u ${ES_USER:-elasticsearch} /usr/share/elasticsearch/bin/elasticsearch-keystore upgrade"
fi
error ""
error "To roll back: ${PKG_MGR} -y downgrade elasticsearch-${ES_CURRENT}"
error "Config backup: ${BACKUP_DIR}"
confirm_or_abort "Continue anyway (maybe it needs more time)?"
else
es_curl GET "/"
version_body="$ES_CURL_BODY"
new_version=$(echo "$version_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('version',{}).get('number','unknown'))" 2>/dev/null) || new_version="unknown"
success "Elasticsearch is running. Version: ${BOLD}${new_version}${NC}"
fi
# --- Step 7: Wait for node to rejoin ---
if $FORCE_MODE; then
step "Step 7 - Wait for node to rejoin cluster"
warn "FORCE MODE: Skipping cluster rejoin check."
warn "Verify manually that the node has rejoined: GET /_cat/nodes"
_allocation_disabled=false
step "Step 7a - Re-enable shard allocation"
warn "FORCE MODE: Skipping. Re-enable manually if needed:"
warn " PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}"
step "Step 7b - Wait for cluster recovery"
warn "FORCE MODE: Skipping. Monitor recovery manually: GET /_cluster/health?pretty"
else
step "Step 7 - Wait for node to rejoin cluster"
info "Checking cluster membership..."
es_curl GET "/_cluster/health?pretty"
health_body="$ES_CURL_BODY"
current_nodes=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('number_of_nodes',0))" 2>/dev/null) || current_nodes="?"
info "Nodes in cluster: ${current_nodes} (was: ${num_nodes:-?})"
success "Node has rejoined the cluster."
# --- Step 7a: Re-enable shard allocation ---
step "Step 7a - Re-enable shard allocation"
info "Clearing both persistent and transient allocation overrides."
echo ""
confirm_or_abort "Re-enable shard allocation now?"
if es_curl_retry PUT "/_cluster/settings" '{"persistent":{"cluster.routing.allocation.enable":null},"transient":{"cluster.routing.allocation.enable":null}}'; then
# Verify it actually took effect
info "Verifying allocation setting was cleared..."
es_curl GET "/_cluster/settings?flat_settings=true"
verify_body="$ES_CURL_BODY"
if echo "$verify_body" | grep -q '"cluster\.routing\.allocation\.enable"' 2>/dev/null; then
error "Allocation setting is STILL present after clearing!"
error "Fix manually:"
error " PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}"
confirm_or_abort "Continue anyway?"
else
_allocation_disabled=false
success "Verified: shard allocation overrides cleared."
fi
else
error "Failed to re-enable shard allocation (HTTP $ES_CURL_HTTP_CODE)."
error "You MUST manually re-enable it:"
error " PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}"
fi
# --- Step 7b: Wait for cluster recovery ---
step "Step 7b - Wait for cluster recovery"
info "Temporarily increasing concurrent incoming recoveries to 10..."
if es_curl_quiet PUT "/_cluster/settings" '{"transient":{"cluster.routing.allocation.node_concurrent_incoming_recoveries":10}}'; then
_recovery_boosted=true
success "Recovery concurrency boosted."
else
warn "Could not increase recovery concurrency. Continuing with defaults."
fi
info "Monitoring cluster health until green — timeout $((RECOVERY_WAIT / 60)) minutes (Ctrl+C to stop waiting)..."
echo ""
retries=0
while [[ $retries -lt $RECOVERY_WAIT ]]; do
es_curl GET "/_cluster/health"
health_body="$ES_CURL_BODY"
status=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('status','unknown'))" 2>/dev/null) || status="unknown"
init=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('initializing_shards',0))" 2>/dev/null) || init="?"
reloc=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('relocating_shards',0))" 2>/dev/null) || reloc="?"
unass=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('unassigned_shards',0))" 2>/dev/null) || unass="?"
recovery_info=""
es_curl GET "/_cat/recovery?active_only=true&h=index,shard,stage,bytes_percent&format=json"
recovery_body="$ES_CURL_BODY"
if [[ "$ES_CURL_HTTP_CODE" == "200" ]]; then
active_recoveries=$(echo "$recovery_body" | python3 -c "import json,sys; print(len(json.load(sys.stdin)))" 2>/dev/null) || active_recoveries=0
if [[ "$active_recoveries" -gt 0 ]]; then
avg_pct=$(echo "$recovery_body" | python3 -c "
import json, sys
data = json.load(sys.stdin)
pcts = [float(r.get('bytes_percent','0').rstrip('%')) for r in data if r.get('bytes_percent')]
print(f'{sum(pcts)/len(pcts):.1f}' if pcts else '0')
" 2>/dev/null) || avg_pct="?"
recovery_info=" | Recoveries: ${active_recoveries} (avg ${avg_pct}%)"
fi
fi
if [[ "$status" == "green" ]]; then
echo ""
success "Cluster is ${GREEN}${BOLD}GREEN${NC}!"
break
fi
elapsed_min=$((retries / 60))
elapsed_sec=$((retries % 60))
printf "\r Status: %-6s | Init: %-3s | Reloc: %-3s | Unassigned: %-3s%s | %dm%02ds " \
"$status" "$init" "$reloc" "$unass" "$recovery_info" "$elapsed_min" "$elapsed_sec"
retries=$((retries + 5))
sleep 5
done
if $_recovery_boosted; then
info "Resetting recovery concurrency to default..."
if es_curl_quiet PUT "/_cluster/settings" '{"transient":{"cluster.routing.allocation.node_concurrent_incoming_recoveries":null}}'; then
_recovery_boosted=false
success "Recovery concurrency reset to default."
else
warn "Could not reset recovery concurrency. Reset manually:"
warn " PUT /_cluster/settings {\"transient\":{\"cluster.routing.allocation.node_concurrent_incoming_recoveries\":null}}"
fi
fi
if [[ $retries -ge $RECOVERY_WAIT ]]; then
echo ""
warn "Cluster did not reach GREEN within $((RECOVERY_WAIT / 60)) minutes."
warn "Current status: ${status:-unknown}"
warn "This may be normal if you are doing a rolling upgrade across multiple nodes."
info "Monitor with: GET /_cluster/health?pretty"
info "Active recoveries: GET /_cat/recovery?active_only=true&v"
fi
fi # end of: if ! $FORCE_MODE (steps 7-7b)
success "Elasticsearch upgrade complete on this node."
fi
# ==============================================================================
# Kibana Upgrade
# ==============================================================================
if $UPGRADE_KIBANA; then
header "Upgrading Kibana: ${KIBANA_CURRENT} -> ${TARGET_VERSION}"
# Kibana migrations require cluster.routing.allocation.enable to be "all" (or unset).
if ! $FORCE_MODE; then
step "Verifying shard allocation is enabled"
info "Kibana migrations will fail if cluster.routing.allocation.enable is not 'all'."
es_curl GET "/_cluster/settings?flat_settings=true&include_defaults=true"
alloc_body="$ES_CURL_BODY"
alloc_code="$ES_CURL_HTTP_CODE"
if [[ "$alloc_code" == "200" ]]; then
persistent_alloc=$(echo "$alloc_body" | python3 -c "
import json, sys
d = json.load(sys.stdin)
v = d.get('persistent', {}).get('cluster.routing.allocation.enable', '')
print(v)
" 2>/dev/null) || persistent_alloc=""
transient_alloc=$(echo "$alloc_body" | python3 -c "
import json, sys
d = json.load(sys.stdin)
v = d.get('transient', {}).get('cluster.routing.allocation.enable', '')
print(v)
" 2>/dev/null) || transient_alloc=""
alloc_blocked=false
if [[ -n "$persistent_alloc" && "$persistent_alloc" != "all" ]]; then
warn "Persistent cluster.routing.allocation.enable = '${persistent_alloc}' (must be 'all' or unset)"
alloc_blocked=true
fi
if [[ -n "$transient_alloc" && "$transient_alloc" != "all" ]]; then
warn "Transient cluster.routing.allocation.enable = '${transient_alloc}' (must be 'all' or unset)"
alloc_blocked=true
fi
if $alloc_blocked; then
warn ""
warn "Kibana migrations WILL FAIL with these settings."
info "Fixing: resetting allocation to default (all)..."
if es_curl_retry PUT "/_cluster/settings" '{"persistent":{"cluster.routing.allocation.enable":null},"transient":{"cluster.routing.allocation.enable":null}}'; then
# Verify
es_curl GET "/_cluster/settings?flat_settings=true"
verify_body="$ES_CURL_BODY"
if echo "$verify_body" | grep -q '"cluster\.routing\.allocation\.enable"' 2>/dev/null; then
error "Allocation setting is STILL present after clearing!"
error "Fix manually before starting Kibana."
confirm_or_abort "Continue anyway? (Kibana will likely fail to start)"
else
success "Verified: allocation overrides cleared. Kibana migrations can proceed."
fi
else
error "Failed to reset allocation (HTTP $ES_CURL_HTTP_CODE)."
error "Fix manually before starting Kibana:"
error " PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}"
confirm_or_abort "Continue anyway? (Kibana will likely fail to start)"
fi
else
success "Shard allocation is set to 'all'. Kibana migrations can proceed."
fi
else
warn "Could not check allocation settings (HTTP $alloc_code). Kibana may fail if allocation is restricted."
fi
echo ""
fi
info "Using pre-downloaded RPM: ${DOWNLOAD_DIR}/${KIBANA_RPM}"
echo ""
# --- K2: Stop Kibana ---
step "K2 - Stop Kibana service"
if ! systemctl is-active kibana &>/dev/null; then
info "Kibana service is not running. Skipping stop."
else
info "Command: systemctl stop kibana"
echo ""
confirm_or_abort "Stop Kibana now?"
if ! timeout "$STOP_TIMEOUT" systemctl stop kibana; then
error "Kibana did not stop within ${STOP_TIMEOUT} seconds."
confirm_or_abort "Force kill the Kibana process?"
systemctl kill -s SIGKILL kibana
sleep 2
fi
success "Kibana stopped."
fi
# --- K3: Install RPM ---
step "K3 - Install Kibana RPM"
info "Command: ${PKG_MGR} -y localinstall ${DOWNLOAD_DIR}/${KIBANA_RPM}"
info "Note: Config files in /etc/kibana/ will NOT be overwritten."
echo ""
confirm_or_abort "Install the Kibana RPM now?"
${PKG_MGR} -y localinstall "${DOWNLOAD_DIR}/${KIBANA_RPM}"
success "Kibana RPM installed."
# --- K4: Reload systemd ---
step "K4 - Reload systemd daemon"
systemctl daemon-reload
success "Systemd daemon reloaded."
# --- K5: Start Kibana ---
step "K5 - Start Kibana service"
info "Command: systemctl start kibana"
echo ""
confirm_or_abort "Start Kibana now?"
systemctl start kibana
info "Kibana starting... waiting for it to become ready."
info "Kibana runs migrations after an upgrade, which may take several minutes."
# Detect Kibana listen address from config
kibana_config="/etc/kibana/kibana.yml"
kibana_host="localhost"
kibana_port="5601"
kibana_scheme="http"
if [[ -f "$kibana_config" ]]; then
cfg_host=$(grep -E '^\s*server\.host\s*:' "$kibana_config" 2>/dev/null | sed 's/^[^:]*:\s*//' | tr -d '"'"'" | xargs) || true
if [[ -n "$cfg_host" && "$cfg_host" != "0.0.0.0" && "$cfg_host" != "::" ]]; then
kibana_host="$cfg_host"
fi
cfg_port=$(grep -E '^\s*server\.port\s*:' "$kibana_config" 2>/dev/null | sed 's/^[^:]*:\s*//' | tr -d '"'"'" | xargs) || true
if [[ -n "$cfg_port" ]]; then
kibana_port="$cfg_port"
fi
cfg_ssl=$(grep -E '^\s*server\.ssl\.enabled\s*:' "$kibana_config" 2>/dev/null | sed 's/^[^:]*:\s*//' | tr -d '"'"'" | xargs) || true
if [[ "$cfg_ssl" == "true" ]]; then
kibana_scheme="https"
fi
fi
KIBANA_URL="${kibana_scheme}://${kibana_host}:${kibana_port}"
info "Kibana URL: ${KIBANA_URL}"
declare -a kibana_curl_cmd=(curl -s --max-time 10 -o /dev/null -w '%{http_code}')
if [[ "$kibana_scheme" == "https" ]]; then
kibana_curl_cmd+=(-k)
fi
retries=0
last_code="000"
while [[ $retries -lt $KIBANA_STARTUP_WAIT ]]; do
last_code=$("${kibana_curl_cmd[@]}" "${KIBANA_URL}/api/status" 2>/dev/null) || last_code="000"
if [[ "$last_code" == "200" ]]; then
break
fi
retries=$((retries + 1))
if [[ "$last_code" == "503" ]]; then
echo -ne "\r Kibana is running migrations... (${retries}s, HTTP ${last_code}) "
elif [[ "$last_code" == "000" ]]; then
echo -ne "\r Waiting for Kibana to start listening... (${retries}s) "
else
echo -ne "\r Waiting for Kibana... (${retries}s, HTTP ${last_code}) "
fi
sleep 1
done
echo ""
if [[ "$last_code" == "200" ]]; then
success "Kibana is ready and accepting requests."
elif [[ "$last_code" == "503" ]]; then
warn "Kibana is still running migrations after ${KIBANA_STARTUP_WAIT}s (HTTP 503)."
warn "This is normal for large deployments. It should become available shortly."
warn "Monitor with: curl -s ${KIBANA_URL}/api/status | python3 -m json.tool"
elif [[ "$last_code" == "000" ]]; then
warn "Kibana did not start responding within ${KIBANA_STARTUP_WAIT}s."
warn "Check logs: journalctl -u kibana -f"
warn " tail -f /var/log/kibana/kibana.log"
recent_logs=$(journalctl -u kibana --no-pager -n 30 2>/dev/null) || recent_logs=""
if echo "$recent_logs" | grep -qi "ECONNREFUSED\|connect.*elasticsearch"; then
warn ""
warn "=== ELASTICSEARCH CONNECTION ISSUE ==="
warn "Kibana cannot connect to Elasticsearch."
warn "Verify Elasticsearch is running and reachable."
fi
else
warn "Kibana returned unexpected status: HTTP ${last_code}"
warn "Check logs: journalctl -u kibana -f"
fi
KIBANA_NEW=$(get_rpm_version kibana)
success "Kibana upgrade complete. Version: ${BOLD}${KIBANA_NEW}${NC}"
fi
# ==============================================================================
# Post-upgrade: Restore version locks
# ==============================================================================
if command -v "${PKG_MGR}" &>/dev/null && ${PKG_MGR} versionlock list &>/dev/null 2>&1; then
if $versionlock_removed_es || $versionlock_removed_kibana; then
header "Restoring Version Locks"
if $versionlock_removed_es && $UPGRADE_ES; then
info "Re-adding version lock for Elasticsearch"
if ${PKG_MGR} versionlock add elasticsearch 2>/dev/null; then
if ${PKG_MGR} versionlock list 2>/dev/null | grep -q "elasticsearch"; then
success "Elasticsearch version lock restored."
else
warn "Versionlock add returned success but lock not found in list."
warn "Add manually: ${PKG_MGR} versionlock add elasticsearch"
fi
else
warn "Failed to restore Elasticsearch version lock."
warn "Add manually: ${PKG_MGR} versionlock add elasticsearch"
fi
fi
if $versionlock_removed_kibana && $UPGRADE_KIBANA; then
info "Re-adding version lock for Kibana"
if ${PKG_MGR} versionlock add kibana 2>/dev/null; then
if ${PKG_MGR} versionlock list 2>/dev/null | grep -q "kibana"; then
success "Kibana version lock restored."
else
warn "Versionlock add returned success but lock not found in list."
warn "Add manually: ${PKG_MGR} versionlock add kibana"
fi
else
warn "Failed to restore Kibana version lock."
warn "Add manually: ${PKG_MGR} versionlock add kibana"
fi
fi
fi
fi
# ==============================================================================
# Cleanup
# ==============================================================================
# Clear trap since we finished successfully
_allocation_disabled=false
_recovery_boosted=false
header "Upgrade Complete"
echo -e " ${GREEN}OK${NC} Upgrade finished successfully on this node."
echo ""
$UPGRADE_ES && echo -e " Elasticsearch: ${ES_CURRENT} -> ${BOLD}${TARGET_VERSION}${NC}"
$UPGRADE_KIBANA && echo -e " Kibana: ${KIBANA_CURRENT} -> ${BOLD}${TARGET_VERSION}${NC}"
echo ""
echo -e " Config backup: ${BOLD}${BACKUP_DIR}${NC}"
echo ""
if confirm "Clean up downloaded RPM files from ${DOWNLOAD_DIR}?" "y"; then
rm -rf "${DOWNLOAD_DIR}"
success "Cleaned up download directory."
else
info "RPM files kept in ${DOWNLOAD_DIR}"
fi
echo ""
info "If this is a multi-node cluster, repeat this process on the next node."
info "Upgrade order: non-master-eligible nodes first, then master-eligible nodes."
echo ""
success "Done!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment