Oddly · February 3, 2026 18:03
diff --git a/upgrade-elasticsearch.sh b/upgrade-elasticsearch.sh
 #!/bin/bash
 #
 # Elasticsearch & Kibana RPM Upgrade Script
 # ==========================================
 # This script upgrades Elasticsearch and/or Kibana on the local node
 # using RPMs downloaded from artifacts.elastic.co.
 #
 # It follows the official Elastic rolling upgrade procedure:
 #   1. Pre-flight checks (current versions, cluster health, disk space)
 #   2. Disable shard allocation
 #   3. Flush all indices (best effort)
 #   4. Stop the service
 #   5. Download & install the RPM
 #   6. Reload systemd and start the service
 #   7. Wait for the node to rejoin the cluster
 #   8. Re-enable shard allocation
 #   9. Wait for cluster to go green
 #
 # Usage:
 #   ./upgrade-elastic.sh <target-version> [--es-only|--kibana-only] [--arch aarch64]
 #
 # Examples:
 #   ./upgrade-elastic.sh 8.17.0              # Upgrade both ES and Kibana
 #   ./upgrade-elastic.sh 8.17.0 --es-only    # Upgrade Elasticsearch only
 #   ./upgrade-elastic.sh 8.17.0 --kibana-only # Upgrade Kibana only
 #   ./upgrade-elastic.sh 8.17.0 --arch aarch64 # Use aarch64 RPMs
 #   ./upgrade-elastic.sh 8.17.0 --yes        # Auto-accept non-critical prompts
 #   ./upgrade-elastic.sh 8.17.0 --force      # Skip ES-dependent checks (ES not running)
 #
 # Notes:
 #   - Run this script on each node individually (rolling upgrade)
 #   - Upgrade non-master-eligible nodes first, then master-eligible nodes
 #   - Make sure the cluster is GREEN before starting
 #   - This script must be run as root or with sudo
 #

 set -euo pipefail

 # ==============================================================================
 # Configuration & Defaults
 # ==============================================================================

 ARCH="x86_64"
 UPGRADE_ES=true
 UPGRADE_KIBANA=true
 TARGET_VERSION=""
 ES_URL="https://localhost:9200"
 DOWNLOAD_DIR="/tmp/elastic-upgrade"
 AUTO_YES=false
 FORCE_MODE=false
 ES_CURRENT=""
 KIBANA_CURRENT=""
 cluster_name=""

 # Curl auth/TLS options -- populated during connection detection
 # Stored as an array to avoid eval and shell injection
 declare -a ES_CURL_AUTH=()
 ES_CURL_INSECURE=false

 # Lock file to prevent concurrent runs
 LOCK_FILE="/var/run/elastic-upgrade.lock"
 LOCK_FD=9

 # Timeouts (seconds)
 STOP_TIMEOUT=120
 API_TIMEOUT=60
 STARTUP_WAIT=120
 KIBANA_STARTUP_WAIT=300
 RECOVERY_WAIT=1200
 API_RETRIES=3

 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 BOLD='\033[1m'
 NC='\033[0m' # No Color

 # Track state for cleanup on unexpected exit
 _allocation_disabled=false
 _recovery_boosted=false

 # ==============================================================================
 # Helper Functions
 # ==============================================================================

 info()    { echo -e "${BLUE}[INFO]${NC}    $*"; }
 success() { echo -e "${GREEN}[OK]${NC}      $*"; }
 warn()    { echo -e "${YELLOW}[WARN]${NC}    $*"; }
 error()   { echo -e "${RED}[ERROR]${NC}   $*"; }
 header()  { echo -e "\n${BOLD}═══════════════════════════════════════════════════════════${NC}"; echo -e "${BOLD}  $*${NC}"; echo -e "${BOLD}═══════════════════════════════════════════════════════════${NC}\n"; }
 step()    { echo -e "\n${YELLOW}▶ STEP: $*${NC}\n"; }

 # Cleanup handler for unexpected exits
 cleanup_on_exit() {
    local exit_code=$?

    # Re-enable shard allocation if we disabled it and didn't re-enable
    if $_allocation_disabled; then
        echo ""
        warn "Script interrupted! Attempting to re-enable shard allocation..."
        if es_curl_quiet PUT "/_cluster/settings" '{"persistent":{"cluster.routing.allocation.enable":null},"transient":{"cluster.routing.allocation.enable":null}}'; then
            success "Shard allocation re-enabled."
        else
            error "FAILED to re-enable shard allocation! Run manually:"
            error "  PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}"
        fi
    fi

    # Reset recovery concurrency if we boosted it
    if $_recovery_boosted; then
        es_curl_quiet PUT "/_cluster/settings" '{"transient":{"cluster.routing.allocation.node_concurrent_incoming_recoveries":null}}' || true
    fi

    # Release lock
    release_lock

    exit "$exit_code"
 }

 trap cleanup_on_exit EXIT INT TERM

 # Acquire exclusive lock to prevent concurrent runs
 acquire_lock() {
    eval "exec ${LOCK_FD}>${LOCK_FILE}"
    if ! flock -n "$LOCK_FD"; then
        error "Another instance of this script is already running (lock: ${LOCK_FILE})."
        error "If you are sure no other instance is running, remove the lock file:"
        error "  rm -f ${LOCK_FILE}"
        exit 1
    fi
    # Write PID to lock file for debugging
    echo $$ >&"$LOCK_FD"
 }

 release_lock() {
    flock -u "$LOCK_FD" 2>/dev/null || true
    rm -f "$LOCK_FILE" 2>/dev/null || true
 }

 confirm() {
    local message="$1"
    local default="${2:-n}"

    if $AUTO_YES; then
        info "(auto-yes) $message -> yes"
        return 0
    fi

    if [[ "$default" == "y" ]]; then
        prompt="[Y/n]"
    else
        prompt="[y/N]"
    fi

    while true; do
        echo -en "${BOLD}$message ${prompt}: ${NC}"
        read -r answer
        answer="${answer:-$default}"
        case "${answer,,}" in
            y|yes) return 0 ;;
            n|no)  return 1 ;;
            *)     echo "Please answer y or n." ;;
        esac
    done
 }

 confirm_or_abort() {
    if ! confirm "$1" "${2:-y}"; then
        warn "Aborted by user."
        exit 0
    fi
 }

 # Critical confirmation: in --yes mode, abort with exit 1 instead of auto-accepting
 confirm_critical() {
    local message="$1"

    if $AUTO_YES; then
        error "(auto-yes) CRITICAL: $message"
        error "Cannot auto-accept critical issues. Resolve the problem and re-run."
        exit 1
    fi

    confirm_or_abort "$message"
 }

 # Execute curl against the Elasticsearch API.
 # Uses arrays instead of eval to avoid shell injection.
 # Usage: es_curl <METHOD> <PATH> [JSON_DATA]
 # Sets globals: ES_CURL_HTTP_CODE, ES_CURL_BODY
 #
 # IMPORTANT: Do NOT use body=$(es_curl ...) — command substitution runs in a
 # subshell, so the global variables would not propagate back. Instead call
 # es_curl directly and read ES_CURL_BODY / ES_CURL_HTTP_CODE afterwards.
 ES_CURL_HTTP_CODE=""
 ES_CURL_BODY=""

 es_curl() {
    local method="${1:-GET}"
    local path="${2:-/}"
    local data="${3:-}"

    local base_url="${ES_URL%/}"
    local clean_path="${path#/}"
    local full_url="${base_url}/${clean_path}"

    local -a cmd=(curl -s -w '\n%{http_code}' --max-time "$API_TIMEOUT")

    # Add TLS options
    if $ES_CURL_INSECURE; then
        cmd+=(-k)
    fi

    # Add auth options (array-safe, no shell expansion)
    if [[ ${#ES_CURL_AUTH[@]} -gt 0 ]]; then
        cmd+=("${ES_CURL_AUTH[@]}")
    fi

    if [[ -n "$data" ]]; then
        cmd+=(-X "$method" -H 'Content-Type: application/json' -d "$data" "$full_url")
    else
        cmd+=(-X "$method" "$full_url")
    fi

    local output
    output=$("${cmd[@]}" 2>/dev/null) || true

    # Parse HTTP code (last line) and body (everything else)
    ES_CURL_HTTP_CODE=$(echo "$output" | tail -1)
    ES_CURL_BODY=$(echo "$output" | sed '$d')
 }

 # Quiet version: returns 0 if HTTP 200, 1 otherwise. No output.
 es_curl_quiet() {
    es_curl "$@"
    [[ "$ES_CURL_HTTP_CODE" == "200" ]]
 }

 # Retry wrapper for critical API calls
 es_curl_retry() {
    local retries="$API_RETRIES"
    local attempt=1

    while [[ $attempt -le $retries ]]; do
        es_curl "$@"
        if [[ "$ES_CURL_HTTP_CODE" == "200" ]]; then
            return 0
        fi
        if [[ $attempt -lt $retries ]]; then
            warn "API call failed (HTTP ${ES_CURL_HTTP_CODE}), retrying (${attempt}/${retries})..."
            sleep 2
        fi
        attempt=$((attempt + 1))
    done
    return 1
 }

 # Test ES connectivity. Returns 0 if reachable, 1 otherwise.
 test_es_connection() {
    local -a cmd=(curl -s -o /dev/null -w '%{http_code}' --max-time 5)
    if $ES_CURL_INSECURE; then
        cmd+=(-k)
    fi
    if [[ ${#ES_CURL_AUTH[@]} -gt 0 ]]; then
        cmd+=("${ES_CURL_AUTH[@]}")
    fi
    cmd+=("${ES_URL}/")

    local code
    code=$("${cmd[@]}" 2>/dev/null) || code="000"
    echo "$code"
 }

 # Check if a systemd service exists
 service_exists() {
    systemctl list-unit-files "$1.service" &>/dev/null
 }

 # Get installed RPM version
 get_rpm_version() {
    local ver
    if ver=$(rpm -q --queryformat '%{VERSION}' "$1" 2>/dev/null) && [[ "$ver" != *"not installed"* ]]; then
        echo "$ver"
    else
        echo "not installed"
    fi
 }

 # Download a file with progress, using curl (no wget dependency)
 download_file() {
    local url="$1"
    local dest="$2"
    local description="${3:-file}"

    info "Downloading ${description}..."
    if ! curl --fail --location --retry 3 --retry-delay 5 \
         --connect-timeout 15 --max-time 600 \
         --progress-bar -o "$dest" "$url"; then
        error "Failed to download ${description}."
        rm -f "$dest"
        return 1
    fi
    return 0
 }

 # ==============================================================================
 # Argument Parsing
 # ==============================================================================

 usage() {
    echo "Usage: $0 <target-version> [OPTIONS]"
    echo ""
    echo "Options:"
    echo "  --es-only       Only upgrade Elasticsearch"
    echo "  --kibana-only   Only upgrade Kibana"
    echo "  --arch ARCH     Architecture: x86_64 (default) or aarch64"
    echo "  --es-url URL    Elasticsearch URL (default: https://localhost:9200)"
    echo "  --yes           Auto-accept all non-critical prompts; exit 1 on critical issues"
    echo "  --force         Skip pre-flight checks that require a running Elasticsearch"
    echo "                  (cluster health, node analysis, version mix, deprecation API,"
    echo "                  shard allocation, flush, rejoin wait, recovery wait)"
    echo "  -h, --help      Show this help"
    echo ""
    echo "Flags can be combined. --force implies --yes for skipped steps."
    exit 1
 }

 while [[ $# -gt 0 ]]; do
    case "$1" in
        --es-only)      UPGRADE_KIBANA=false; shift ;;
        --kibana-only)  UPGRADE_ES=false; shift ;;
        --arch)         ARCH="$2"; shift 2 ;;
        --es-url)       ES_URL="$2"; shift 2 ;;
        --yes|-y)       AUTO_YES=true; shift ;;
        --force|-f)     FORCE_MODE=true; AUTO_YES=true; shift ;;
        -h|--help)      usage ;;
        -*)             error "Unknown option: $1"; usage ;;
        *)
            if [[ -z "$TARGET_VERSION" ]]; then
                TARGET_VERSION="$1"
            else
                error "Unexpected argument: $1"
                usage
            fi
            shift
            ;;
    esac
 done

 if [[ -z "$TARGET_VERSION" ]]; then
    error "Target version is required."
    usage
 fi

 # Validate version format
 if ! [[ "$TARGET_VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
    error "Invalid version format: ${TARGET_VERSION}"
    error "Expected format: MAJOR.MINOR.PATCH (e.g., 8.17.0)"
    exit 1
 fi

 # Show active mode banners
 if $FORCE_MODE; then
    warn "╔══════════════════════════════════════════════════════════╗"
    warn "║  FORCE MODE — skipping checks that require running ES   ║"
    warn "║  Shard allocation, flush, rejoin, and recovery steps    ║"
    warn "║  will be skipped. You must manage these manually.       ║"
    warn "╚══════════════════════════════════════════════════════════╝"
    echo ""
 elif $AUTO_YES; then
    info "╔══════════════════════════════════════════════════════════╗"
    info "║  AUTO-YES MODE — non-critical prompts will be accepted  ║"
    info "║  Critical issues will cause the script to exit 1.       ║"
    info "╚══════════════════════════════════════════════════════════╝"
    echo ""
 fi

 # ==============================================================================
 # Pre-flight: Root check
 # ==============================================================================

 if [[ $EUID -ne 0 ]]; then
    error "This script must be run as root (or with sudo)."
    exit 1
 fi

 # ==============================================================================
 # Pre-flight: Acquire lock
 # ==============================================================================

 acquire_lock

 # ==============================================================================
 # Pre-flight: OS compatibility check
 # ==============================================================================

 is_rhel_compatible=false

 if [[ -f /etc/redhat-release ]]; then
    is_rhel_compatible=true
 elif [[ -f /etc/os-release ]]; then
    source /etc/os-release
    case "${ID:-}" in
        rhel|centos|fedora|rocky|alma|ol|scientific|amzn)
            is_rhel_compatible=true
            ;;
    esac
    if [[ "${ID_LIKE:-}" == *"rhel"* ]] || [[ "${ID_LIKE:-}" == *"fedora"* ]]; then
        is_rhel_compatible=true
    fi
 fi

 if ! $is_rhel_compatible; then
    error "This script is designed for RHEL-compatible systems (RHEL, CentOS, Rocky, Alma, Fedora, etc.)"
    error "Detected OS does not appear to be RHEL-compatible."
    if [[ -f /etc/os-release ]]; then
        source /etc/os-release
        error "  Detected: ${PRETTY_NAME:-$ID}"
    fi
    error ""
    error "For Debian/Ubuntu systems, use .deb packages instead of RPMs."
    error "For other systems, consider using the tarball distribution."
    exit 1
 fi

 if [[ -f /etc/os-release ]]; then
    source /etc/os-release
    info "Detected OS: ${PRETTY_NAME:-$ID}"
 fi

 # ==============================================================================
 # Pre-flight: Required commands check
 # ==============================================================================

 header "Checking Required Commands"

 missing_cmds=()
 for cmd in curl rpm systemctl sha512sum df grep sed awk python3 diff journalctl ldd; do
    if command -v "$cmd" &>/dev/null; then
        success "Found: $(command -v "$cmd")"
    else
        error "Missing: ${BOLD}${cmd}${NC}"
        missing_cmds+=("$cmd")
    fi
 done

 # Detect package manager: prefer dnf, fall back to yum
 if command -v dnf &>/dev/null; then
    PKG_MGR="dnf"
    success "Found: $(command -v dnf) (package manager)"
 elif command -v yum &>/dev/null; then
    PKG_MGR="yum"
    success "Found: $(command -v yum) (package manager)"
 else
    error "Missing: ${BOLD}yum/dnf${NC} — no package manager found"
    missing_cmds+=("yum/dnf")
 fi

 if [[ ${#missing_cmds[@]} -gt 0 ]]; then
    echo ""
    error "The following required commands are missing: ${missing_cmds[*]}"
    error "Install them before running this script."

    for cmd in "${missing_cmds[@]}"; do
        case "$cmd" in
            python3)   error "  -> ${PKG_MGR:-yum} install python3" ;;
            sha512sum) error "  -> Part of coreutils: ${PKG_MGR:-yum} install coreutils" ;;
            journalctl) error "  -> Part of systemd: ${PKG_MGR:-yum} install systemd" ;;
            ldd)       error "  -> Part of glibc: ${PKG_MGR:-yum} install glibc-common" ;;
            diff)      error "  -> Part of diffutils: ${PKG_MGR:-yum} install diffutils" ;;
        esac
    done

    exit 1
 fi

 # ==============================================================================
 # Pre-flight: Verify target version is available for download
 # ==============================================================================

 header "Verifying Target Version Availability"

 info "Checking if version ${BOLD}${TARGET_VERSION}${NC} is available on artifacts.elastic.co..."
 echo ""

 version_available=true

 if $UPGRADE_ES; then
    ES_RPM_CHECK_URL="https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${TARGET_VERSION}-${ARCH}.rpm"
    info "Checking Elasticsearch RPM: ${ES_RPM_CHECK_URL}"

    es_http_code=$(curl -s -o /dev/null -w '%{http_code}' --head --max-time 15 "${ES_RPM_CHECK_URL}" 2>/dev/null) || es_http_code="000"

    if [[ "$es_http_code" == "200" ]]; then
        es_size=$(curl -sI --max-time 15 "${ES_RPM_CHECK_URL}" 2>/dev/null | grep -i 'Content-Length' | awk '{print $2}' | tr -d '\r') || es_size=""
        if [[ -n "$es_size" && "$es_size" -gt 0 ]] 2>/dev/null; then
            es_size_mb=$((es_size / 1024 / 1024))
            success "Elasticsearch ${TARGET_VERSION} (${ARCH}) is available (${es_size_mb} MB)"
        else
            success "Elasticsearch ${TARGET_VERSION} (${ARCH}) is available"
        fi
    elif [[ "$es_http_code" == "404" ]]; then
        error "Elasticsearch ${TARGET_VERSION} (${ARCH}) was ${BOLD}NOT FOUND${NC}${RED} (HTTP 404)${NC}"
        error "URL: ${ES_RPM_CHECK_URL}"
        error "Verify the version number and architecture are correct."
        error "Browse available versions at: https://www.elastic.co/downloads/past-releases"
        version_available=false
    else
        warn "Could not verify Elasticsearch RPM availability (HTTP ${es_http_code})."
        warn "URL: ${ES_RPM_CHECK_URL}"
        warn "This may be a network issue. The download will be attempted later."
    fi
 fi

 if $UPGRADE_KIBANA; then
    KIBANA_RPM_CHECK_URL="https://artifacts.elastic.co/downloads/kibana/kibana-${TARGET_VERSION}-${ARCH}.rpm"
    info "Checking Kibana RPM:          ${KIBANA_RPM_CHECK_URL}"

    kibana_http_code=$(curl -s -o /dev/null -w '%{http_code}' --head --max-time 15 "${KIBANA_RPM_CHECK_URL}" 2>/dev/null) || kibana_http_code="000"

    if [[ "$kibana_http_code" == "200" ]]; then
        kibana_size=$(curl -sI --max-time 15 "${KIBANA_RPM_CHECK_URL}" 2>/dev/null | grep -i 'Content-Length' | awk '{print $2}' | tr -d '\r') || kibana_size=""
        if [[ -n "$kibana_size" && "$kibana_size" -gt 0 ]] 2>/dev/null; then
            kibana_size_mb=$((kibana_size / 1024 / 1024))
            success "Kibana ${TARGET_VERSION} (${ARCH}) is available (${kibana_size_mb} MB)"
        else
            success "Kibana ${TARGET_VERSION} (${ARCH}) is available"
        fi
    elif [[ "$kibana_http_code" == "404" ]]; then
        error "Kibana ${TARGET_VERSION} (${ARCH}) was ${BOLD}NOT FOUND${NC}${RED} (HTTP 404)${NC}"
        error "URL: ${KIBANA_RPM_CHECK_URL}"
        error "Verify the version number and architecture are correct."
        error "Browse available versions at: https://www.elastic.co/downloads/past-releases"
        version_available=false
    else
        warn "Could not verify Kibana RPM availability (HTTP ${kibana_http_code})."
        warn "URL: ${KIBANA_RPM_CHECK_URL}"
        warn "This may be a network issue. The download will be attempted later."
    fi
 fi

 if ! $version_available; then
    echo ""
    error "One or more RPMs are not available for version ${TARGET_VERSION}."
    confirm_or_abort "Continue anyway (downloads will fail later)?"
 fi

 # ==============================================================================
 # Detect Elasticsearch connection settings
 # This runs for BOTH --es-only and --kibana-only because the Kibana upgrade
 # section needs ES API access to check shard allocation.
 # Only skipped in --force mode where all API calls are bypassed.
 # ==============================================================================

 if $FORCE_MODE; then
    header "Detecting Elasticsearch Connection"
    warn "FORCE MODE: Skipping Elasticsearch connection detection."
    warn "Cluster health, node analysis, deprecation checks, shard management"
    warn "and recovery steps will all be skipped."
 else

 header "Detecting Elasticsearch Connection"

 info "Testing connection to ${ES_URL}..."

 # Try plain connection first
 response=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "${ES_URL}/" 2>/dev/null) || response="000"

 if [[ "$response" == "200" ]]; then
    success "Connected to Elasticsearch (no auth required)"
 elif [[ "$response" == "401" ]]; then
    info "Elasticsearch requires authentication."
    echo -en "${BOLD}Enter Elasticsearch username [elastic]: ${NC}"
    read -r es_user
    es_user="${es_user:-elastic}"
    echo -en "${BOLD}Enter Elasticsearch password: ${NC}"
    read -rs es_pass
    echo ""

    # Store credentials in array -- never passed through eval/shell expansion
    ES_CURL_AUTH=(-u "${es_user}:${es_pass}")
    ES_CURL_INSECURE=true

    response=$(test_es_connection)
    if [[ "$response" == "200" ]]; then
        success "Authenticated successfully."
    else
        error "Authentication failed (HTTP $response). Please check credentials."
        exit 1
    fi
    # Clear credential variables from memory (array persists for curl calls)
    unset es_pass
 elif [[ "$response" == "000" ]]; then
    # Try https with -k
    ES_URL="${ES_URL/http:/https:}"
    ES_CURL_INSECURE=true
    response=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 -k "${ES_URL}/" 2>/dev/null) || response="000"

    if [[ "$response" == "200" ]]; then
        success "Connected to Elasticsearch over HTTPS (self-signed cert)"
    elif [[ "$response" == "401" ]]; then
        info "Elasticsearch requires authentication (HTTPS)."
        echo -en "${BOLD}Enter Elasticsearch username [elastic]: ${NC}"
        read -r es_user
        es_user="${es_user:-elastic}"
        echo -en "${BOLD}Enter Elasticsearch password: ${NC}"
        read -rs es_pass
        echo ""

        ES_CURL_AUTH=(-u "${es_user}:${es_pass}")

        response=$(test_es_connection)
        if [[ "$response" == "200" ]]; then
            success "Authenticated successfully over HTTPS."
        else
            error "Authentication failed (HTTP $response)."
            exit 1
        fi
        unset es_pass
    else
        error "Cannot connect to Elasticsearch at ${ES_URL} (HTTP $response)."
        error "Make sure Elasticsearch is running and accessible."
        error "You can specify a custom URL with: --es-url http://hostname:9200"
        exit 1
    fi
 else
    error "Unexpected response from Elasticsearch (HTTP $response)."
    exit 1
 fi

 fi # end of: if ! $FORCE_MODE (ES connection)

 # ==============================================================================
 # STEP 0: Pre-flight Checks
 # ==============================================================================

 header "Pre-flight Checks"

 # --- Current versions ---
 step "Checking currently installed versions"

 if $UPGRADE_ES; then
    ES_CURRENT=$(get_rpm_version elasticsearch)
    if [[ "$ES_CURRENT" == "not installed" ]]; then
        error "Elasticsearch RPM is not installed on this system."
        exit 1
    fi
    info "Elasticsearch installed version: ${BOLD}${ES_CURRENT}${NC}"
 fi

 if $UPGRADE_KIBANA; then
    KIBANA_CURRENT=$(get_rpm_version kibana)
    if [[ "$KIBANA_CURRENT" == "not installed" ]]; then
        warn "Kibana RPM is not installed on this system. Skipping Kibana upgrade."
        UPGRADE_KIBANA=false
    else
        info "Kibana installed version:       ${BOLD}${KIBANA_CURRENT}${NC}"
    fi
 fi

 info "Target version:                 ${BOLD}${TARGET_VERSION}${NC}"
 echo ""

 if $UPGRADE_ES && [[ "$ES_CURRENT" == "$TARGET_VERSION" ]]; then
    warn "Elasticsearch is already at version ${TARGET_VERSION}."
    if ! confirm "Continue anyway?" "n"; then
        UPGRADE_ES=false
    fi
 fi

 if $UPGRADE_KIBANA && [[ "$KIBANA_CURRENT" == "$TARGET_VERSION" ]]; then
    warn "Kibana is already at version ${TARGET_VERSION}."
    if ! confirm "Continue anyway?" "n"; then
        UPGRADE_KIBANA=false
    fi
 fi

 if ! $UPGRADE_ES && ! $UPGRADE_KIBANA; then
    info "Nothing to upgrade."
    exit 0
 fi

 # --- Version jump analysis ---
 if $UPGRADE_ES; then
    step "Analyzing version upgrade path"

    upgrade_path_ok=true

    read -r cur_major cur_minor cur_patch <<< "$(echo "$ES_CURRENT" | awk -F. '{print $1, $2, $3}')"
    read -r tgt_major tgt_minor tgt_patch <<< "$(echo "$TARGET_VERSION" | awk -F. '{print $1, $2, $3}')"

    if [[ -z "$cur_major" || -z "$tgt_major" ]]; then
        warn "Could not parse version numbers. Skipping upgrade path analysis."
    else
        info "Current:  ${BOLD}${ES_CURRENT}${NC}  (major=${cur_major}, minor=${cur_minor}, patch=${cur_patch})"
        info "Target:   ${BOLD}${TARGET_VERSION}${NC}  (major=${tgt_major}, minor=${tgt_minor}, patch=${tgt_patch})"
        echo ""

        if [[ "$tgt_major" -lt "$cur_major" ]] || \
           { [[ "$tgt_major" -eq "$cur_major" && "$tgt_minor" -lt "$cur_minor" ]]; } || \
           { [[ "$tgt_major" -eq "$cur_major" && "$tgt_minor" -eq "$cur_minor" && "$tgt_patch" -lt "$cur_patch" ]]; }; then
            warn "Target version ${TARGET_VERSION} is OLDER than current ${ES_CURRENT}."
            warn "Elasticsearch does not support downgrades. Nodes cannot be rolled back"
            warn "once upgraded. This will install an older RPM but may cause problems."
            confirm_or_abort "This looks like a downgrade. Are you sure?"

        elif [[ "$tgt_major" -eq "$cur_major" ]]; then
            if [[ "$tgt_minor" -eq "$cur_minor" ]]; then
                success "Patch upgrade (${ES_CURRENT} -> ${TARGET_VERSION}). No special requirements."
            else
                success "Minor upgrade (${ES_CURRENT} -> ${TARGET_VERSION}). Rolling upgrade supported."
            fi

        elif [[ "$tgt_major" -eq $((cur_major + 1)) ]]; then
            warn "This is a ${BOLD}MAJOR VERSION${NC}${YELLOW} upgrade (${cur_major}.x -> ${tgt_major}.x).${NC}"
            echo ""

            declare -A gateway_minor
            gateway_minor[6]=8
            gateway_minor[7]=17
            gateway_minor[8]=19

            required_minor="${gateway_minor[$cur_major]:-}"

            if [[ -n "$required_minor" ]]; then
                if [[ "$cur_major" -eq 8 && "$cur_minor" -eq 18 && "$tgt_major" -eq 9 && "$tgt_minor" -eq 0 ]]; then
                    success "On 8.18.x targeting 9.0.x — this specific path is supported by Elastic."

                elif [[ "$cur_minor" -lt "$required_minor" ]]; then
                    error "Major upgrade from ${cur_major}.x to ${tgt_major}.x requires being on ${cur_major}.${required_minor}.x first."
                    error ""
                    error "Current version ${ES_CURRENT} is below the required stepping stone."
                    error ""
                    error "Required upgrade path:"
                    error "  1. First upgrade:  ${ES_CURRENT}  ->  ${cur_major}.${required_minor}.x  (minor upgrade)"
                    error "  2. Then upgrade:   ${cur_major}.${required_minor}.x  ->  ${TARGET_VERSION}  (major upgrade)"
                    error ""
                    error "Skipping the stepping-stone version may cause data loss or failed startup."
                    upgrade_path_ok=false
                    confirm_critical "Override and attempt direct major upgrade anyway? THIS IS DANGEROUS."

                elif [[ "$cur_minor" -eq "$required_minor" ]]; then
                    success "On required gateway version ${cur_major}.${required_minor}.x. Major upgrade path is valid."
                else
                    success "On ${ES_CURRENT}, which is above the required ${cur_major}.${required_minor}.x gateway."
                fi

                if [[ "$cur_major" -eq 8 && "$cur_minor" -eq 18 && "$tgt_minor" -gt 0 ]]; then
                    warn "Version 8.18.x can only upgrade to 9.0.x directly."
                    warn "For 9.1.x or later, you must first upgrade to 8.19.x."
                    error ""
                    error "Required upgrade path:"
                    error "  1. First upgrade:  ${ES_CURRENT}  ->  8.19.x"
                    error "  2. Then upgrade:   8.19.x  ->  ${TARGET_VERSION}"
                    upgrade_path_ok=false
                    confirm_critical "Override and attempt direct upgrade anyway? THIS IS DANGEROUS."
                fi
            else
                warn "No gateway version defined for ${cur_major}.x -> ${tgt_major}.x in this script."
                warn "Check the Elastic upgrade documentation for the correct upgrade path."
            fi

            echo ""
            warn "Major upgrade checklist:"
            warn "  - Run the Upgrade Assistant in Kibana before upgrading"
            warn "  - Resolve ALL critical deprecation issues (checked later in this script)"
            warn "  - Ensure no indices created before ${cur_major}.0 exist (reindex or delete them)"
            warn "  - Review breaking changes: https://www.elastic.co/guide/en/elasticsearch/reference/${tgt_major}.x/breaking-changes.html"
            warn "  - Take a full snapshot backup before proceeding"
            echo ""

            if $upgrade_path_ok; then
                confirm_or_abort "Acknowledge major upgrade requirements and continue?"
            fi

        elif [[ "$tgt_major" -gt $((cur_major + 1)) ]]; then
            error "Upgrading from ${cur_major}.x to ${tgt_major}.x skips one or more major versions."
            error "Elasticsearch does NOT support skipping major versions."
            error ""
            error "Required upgrade path:"

            v=$cur_major
            while [[ $v -lt $tgt_major ]]; do
                next=$((v + 1))
                gw="${gateway_minor[$v]:-last_minor}"
                if [[ $v -eq $cur_major ]]; then
                    error "  ${v}.x  ->  ${v}.${gw}.x  (get to gateway minor first)"
                fi
                error "  ${v}.${gw}.x  ->  ${next}.x"
                v=$next
            done

            error ""
            error "Each major version boundary must be crossed individually."
            confirm_critical "Override and attempt anyway? THIS WILL ALMOST CERTAINLY FAIL."
        fi
    fi
 fi

 # --- Cluster health ---
 if $FORCE_MODE; then
    if $UPGRADE_ES || $UPGRADE_KIBANA; then
        step "Checking cluster health"
        warn "FORCE MODE: Skipping cluster health check."
    fi
 else
 step "Checking cluster health"

 es_curl GET "/_cluster/health?pretty"
 health_body="$ES_CURL_BODY"
 health_code="$ES_CURL_HTTP_CODE"

 if [[ "$health_code" != "200" ]]; then
    error "Failed to get cluster health (HTTP $health_code)."
    exit 1
 fi

 cluster_status=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('status','unknown'))" 2>/dev/null) || cluster_status="unknown"
 cluster_name=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('cluster_name',''))" 2>/dev/null) || cluster_name=""
 num_nodes=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('number_of_nodes',0))" 2>/dev/null) || num_nodes="?"
 unassigned=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('unassigned_shards',0))" 2>/dev/null) || unassigned="?"

 info "Cluster:          ${BOLD}${cluster_name}${NC}"
 info "Nodes:            ${BOLD}${num_nodes}${NC}"
 info "Unassigned:       ${BOLD}${unassigned}${NC}"

 if [[ "$cluster_status" == "green" ]]; then
    success "Cluster status:   ${GREEN}${BOLD}GREEN${NC}"
 elif [[ "$cluster_status" == "yellow" ]]; then
    warn "Cluster status:   ${YELLOW}${BOLD}YELLOW${NC}"
    warn "It is recommended to start with a GREEN cluster."
    confirm_or_abort "Continue with YELLOW cluster status?"
 else
    error "Cluster status:   ${RED}${BOLD}RED${NC}"
    error "DO NOT upgrade while cluster is RED."
    confirm_critical "Are you absolutely sure you want to continue with RED status?"
 fi

 # --- Node roles, versions & upgrade order advice (ES upgrade only) ---
 if $UPGRADE_ES; then
    step "Analyzing cluster nodes and upgrade order"

    es_curl GET "/_cat/nodes?h=name,ip,version,master,node.role&format=json"
    nodes_body="$ES_CURL_BODY"
    nodes_code="$ES_CURL_HTTP_CODE"

    local_hostname=$(hostname -s 2>/dev/null) || local_hostname=""
    local_fqdn=$(hostname -f 2>/dev/null) || local_fqdn=""
    local_ips=$(hostname -I 2>/dev/null | tr ' ' '\n' | grep -v '^$') || local_ips=""
    local_ips="${local_ips}
 127.0.0.1
 ::1"

    if [[ "$nodes_code" == "200" ]]; then

        _pyfile=$(mktemp /tmp/elastic-upgrade-nodecheck.XXXXXX.py)
        chmod 600 "$_pyfile"
        cat > "$_pyfile" <<'PYEOF'
 import json, sys, os

 nodes = json.load(sys.stdin)
 local_hostname = os.environ.get('LOCAL_HOSTNAME', '').lower()
 local_fqdn = os.environ.get('LOCAL_FQDN', '').lower()
 local_ips = set(ip.strip() for ip in os.environ.get('LOCAL_IPS', '').split('\n') if ip.strip())
 target_version = os.environ.get('TARGET_VERSION', '')

 # --- Identify local node ---
 local_node = None
 for n in nodes:
    node_name = n.get('name', '').strip()
    node_ip = n.get('ip', '').strip()
    if (node_name.lower() == local_hostname or
        node_name.lower() == local_fqdn or
        node_ip in local_ips or
        local_hostname and node_name.lower().startswith(local_hostname)):
        local_node = node_name
        break

 # --- Classify nodes ---
 tier1_coord   = []
 tier2_data    = []
 tier3_master  = []
 tier4_elected = None

 all_entries = []

 for n in nodes:
    name = n.get('name', '').strip()
    version = n.get('version', '').strip()
    is_elected_master = n.get('master', '').strip() == '*'
    roles = n.get('node.role', '').strip()

    has_master_role = 'm' in roles
    has_data_role = 'd' in roles

    entry = {
        'name': name,
        'version': version,
        'roles': roles,
        'is_elected_master': is_elected_master,
        'has_master_role': has_master_role,
        'has_data_role': has_data_role,
        'is_local': name == local_node,
        'already_upgraded': version == target_version,
    }
    all_entries.append(entry)

    if is_elected_master:
        tier4_elected = entry
    elif has_master_role:
        tier3_master.append(entry)
    elif has_data_role:
        tier2_data.append(entry)
    else:
        tier1_coord.append(entry)

 all_combined = all(e['has_master_role'] and e['has_data_role'] for e in all_entries)

 # --- Print node table ---
 print('NODE_TABLE_START')
 for n in nodes:
    name = n.get('name', '').strip()
    version = n.get('version', '').strip()
    roles = n.get('node.role', '').strip()
    is_elected = n.get('master', '').strip() == '*'

    markers = []
    if is_elected:
        markers.append('elected master')
    if name == local_node:
        markers.append('THIS NODE')

    marker_str = f'  ({", ".join(markers)})' if markers else ''
    elected_icon = 'M' if is_elected else ' '
    local_icon = '>' if name == local_node else ' '

    print(f'  {local_icon}{elected_icon} {name:<45} v{version:<12} roles: {roles}{marker_str}')

 print('NODE_TABLE_END')

 # --- Determine upgrade order ---
 upgrade_order = []
 upgrade_order += sorted(tier1_coord, key=lambda x: x['name'])
 upgrade_order += sorted(tier2_data, key=lambda x: x['name'])
 upgrade_order += sorted(tier3_master, key=lambda x: x['name'])
 if tier4_elected:
    upgrade_order.append(tier4_elected)

 def tier_label(entry):
    parts = []
    if entry['is_elected_master']:
        parts.append('elected master')
    elif entry['has_master_role']:
        parts.append('master-eligible')
    if entry['has_data_role']:
        parts.append('data')
    if not entry['has_master_role'] and not entry['has_data_role']:
        parts.append('coordinating/ingest')
    return ', '.join(parts)

 print('ORDER_START')
 if all_combined:
    print(f'  NOTE: All nodes have both data and master roles (combined topology).')
    print(f'  NOTE: Upgrade any non-elected node first, elected master last.')
    print(f'  ---')
 for i, n in enumerate(upgrade_order, 1):
    status = 'DONE' if n['already_upgraded'] else 'PENDING'
    local_marker = ' << THIS NODE' if n['is_local'] else ''
    print(f'  {i:>2}. {n["name"]:<40} v{n["version"]:<12} [{tier_label(n)}] {status}{local_marker}')
 print('ORDER_END')

 # --- Generate advice ---
 print('ADVICE_START')

 if local_node is None:
    print('WARN|Could not identify which cluster node corresponds to this machine.')
    print('WARN|Hostname: ' + local_hostname + ', FQDN: ' + local_fqdn)
    print('WARN|Local IPs: ' + ', '.join(sorted(local_ips - {'127.0.0.1', '::1'})))
    print('WARN|This is expected when --es-url points to a remote node.')
    print('WARN|Verify manually that you are upgrading nodes in the correct order.')
    local_entry = None
 else:
    local_entry = None
    local_position = -1
    for i, n in enumerate(upgrade_order):
        if n['is_local']:
            local_entry = n
            local_position = i
            break

    if local_entry is None:
        print('WARN|This node was identified but not found in the upgrade order. This is unexpected.')

    elif local_entry['already_upgraded']:
        print(f'OK|This node ({local_node}) is already at version {target_version}.')

    else:
        nodes_ahead_pending = [upgrade_order[i] for i in range(local_position) if not upgrade_order[i]['already_upgraded']]

        if len(nodes_ahead_pending) > 0:
            print(f'WARN|There are {len(nodes_ahead_pending)} node(s) that should ideally be upgraded BEFORE this one:')
            for n in nodes_ahead_pending:
                print(f'WARN|  - {n["name"]} (v{n["version"]}, {tier_label(n)})')

            if local_entry.get('is_elected_master'):
                print('INFO|')
                print('INFO|This node is the current elected master.')
                print('INFO|The master role will transfer automatically when this node stops.')
            elif all_combined:
                print('INFO|')
                print('INFO|All nodes share both data and master roles.')
                print('INFO|Order among non-elected peers does not strictly matter,')
                print('INFO|but the elected master should be upgraded last.')
        else:
            print(f'OK|This node ({local_node}) is next in the upgrade order. Safe to proceed.')

            if local_entry.get('is_elected_master'):
                other_masters = [n for n in tier3_master if not n['is_local']]
                if other_masters:
                    print('OK|This is the elected master. A new master will be elected automatically when this node stops.')
                else:
                    print('WARN|This is the ONLY master-eligible node. The cluster will be unavailable during upgrade.')

 print('ADVICE_END')

 # --- Cluster version mix analysis ---
 print('VERSION_MIX_START')

 versions_in_cluster = set(e['version'] for e in all_entries)

 if local_node is None or local_entry is None:
    from collections import Counter
    version_counts = Counter(e['version'] for e in all_entries)
    print(f'INFO|Versions currently in cluster:')
    for v, count in sorted(version_counts.items()):
        node_names = [e['name'] for e in all_entries if e['version'] == v]
        print(f'INFO|  v{v}: {count} node(s) — {", ".join(node_names)}')
    print(f'WARN|Local node not found in cluster. Cannot predict version mix after upgrade.')
 else:
    versions_after = list(e['version'] for e in all_entries if not e.get('is_local'))
    versions_after.append(target_version)
    unique_after = set(versions_after)

    from collections import Counter
    version_counts = Counter(e['version'] for e in all_entries)

    print(f'INFO|Versions currently in cluster:')
    for v, count in sorted(version_counts.items()):
        node_names = [e['name'] for e in all_entries if e['version'] == v]
        print(f'INFO|  v{v}: {count} node(s) — {", ".join(node_names)}')

    if len(versions_in_cluster) == 1 and list(versions_in_cluster)[0] == target_version:
        print(f'OK|All nodes are already on {target_version}.')
    elif len(unique_after) <= 2:
        if len(unique_after) == 1:
            print(f'OK|After this upgrade, all nodes will be on {target_version}.')
        else:
            other_version = [v for v in unique_after if v != target_version]
            remaining_old = len([v for v in versions_after if v != target_version])
            print(f'OK|After this upgrade the cluster will have 2 versions (normal during rolling upgrade):')
            print(f'OK|  v{target_version} (upgraded) and v{other_version[0]} ({remaining_old} node(s) remaining)')
    elif len(unique_after) >= 3:
        print(f'BLOCK|')
        print(f'BLOCK|After upgrading this node, the cluster would have {len(unique_after)} different versions:')
        version_counts_after = Counter(versions_after)
        for v, count in sorted(version_counts_after.items()):
            print(f'BLOCK|  v{v}: {count} node(s)')
        print(f'BLOCK|')
        print(f'BLOCK|Running 3+ versions simultaneously is NOT supported by Elasticsearch.')
        print(f'BLOCK|This typically means a previous rolling upgrade was not completed.')
        print(f'BLOCK|')
        print(f'BLOCK|Recommended action:')
        print(f'BLOCK|  Complete the previous upgrade first — bring ALL nodes to the same')
        print(f'BLOCK|  version before starting a new upgrade to {target_version}.')

 print('VERSION_MIX_END')
 PYEOF
        upgrade_advice=$(echo "$nodes_body" | LOCAL_HOSTNAME="$local_hostname" LOCAL_FQDN="$local_fqdn" LOCAL_IPS="$local_ips" TARGET_VERSION="$TARGET_VERSION" python3 "$_pyfile" 2>/dev/null) || true
        rm -f "$_pyfile"

        if [[ -z "$upgrade_advice" ]]; then
            warn "Could not analyze node roles. Proceeding without upgrade order advice."
        else
            echo "$upgrade_advice" | sed -n '/NODE_TABLE_START/,/NODE_TABLE_END/p' | grep -v '_START\|_END'
            echo ""
            info "${BOLD}Legend:${NC} M = elected master, > = this node"
            echo ""

            info "${BOLD}Recommended upgrade order:${NC}"
            echo "$upgrade_advice" | sed -n '/ORDER_START/,/ORDER_END/p' | grep -v '_START\|_END'
            echo ""

            has_warn=false

            while IFS= read -r line; do
                case "$line" in
                    WARN\|*)
                        has_warn=true
                        msg="${line#WARN|}"
                        [[ -n "$msg" ]] && warn "$msg"
                        ;;
                    OK\|*)
                        msg="${line#OK|}"
                        [[ -n "$msg" ]] && success "$msg"
                        ;;
                    INFO\|*)
                        msg="${line#INFO|}"
                        [[ -n "$msg" ]] && info "$msg"
                        ;;
                esac
            done <<< "$(echo "$upgrade_advice" | sed -n '/ADVICE_START/,/ADVICE_END/p' | grep -v '_START\|_END')"

            echo ""

            if $has_warn; then
                confirm_or_abort "Acknowledge the upgrade order warnings above and continue?"
            fi

            # --- Version mix analysis ---
            version_mix_section=$(echo "$upgrade_advice" | sed -n '/VERSION_MIX_START/,/VERSION_MIX_END/p' | grep -v '_START\|_END')

            if [[ -n "$version_mix_section" ]]; then
                step "Checking cluster version consistency"

                vmix_has_block=false

                while IFS= read -r line; do
                    case "$line" in
                        BLOCK\|*)
                            vmix_has_block=true
                            msg="${line#BLOCK|}"
                            [[ -n "$msg" ]] && error "$msg"
                            ;;
                        WARN\|*)
                            msg="${line#WARN|}"
                            [[ -n "$msg" ]] && warn "$msg"
                            ;;
                        OK\|*)
                            msg="${line#OK|}"
                            [[ -n "$msg" ]] && success "$msg"
                            ;;
                        INFO\|*)
                            msg="${line#INFO|}"
                            [[ -n "$msg" ]] && info "$msg"
                            ;;
                    esac
                done <<< "$version_mix_section"

                echo ""

                if $vmix_has_block; then
                    error "Upgrading this node would introduce 3+ versions into the cluster."
                    confirm_critical "Override version mix warning and continue anyway? THIS IS NOT SUPPORTED."
                fi
            fi
        fi

    else
        warn "Could not retrieve node list (HTTP ${nodes_code}). Skipping upgrade order analysis."
    fi
 fi # end $UPGRADE_ES node analysis

 fi # end of: if ! $FORCE_MODE (cluster health, node analysis, version mix)

 # --- Disk space ---
 step "Checking disk space"

 disk_space_ok=true

 free_tmp_mb=$(df -m /tmp 2>/dev/null | awk 'NR==2{print $4}')
 info "Free space in /tmp (downloads): ${free_tmp_mb:-unknown} MB"
 if [[ -n "$free_tmp_mb" && "$free_tmp_mb" -lt 1024 ]]; then
    warn "Less than 1 GB free in /tmp. RPM download may fail."
    disk_space_ok=false
 fi

 free_usr_mb=$(df -m /usr 2>/dev/null | awk 'NR==2{print $4}')
 info "Free space in /usr (installation): ${free_usr_mb:-unknown} MB"
 if [[ -n "$free_usr_mb" && "$free_usr_mb" -lt 1024 ]]; then
    warn "Less than 1 GB free in /usr. RPM installation may fail."
    disk_space_ok=false
 fi

 free_var_mb=$(df -m /var 2>/dev/null | awk 'NR==2{print $4}')
 info "Free space in /var (data/logs): ${free_var_mb:-unknown} MB"
 if [[ -n "$free_var_mb" && "$free_var_mb" -lt 512 ]]; then
    warn "Less than 512 MB free in /var. May have issues with logs during upgrade."
    disk_space_ok=false
 fi

 if ! $disk_space_ok; then
    confirm_or_abort "Continue despite low disk space warnings?"
 fi

 # --- glibc version check ---
 if $UPGRADE_ES; then
    step "Checking glibc version"
    info "Newer Elasticsearch versions bundle a JDK that may require a newer glibc."
    info "If glibc is too old (e.g. CentOS 7 ships glibc 2.17), ES may fail to start."

    glibc_version=$(ldd --version 2>&1 | head -1 | grep -oP '[0-9]+\.[0-9]+' | head -1) || glibc_version="unknown"
    info "Detected glibc version: ${BOLD}${glibc_version}${NC}"

    if [[ "$glibc_version" != "unknown" ]]; then
        glibc_major=$(echo "$glibc_version" | cut -d. -f1)
        glibc_minor=$(echo "$glibc_version" | cut -d. -f2)
        if [[ "$glibc_major" -le 2 && "$glibc_minor" -lt 17 ]]; then
            error "glibc ${glibc_version} is very old and likely incompatible with ES ${TARGET_VERSION}."
            confirm_or_abort "Continue despite potential glibc incompatibility?"
        elif [[ "$glibc_major" -le 2 && "$glibc_minor" -lt 31 ]]; then
            warn "glibc ${glibc_version} may be too old for the bundled JDK in newer ES releases."
            warn "If ES fails to start after upgrade, this is likely the cause."
            warn "Consider testing this upgrade on one node first before rolling out."
            confirm_or_abort "Acknowledge glibc risk and continue?"
        else
            success "glibc ${glibc_version} should be compatible."
        fi
    else
        warn "Could not detect glibc version. Verify manually that your OS is compatible."
    fi
 fi

 # --- Deprecation API check ---
 if $UPGRADE_ES && ! $FORCE_MODE; then
    step "Checking for deprecated settings"
    info "Querying the deprecation API to find settings that may block the upgrade."
    info "Command: GET /_migration/deprecations"
    echo ""

    es_curl GET "/_migration/deprecations"
    deprec_body="$ES_CURL_BODY"
    deprec_code="$ES_CURL_HTTP_CODE"

    if [[ "$deprec_code" == "200" ]]; then
        crit_count=$(echo "$deprec_body" | python3 -c "
 import json, sys
 try:
    data = json.load(sys.stdin)
    count = 0
    for category in ['cluster_settings', 'node_settings', 'index_settings', 'ml_settings']:
        items = data.get(category, [])
        if isinstance(items, list):
            for item in items:
                if item.get('level') == 'critical':
                    count += 1
        elif isinstance(items, dict):
            for index_name, idx_items in items.items():
                for item in idx_items:
                    if item.get('level') == 'critical':
                        count += 1
    print(count)
 except:
    print(-1)
 " 2>/dev/null) || crit_count="-1"

        warn_count=$(echo "$deprec_body" | python3 -c "
 import json, sys
 try:
    data = json.load(sys.stdin)
    count = 0
    for category in ['cluster_settings', 'node_settings', 'index_settings', 'ml_settings']:
        items = data.get(category, [])
        if isinstance(items, list):
            for item in items:
                if item.get('level') == 'warning':
                    count += 1
        elif isinstance(items, dict):
            for index_name, idx_items in items.items():
                for item in idx_items:
                    if item.get('level') == 'warning':
                        count += 1
    print(count)
 except:
    print(-1)
 " 2>/dev/null) || warn_count="-1"

        if [[ "$crit_count" == "-1" ]]; then
            warn "Could not parse deprecation response. Review manually:"
            warn "  GET /_migration/deprecations"
        elif [[ "$crit_count" -gt 0 ]]; then
            error "Found ${BOLD}${crit_count} CRITICAL${NC}${RED} deprecation(s) that may prevent startup after upgrade!${NC}"

            echo "$deprec_body" | python3 -c "
 import json, sys
 data = json.load(sys.stdin)
 for category in ['cluster_settings', 'node_settings', 'index_settings', 'ml_settings']:
    items = data.get(category, [])
    if isinstance(items, list):
        for item in items:
            if item.get('level') == 'critical':
                print(f\"  [CRITICAL] [{category}] {item.get('message', 'unknown')}\")
    elif isinstance(items, dict):
        for idx_name, idx_items in items.items():
            for item in idx_items:
                if item.get('level') == 'critical':
                    print(f\"  [CRITICAL] [index: {idx_name}] {item.get('message', 'unknown')}\")
 " 2>/dev/null || true

            echo ""
            error "Critical deprecations MUST be resolved before upgrading."
            error "Removed settings will prevent Elasticsearch from starting."
            confirm_critical "Continue despite critical deprecation warnings?"
        elif [[ "$warn_count" -gt 0 ]]; then
            warn "Found ${warn_count} deprecation warning(s) (non-critical)."
            info "Review with: GET /_migration/deprecations"
            info "These won't block the upgrade but should be addressed."
        else
            success "No deprecation issues found."
        fi
    elif [[ "$deprec_code" == "404" ]]; then
        warn "Deprecation API not available (older ES version). Skipping check."
        warn "Manually review release notes for removed settings between ${ES_CURRENT} and ${TARGET_VERSION}."
    else
        warn "Deprecation API returned HTTP ${deprec_code}. Skipping check."
    fi
 fi

 # --- Monitoring exporter config check ---
 if $UPGRADE_ES; then
    step "Checking for legacy monitoring configuration"

    es_config="/etc/elasticsearch/elasticsearch.yml"
    if [[ -f "$es_config" ]]; then
        if grep -q "xpack.monitoring.exporters" "$es_config" 2>/dev/null; then
            warn "Found ${BOLD}xpack.monitoring.exporters${NC}${YELLOW} in elasticsearch.yml${NC}"
            warn "Legacy monitoring (internal collection) is deprecated since 8.x."
            warn "Plan to migrate to Elastic Agent or Metricbeat monitoring collection."
            echo ""
        fi
        if grep -q "xpack.monitoring.collection.enabled" "$es_config" 2>/dev/null; then
            warn "Found ${BOLD}xpack.monitoring.collection.enabled${NC}${YELLOW} in elasticsearch.yml${NC}"
            warn "Legacy internal monitoring collection is deprecated."
            echo ""
        fi
    fi
 fi

 # --- Check for open ML jobs (warn about potential disruption) ---
 if $UPGRADE_ES && ! $FORCE_MODE; then
    step "Checking for running ML jobs"

    es_curl GET "/_ml/anomaly_detectors/_all?allow_no_match=true"
    ml_body="$ES_CURL_BODY"
    ml_code="$ES_CURL_HTTP_CODE"

    if [[ "$ml_code" == "200" ]]; then
        open_jobs=$(echo "$ml_body" | python3 -c "
 import json, sys
 try:
    data = json.load(sys.stdin)
    jobs = data.get('jobs', [])
    opened = [j['job_id'] for j in jobs if j.get('state') == 'opened']
    print(len(opened))
 except:
    print(0)
 " 2>/dev/null) || open_jobs="0"

        if [[ "$open_jobs" -gt 0 ]]; then
            warn "There are ${BOLD}${open_jobs}${NC}${YELLOW} open ML anomaly detection jobs.${NC}"
            warn "These will be interrupted when Elasticsearch stops."
            warn "They should recover automatically after restart, but may need"
            warn "a few minutes to re-open and resume processing."
            info "To see open jobs: GET /_ml/anomaly_detectors/_all?allow_no_match=true"
        else
            success "No open ML jobs."
        fi
    fi
 fi

 # ==============================================================================
 # Summary & Confirmation
 # ==============================================================================

 header "Upgrade Plan"

 if [[ -n "${cluster_name:-}" ]]; then
    echo -e "  Cluster:            ${BOLD}${cluster_name}${NC}"
 fi
 echo -e "  Target version:     ${BOLD}${TARGET_VERSION}${NC}"
 echo -e "  Architecture:       ${BOLD}${ARCH}${NC}"
 $UPGRADE_ES     && echo -e "  Upgrade ES:         ${GREEN}YES${NC}  (${ES_CURRENT} -> ${TARGET_VERSION})"
 $UPGRADE_ES     || echo -e "  Upgrade ES:         ${YELLOW}NO${NC}"
 $UPGRADE_KIBANA && echo -e "  Upgrade Kibana:     ${GREEN}YES${NC}  (${KIBANA_CURRENT} -> ${TARGET_VERSION})"
 $UPGRADE_KIBANA || echo -e "  Upgrade Kibana:     ${YELLOW}NO${NC}"
 echo ""

 echo -e "${BOLD}The following steps will be performed:${NC}"
 echo "  Pre-upgrade:"
 echo "    - Snapshot reminder"
 echo "    - Remove version locks (if any)"
 echo "    - Download and verify all RPMs"
 echo "    - Backup configuration files"
 $UPGRADE_ES && cat <<'EOF'
  Elasticsearch:
    1. Disable shard allocation
    2. Flush all indices (best effort)
    3. Stop Elasticsearch
    4. Install Elasticsearch RPM
       4a. Keystore check
       4b. Check for .rpmnew files
       4c. Verify JVM heap settings
    5. Reload systemd daemon
    6. Start Elasticsearch
    7. Wait for node to rejoin and re-enable allocation
       7a. Re-enable shard allocation
       7b. Wait for cluster recovery
 EOF
 $UPGRADE_KIBANA && cat <<'EOF'
  Kibana:
    K1. Verify shard allocation is enabled
    K2. Stop Kibana
    K3. Install Kibana RPM
    K4. Reload systemd daemon
    K5. Start Kibana and wait for ready
 EOF
 echo ""

 confirm_or_abort "Proceed with the upgrade?"

 # ==============================================================================
 # Pre-upgrade: Snapshot Reminder
 # ==============================================================================

 if $UPGRADE_ES && ! $FORCE_MODE; then
    header "Snapshot Reminder"
    warn "Before upgrading, you should have a recent snapshot of your data."
    warn "Snapshots are the ONLY reliable way to roll back Elasticsearch data."
    echo ""
    info "To check existing snapshots:"
    info "  GET /_snapshot/_all"
    info "  GET /_snapshot/<repo>/_all"
    echo ""
    info "To create a snapshot:"
    info "  PUT /_snapshot/<repo>/<snapshot_name>?wait_for_completion=true"
    echo ""
    confirm_or_abort "I have a recent snapshot or accept the risk of proceeding without one"
 elif $UPGRADE_ES && $FORCE_MODE; then
    warn "FORCE MODE: Skipping snapshot reminder. Ensure you have a backup!"
 fi

 # ==============================================================================
 # Pre-upgrade: Remove version locks
 # ==============================================================================

 header "Checking Version Locks"

 versionlock_removed_es=false
 versionlock_removed_kibana=false

 if command -v "${PKG_MGR}" &>/dev/null && ${PKG_MGR} versionlock list &>/dev/null 2>&1; then
    info "Checking for version locks..."

    if $UPGRADE_ES; then
        if ${PKG_MGR} versionlock list 2>/dev/null | grep -q "elasticsearch"; then
            warn "Found version lock for Elasticsearch"
            info "Removing lock: ${PKG_MGR} versionlock delete elasticsearch*"
            ${PKG_MGR} versionlock delete "elasticsearch*" || ${PKG_MGR} versionlock delete "0:elasticsearch*" || true
            versionlock_removed_es=true
            success "Elasticsearch version lock removed."
        fi
    fi

    if $UPGRADE_KIBANA; then
        if ${PKG_MGR} versionlock list 2>/dev/null | grep -q "kibana"; then
            warn "Found version lock for Kibana"
            info "Removing lock: ${PKG_MGR} versionlock delete kibana*"
            ${PKG_MGR} versionlock delete "kibana*" || ${PKG_MGR} versionlock delete "0:kibana*" || true
            versionlock_removed_kibana=true
            success "Kibana version lock removed."
        fi
    fi

    if ! $versionlock_removed_es && ! $versionlock_removed_kibana; then
        success "No version locks found."
    fi
 else
    info "Version lock plugin not available or not installed. Skipping."
 fi

 # ==============================================================================
 # Pre-upgrade: Download all RPMs upfront
 # ==============================================================================

 header "Downloading RPM Packages"

 mkdir -p "$DOWNLOAD_DIR"
 chmod 700 "$DOWNLOAD_DIR"

 ES_RPM="elasticsearch-${TARGET_VERSION}-${ARCH}.rpm"
 ES_RPM_URL="https://artifacts.elastic.co/downloads/elasticsearch/${ES_RPM}"
 ES_SHA_URL="${ES_RPM_URL}.sha512"

 KIBANA_RPM="kibana-${TARGET_VERSION}-${ARCH}.rpm"
 KIBANA_RPM_URL="https://artifacts.elastic.co/downloads/kibana/${KIBANA_RPM}"
 KIBANA_SHA_URL="${KIBANA_RPM_URL}.sha512"

 if $UPGRADE_ES; then
    step "Downloading Elasticsearch RPM"
    info "URL: ${ES_RPM_URL}"

    if [[ -f "${DOWNLOAD_DIR}/${ES_RPM}" ]]; then
        warn "File already exists: ${DOWNLOAD_DIR}/${ES_RPM}"
        if confirm "Re-download and overwrite?" "n"; then
            rm -f "${DOWNLOAD_DIR}/${ES_RPM}" "${DOWNLOAD_DIR}/${ES_RPM}.sha512"
        else
            info "Using existing file."
        fi
    fi

    if [[ ! -f "${DOWNLOAD_DIR}/${ES_RPM}" ]]; then
        if ! download_file "${ES_RPM_URL}" "${DOWNLOAD_DIR}/${ES_RPM}" "Elasticsearch ${TARGET_VERSION} RPM"; then
            exit 1
        fi
        success "Elasticsearch RPM downloaded."
    fi

    info "Verifying SHA512 checksum..."
    if ! curl -sf --max-time 30 --retry 3 -o "${DOWNLOAD_DIR}/${ES_RPM}.sha512" "${ES_SHA_URL}"; then
        error "Failed to download Elasticsearch checksum file."
        exit 1
    fi

    expected_hash=$(awk '{print $1}' "${DOWNLOAD_DIR}/${ES_RPM}.sha512")
    actual_hash=$(sha512sum "${DOWNLOAD_DIR}/${ES_RPM}" | awk '{print $1}')

    if [[ "$expected_hash" == "$actual_hash" ]]; then
        success "Elasticsearch checksum verified."
    else
        error "Elasticsearch checksum mismatch!"
        error "Expected: ${expected_hash}"
        error "Actual:   ${actual_hash}"
        confirm_critical "This is dangerous. Continue despite checksum failure?"
    fi
 fi

 if $UPGRADE_KIBANA; then
    step "Downloading Kibana RPM"
    info "URL: ${KIBANA_RPM_URL}"

    if [[ -f "${DOWNLOAD_DIR}/${KIBANA_RPM}" ]]; then
        warn "File already exists: ${DOWNLOAD_DIR}/${KIBANA_RPM}"
        if confirm "Re-download and overwrite?" "n"; then
            rm -f "${DOWNLOAD_DIR}/${KIBANA_RPM}" "${DOWNLOAD_DIR}/${KIBANA_RPM}.sha512"
        else
            info "Using existing file."
        fi
    fi

    if [[ ! -f "${DOWNLOAD_DIR}/${KIBANA_RPM}" ]]; then
        if ! download_file "${KIBANA_RPM_URL}" "${DOWNLOAD_DIR}/${KIBANA_RPM}" "Kibana ${TARGET_VERSION} RPM"; then
            exit 1
        fi
        success "Kibana RPM downloaded."
    fi

    info "Verifying SHA512 checksum..."
    if ! curl -sf --max-time 30 --retry 3 -o "${DOWNLOAD_DIR}/${KIBANA_RPM}.sha512" "${KIBANA_SHA_URL}"; then
        error "Failed to download Kibana checksum file."
        exit 1
    fi

    expected_hash=$(awk '{print $1}' "${DOWNLOAD_DIR}/${KIBANA_RPM}.sha512")
    actual_hash=$(sha512sum "${DOWNLOAD_DIR}/${KIBANA_RPM}" | awk '{print $1}')

    if [[ "$expected_hash" == "$actual_hash" ]]; then
        success "Kibana checksum verified."
    else
        error "Kibana checksum mismatch!"
        error "Expected: ${expected_hash}"
        error "Actual:   ${actual_hash}"
        confirm_critical "This is dangerous. Continue despite checksum failure?"
    fi
 fi

 success "All RPM packages downloaded and verified."

 # ==============================================================================
 # Pre-upgrade: Backup configuration files
 # ==============================================================================

 header "Backing Up Configuration Files"

 BACKUP_TIMESTAMP=$(date +%Y%m%d_%H%M%S)
 BACKUP_BASE="/var/backup"
 BACKUP_DIR="${BACKUP_BASE}/elastic-upgrade-${BACKUP_TIMESTAMP}"

 if [[ ! -d "$BACKUP_BASE" ]]; then
    info "Creating backup directory: ${BACKUP_BASE}"
    mkdir -p "$BACKUP_BASE"
    chmod 700 "$BACKUP_BASE"
 fi

 # Check backup disk space
 free_backup_mb=$(df -m "$BACKUP_BASE" 2>/dev/null | awk 'NR==2{print $4}')
 if [[ -n "$free_backup_mb" && "$free_backup_mb" -lt 256 ]]; then
    warn "Less than 256 MB free on backup filesystem (${BACKUP_BASE})."
    confirm_or_abort "Continue without sufficient backup space?"
 fi

 if ! mkdir -p "$BACKUP_DIR"; then
    error "Failed to create backup directory: ${BACKUP_DIR}"
    error "Check permissions on ${BACKUP_BASE}"
    exit 1
 fi

 chmod 700 "$BACKUP_DIR"

 if $UPGRADE_ES; then
    if [[ -d /etc/elasticsearch ]]; then
        step "Backing up Elasticsearch configuration"
        cp -a /etc/elasticsearch "$BACKUP_DIR/elasticsearch"
        success "Elasticsearch config backed up to: ${BACKUP_DIR}/elasticsearch/"

        config_count=$(find "$BACKUP_DIR/elasticsearch" -type f | wc -l)
        info "  Backed up ${config_count} file(s)"
    else
        warn "No /etc/elasticsearch directory found to backup."
    fi

    # Also backup systemd overrides (contains LimitNOFILE, LimitMEMLOCK, etc.)
    if [[ -d /etc/systemd/system/elasticsearch.service.d ]]; then
        step "Backing up Elasticsearch systemd overrides"
        mkdir -p "$BACKUP_DIR/systemd-elasticsearch"
        cp -a /etc/systemd/system/elasticsearch.service.d/* "$BACKUP_DIR/systemd-elasticsearch/" 2>/dev/null || true
        success "Systemd overrides backed up to: ${BACKUP_DIR}/systemd-elasticsearch/"
    fi
 fi

 if $UPGRADE_KIBANA; then
    if [[ -d /etc/kibana ]]; then
        step "Backing up Kibana configuration"
        cp -a /etc/kibana "$BACKUP_DIR/kibana"
        success "Kibana config backed up to: ${BACKUP_DIR}/kibana/"

        config_count=$(find "$BACKUP_DIR/kibana" -type f | wc -l)
        info "  Backed up ${config_count} file(s)"
    else
        warn "No /etc/kibana directory found to backup."
    fi

    if [[ -d /etc/systemd/system/kibana.service.d ]]; then
        mkdir -p "$BACKUP_DIR/systemd-kibana"
        cp -a /etc/systemd/system/kibana.service.d/* "$BACKUP_DIR/systemd-kibana/" 2>/dev/null || true
        success "Kibana systemd overrides backed up."
    fi
 fi

 info "All backups stored in: ${BOLD}${BACKUP_DIR}${NC}"
 echo ""

 # ==============================================================================
 # Elasticsearch Upgrade
 # ==============================================================================

 if $UPGRADE_ES; then

    header "Upgrading Elasticsearch: ${ES_CURRENT} -> ${TARGET_VERSION}"

    info "Using pre-downloaded RPM: ${DOWNLOAD_DIR}/${ES_RPM}"
    echo ""

    # --- Step 1: Disable shard allocation ---
    if $FORCE_MODE; then
        step "Step 1 - Disable shard allocation"
        warn "FORCE MODE: Skipping (Elasticsearch may not be running)."

        step "Step 2 - Flush all indices"
        warn "FORCE MODE: Skipping (Elasticsearch may not be running)."
    else
    step "Step 1 - Disable shard allocation"
    info "This prevents the cluster from rebalancing shards while the node is down."
    info "Command: PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":\"primaries\"}}"
    echo ""

    confirm_or_abort "Disable shard allocation now?"

    if es_curl_retry PUT "/_cluster/settings" '{"persistent":{"cluster.routing.allocation.enable":"primaries"}}'; then
        _allocation_disabled=true
        success "Shard allocation disabled (set to primaries only)."
    else
        error "Failed to disable shard allocation (HTTP $ES_CURL_HTTP_CODE)."
        confirm_or_abort "Continue despite this failure?"
    fi

    # --- Step 2: Flush ---
    step "Step 2 - Flush all indices"
    info "Flushing ensures all data is written to disk before stopping."
    info "Command: POST /_flush"
    echo ""

    confirm_or_abort "Flush all indices now?"

    # Use longer timeout for flush -- large clusters may take a while
    old_timeout="$API_TIMEOUT"
    API_TIMEOUT=120
    es_curl POST "/_flush"
    flush_code="$ES_CURL_HTTP_CODE"
    API_TIMEOUT="$old_timeout"

    if [[ "$flush_code" == "200" ]]; then
        success "Flush completed."
    else
        warn "Flush returned HTTP $flush_code (this is usually okay to continue)."
    fi
    fi # end of: if ! $FORCE_MODE (steps 1-2)

    # --- Step 3: Stop Elasticsearch ---
    step "Step 3 - Stop Elasticsearch service"
    if ! systemctl is-active elasticsearch &>/dev/null; then
        info "Elasticsearch service is not running. Skipping stop."
    else
        info "Command: systemctl stop elasticsearch (timeout: ${STOP_TIMEOUT}s)"
        echo ""

        confirm_or_abort "Stop Elasticsearch now?"

        if ! timeout "$STOP_TIMEOUT" systemctl stop elasticsearch; then
            error "Elasticsearch did not stop within ${STOP_TIMEOUT} seconds."
            warn "The service may be stuck. Options:"
            warn "  1. Wait longer:    systemctl stop elasticsearch"
            warn "  2. Force kill:     systemctl kill -s SIGKILL elasticsearch"
            confirm_or_abort "Force kill the Elasticsearch process?"
            systemctl kill -s SIGKILL elasticsearch
            sleep 2
        fi
        success "Elasticsearch stopped."
    fi

    # --- Step 4: Install RPM ---
    step "Step 4 - Install Elasticsearch RPM"
    info "Command: ${PKG_MGR} -y localinstall ${DOWNLOAD_DIR}/${ES_RPM}"
    info "Note: Config files in /etc/elasticsearch/ will NOT be overwritten."
    echo ""

    confirm_or_abort "Install the Elasticsearch RPM now?"

    ${PKG_MGR} -y localinstall "${DOWNLOAD_DIR}/${ES_RPM}"
    success "Elasticsearch RPM installed."

    # --- Step 4a: Fix keystore permissions and upgrade ---
    step "Step 4a - Checking Elasticsearch keystore"

    keystore_file="/etc/elasticsearch/elasticsearch.keystore"
    keystore_tmp="/etc/elasticsearch/elasticsearch.keystore.tmp"

    ES_USER=$(systemctl show elasticsearch -p User --value 2>/dev/null) || \
        ES_USER=$(systemctl show elasticsearch -p User 2>/dev/null | cut -d= -f2) || true
    ES_GROUP=$(systemctl show elasticsearch -p Group --value 2>/dev/null) || \
        ES_GROUP=$(systemctl show elasticsearch -p Group 2>/dev/null | cut -d= -f2) || true
    ES_USER="${ES_USER:-elasticsearch}"
    ES_GROUP="${ES_GROUP:-elasticsearch}"
    [[ -z "$ES_USER" ]] && ES_USER="elasticsearch"
    [[ -z "$ES_GROUP" ]] && ES_GROUP="elasticsearch"

    info "Elasticsearch runs as user: ${ES_USER}, group: ${ES_GROUP}"

    if [[ -f "$keystore_file" ]]; then
        info "Keystore exists, checking permissions..."
        chown -R "${ES_USER}:${ES_GROUP}" /etc/elasticsearch 2>/dev/null || true
        chmod 750 /etc/elasticsearch 2>/dev/null || true
        chmod 660 "$keystore_file" 2>/dev/null || true

        if [[ -f "$keystore_tmp" ]]; then
            warn "Found stale keystore temp file, removing..."
            rm -f "$keystore_tmp"
        fi

        info "Running keystore upgrade as ${ES_USER} user..."
        if [[ -x /usr/share/elasticsearch/bin/elasticsearch-keystore ]]; then
            if sudo -u "$ES_USER" /usr/share/elasticsearch/bin/elasticsearch-keystore upgrade 2>&1; then
                success "Keystore upgraded successfully."
            else
                warn "Keystore upgrade returned non-zero (may already be current format)."
                info "If ES fails to start with keystore errors, check:"
                info "  ls -la /etc/elasticsearch/elasticsearch.keystore"
                info "  sudo -u ${ES_USER} /usr/share/elasticsearch/bin/elasticsearch-keystore upgrade"
            fi
        else
            warn "elasticsearch-keystore binary not found or not executable."
        fi
    else
        info "No existing keystore found (will be created on first start if needed)."
    fi

    # --- Step 4b: Check for .rpmnew config files ---
    step "Step 4b - Checking for .rpmnew configuration files"
    info "RPM upgrades may create .rpmnew files when your config has been modified."
    echo ""

    rpmnew_found=false
    while IFS= read -r -d '' rpmnew_file; do
        rpmnew_found=true
        original="${rpmnew_file%.rpmnew}"
        warn "Found: ${BOLD}${rpmnew_file}${NC}"
        if [[ -f "$original" ]]; then
            diff_output=$(diff --brief "$original" "$rpmnew_file" 2>/dev/null) || true
            if [[ -n "$diff_output" ]]; then
                warn "  -> Differs from current ${original}"
                info "  -> Review with: diff ${original} ${rpmnew_file}"
            fi
        fi
    done < <(find /etc/elasticsearch -name '*.rpmnew' -print0 2>/dev/null)

    if $rpmnew_found; then
        echo ""
        warn "Review the .rpmnew files above and merge any needed changes into your config."
        warn "The .rpmnew files contain the new defaults from version ${TARGET_VERSION}."
        confirm_or_abort "Have you noted the .rpmnew files? Continue?"
    else
        success "No .rpmnew config files found (your configs were preserved cleanly)."
    fi

    # --- Step 4c: Verify JVM heap settings ---
    step "Step 4c - Verifying JVM heap settings"
    info "Checking that your JVM heap settings (-Xms / -Xmx) are still in place."
    echo ""

    jvm_opts_file="/etc/elasticsearch/jvm.options"
    jvm_opts_d="/etc/elasticsearch/jvm.options.d"
    heap_found=false
    xms=""
    xmx=""

    if [[ -d "$jvm_opts_d" ]]; then
        for f in "$jvm_opts_d"/*.options; do
            if [[ -f "$f" ]]; then
                _xms=$(grep -oP '^\s*-Xms\K\S+' "$f" 2>/dev/null | tail -1) || true
                _xmx=$(grep -oP '^\s*-Xmx\K\S+' "$f" 2>/dev/null | tail -1) || true
                if [[ -n "$_xms" || -n "$_xmx" ]]; then
                    heap_found=true
                    [[ -n "$_xms" ]] && xms="$_xms"
                    [[ -n "$_xmx" ]] && xmx="$_xmx"
                    info "  Found in ${BOLD}${f}${NC}:"
                    [[ -n "$_xms" ]] && info "    -Xms${_xms}"
                    [[ -n "$_xmx" ]] && info "    -Xmx${_xmx}"
                fi
            fi
        done
    fi

    if [[ -f "$jvm_opts_file" ]]; then
        _xms=$(grep -oP '^\s*-Xms\K\S+' "$jvm_opts_file" 2>/dev/null | tail -1) || true
        _xmx=$(grep -oP '^\s*-Xmx\K\S+' "$jvm_opts_file" 2>/dev/null | tail -1) || true
        if [[ -n "$_xms" || -n "$_xmx" ]]; then
            if ! $heap_found; then
                heap_found=true
            fi
            [[ -n "$_xms" && -z "$xms" ]] && xms="$_xms"
            [[ -n "$_xmx" && -z "$xmx" ]] && xmx="$_xmx"
            info "  Found in ${BOLD}${jvm_opts_file}${NC}:"
            [[ -n "$_xms" ]] && info "    -Xms${_xms}"
            [[ -n "$_xmx" ]] && info "    -Xmx${_xmx}"
        fi
    fi

    if [[ -f "${jvm_opts_file}.rpmnew" ]]; then
        warn "Found ${BOLD}${jvm_opts_file}.rpmnew${NC} — new JVM defaults from ${TARGET_VERSION}."
        info "  Review with: diff ${jvm_opts_file} ${jvm_opts_file}.rpmnew"
    fi

    if $heap_found; then
        echo ""
        if [[ -n "$xms" && -n "$xmx" && "$xms" != "$xmx" ]]; then
            warn "Xms (${xms}) and Xmx (${xmx}) differ. Elastic recommends setting them equal."
        fi
        info "Verify these heap values are correct for this node before starting."
        confirm_or_abort "JVM heap settings look correct? Continue?"
    else
        warn "No explicit -Xms/-Xmx found in jvm.options or jvm.options.d/"
        warn "Elasticsearch will use its built-in defaults (typically 50% of RAM, max 31g)."
        warn "The default may have changed between versions. Verify this is acceptable."
        confirm_or_abort "Continue with default heap settings?"
    fi

    # --- Step 5: Reload systemd ---
    step "Step 5 - Reload systemd daemon"
    systemctl daemon-reload
    success "Systemd daemon reloaded."

    # --- Step 6: Start Elasticsearch ---
    step "Step 6 - Start Elasticsearch service"
    info "Command: systemctl start elasticsearch"
    echo ""

    confirm_or_abort "Start Elasticsearch now?"

    systemctl start elasticsearch
    info "Elasticsearch starting... waiting for it to become available."

    retries=0
    while [[ $retries -lt $STARTUP_WAIT ]]; do
        es_curl GET "/" >/dev/null 2>&1
        if [[ "$ES_CURL_HTTP_CODE" == "200" ]]; then
            break
        fi
        retries=$((retries + 1))
        echo -ne "\r  Waiting for Elasticsearch... (${retries}/${STARTUP_WAIT}s)"
        sleep 1
    done
    echo ""

    if [[ $retries -ge $STARTUP_WAIT ]]; then
        error "Elasticsearch did not start within ${STARTUP_WAIT} seconds."
        error "Check the logs: journalctl -u elasticsearch -f"

        recent_logs=$(journalctl -u elasticsearch --no-pager -n 30 2>/dev/null) || recent_logs=""
        if echo "$recent_logs" | grep -qi "GLIBC\|glibc\|libc\.so\|GLIBCXX"; then
            error ""
            error "=== GLIBC INCOMPATIBILITY DETECTED ==="
            error "The Elasticsearch JDK requires a newer glibc than this system provides."
            error "Options:"
            error "  1. Upgrade your OS (e.g. CentOS 7 -> RHEL 8/9, Rocky 8/9, etc.)"
            error "  2. Install a compatible system JDK and configure ES_JAVA_HOME"
            error "  3. Roll back: ${PKG_MGR} -y downgrade elasticsearch-${ES_CURRENT}"
        fi
        if echo "$recent_logs" | grep -qi "unknown setting\|unsupported setting\|IllegalArgument"; then
            error ""
            error "=== CONFIGURATION ERROR DETECTED ==="
            error "Elasticsearch may have failed due to removed/deprecated settings."
            error "Check elasticsearch.yml for settings that were removed in ${TARGET_VERSION}."
        fi
        if echo "$recent_logs" | grep -qi "keystore\|KeyStoreException"; then
            error ""
            error "=== KEYSTORE ERROR DETECTED ==="
            error "Try running:"
            error "  sudo -u ${ES_USER:-elasticsearch} /usr/share/elasticsearch/bin/elasticsearch-keystore upgrade"
        fi

        error ""
        error "To roll back: ${PKG_MGR} -y downgrade elasticsearch-${ES_CURRENT}"
        error "Config backup: ${BACKUP_DIR}"
        confirm_or_abort "Continue anyway (maybe it needs more time)?"
    else
        es_curl GET "/"
        version_body="$ES_CURL_BODY"
        new_version=$(echo "$version_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('version',{}).get('number','unknown'))" 2>/dev/null) || new_version="unknown"
        success "Elasticsearch is running. Version: ${BOLD}${new_version}${NC}"
    fi

    # --- Step 7: Wait for node to rejoin ---
    if $FORCE_MODE; then
        step "Step 7 - Wait for node to rejoin cluster"
        warn "FORCE MODE: Skipping cluster rejoin check."
        warn "Verify manually that the node has rejoined: GET /_cat/nodes"
        _allocation_disabled=false

        step "Step 7a - Re-enable shard allocation"
        warn "FORCE MODE: Skipping. Re-enable manually if needed:"
        warn "  PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}"

        step "Step 7b - Wait for cluster recovery"
        warn "FORCE MODE: Skipping. Monitor recovery manually: GET /_cluster/health?pretty"
    else
    step "Step 7 - Wait for node to rejoin cluster"
    info "Checking cluster membership..."

    es_curl GET "/_cluster/health?pretty"
    health_body="$ES_CURL_BODY"
    current_nodes=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('number_of_nodes',0))" 2>/dev/null) || current_nodes="?"

    info "Nodes in cluster: ${current_nodes} (was: ${num_nodes:-?})"
    success "Node has rejoined the cluster."

    # --- Step 7a: Re-enable shard allocation ---
    step "Step 7a - Re-enable shard allocation"
    info "Clearing both persistent and transient allocation overrides."
    echo ""

    confirm_or_abort "Re-enable shard allocation now?"

    if es_curl_retry PUT "/_cluster/settings" '{"persistent":{"cluster.routing.allocation.enable":null},"transient":{"cluster.routing.allocation.enable":null}}'; then
        # Verify it actually took effect
        info "Verifying allocation setting was cleared..."
        es_curl GET "/_cluster/settings?flat_settings=true"
        verify_body="$ES_CURL_BODY"

        if echo "$verify_body" | grep -q '"cluster\.routing\.allocation\.enable"' 2>/dev/null; then
            error "Allocation setting is STILL present after clearing!"
            error "Fix manually:"
            error "  PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}"
            confirm_or_abort "Continue anyway?"
        else
            _allocation_disabled=false
            success "Verified: shard allocation overrides cleared."
        fi
    else
        error "Failed to re-enable shard allocation (HTTP $ES_CURL_HTTP_CODE)."
        error "You MUST manually re-enable it:"
        error "  PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}"
    fi

    # --- Step 7b: Wait for cluster recovery ---
    step "Step 7b - Wait for cluster recovery"

    info "Temporarily increasing concurrent incoming recoveries to 10..."
    if es_curl_quiet PUT "/_cluster/settings" '{"transient":{"cluster.routing.allocation.node_concurrent_incoming_recoveries":10}}'; then
        _recovery_boosted=true
        success "Recovery concurrency boosted."
    else
        warn "Could not increase recovery concurrency. Continuing with defaults."
    fi

    info "Monitoring cluster health until green — timeout $((RECOVERY_WAIT / 60)) minutes (Ctrl+C to stop waiting)..."
    echo ""

    retries=0
    while [[ $retries -lt $RECOVERY_WAIT ]]; do
        es_curl GET "/_cluster/health"
        health_body="$ES_CURL_BODY"
        status=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('status','unknown'))" 2>/dev/null) || status="unknown"
        init=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('initializing_shards',0))" 2>/dev/null) || init="?"
        reloc=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('relocating_shards',0))" 2>/dev/null) || reloc="?"
        unass=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('unassigned_shards',0))" 2>/dev/null) || unass="?"

        recovery_info=""
        es_curl GET "/_cat/recovery?active_only=true&h=index,shard,stage,bytes_percent&format=json"
        recovery_body="$ES_CURL_BODY"
        if [[ "$ES_CURL_HTTP_CODE" == "200" ]]; then
            active_recoveries=$(echo "$recovery_body" | python3 -c "import json,sys; print(len(json.load(sys.stdin)))" 2>/dev/null) || active_recoveries=0
            if [[ "$active_recoveries" -gt 0 ]]; then
                avg_pct=$(echo "$recovery_body" | python3 -c "
 import json, sys
 data = json.load(sys.stdin)
 pcts = [float(r.get('bytes_percent','0').rstrip('%')) for r in data if r.get('bytes_percent')]
 print(f'{sum(pcts)/len(pcts):.1f}' if pcts else '0')
 " 2>/dev/null) || avg_pct="?"
                recovery_info=" | Recoveries: ${active_recoveries} (avg ${avg_pct}%)"
            fi
        fi

        if [[ "$status" == "green" ]]; then
            echo ""
            success "Cluster is ${GREEN}${BOLD}GREEN${NC}!"
            break
        fi

        elapsed_min=$((retries / 60))
        elapsed_sec=$((retries % 60))
        printf "\r  Status: %-6s | Init: %-3s | Reloc: %-3s | Unassigned: %-3s%s | %dm%02ds  " \
            "$status" "$init" "$reloc" "$unass" "$recovery_info" "$elapsed_min" "$elapsed_sec"
        retries=$((retries + 5))
        sleep 5
    done

    if $_recovery_boosted; then
        info "Resetting recovery concurrency to default..."
        if es_curl_quiet PUT "/_cluster/settings" '{"transient":{"cluster.routing.allocation.node_concurrent_incoming_recoveries":null}}'; then
            _recovery_boosted=false
            success "Recovery concurrency reset to default."
        else
            warn "Could not reset recovery concurrency. Reset manually:"
            warn "  PUT /_cluster/settings {\"transient\":{\"cluster.routing.allocation.node_concurrent_incoming_recoveries\":null}}"
        fi
    fi

    if [[ $retries -ge $RECOVERY_WAIT ]]; then
        echo ""
        warn "Cluster did not reach GREEN within $((RECOVERY_WAIT / 60)) minutes."
        warn "Current status: ${status:-unknown}"
        warn "This may be normal if you are doing a rolling upgrade across multiple nodes."
        info "Monitor with: GET /_cluster/health?pretty"
        info "Active recoveries: GET /_cat/recovery?active_only=true&v"
    fi
    fi # end of: if ! $FORCE_MODE (steps 7-7b)

    success "Elasticsearch upgrade complete on this node."
 fi

 # ==============================================================================
 # Kibana Upgrade
 # ==============================================================================

 if $UPGRADE_KIBANA; then

    header "Upgrading Kibana: ${KIBANA_CURRENT} -> ${TARGET_VERSION}"

    # Kibana migrations require cluster.routing.allocation.enable to be "all" (or unset).
    if ! $FORCE_MODE; then
        step "Verifying shard allocation is enabled"
        info "Kibana migrations will fail if cluster.routing.allocation.enable is not 'all'."

        es_curl GET "/_cluster/settings?flat_settings=true&include_defaults=true"
        alloc_body="$ES_CURL_BODY"
        alloc_code="$ES_CURL_HTTP_CODE"

        if [[ "$alloc_code" == "200" ]]; then
            persistent_alloc=$(echo "$alloc_body" | python3 -c "
 import json, sys
 d = json.load(sys.stdin)
 v = d.get('persistent', {}).get('cluster.routing.allocation.enable', '')
 print(v)
 " 2>/dev/null) || persistent_alloc=""

            transient_alloc=$(echo "$alloc_body" | python3 -c "
 import json, sys
 d = json.load(sys.stdin)
 v = d.get('transient', {}).get('cluster.routing.allocation.enable', '')
 print(v)
 " 2>/dev/null) || transient_alloc=""

            alloc_blocked=false

            if [[ -n "$persistent_alloc" && "$persistent_alloc" != "all" ]]; then
                warn "Persistent cluster.routing.allocation.enable = '${persistent_alloc}' (must be 'all' or unset)"
                alloc_blocked=true
            fi
            if [[ -n "$transient_alloc" && "$transient_alloc" != "all" ]]; then
                warn "Transient cluster.routing.allocation.enable = '${transient_alloc}' (must be 'all' or unset)"
                alloc_blocked=true
            fi

            if $alloc_blocked; then
                warn ""
                warn "Kibana migrations WILL FAIL with these settings."
                info "Fixing: resetting allocation to default (all)..."

                if es_curl_retry PUT "/_cluster/settings" '{"persistent":{"cluster.routing.allocation.enable":null},"transient":{"cluster.routing.allocation.enable":null}}'; then
                    # Verify
                    es_curl GET "/_cluster/settings?flat_settings=true"
                    verify_body="$ES_CURL_BODY"
                    if echo "$verify_body" | grep -q '"cluster\.routing\.allocation\.enable"' 2>/dev/null; then
                        error "Allocation setting is STILL present after clearing!"
                        error "Fix manually before starting Kibana."
                        confirm_or_abort "Continue anyway? (Kibana will likely fail to start)"
                    else
                        success "Verified: allocation overrides cleared. Kibana migrations can proceed."
                    fi
                else
                    error "Failed to reset allocation (HTTP $ES_CURL_HTTP_CODE)."
                    error "Fix manually before starting Kibana:"
                    error "  PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}"
                    confirm_or_abort "Continue anyway? (Kibana will likely fail to start)"
                fi
            else
                success "Shard allocation is set to 'all'. Kibana migrations can proceed."
            fi
        else
            warn "Could not check allocation settings (HTTP $alloc_code). Kibana may fail if allocation is restricted."
        fi
        echo ""
    fi

    info "Using pre-downloaded RPM: ${DOWNLOAD_DIR}/${KIBANA_RPM}"
    echo ""

    # --- K2: Stop Kibana ---
    step "K2 - Stop Kibana service"
    if ! systemctl is-active kibana &>/dev/null; then
        info "Kibana service is not running. Skipping stop."
    else
        info "Command: systemctl stop kibana"
        echo ""

        confirm_or_abort "Stop Kibana now?"

        if ! timeout "$STOP_TIMEOUT" systemctl stop kibana; then
            error "Kibana did not stop within ${STOP_TIMEOUT} seconds."
            confirm_or_abort "Force kill the Kibana process?"
            systemctl kill -s SIGKILL kibana
            sleep 2
        fi
        success "Kibana stopped."
    fi

    # --- K3: Install RPM ---
    step "K3 - Install Kibana RPM"
    info "Command: ${PKG_MGR} -y localinstall ${DOWNLOAD_DIR}/${KIBANA_RPM}"
    info "Note: Config files in /etc/kibana/ will NOT be overwritten."
    echo ""

    confirm_or_abort "Install the Kibana RPM now?"

    ${PKG_MGR} -y localinstall "${DOWNLOAD_DIR}/${KIBANA_RPM}"
    success "Kibana RPM installed."

    # --- K4: Reload systemd ---
    step "K4 - Reload systemd daemon"
    systemctl daemon-reload
    success "Systemd daemon reloaded."

    # --- K5: Start Kibana ---
    step "K5 - Start Kibana service"
    info "Command: systemctl start kibana"
    echo ""

    confirm_or_abort "Start Kibana now?"

    systemctl start kibana
    info "Kibana starting... waiting for it to become ready."
    info "Kibana runs migrations after an upgrade, which may take several minutes."

    # Detect Kibana listen address from config
    kibana_config="/etc/kibana/kibana.yml"
    kibana_host="localhost"
    kibana_port="5601"
    kibana_scheme="http"

    if [[ -f "$kibana_config" ]]; then
        cfg_host=$(grep -E '^\s*server\.host\s*:' "$kibana_config" 2>/dev/null | sed 's/^[^:]*:\s*//' | tr -d '"'"'" | xargs) || true
        if [[ -n "$cfg_host" && "$cfg_host" != "0.0.0.0" && "$cfg_host" != "::" ]]; then
            kibana_host="$cfg_host"
        fi

        cfg_port=$(grep -E '^\s*server\.port\s*:' "$kibana_config" 2>/dev/null | sed 's/^[^:]*:\s*//' | tr -d '"'"'" | xargs) || true
        if [[ -n "$cfg_port" ]]; then
            kibana_port="$cfg_port"
        fi

        cfg_ssl=$(grep -E '^\s*server\.ssl\.enabled\s*:' "$kibana_config" 2>/dev/null | sed 's/^[^:]*:\s*//' | tr -d '"'"'" | xargs) || true
        if [[ "$cfg_ssl" == "true" ]]; then
            kibana_scheme="https"
        fi
    fi

    KIBANA_URL="${kibana_scheme}://${kibana_host}:${kibana_port}"
    info "Kibana URL: ${KIBANA_URL}"

    declare -a kibana_curl_cmd=(curl -s --max-time 10 -o /dev/null -w '%{http_code}')
    if [[ "$kibana_scheme" == "https" ]]; then
        kibana_curl_cmd+=(-k)
    fi

    retries=0
    last_code="000"
    while [[ $retries -lt $KIBANA_STARTUP_WAIT ]]; do
        last_code=$("${kibana_curl_cmd[@]}" "${KIBANA_URL}/api/status" 2>/dev/null) || last_code="000"

        if [[ "$last_code" == "200" ]]; then
            break
        fi

        retries=$((retries + 1))

        if [[ "$last_code" == "503" ]]; then
            echo -ne "\r  Kibana is running migrations... (${retries}s, HTTP ${last_code})    "
        elif [[ "$last_code" == "000" ]]; then
            echo -ne "\r  Waiting for Kibana to start listening... (${retries}s)    "
        else
            echo -ne "\r  Waiting for Kibana... (${retries}s, HTTP ${last_code})    "
        fi
        sleep 1
    done
    echo ""

    if [[ "$last_code" == "200" ]]; then
        success "Kibana is ready and accepting requests."
    elif [[ "$last_code" == "503" ]]; then
        warn "Kibana is still running migrations after ${KIBANA_STARTUP_WAIT}s (HTTP 503)."
        warn "This is normal for large deployments. It should become available shortly."
        warn "Monitor with: curl -s ${KIBANA_URL}/api/status | python3 -m json.tool"
    elif [[ "$last_code" == "000" ]]; then
        warn "Kibana did not start responding within ${KIBANA_STARTUP_WAIT}s."
        warn "Check logs: journalctl -u kibana -f"
        warn "             tail -f /var/log/kibana/kibana.log"

        recent_logs=$(journalctl -u kibana --no-pager -n 30 2>/dev/null) || recent_logs=""
        if echo "$recent_logs" | grep -qi "ECONNREFUSED\|connect.*elasticsearch"; then
            warn ""
            warn "=== ELASTICSEARCH CONNECTION ISSUE ==="
            warn "Kibana cannot connect to Elasticsearch."
            warn "Verify Elasticsearch is running and reachable."
        fi
    else
        warn "Kibana returned unexpected status: HTTP ${last_code}"
        warn "Check logs: journalctl -u kibana -f"
    fi

    KIBANA_NEW=$(get_rpm_version kibana)
    success "Kibana upgrade complete. Version: ${BOLD}${KIBANA_NEW}${NC}"
 fi

 # ==============================================================================
 # Post-upgrade: Restore version locks
 # ==============================================================================

 if command -v "${PKG_MGR}" &>/dev/null && ${PKG_MGR} versionlock list &>/dev/null 2>&1; then
    if $versionlock_removed_es || $versionlock_removed_kibana; then
        header "Restoring Version Locks"

        if $versionlock_removed_es && $UPGRADE_ES; then
            info "Re-adding version lock for Elasticsearch"
            if ${PKG_MGR} versionlock add elasticsearch 2>/dev/null; then
                if ${PKG_MGR} versionlock list 2>/dev/null | grep -q "elasticsearch"; then
                    success "Elasticsearch version lock restored."
                else
                    warn "Versionlock add returned success but lock not found in list."
                    warn "Add manually: ${PKG_MGR} versionlock add elasticsearch"
                fi
            else
                warn "Failed to restore Elasticsearch version lock."
                warn "Add manually: ${PKG_MGR} versionlock add elasticsearch"
            fi
        fi

        if $versionlock_removed_kibana && $UPGRADE_KIBANA; then
            info "Re-adding version lock for Kibana"
            if ${PKG_MGR} versionlock add kibana 2>/dev/null; then
                if ${PKG_MGR} versionlock list 2>/dev/null | grep -q "kibana"; then
                    success "Kibana version lock restored."
                else
                    warn "Versionlock add returned success but lock not found in list."
                    warn "Add manually: ${PKG_MGR} versionlock add kibana"
                fi
            else
                warn "Failed to restore Kibana version lock."
                warn "Add manually: ${PKG_MGR} versionlock add kibana"
            fi
        fi
    fi
 fi

 # ==============================================================================
 # Cleanup
 # ==============================================================================

 # Clear trap since we finished successfully
 _allocation_disabled=false
 _recovery_boosted=false

 header "Upgrade Complete"

 echo -e "  ${GREEN}OK${NC} Upgrade finished successfully on this node."
 echo ""
 $UPGRADE_ES     && echo -e "  Elasticsearch: ${ES_CURRENT} -> ${BOLD}${TARGET_VERSION}${NC}"
 $UPGRADE_KIBANA && echo -e "  Kibana:        ${KIBANA_CURRENT} -> ${BOLD}${TARGET_VERSION}${NC}"
 echo ""
 echo -e "  Config backup: ${BOLD}${BACKUP_DIR}${NC}"
 echo ""

 if confirm "Clean up downloaded RPM files from ${DOWNLOAD_DIR}?" "y"; then
    rm -rf "${DOWNLOAD_DIR}"
    success "Cleaned up download directory."
 else
    info "RPM files kept in ${DOWNLOAD_DIR}"
 fi

 echo ""
 info "If this is a multi-node cluster, repeat this process on the next node."
 info "Upgrade order: non-master-eligible nodes first, then master-eligible nodes."
 echo ""
 success "Done!"
No results found