Created
February 3, 2026 18:03
-
-
Save Oddly/621873ffa5f98c0b044f8c289212dea2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # | |
| # Elasticsearch & Kibana RPM Upgrade Script | |
| # ========================================== | |
| # This script upgrades Elasticsearch and/or Kibana on the local node | |
| # using RPMs downloaded from artifacts.elastic.co. | |
| # | |
| # It follows the official Elastic rolling upgrade procedure: | |
| # 1. Pre-flight checks (current versions, cluster health, disk space) | |
| # 2. Disable shard allocation | |
| # 3. Flush all indices (best effort) | |
| # 4. Stop the service | |
| # 5. Download & install the RPM | |
| # 6. Reload systemd and start the service | |
| # 7. Wait for the node to rejoin the cluster | |
| # 8. Re-enable shard allocation | |
| # 9. Wait for cluster to go green | |
| # | |
| # Usage: | |
| # ./upgrade-elastic.sh <target-version> [--es-only|--kibana-only] [--arch aarch64] | |
| # | |
| # Examples: | |
| # ./upgrade-elastic.sh 8.17.0 # Upgrade both ES and Kibana | |
| # ./upgrade-elastic.sh 8.17.0 --es-only # Upgrade Elasticsearch only | |
| # ./upgrade-elastic.sh 8.17.0 --kibana-only # Upgrade Kibana only | |
| # ./upgrade-elastic.sh 8.17.0 --arch aarch64 # Use aarch64 RPMs | |
| # ./upgrade-elastic.sh 8.17.0 --yes # Auto-accept non-critical prompts | |
| # ./upgrade-elastic.sh 8.17.0 --force # Skip ES-dependent checks (ES not running) | |
| # | |
| # Notes: | |
| # - Run this script on each node individually (rolling upgrade) | |
| # - Upgrade non-master-eligible nodes first, then master-eligible nodes | |
| # - Make sure the cluster is GREEN before starting | |
| # - This script must be run as root or with sudo | |
| # | |
| set -euo pipefail | |
| # ============================================================================== | |
| # Configuration & Defaults | |
| # ============================================================================== | |
| ARCH="x86_64" | |
| UPGRADE_ES=true | |
| UPGRADE_KIBANA=true | |
| TARGET_VERSION="" | |
| ES_URL="https://localhost:9200" | |
| DOWNLOAD_DIR="/tmp/elastic-upgrade" | |
| AUTO_YES=false | |
| FORCE_MODE=false | |
| ES_CURRENT="" | |
| KIBANA_CURRENT="" | |
| cluster_name="" | |
| # Curl auth/TLS options -- populated during connection detection | |
| # Stored as an array to avoid eval and shell injection | |
| declare -a ES_CURL_AUTH=() | |
| ES_CURL_INSECURE=false | |
| # Lock file to prevent concurrent runs | |
| LOCK_FILE="/var/run/elastic-upgrade.lock" | |
| LOCK_FD=9 | |
| # Timeouts (seconds) | |
| STOP_TIMEOUT=120 | |
| API_TIMEOUT=60 | |
| STARTUP_WAIT=120 | |
| KIBANA_STARTUP_WAIT=300 | |
| RECOVERY_WAIT=1200 | |
| API_RETRIES=3 | |
| # Colors for output | |
| RED='\033[0;31m' | |
| GREEN='\033[0;32m' | |
| YELLOW='\033[1;33m' | |
| BLUE='\033[0;34m' | |
| BOLD='\033[1m' | |
| NC='\033[0m' # No Color | |
| # Track state for cleanup on unexpected exit | |
| _allocation_disabled=false | |
| _recovery_boosted=false | |
| # ============================================================================== | |
| # Helper Functions | |
| # ============================================================================== | |
| info() { echo -e "${BLUE}[INFO]${NC} $*"; } | |
| success() { echo -e "${GREEN}[OK]${NC} $*"; } | |
| warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } | |
| error() { echo -e "${RED}[ERROR]${NC} $*"; } | |
| header() { echo -e "\n${BOLD}═══════════════════════════════════════════════════════════${NC}"; echo -e "${BOLD} $*${NC}"; echo -e "${BOLD}═══════════════════════════════════════════════════════════${NC}\n"; } | |
| step() { echo -e "\n${YELLOW}▶ STEP: $*${NC}\n"; } | |
| # Cleanup handler for unexpected exits | |
| cleanup_on_exit() { | |
| local exit_code=$? | |
| # Re-enable shard allocation if we disabled it and didn't re-enable | |
| if $_allocation_disabled; then | |
| echo "" | |
| warn "Script interrupted! Attempting to re-enable shard allocation..." | |
| if es_curl_quiet PUT "/_cluster/settings" '{"persistent":{"cluster.routing.allocation.enable":null},"transient":{"cluster.routing.allocation.enable":null}}'; then | |
| success "Shard allocation re-enabled." | |
| else | |
| error "FAILED to re-enable shard allocation! Run manually:" | |
| error " PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}" | |
| fi | |
| fi | |
| # Reset recovery concurrency if we boosted it | |
| if $_recovery_boosted; then | |
| es_curl_quiet PUT "/_cluster/settings" '{"transient":{"cluster.routing.allocation.node_concurrent_incoming_recoveries":null}}' || true | |
| fi | |
| # Release lock | |
| release_lock | |
| exit "$exit_code" | |
| } | |
| trap cleanup_on_exit EXIT INT TERM | |
| # Acquire exclusive lock to prevent concurrent runs | |
| acquire_lock() { | |
| eval "exec ${LOCK_FD}>${LOCK_FILE}" | |
| if ! flock -n "$LOCK_FD"; then | |
| error "Another instance of this script is already running (lock: ${LOCK_FILE})." | |
| error "If you are sure no other instance is running, remove the lock file:" | |
| error " rm -f ${LOCK_FILE}" | |
| exit 1 | |
| fi | |
| # Write PID to lock file for debugging | |
| echo $$ >&"$LOCK_FD" | |
| } | |
| release_lock() { | |
| flock -u "$LOCK_FD" 2>/dev/null || true | |
| rm -f "$LOCK_FILE" 2>/dev/null || true | |
| } | |
| confirm() { | |
| local message="$1" | |
| local default="${2:-n}" | |
| if $AUTO_YES; then | |
| info "(auto-yes) $message -> yes" | |
| return 0 | |
| fi | |
| if [[ "$default" == "y" ]]; then | |
| prompt="[Y/n]" | |
| else | |
| prompt="[y/N]" | |
| fi | |
| while true; do | |
| echo -en "${BOLD}$message ${prompt}: ${NC}" | |
| read -r answer | |
| answer="${answer:-$default}" | |
| case "${answer,,}" in | |
| y|yes) return 0 ;; | |
| n|no) return 1 ;; | |
| *) echo "Please answer y or n." ;; | |
| esac | |
| done | |
| } | |
| confirm_or_abort() { | |
| if ! confirm "$1" "${2:-y}"; then | |
| warn "Aborted by user." | |
| exit 0 | |
| fi | |
| } | |
| # Critical confirmation: in --yes mode, abort with exit 1 instead of auto-accepting | |
| confirm_critical() { | |
| local message="$1" | |
| if $AUTO_YES; then | |
| error "(auto-yes) CRITICAL: $message" | |
| error "Cannot auto-accept critical issues. Resolve the problem and re-run." | |
| exit 1 | |
| fi | |
| confirm_or_abort "$message" | |
| } | |
| # Execute curl against the Elasticsearch API. | |
| # Uses arrays instead of eval to avoid shell injection. | |
| # Usage: es_curl <METHOD> <PATH> [JSON_DATA] | |
| # Sets globals: ES_CURL_HTTP_CODE, ES_CURL_BODY | |
| # | |
| # IMPORTANT: Do NOT use body=$(es_curl ...) — command substitution runs in a | |
| # subshell, so the global variables would not propagate back. Instead call | |
| # es_curl directly and read ES_CURL_BODY / ES_CURL_HTTP_CODE afterwards. | |
| ES_CURL_HTTP_CODE="" | |
| ES_CURL_BODY="" | |
| es_curl() { | |
| local method="${1:-GET}" | |
| local path="${2:-/}" | |
| local data="${3:-}" | |
| local base_url="${ES_URL%/}" | |
| local clean_path="${path#/}" | |
| local full_url="${base_url}/${clean_path}" | |
| local -a cmd=(curl -s -w '\n%{http_code}' --max-time "$API_TIMEOUT") | |
| # Add TLS options | |
| if $ES_CURL_INSECURE; then | |
| cmd+=(-k) | |
| fi | |
| # Add auth options (array-safe, no shell expansion) | |
| if [[ ${#ES_CURL_AUTH[@]} -gt 0 ]]; then | |
| cmd+=("${ES_CURL_AUTH[@]}") | |
| fi | |
| if [[ -n "$data" ]]; then | |
| cmd+=(-X "$method" -H 'Content-Type: application/json' -d "$data" "$full_url") | |
| else | |
| cmd+=(-X "$method" "$full_url") | |
| fi | |
| local output | |
| output=$("${cmd[@]}" 2>/dev/null) || true | |
| # Parse HTTP code (last line) and body (everything else) | |
| ES_CURL_HTTP_CODE=$(echo "$output" | tail -1) | |
| ES_CURL_BODY=$(echo "$output" | sed '$d') | |
| } | |
| # Quiet version: returns 0 if HTTP 200, 1 otherwise. No output. | |
| es_curl_quiet() { | |
| es_curl "$@" | |
| [[ "$ES_CURL_HTTP_CODE" == "200" ]] | |
| } | |
| # Retry wrapper for critical API calls | |
| es_curl_retry() { | |
| local retries="$API_RETRIES" | |
| local attempt=1 | |
| while [[ $attempt -le $retries ]]; do | |
| es_curl "$@" | |
| if [[ "$ES_CURL_HTTP_CODE" == "200" ]]; then | |
| return 0 | |
| fi | |
| if [[ $attempt -lt $retries ]]; then | |
| warn "API call failed (HTTP ${ES_CURL_HTTP_CODE}), retrying (${attempt}/${retries})..." | |
| sleep 2 | |
| fi | |
| attempt=$((attempt + 1)) | |
| done | |
| return 1 | |
| } | |
| # Test ES connectivity. Returns 0 if reachable, 1 otherwise. | |
| test_es_connection() { | |
| local -a cmd=(curl -s -o /dev/null -w '%{http_code}' --max-time 5) | |
| if $ES_CURL_INSECURE; then | |
| cmd+=(-k) | |
| fi | |
| if [[ ${#ES_CURL_AUTH[@]} -gt 0 ]]; then | |
| cmd+=("${ES_CURL_AUTH[@]}") | |
| fi | |
| cmd+=("${ES_URL}/") | |
| local code | |
| code=$("${cmd[@]}" 2>/dev/null) || code="000" | |
| echo "$code" | |
| } | |
| # Check if a systemd service exists | |
| service_exists() { | |
| systemctl list-unit-files "$1.service" &>/dev/null | |
| } | |
| # Get installed RPM version | |
| get_rpm_version() { | |
| local ver | |
| if ver=$(rpm -q --queryformat '%{VERSION}' "$1" 2>/dev/null) && [[ "$ver" != *"not installed"* ]]; then | |
| echo "$ver" | |
| else | |
| echo "not installed" | |
| fi | |
| } | |
| # Download a file with progress, using curl (no wget dependency) | |
| download_file() { | |
| local url="$1" | |
| local dest="$2" | |
| local description="${3:-file}" | |
| info "Downloading ${description}..." | |
| if ! curl --fail --location --retry 3 --retry-delay 5 \ | |
| --connect-timeout 15 --max-time 600 \ | |
| --progress-bar -o "$dest" "$url"; then | |
| error "Failed to download ${description}." | |
| rm -f "$dest" | |
| return 1 | |
| fi | |
| return 0 | |
| } | |
| # ============================================================================== | |
| # Argument Parsing | |
| # ============================================================================== | |
| usage() { | |
| echo "Usage: $0 <target-version> [OPTIONS]" | |
| echo "" | |
| echo "Options:" | |
| echo " --es-only Only upgrade Elasticsearch" | |
| echo " --kibana-only Only upgrade Kibana" | |
| echo " --arch ARCH Architecture: x86_64 (default) or aarch64" | |
| echo " --es-url URL Elasticsearch URL (default: https://localhost:9200)" | |
| echo " --yes Auto-accept all non-critical prompts; exit 1 on critical issues" | |
| echo " --force Skip pre-flight checks that require a running Elasticsearch" | |
| echo " (cluster health, node analysis, version mix, deprecation API," | |
| echo " shard allocation, flush, rejoin wait, recovery wait)" | |
| echo " -h, --help Show this help" | |
| echo "" | |
| echo "Flags can be combined. --force implies --yes for skipped steps." | |
| exit 1 | |
| } | |
| while [[ $# -gt 0 ]]; do | |
| case "$1" in | |
| --es-only) UPGRADE_KIBANA=false; shift ;; | |
| --kibana-only) UPGRADE_ES=false; shift ;; | |
| --arch) ARCH="$2"; shift 2 ;; | |
| --es-url) ES_URL="$2"; shift 2 ;; | |
| --yes|-y) AUTO_YES=true; shift ;; | |
| --force|-f) FORCE_MODE=true; AUTO_YES=true; shift ;; | |
| -h|--help) usage ;; | |
| -*) error "Unknown option: $1"; usage ;; | |
| *) | |
| if [[ -z "$TARGET_VERSION" ]]; then | |
| TARGET_VERSION="$1" | |
| else | |
| error "Unexpected argument: $1" | |
| usage | |
| fi | |
| shift | |
| ;; | |
| esac | |
| done | |
| if [[ -z "$TARGET_VERSION" ]]; then | |
| error "Target version is required." | |
| usage | |
| fi | |
| # Validate version format | |
| if ! [[ "$TARGET_VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then | |
| error "Invalid version format: ${TARGET_VERSION}" | |
| error "Expected format: MAJOR.MINOR.PATCH (e.g., 8.17.0)" | |
| exit 1 | |
| fi | |
| # Show active mode banners | |
| if $FORCE_MODE; then | |
| warn "╔══════════════════════════════════════════════════════════╗" | |
| warn "║ FORCE MODE — skipping checks that require running ES ║" | |
| warn "║ Shard allocation, flush, rejoin, and recovery steps ║" | |
| warn "║ will be skipped. You must manage these manually. ║" | |
| warn "╚══════════════════════════════════════════════════════════╝" | |
| echo "" | |
| elif $AUTO_YES; then | |
| info "╔══════════════════════════════════════════════════════════╗" | |
| info "║ AUTO-YES MODE — non-critical prompts will be accepted ║" | |
| info "║ Critical issues will cause the script to exit 1. ║" | |
| info "╚══════════════════════════════════════════════════════════╝" | |
| echo "" | |
| fi | |
| # ============================================================================== | |
| # Pre-flight: Root check | |
| # ============================================================================== | |
| if [[ $EUID -ne 0 ]]; then | |
| error "This script must be run as root (or with sudo)." | |
| exit 1 | |
| fi | |
| # ============================================================================== | |
| # Pre-flight: Acquire lock | |
| # ============================================================================== | |
| acquire_lock | |
| # ============================================================================== | |
| # Pre-flight: OS compatibility check | |
| # ============================================================================== | |
| is_rhel_compatible=false | |
| if [[ -f /etc/redhat-release ]]; then | |
| is_rhel_compatible=true | |
| elif [[ -f /etc/os-release ]]; then | |
| source /etc/os-release | |
| case "${ID:-}" in | |
| rhel|centos|fedora|rocky|alma|ol|scientific|amzn) | |
| is_rhel_compatible=true | |
| ;; | |
| esac | |
| if [[ "${ID_LIKE:-}" == *"rhel"* ]] || [[ "${ID_LIKE:-}" == *"fedora"* ]]; then | |
| is_rhel_compatible=true | |
| fi | |
| fi | |
| if ! $is_rhel_compatible; then | |
| error "This script is designed for RHEL-compatible systems (RHEL, CentOS, Rocky, Alma, Fedora, etc.)" | |
| error "Detected OS does not appear to be RHEL-compatible." | |
| if [[ -f /etc/os-release ]]; then | |
| source /etc/os-release | |
| error " Detected: ${PRETTY_NAME:-$ID}" | |
| fi | |
| error "" | |
| error "For Debian/Ubuntu systems, use .deb packages instead of RPMs." | |
| error "For other systems, consider using the tarball distribution." | |
| exit 1 | |
| fi | |
| if [[ -f /etc/os-release ]]; then | |
| source /etc/os-release | |
| info "Detected OS: ${PRETTY_NAME:-$ID}" | |
| fi | |
| # ============================================================================== | |
| # Pre-flight: Required commands check | |
| # ============================================================================== | |
| header "Checking Required Commands" | |
| missing_cmds=() | |
| for cmd in curl rpm systemctl sha512sum df grep sed awk python3 diff journalctl ldd; do | |
| if command -v "$cmd" &>/dev/null; then | |
| success "Found: $(command -v "$cmd")" | |
| else | |
| error "Missing: ${BOLD}${cmd}${NC}" | |
| missing_cmds+=("$cmd") | |
| fi | |
| done | |
| # Detect package manager: prefer dnf, fall back to yum | |
| if command -v dnf &>/dev/null; then | |
| PKG_MGR="dnf" | |
| success "Found: $(command -v dnf) (package manager)" | |
| elif command -v yum &>/dev/null; then | |
| PKG_MGR="yum" | |
| success "Found: $(command -v yum) (package manager)" | |
| else | |
| error "Missing: ${BOLD}yum/dnf${NC} — no package manager found" | |
| missing_cmds+=("yum/dnf") | |
| fi | |
| if [[ ${#missing_cmds[@]} -gt 0 ]]; then | |
| echo "" | |
| error "The following required commands are missing: ${missing_cmds[*]}" | |
| error "Install them before running this script." | |
| for cmd in "${missing_cmds[@]}"; do | |
| case "$cmd" in | |
| python3) error " -> ${PKG_MGR:-yum} install python3" ;; | |
| sha512sum) error " -> Part of coreutils: ${PKG_MGR:-yum} install coreutils" ;; | |
| journalctl) error " -> Part of systemd: ${PKG_MGR:-yum} install systemd" ;; | |
| ldd) error " -> Part of glibc: ${PKG_MGR:-yum} install glibc-common" ;; | |
| diff) error " -> Part of diffutils: ${PKG_MGR:-yum} install diffutils" ;; | |
| esac | |
| done | |
| exit 1 | |
| fi | |
| # ============================================================================== | |
| # Pre-flight: Verify target version is available for download | |
| # ============================================================================== | |
| header "Verifying Target Version Availability" | |
| info "Checking if version ${BOLD}${TARGET_VERSION}${NC} is available on artifacts.elastic.co..." | |
| echo "" | |
| version_available=true | |
| if $UPGRADE_ES; then | |
| ES_RPM_CHECK_URL="https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${TARGET_VERSION}-${ARCH}.rpm" | |
| info "Checking Elasticsearch RPM: ${ES_RPM_CHECK_URL}" | |
| es_http_code=$(curl -s -o /dev/null -w '%{http_code}' --head --max-time 15 "${ES_RPM_CHECK_URL}" 2>/dev/null) || es_http_code="000" | |
| if [[ "$es_http_code" == "200" ]]; then | |
| es_size=$(curl -sI --max-time 15 "${ES_RPM_CHECK_URL}" 2>/dev/null | grep -i 'Content-Length' | awk '{print $2}' | tr -d '\r') || es_size="" | |
| if [[ -n "$es_size" && "$es_size" -gt 0 ]] 2>/dev/null; then | |
| es_size_mb=$((es_size / 1024 / 1024)) | |
| success "Elasticsearch ${TARGET_VERSION} (${ARCH}) is available (${es_size_mb} MB)" | |
| else | |
| success "Elasticsearch ${TARGET_VERSION} (${ARCH}) is available" | |
| fi | |
| elif [[ "$es_http_code" == "404" ]]; then | |
| error "Elasticsearch ${TARGET_VERSION} (${ARCH}) was ${BOLD}NOT FOUND${NC}${RED} (HTTP 404)${NC}" | |
| error "URL: ${ES_RPM_CHECK_URL}" | |
| error "Verify the version number and architecture are correct." | |
| error "Browse available versions at: https://www.elastic.co/downloads/past-releases" | |
| version_available=false | |
| else | |
| warn "Could not verify Elasticsearch RPM availability (HTTP ${es_http_code})." | |
| warn "URL: ${ES_RPM_CHECK_URL}" | |
| warn "This may be a network issue. The download will be attempted later." | |
| fi | |
| fi | |
| if $UPGRADE_KIBANA; then | |
| KIBANA_RPM_CHECK_URL="https://artifacts.elastic.co/downloads/kibana/kibana-${TARGET_VERSION}-${ARCH}.rpm" | |
| info "Checking Kibana RPM: ${KIBANA_RPM_CHECK_URL}" | |
| kibana_http_code=$(curl -s -o /dev/null -w '%{http_code}' --head --max-time 15 "${KIBANA_RPM_CHECK_URL}" 2>/dev/null) || kibana_http_code="000" | |
| if [[ "$kibana_http_code" == "200" ]]; then | |
| kibana_size=$(curl -sI --max-time 15 "${KIBANA_RPM_CHECK_URL}" 2>/dev/null | grep -i 'Content-Length' | awk '{print $2}' | tr -d '\r') || kibana_size="" | |
| if [[ -n "$kibana_size" && "$kibana_size" -gt 0 ]] 2>/dev/null; then | |
| kibana_size_mb=$((kibana_size / 1024 / 1024)) | |
| success "Kibana ${TARGET_VERSION} (${ARCH}) is available (${kibana_size_mb} MB)" | |
| else | |
| success "Kibana ${TARGET_VERSION} (${ARCH}) is available" | |
| fi | |
| elif [[ "$kibana_http_code" == "404" ]]; then | |
| error "Kibana ${TARGET_VERSION} (${ARCH}) was ${BOLD}NOT FOUND${NC}${RED} (HTTP 404)${NC}" | |
| error "URL: ${KIBANA_RPM_CHECK_URL}" | |
| error "Verify the version number and architecture are correct." | |
| error "Browse available versions at: https://www.elastic.co/downloads/past-releases" | |
| version_available=false | |
| else | |
| warn "Could not verify Kibana RPM availability (HTTP ${kibana_http_code})." | |
| warn "URL: ${KIBANA_RPM_CHECK_URL}" | |
| warn "This may be a network issue. The download will be attempted later." | |
| fi | |
| fi | |
| if ! $version_available; then | |
| echo "" | |
| error "One or more RPMs are not available for version ${TARGET_VERSION}." | |
| confirm_or_abort "Continue anyway (downloads will fail later)?" | |
| fi | |
| # ============================================================================== | |
| # Detect Elasticsearch connection settings | |
| # This runs for BOTH --es-only and --kibana-only because the Kibana upgrade | |
| # section needs ES API access to check shard allocation. | |
| # Only skipped in --force mode where all API calls are bypassed. | |
| # ============================================================================== | |
| if $FORCE_MODE; then | |
| header "Detecting Elasticsearch Connection" | |
| warn "FORCE MODE: Skipping Elasticsearch connection detection." | |
| warn "Cluster health, node analysis, deprecation checks, shard management" | |
| warn "and recovery steps will all be skipped." | |
| else | |
| header "Detecting Elasticsearch Connection" | |
| info "Testing connection to ${ES_URL}..." | |
| # Try plain connection first | |
| response=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "${ES_URL}/" 2>/dev/null) || response="000" | |
| if [[ "$response" == "200" ]]; then | |
| success "Connected to Elasticsearch (no auth required)" | |
| elif [[ "$response" == "401" ]]; then | |
| info "Elasticsearch requires authentication." | |
| echo -en "${BOLD}Enter Elasticsearch username [elastic]: ${NC}" | |
| read -r es_user | |
| es_user="${es_user:-elastic}" | |
| echo -en "${BOLD}Enter Elasticsearch password: ${NC}" | |
| read -rs es_pass | |
| echo "" | |
| # Store credentials in array -- never passed through eval/shell expansion | |
| ES_CURL_AUTH=(-u "${es_user}:${es_pass}") | |
| ES_CURL_INSECURE=true | |
| response=$(test_es_connection) | |
| if [[ "$response" == "200" ]]; then | |
| success "Authenticated successfully." | |
| else | |
| error "Authentication failed (HTTP $response). Please check credentials." | |
| exit 1 | |
| fi | |
| # Clear credential variables from memory (array persists for curl calls) | |
| unset es_pass | |
| elif [[ "$response" == "000" ]]; then | |
| # Try https with -k | |
| ES_URL="${ES_URL/http:/https:}" | |
| ES_CURL_INSECURE=true | |
| response=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 -k "${ES_URL}/" 2>/dev/null) || response="000" | |
| if [[ "$response" == "200" ]]; then | |
| success "Connected to Elasticsearch over HTTPS (self-signed cert)" | |
| elif [[ "$response" == "401" ]]; then | |
| info "Elasticsearch requires authentication (HTTPS)." | |
| echo -en "${BOLD}Enter Elasticsearch username [elastic]: ${NC}" | |
| read -r es_user | |
| es_user="${es_user:-elastic}" | |
| echo -en "${BOLD}Enter Elasticsearch password: ${NC}" | |
| read -rs es_pass | |
| echo "" | |
| ES_CURL_AUTH=(-u "${es_user}:${es_pass}") | |
| response=$(test_es_connection) | |
| if [[ "$response" == "200" ]]; then | |
| success "Authenticated successfully over HTTPS." | |
| else | |
| error "Authentication failed (HTTP $response)." | |
| exit 1 | |
| fi | |
| unset es_pass | |
| else | |
| error "Cannot connect to Elasticsearch at ${ES_URL} (HTTP $response)." | |
| error "Make sure Elasticsearch is running and accessible." | |
| error "You can specify a custom URL with: --es-url http://hostname:9200" | |
| exit 1 | |
| fi | |
| else | |
| error "Unexpected response from Elasticsearch (HTTP $response)." | |
| exit 1 | |
| fi | |
| fi # end of: if ! $FORCE_MODE (ES connection) | |
| # ============================================================================== | |
| # STEP 0: Pre-flight Checks | |
| # ============================================================================== | |
| header "Pre-flight Checks" | |
| # --- Current versions --- | |
| step "Checking currently installed versions" | |
| if $UPGRADE_ES; then | |
| ES_CURRENT=$(get_rpm_version elasticsearch) | |
| if [[ "$ES_CURRENT" == "not installed" ]]; then | |
| error "Elasticsearch RPM is not installed on this system." | |
| exit 1 | |
| fi | |
| info "Elasticsearch installed version: ${BOLD}${ES_CURRENT}${NC}" | |
| fi | |
| if $UPGRADE_KIBANA; then | |
| KIBANA_CURRENT=$(get_rpm_version kibana) | |
| if [[ "$KIBANA_CURRENT" == "not installed" ]]; then | |
| warn "Kibana RPM is not installed on this system. Skipping Kibana upgrade." | |
| UPGRADE_KIBANA=false | |
| else | |
| info "Kibana installed version: ${BOLD}${KIBANA_CURRENT}${NC}" | |
| fi | |
| fi | |
| info "Target version: ${BOLD}${TARGET_VERSION}${NC}" | |
| echo "" | |
| if $UPGRADE_ES && [[ "$ES_CURRENT" == "$TARGET_VERSION" ]]; then | |
| warn "Elasticsearch is already at version ${TARGET_VERSION}." | |
| if ! confirm "Continue anyway?" "n"; then | |
| UPGRADE_ES=false | |
| fi | |
| fi | |
| if $UPGRADE_KIBANA && [[ "$KIBANA_CURRENT" == "$TARGET_VERSION" ]]; then | |
| warn "Kibana is already at version ${TARGET_VERSION}." | |
| if ! confirm "Continue anyway?" "n"; then | |
| UPGRADE_KIBANA=false | |
| fi | |
| fi | |
| if ! $UPGRADE_ES && ! $UPGRADE_KIBANA; then | |
| info "Nothing to upgrade." | |
| exit 0 | |
| fi | |
| # --- Version jump analysis --- | |
| if $UPGRADE_ES; then | |
| step "Analyzing version upgrade path" | |
| upgrade_path_ok=true | |
| read -r cur_major cur_minor cur_patch <<< "$(echo "$ES_CURRENT" | awk -F. '{print $1, $2, $3}')" | |
| read -r tgt_major tgt_minor tgt_patch <<< "$(echo "$TARGET_VERSION" | awk -F. '{print $1, $2, $3}')" | |
| if [[ -z "$cur_major" || -z "$tgt_major" ]]; then | |
| warn "Could not parse version numbers. Skipping upgrade path analysis." | |
| else | |
| info "Current: ${BOLD}${ES_CURRENT}${NC} (major=${cur_major}, minor=${cur_minor}, patch=${cur_patch})" | |
| info "Target: ${BOLD}${TARGET_VERSION}${NC} (major=${tgt_major}, minor=${tgt_minor}, patch=${tgt_patch})" | |
| echo "" | |
| if [[ "$tgt_major" -lt "$cur_major" ]] || \ | |
| { [[ "$tgt_major" -eq "$cur_major" && "$tgt_minor" -lt "$cur_minor" ]]; } || \ | |
| { [[ "$tgt_major" -eq "$cur_major" && "$tgt_minor" -eq "$cur_minor" && "$tgt_patch" -lt "$cur_patch" ]]; }; then | |
| warn "Target version ${TARGET_VERSION} is OLDER than current ${ES_CURRENT}." | |
| warn "Elasticsearch does not support downgrades. Nodes cannot be rolled back" | |
| warn "once upgraded. This will install an older RPM but may cause problems." | |
| confirm_or_abort "This looks like a downgrade. Are you sure?" | |
| elif [[ "$tgt_major" -eq "$cur_major" ]]; then | |
| if [[ "$tgt_minor" -eq "$cur_minor" ]]; then | |
| success "Patch upgrade (${ES_CURRENT} -> ${TARGET_VERSION}). No special requirements." | |
| else | |
| success "Minor upgrade (${ES_CURRENT} -> ${TARGET_VERSION}). Rolling upgrade supported." | |
| fi | |
| elif [[ "$tgt_major" -eq $((cur_major + 1)) ]]; then | |
| warn "This is a ${BOLD}MAJOR VERSION${NC}${YELLOW} upgrade (${cur_major}.x -> ${tgt_major}.x).${NC}" | |
| echo "" | |
| declare -A gateway_minor | |
| gateway_minor[6]=8 | |
| gateway_minor[7]=17 | |
| gateway_minor[8]=19 | |
| required_minor="${gateway_minor[$cur_major]:-}" | |
| if [[ -n "$required_minor" ]]; then | |
| if [[ "$cur_major" -eq 8 && "$cur_minor" -eq 18 && "$tgt_major" -eq 9 && "$tgt_minor" -eq 0 ]]; then | |
| success "On 8.18.x targeting 9.0.x — this specific path is supported by Elastic." | |
| elif [[ "$cur_minor" -lt "$required_minor" ]]; then | |
| error "Major upgrade from ${cur_major}.x to ${tgt_major}.x requires being on ${cur_major}.${required_minor}.x first." | |
| error "" | |
| error "Current version ${ES_CURRENT} is below the required stepping stone." | |
| error "" | |
| error "Required upgrade path:" | |
| error " 1. First upgrade: ${ES_CURRENT} -> ${cur_major}.${required_minor}.x (minor upgrade)" | |
| error " 2. Then upgrade: ${cur_major}.${required_minor}.x -> ${TARGET_VERSION} (major upgrade)" | |
| error "" | |
| error "Skipping the stepping-stone version may cause data loss or failed startup." | |
| upgrade_path_ok=false | |
| confirm_critical "Override and attempt direct major upgrade anyway? THIS IS DANGEROUS." | |
| elif [[ "$cur_minor" -eq "$required_minor" ]]; then | |
| success "On required gateway version ${cur_major}.${required_minor}.x. Major upgrade path is valid." | |
| else | |
| success "On ${ES_CURRENT}, which is above the required ${cur_major}.${required_minor}.x gateway." | |
| fi | |
| if [[ "$cur_major" -eq 8 && "$cur_minor" -eq 18 && "$tgt_minor" -gt 0 ]]; then | |
| warn "Version 8.18.x can only upgrade to 9.0.x directly." | |
| warn "For 9.1.x or later, you must first upgrade to 8.19.x." | |
| error "" | |
| error "Required upgrade path:" | |
| error " 1. First upgrade: ${ES_CURRENT} -> 8.19.x" | |
| error " 2. Then upgrade: 8.19.x -> ${TARGET_VERSION}" | |
| upgrade_path_ok=false | |
| confirm_critical "Override and attempt direct upgrade anyway? THIS IS DANGEROUS." | |
| fi | |
| else | |
| warn "No gateway version defined for ${cur_major}.x -> ${tgt_major}.x in this script." | |
| warn "Check the Elastic upgrade documentation for the correct upgrade path." | |
| fi | |
| echo "" | |
| warn "Major upgrade checklist:" | |
| warn " - Run the Upgrade Assistant in Kibana before upgrading" | |
| warn " - Resolve ALL critical deprecation issues (checked later in this script)" | |
| warn " - Ensure no indices created before ${cur_major}.0 exist (reindex or delete them)" | |
| warn " - Review breaking changes: https://www.elastic.co/guide/en/elasticsearch/reference/${tgt_major}.x/breaking-changes.html" | |
| warn " - Take a full snapshot backup before proceeding" | |
| echo "" | |
| if $upgrade_path_ok; then | |
| confirm_or_abort "Acknowledge major upgrade requirements and continue?" | |
| fi | |
| elif [[ "$tgt_major" -gt $((cur_major + 1)) ]]; then | |
| error "Upgrading from ${cur_major}.x to ${tgt_major}.x skips one or more major versions." | |
| error "Elasticsearch does NOT support skipping major versions." | |
| error "" | |
| error "Required upgrade path:" | |
| v=$cur_major | |
| while [[ $v -lt $tgt_major ]]; do | |
| next=$((v + 1)) | |
| gw="${gateway_minor[$v]:-last_minor}" | |
| if [[ $v -eq $cur_major ]]; then | |
| error " ${v}.x -> ${v}.${gw}.x (get to gateway minor first)" | |
| fi | |
| error " ${v}.${gw}.x -> ${next}.x" | |
| v=$next | |
| done | |
| error "" | |
| error "Each major version boundary must be crossed individually." | |
| confirm_critical "Override and attempt anyway? THIS WILL ALMOST CERTAINLY FAIL." | |
| fi | |
| fi | |
| fi | |
| # --- Cluster health --- | |
| if $FORCE_MODE; then | |
| if $UPGRADE_ES || $UPGRADE_KIBANA; then | |
| step "Checking cluster health" | |
| warn "FORCE MODE: Skipping cluster health check." | |
| fi | |
| else | |
| step "Checking cluster health" | |
| es_curl GET "/_cluster/health?pretty" | |
| health_body="$ES_CURL_BODY" | |
| health_code="$ES_CURL_HTTP_CODE" | |
| if [[ "$health_code" != "200" ]]; then | |
| error "Failed to get cluster health (HTTP $health_code)." | |
| exit 1 | |
| fi | |
| cluster_status=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('status','unknown'))" 2>/dev/null) || cluster_status="unknown" | |
| cluster_name=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('cluster_name',''))" 2>/dev/null) || cluster_name="" | |
| num_nodes=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('number_of_nodes',0))" 2>/dev/null) || num_nodes="?" | |
| unassigned=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('unassigned_shards',0))" 2>/dev/null) || unassigned="?" | |
| info "Cluster: ${BOLD}${cluster_name}${NC}" | |
| info "Nodes: ${BOLD}${num_nodes}${NC}" | |
| info "Unassigned: ${BOLD}${unassigned}${NC}" | |
| if [[ "$cluster_status" == "green" ]]; then | |
| success "Cluster status: ${GREEN}${BOLD}GREEN${NC}" | |
| elif [[ "$cluster_status" == "yellow" ]]; then | |
| warn "Cluster status: ${YELLOW}${BOLD}YELLOW${NC}" | |
| warn "It is recommended to start with a GREEN cluster." | |
| confirm_or_abort "Continue with YELLOW cluster status?" | |
| else | |
| error "Cluster status: ${RED}${BOLD}RED${NC}" | |
| error "DO NOT upgrade while cluster is RED." | |
| confirm_critical "Are you absolutely sure you want to continue with RED status?" | |
| fi | |
| # --- Node roles, versions & upgrade order advice (ES upgrade only) --- | |
| if $UPGRADE_ES; then | |
| step "Analyzing cluster nodes and upgrade order" | |
| es_curl GET "/_cat/nodes?h=name,ip,version,master,node.role&format=json" | |
| nodes_body="$ES_CURL_BODY" | |
| nodes_code="$ES_CURL_HTTP_CODE" | |
| local_hostname=$(hostname -s 2>/dev/null) || local_hostname="" | |
| local_fqdn=$(hostname -f 2>/dev/null) || local_fqdn="" | |
| local_ips=$(hostname -I 2>/dev/null | tr ' ' '\n' | grep -v '^$') || local_ips="" | |
| local_ips="${local_ips} | |
| 127.0.0.1 | |
| ::1" | |
| if [[ "$nodes_code" == "200" ]]; then | |
| _pyfile=$(mktemp /tmp/elastic-upgrade-nodecheck.XXXXXX.py) | |
| chmod 600 "$_pyfile" | |
| cat > "$_pyfile" <<'PYEOF' | |
| import json, sys, os | |
| nodes = json.load(sys.stdin) | |
| local_hostname = os.environ.get('LOCAL_HOSTNAME', '').lower() | |
| local_fqdn = os.environ.get('LOCAL_FQDN', '').lower() | |
| local_ips = set(ip.strip() for ip in os.environ.get('LOCAL_IPS', '').split('\n') if ip.strip()) | |
| target_version = os.environ.get('TARGET_VERSION', '') | |
| # --- Identify local node --- | |
| local_node = None | |
| for n in nodes: | |
| node_name = n.get('name', '').strip() | |
| node_ip = n.get('ip', '').strip() | |
| if (node_name.lower() == local_hostname or | |
| node_name.lower() == local_fqdn or | |
| node_ip in local_ips or | |
| local_hostname and node_name.lower().startswith(local_hostname)): | |
| local_node = node_name | |
| break | |
| # --- Classify nodes --- | |
| tier1_coord = [] | |
| tier2_data = [] | |
| tier3_master = [] | |
| tier4_elected = None | |
| all_entries = [] | |
| for n in nodes: | |
| name = n.get('name', '').strip() | |
| version = n.get('version', '').strip() | |
| is_elected_master = n.get('master', '').strip() == '*' | |
| roles = n.get('node.role', '').strip() | |
| has_master_role = 'm' in roles | |
| has_data_role = 'd' in roles | |
| entry = { | |
| 'name': name, | |
| 'version': version, | |
| 'roles': roles, | |
| 'is_elected_master': is_elected_master, | |
| 'has_master_role': has_master_role, | |
| 'has_data_role': has_data_role, | |
| 'is_local': name == local_node, | |
| 'already_upgraded': version == target_version, | |
| } | |
| all_entries.append(entry) | |
| if is_elected_master: | |
| tier4_elected = entry | |
| elif has_master_role: | |
| tier3_master.append(entry) | |
| elif has_data_role: | |
| tier2_data.append(entry) | |
| else: | |
| tier1_coord.append(entry) | |
| all_combined = all(e['has_master_role'] and e['has_data_role'] for e in all_entries) | |
| # --- Print node table --- | |
| print('NODE_TABLE_START') | |
| for n in nodes: | |
| name = n.get('name', '').strip() | |
| version = n.get('version', '').strip() | |
| roles = n.get('node.role', '').strip() | |
| is_elected = n.get('master', '').strip() == '*' | |
| markers = [] | |
| if is_elected: | |
| markers.append('elected master') | |
| if name == local_node: | |
| markers.append('THIS NODE') | |
| marker_str = f' ({", ".join(markers)})' if markers else '' | |
| elected_icon = 'M' if is_elected else ' ' | |
| local_icon = '>' if name == local_node else ' ' | |
| print(f' {local_icon}{elected_icon} {name:<45} v{version:<12} roles: {roles}{marker_str}') | |
| print('NODE_TABLE_END') | |
| # --- Determine upgrade order --- | |
| upgrade_order = [] | |
| upgrade_order += sorted(tier1_coord, key=lambda x: x['name']) | |
| upgrade_order += sorted(tier2_data, key=lambda x: x['name']) | |
| upgrade_order += sorted(tier3_master, key=lambda x: x['name']) | |
| if tier4_elected: | |
| upgrade_order.append(tier4_elected) | |
| def tier_label(entry): | |
| parts = [] | |
| if entry['is_elected_master']: | |
| parts.append('elected master') | |
| elif entry['has_master_role']: | |
| parts.append('master-eligible') | |
| if entry['has_data_role']: | |
| parts.append('data') | |
| if not entry['has_master_role'] and not entry['has_data_role']: | |
| parts.append('coordinating/ingest') | |
| return ', '.join(parts) | |
| print('ORDER_START') | |
| if all_combined: | |
| print(f' NOTE: All nodes have both data and master roles (combined topology).') | |
| print(f' NOTE: Upgrade any non-elected node first, elected master last.') | |
| print(f' ---') | |
| for i, n in enumerate(upgrade_order, 1): | |
| status = 'DONE' if n['already_upgraded'] else 'PENDING' | |
| local_marker = ' << THIS NODE' if n['is_local'] else '' | |
| print(f' {i:>2}. {n["name"]:<40} v{n["version"]:<12} [{tier_label(n)}] {status}{local_marker}') | |
| print('ORDER_END') | |
| # --- Generate advice --- | |
| print('ADVICE_START') | |
| if local_node is None: | |
| print('WARN|Could not identify which cluster node corresponds to this machine.') | |
| print('WARN|Hostname: ' + local_hostname + ', FQDN: ' + local_fqdn) | |
| print('WARN|Local IPs: ' + ', '.join(sorted(local_ips - {'127.0.0.1', '::1'}))) | |
| print('WARN|This is expected when --es-url points to a remote node.') | |
| print('WARN|Verify manually that you are upgrading nodes in the correct order.') | |
| local_entry = None | |
| else: | |
| local_entry = None | |
| local_position = -1 | |
| for i, n in enumerate(upgrade_order): | |
| if n['is_local']: | |
| local_entry = n | |
| local_position = i | |
| break | |
| if local_entry is None: | |
| print('WARN|This node was identified but not found in the upgrade order. This is unexpected.') | |
| elif local_entry['already_upgraded']: | |
| print(f'OK|This node ({local_node}) is already at version {target_version}.') | |
| else: | |
| nodes_ahead_pending = [upgrade_order[i] for i in range(local_position) if not upgrade_order[i]['already_upgraded']] | |
| if len(nodes_ahead_pending) > 0: | |
| print(f'WARN|There are {len(nodes_ahead_pending)} node(s) that should ideally be upgraded BEFORE this one:') | |
| for n in nodes_ahead_pending: | |
| print(f'WARN| - {n["name"]} (v{n["version"]}, {tier_label(n)})') | |
| if local_entry.get('is_elected_master'): | |
| print('INFO|') | |
| print('INFO|This node is the current elected master.') | |
| print('INFO|The master role will transfer automatically when this node stops.') | |
| elif all_combined: | |
| print('INFO|') | |
| print('INFO|All nodes share both data and master roles.') | |
| print('INFO|Order among non-elected peers does not strictly matter,') | |
| print('INFO|but the elected master should be upgraded last.') | |
| else: | |
| print(f'OK|This node ({local_node}) is next in the upgrade order. Safe to proceed.') | |
| if local_entry.get('is_elected_master'): | |
| other_masters = [n for n in tier3_master if not n['is_local']] | |
| if other_masters: | |
| print('OK|This is the elected master. A new master will be elected automatically when this node stops.') | |
| else: | |
| print('WARN|This is the ONLY master-eligible node. The cluster will be unavailable during upgrade.') | |
| print('ADVICE_END') | |
| # --- Cluster version mix analysis --- | |
| print('VERSION_MIX_START') | |
| versions_in_cluster = set(e['version'] for e in all_entries) | |
| if local_node is None or local_entry is None: | |
| from collections import Counter | |
| version_counts = Counter(e['version'] for e in all_entries) | |
| print(f'INFO|Versions currently in cluster:') | |
| for v, count in sorted(version_counts.items()): | |
| node_names = [e['name'] for e in all_entries if e['version'] == v] | |
| print(f'INFO| v{v}: {count} node(s) — {", ".join(node_names)}') | |
| print(f'WARN|Local node not found in cluster. Cannot predict version mix after upgrade.') | |
| else: | |
| versions_after = list(e['version'] for e in all_entries if not e.get('is_local')) | |
| versions_after.append(target_version) | |
| unique_after = set(versions_after) | |
| from collections import Counter | |
| version_counts = Counter(e['version'] for e in all_entries) | |
| print(f'INFO|Versions currently in cluster:') | |
| for v, count in sorted(version_counts.items()): | |
| node_names = [e['name'] for e in all_entries if e['version'] == v] | |
| print(f'INFO| v{v}: {count} node(s) — {", ".join(node_names)}') | |
| if len(versions_in_cluster) == 1 and list(versions_in_cluster)[0] == target_version: | |
| print(f'OK|All nodes are already on {target_version}.') | |
| elif len(unique_after) <= 2: | |
| if len(unique_after) == 1: | |
| print(f'OK|After this upgrade, all nodes will be on {target_version}.') | |
| else: | |
| other_version = [v for v in unique_after if v != target_version] | |
| remaining_old = len([v for v in versions_after if v != target_version]) | |
| print(f'OK|After this upgrade the cluster will have 2 versions (normal during rolling upgrade):') | |
| print(f'OK| v{target_version} (upgraded) and v{other_version[0]} ({remaining_old} node(s) remaining)') | |
| elif len(unique_after) >= 3: | |
| print(f'BLOCK|') | |
| print(f'BLOCK|After upgrading this node, the cluster would have {len(unique_after)} different versions:') | |
| version_counts_after = Counter(versions_after) | |
| for v, count in sorted(version_counts_after.items()): | |
| print(f'BLOCK| v{v}: {count} node(s)') | |
| print(f'BLOCK|') | |
| print(f'BLOCK|Running 3+ versions simultaneously is NOT supported by Elasticsearch.') | |
| print(f'BLOCK|This typically means a previous rolling upgrade was not completed.') | |
| print(f'BLOCK|') | |
| print(f'BLOCK|Recommended action:') | |
| print(f'BLOCK| Complete the previous upgrade first — bring ALL nodes to the same') | |
| print(f'BLOCK| version before starting a new upgrade to {target_version}.') | |
| print('VERSION_MIX_END') | |
| PYEOF | |
| upgrade_advice=$(echo "$nodes_body" | LOCAL_HOSTNAME="$local_hostname" LOCAL_FQDN="$local_fqdn" LOCAL_IPS="$local_ips" TARGET_VERSION="$TARGET_VERSION" python3 "$_pyfile" 2>/dev/null) || true | |
| rm -f "$_pyfile" | |
| if [[ -z "$upgrade_advice" ]]; then | |
| warn "Could not analyze node roles. Proceeding without upgrade order advice." | |
| else | |
| echo "$upgrade_advice" | sed -n '/NODE_TABLE_START/,/NODE_TABLE_END/p' | grep -v '_START\|_END' | |
| echo "" | |
| info "${BOLD}Legend:${NC} M = elected master, > = this node" | |
| echo "" | |
| info "${BOLD}Recommended upgrade order:${NC}" | |
| echo "$upgrade_advice" | sed -n '/ORDER_START/,/ORDER_END/p' | grep -v '_START\|_END' | |
| echo "" | |
| has_warn=false | |
| while IFS= read -r line; do | |
| case "$line" in | |
| WARN\|*) | |
| has_warn=true | |
| msg="${line#WARN|}" | |
| [[ -n "$msg" ]] && warn "$msg" | |
| ;; | |
| OK\|*) | |
| msg="${line#OK|}" | |
| [[ -n "$msg" ]] && success "$msg" | |
| ;; | |
| INFO\|*) | |
| msg="${line#INFO|}" | |
| [[ -n "$msg" ]] && info "$msg" | |
| ;; | |
| esac | |
| done <<< "$(echo "$upgrade_advice" | sed -n '/ADVICE_START/,/ADVICE_END/p' | grep -v '_START\|_END')" | |
| echo "" | |
| if $has_warn; then | |
| confirm_or_abort "Acknowledge the upgrade order warnings above and continue?" | |
| fi | |
| # --- Version mix analysis --- | |
| version_mix_section=$(echo "$upgrade_advice" | sed -n '/VERSION_MIX_START/,/VERSION_MIX_END/p' | grep -v '_START\|_END') | |
| if [[ -n "$version_mix_section" ]]; then | |
| step "Checking cluster version consistency" | |
| vmix_has_block=false | |
| while IFS= read -r line; do | |
| case "$line" in | |
| BLOCK\|*) | |
| vmix_has_block=true | |
| msg="${line#BLOCK|}" | |
| [[ -n "$msg" ]] && error "$msg" | |
| ;; | |
| WARN\|*) | |
| msg="${line#WARN|}" | |
| [[ -n "$msg" ]] && warn "$msg" | |
| ;; | |
| OK\|*) | |
| msg="${line#OK|}" | |
| [[ -n "$msg" ]] && success "$msg" | |
| ;; | |
| INFO\|*) | |
| msg="${line#INFO|}" | |
| [[ -n "$msg" ]] && info "$msg" | |
| ;; | |
| esac | |
| done <<< "$version_mix_section" | |
| echo "" | |
| if $vmix_has_block; then | |
| error "Upgrading this node would introduce 3+ versions into the cluster." | |
| confirm_critical "Override version mix warning and continue anyway? THIS IS NOT SUPPORTED." | |
| fi | |
| fi | |
| fi | |
| else | |
| warn "Could not retrieve node list (HTTP ${nodes_code}). Skipping upgrade order analysis." | |
| fi | |
| fi # end $UPGRADE_ES node analysis | |
| fi # end of: if ! $FORCE_MODE (cluster health, node analysis, version mix) | |
| # --- Disk space --- | |
| step "Checking disk space" | |
| disk_space_ok=true | |
| free_tmp_mb=$(df -m /tmp 2>/dev/null | awk 'NR==2{print $4}') | |
| info "Free space in /tmp (downloads): ${free_tmp_mb:-unknown} MB" | |
| if [[ -n "$free_tmp_mb" && "$free_tmp_mb" -lt 1024 ]]; then | |
| warn "Less than 1 GB free in /tmp. RPM download may fail." | |
| disk_space_ok=false | |
| fi | |
| free_usr_mb=$(df -m /usr 2>/dev/null | awk 'NR==2{print $4}') | |
| info "Free space in /usr (installation): ${free_usr_mb:-unknown} MB" | |
| if [[ -n "$free_usr_mb" && "$free_usr_mb" -lt 1024 ]]; then | |
| warn "Less than 1 GB free in /usr. RPM installation may fail." | |
| disk_space_ok=false | |
| fi | |
| free_var_mb=$(df -m /var 2>/dev/null | awk 'NR==2{print $4}') | |
| info "Free space in /var (data/logs): ${free_var_mb:-unknown} MB" | |
| if [[ -n "$free_var_mb" && "$free_var_mb" -lt 512 ]]; then | |
| warn "Less than 512 MB free in /var. May have issues with logs during upgrade." | |
| disk_space_ok=false | |
| fi | |
| if ! $disk_space_ok; then | |
| confirm_or_abort "Continue despite low disk space warnings?" | |
| fi | |
| # --- glibc version check --- | |
| if $UPGRADE_ES; then | |
| step "Checking glibc version" | |
| info "Newer Elasticsearch versions bundle a JDK that may require a newer glibc." | |
| info "If glibc is too old (e.g. CentOS 7 ships glibc 2.17), ES may fail to start." | |
| glibc_version=$(ldd --version 2>&1 | head -1 | grep -oP '[0-9]+\.[0-9]+' | head -1) || glibc_version="unknown" | |
| info "Detected glibc version: ${BOLD}${glibc_version}${NC}" | |
| if [[ "$glibc_version" != "unknown" ]]; then | |
| glibc_major=$(echo "$glibc_version" | cut -d. -f1) | |
| glibc_minor=$(echo "$glibc_version" | cut -d. -f2) | |
| if [[ "$glibc_major" -le 2 && "$glibc_minor" -lt 17 ]]; then | |
| error "glibc ${glibc_version} is very old and likely incompatible with ES ${TARGET_VERSION}." | |
| confirm_or_abort "Continue despite potential glibc incompatibility?" | |
| elif [[ "$glibc_major" -le 2 && "$glibc_minor" -lt 31 ]]; then | |
| warn "glibc ${glibc_version} may be too old for the bundled JDK in newer ES releases." | |
| warn "If ES fails to start after upgrade, this is likely the cause." | |
| warn "Consider testing this upgrade on one node first before rolling out." | |
| confirm_or_abort "Acknowledge glibc risk and continue?" | |
| else | |
| success "glibc ${glibc_version} should be compatible." | |
| fi | |
| else | |
| warn "Could not detect glibc version. Verify manually that your OS is compatible." | |
| fi | |
| fi | |
| # --- Deprecation API check --- | |
| if $UPGRADE_ES && ! $FORCE_MODE; then | |
| step "Checking for deprecated settings" | |
| info "Querying the deprecation API to find settings that may block the upgrade." | |
| info "Command: GET /_migration/deprecations" | |
| echo "" | |
| es_curl GET "/_migration/deprecations" | |
| deprec_body="$ES_CURL_BODY" | |
| deprec_code="$ES_CURL_HTTP_CODE" | |
| if [[ "$deprec_code" == "200" ]]; then | |
| crit_count=$(echo "$deprec_body" | python3 -c " | |
| import json, sys | |
| try: | |
| data = json.load(sys.stdin) | |
| count = 0 | |
| for category in ['cluster_settings', 'node_settings', 'index_settings', 'ml_settings']: | |
| items = data.get(category, []) | |
| if isinstance(items, list): | |
| for item in items: | |
| if item.get('level') == 'critical': | |
| count += 1 | |
| elif isinstance(items, dict): | |
| for index_name, idx_items in items.items(): | |
| for item in idx_items: | |
| if item.get('level') == 'critical': | |
| count += 1 | |
| print(count) | |
| except: | |
| print(-1) | |
| " 2>/dev/null) || crit_count="-1" | |
| warn_count=$(echo "$deprec_body" | python3 -c " | |
| import json, sys | |
| try: | |
| data = json.load(sys.stdin) | |
| count = 0 | |
| for category in ['cluster_settings', 'node_settings', 'index_settings', 'ml_settings']: | |
| items = data.get(category, []) | |
| if isinstance(items, list): | |
| for item in items: | |
| if item.get('level') == 'warning': | |
| count += 1 | |
| elif isinstance(items, dict): | |
| for index_name, idx_items in items.items(): | |
| for item in idx_items: | |
| if item.get('level') == 'warning': | |
| count += 1 | |
| print(count) | |
| except: | |
| print(-1) | |
| " 2>/dev/null) || warn_count="-1" | |
| if [[ "$crit_count" == "-1" ]]; then | |
| warn "Could not parse deprecation response. Review manually:" | |
| warn " GET /_migration/deprecations" | |
| elif [[ "$crit_count" -gt 0 ]]; then | |
| error "Found ${BOLD}${crit_count} CRITICAL${NC}${RED} deprecation(s) that may prevent startup after upgrade!${NC}" | |
| echo "$deprec_body" | python3 -c " | |
| import json, sys | |
| data = json.load(sys.stdin) | |
| for category in ['cluster_settings', 'node_settings', 'index_settings', 'ml_settings']: | |
| items = data.get(category, []) | |
| if isinstance(items, list): | |
| for item in items: | |
| if item.get('level') == 'critical': | |
| print(f\" [CRITICAL] [{category}] {item.get('message', 'unknown')}\") | |
| elif isinstance(items, dict): | |
| for idx_name, idx_items in items.items(): | |
| for item in idx_items: | |
| if item.get('level') == 'critical': | |
| print(f\" [CRITICAL] [index: {idx_name}] {item.get('message', 'unknown')}\") | |
| " 2>/dev/null || true | |
| echo "" | |
| error "Critical deprecations MUST be resolved before upgrading." | |
| error "Removed settings will prevent Elasticsearch from starting." | |
| confirm_critical "Continue despite critical deprecation warnings?" | |
| elif [[ "$warn_count" -gt 0 ]]; then | |
| warn "Found ${warn_count} deprecation warning(s) (non-critical)." | |
| info "Review with: GET /_migration/deprecations" | |
| info "These won't block the upgrade but should be addressed." | |
| else | |
| success "No deprecation issues found." | |
| fi | |
| elif [[ "$deprec_code" == "404" ]]; then | |
| warn "Deprecation API not available (older ES version). Skipping check." | |
| warn "Manually review release notes for removed settings between ${ES_CURRENT} and ${TARGET_VERSION}." | |
| else | |
| warn "Deprecation API returned HTTP ${deprec_code}. Skipping check." | |
| fi | |
| fi | |
| # --- Monitoring exporter config check --- | |
| if $UPGRADE_ES; then | |
| step "Checking for legacy monitoring configuration" | |
| es_config="/etc/elasticsearch/elasticsearch.yml" | |
| if [[ -f "$es_config" ]]; then | |
| if grep -q "xpack.monitoring.exporters" "$es_config" 2>/dev/null; then | |
| warn "Found ${BOLD}xpack.monitoring.exporters${NC}${YELLOW} in elasticsearch.yml${NC}" | |
| warn "Legacy monitoring (internal collection) is deprecated since 8.x." | |
| warn "Plan to migrate to Elastic Agent or Metricbeat monitoring collection." | |
| echo "" | |
| fi | |
| if grep -q "xpack.monitoring.collection.enabled" "$es_config" 2>/dev/null; then | |
| warn "Found ${BOLD}xpack.monitoring.collection.enabled${NC}${YELLOW} in elasticsearch.yml${NC}" | |
| warn "Legacy internal monitoring collection is deprecated." | |
| echo "" | |
| fi | |
| fi | |
| fi | |
| # --- Check for open ML jobs (warn about potential disruption) --- | |
| if $UPGRADE_ES && ! $FORCE_MODE; then | |
| step "Checking for running ML jobs" | |
| es_curl GET "/_ml/anomaly_detectors/_all?allow_no_match=true" | |
| ml_body="$ES_CURL_BODY" | |
| ml_code="$ES_CURL_HTTP_CODE" | |
| if [[ "$ml_code" == "200" ]]; then | |
| open_jobs=$(echo "$ml_body" | python3 -c " | |
| import json, sys | |
| try: | |
| data = json.load(sys.stdin) | |
| jobs = data.get('jobs', []) | |
| opened = [j['job_id'] for j in jobs if j.get('state') == 'opened'] | |
| print(len(opened)) | |
| except: | |
| print(0) | |
| " 2>/dev/null) || open_jobs="0" | |
| if [[ "$open_jobs" -gt 0 ]]; then | |
| warn "There are ${BOLD}${open_jobs}${NC}${YELLOW} open ML anomaly detection jobs.${NC}" | |
| warn "These will be interrupted when Elasticsearch stops." | |
| warn "They should recover automatically after restart, but may need" | |
| warn "a few minutes to re-open and resume processing." | |
| info "To see open jobs: GET /_ml/anomaly_detectors/_all?allow_no_match=true" | |
| else | |
| success "No open ML jobs." | |
| fi | |
| fi | |
| fi | |
| # ============================================================================== | |
| # Summary & Confirmation | |
| # ============================================================================== | |
| header "Upgrade Plan" | |
| if [[ -n "${cluster_name:-}" ]]; then | |
| echo -e " Cluster: ${BOLD}${cluster_name}${NC}" | |
| fi | |
| echo -e " Target version: ${BOLD}${TARGET_VERSION}${NC}" | |
| echo -e " Architecture: ${BOLD}${ARCH}${NC}" | |
| $UPGRADE_ES && echo -e " Upgrade ES: ${GREEN}YES${NC} (${ES_CURRENT} -> ${TARGET_VERSION})" | |
| $UPGRADE_ES || echo -e " Upgrade ES: ${YELLOW}NO${NC}" | |
| $UPGRADE_KIBANA && echo -e " Upgrade Kibana: ${GREEN}YES${NC} (${KIBANA_CURRENT} -> ${TARGET_VERSION})" | |
| $UPGRADE_KIBANA || echo -e " Upgrade Kibana: ${YELLOW}NO${NC}" | |
| echo "" | |
| echo -e "${BOLD}The following steps will be performed:${NC}" | |
| echo " Pre-upgrade:" | |
| echo " - Snapshot reminder" | |
| echo " - Remove version locks (if any)" | |
| echo " - Download and verify all RPMs" | |
| echo " - Backup configuration files" | |
| $UPGRADE_ES && cat <<'EOF' | |
| Elasticsearch: | |
| 1. Disable shard allocation | |
| 2. Flush all indices (best effort) | |
| 3. Stop Elasticsearch | |
| 4. Install Elasticsearch RPM | |
| 4a. Keystore check | |
| 4b. Check for .rpmnew files | |
| 4c. Verify JVM heap settings | |
| 5. Reload systemd daemon | |
| 6. Start Elasticsearch | |
| 7. Wait for node to rejoin and re-enable allocation | |
| 7a. Re-enable shard allocation | |
| 7b. Wait for cluster recovery | |
| EOF | |
| $UPGRADE_KIBANA && cat <<'EOF' | |
| Kibana: | |
| K1. Verify shard allocation is enabled | |
| K2. Stop Kibana | |
| K3. Install Kibana RPM | |
| K4. Reload systemd daemon | |
| K5. Start Kibana and wait for ready | |
| EOF | |
| echo "" | |
| confirm_or_abort "Proceed with the upgrade?" | |
| # ============================================================================== | |
| # Pre-upgrade: Snapshot Reminder | |
| # ============================================================================== | |
| if $UPGRADE_ES && ! $FORCE_MODE; then | |
| header "Snapshot Reminder" | |
| warn "Before upgrading, you should have a recent snapshot of your data." | |
| warn "Snapshots are the ONLY reliable way to roll back Elasticsearch data." | |
| echo "" | |
| info "To check existing snapshots:" | |
| info " GET /_snapshot/_all" | |
| info " GET /_snapshot/<repo>/_all" | |
| echo "" | |
| info "To create a snapshot:" | |
| info " PUT /_snapshot/<repo>/<snapshot_name>?wait_for_completion=true" | |
| echo "" | |
| confirm_or_abort "I have a recent snapshot or accept the risk of proceeding without one" | |
| elif $UPGRADE_ES && $FORCE_MODE; then | |
| warn "FORCE MODE: Skipping snapshot reminder. Ensure you have a backup!" | |
| fi | |
| # ============================================================================== | |
| # Pre-upgrade: Remove version locks | |
| # ============================================================================== | |
| header "Checking Version Locks" | |
| versionlock_removed_es=false | |
| versionlock_removed_kibana=false | |
| if command -v "${PKG_MGR}" &>/dev/null && ${PKG_MGR} versionlock list &>/dev/null 2>&1; then | |
| info "Checking for version locks..." | |
| if $UPGRADE_ES; then | |
| if ${PKG_MGR} versionlock list 2>/dev/null | grep -q "elasticsearch"; then | |
| warn "Found version lock for Elasticsearch" | |
| info "Removing lock: ${PKG_MGR} versionlock delete elasticsearch*" | |
| ${PKG_MGR} versionlock delete "elasticsearch*" || ${PKG_MGR} versionlock delete "0:elasticsearch*" || true | |
| versionlock_removed_es=true | |
| success "Elasticsearch version lock removed." | |
| fi | |
| fi | |
| if $UPGRADE_KIBANA; then | |
| if ${PKG_MGR} versionlock list 2>/dev/null | grep -q "kibana"; then | |
| warn "Found version lock for Kibana" | |
| info "Removing lock: ${PKG_MGR} versionlock delete kibana*" | |
| ${PKG_MGR} versionlock delete "kibana*" || ${PKG_MGR} versionlock delete "0:kibana*" || true | |
| versionlock_removed_kibana=true | |
| success "Kibana version lock removed." | |
| fi | |
| fi | |
| if ! $versionlock_removed_es && ! $versionlock_removed_kibana; then | |
| success "No version locks found." | |
| fi | |
| else | |
| info "Version lock plugin not available or not installed. Skipping." | |
| fi | |
| # ============================================================================== | |
| # Pre-upgrade: Download all RPMs upfront | |
| # ============================================================================== | |
| header "Downloading RPM Packages" | |
| mkdir -p "$DOWNLOAD_DIR" | |
| chmod 700 "$DOWNLOAD_DIR" | |
| ES_RPM="elasticsearch-${TARGET_VERSION}-${ARCH}.rpm" | |
| ES_RPM_URL="https://artifacts.elastic.co/downloads/elasticsearch/${ES_RPM}" | |
| ES_SHA_URL="${ES_RPM_URL}.sha512" | |
| KIBANA_RPM="kibana-${TARGET_VERSION}-${ARCH}.rpm" | |
| KIBANA_RPM_URL="https://artifacts.elastic.co/downloads/kibana/${KIBANA_RPM}" | |
| KIBANA_SHA_URL="${KIBANA_RPM_URL}.sha512" | |
| if $UPGRADE_ES; then | |
| step "Downloading Elasticsearch RPM" | |
| info "URL: ${ES_RPM_URL}" | |
| if [[ -f "${DOWNLOAD_DIR}/${ES_RPM}" ]]; then | |
| warn "File already exists: ${DOWNLOAD_DIR}/${ES_RPM}" | |
| if confirm "Re-download and overwrite?" "n"; then | |
| rm -f "${DOWNLOAD_DIR}/${ES_RPM}" "${DOWNLOAD_DIR}/${ES_RPM}.sha512" | |
| else | |
| info "Using existing file." | |
| fi | |
| fi | |
| if [[ ! -f "${DOWNLOAD_DIR}/${ES_RPM}" ]]; then | |
| if ! download_file "${ES_RPM_URL}" "${DOWNLOAD_DIR}/${ES_RPM}" "Elasticsearch ${TARGET_VERSION} RPM"; then | |
| exit 1 | |
| fi | |
| success "Elasticsearch RPM downloaded." | |
| fi | |
| info "Verifying SHA512 checksum..." | |
| if ! curl -sf --max-time 30 --retry 3 -o "${DOWNLOAD_DIR}/${ES_RPM}.sha512" "${ES_SHA_URL}"; then | |
| error "Failed to download Elasticsearch checksum file." | |
| exit 1 | |
| fi | |
| expected_hash=$(awk '{print $1}' "${DOWNLOAD_DIR}/${ES_RPM}.sha512") | |
| actual_hash=$(sha512sum "${DOWNLOAD_DIR}/${ES_RPM}" | awk '{print $1}') | |
| if [[ "$expected_hash" == "$actual_hash" ]]; then | |
| success "Elasticsearch checksum verified." | |
| else | |
| error "Elasticsearch checksum mismatch!" | |
| error "Expected: ${expected_hash}" | |
| error "Actual: ${actual_hash}" | |
| confirm_critical "This is dangerous. Continue despite checksum failure?" | |
| fi | |
| fi | |
| if $UPGRADE_KIBANA; then | |
| step "Downloading Kibana RPM" | |
| info "URL: ${KIBANA_RPM_URL}" | |
| if [[ -f "${DOWNLOAD_DIR}/${KIBANA_RPM}" ]]; then | |
| warn "File already exists: ${DOWNLOAD_DIR}/${KIBANA_RPM}" | |
| if confirm "Re-download and overwrite?" "n"; then | |
| rm -f "${DOWNLOAD_DIR}/${KIBANA_RPM}" "${DOWNLOAD_DIR}/${KIBANA_RPM}.sha512" | |
| else | |
| info "Using existing file." | |
| fi | |
| fi | |
| if [[ ! -f "${DOWNLOAD_DIR}/${KIBANA_RPM}" ]]; then | |
| if ! download_file "${KIBANA_RPM_URL}" "${DOWNLOAD_DIR}/${KIBANA_RPM}" "Kibana ${TARGET_VERSION} RPM"; then | |
| exit 1 | |
| fi | |
| success "Kibana RPM downloaded." | |
| fi | |
| info "Verifying SHA512 checksum..." | |
| if ! curl -sf --max-time 30 --retry 3 -o "${DOWNLOAD_DIR}/${KIBANA_RPM}.sha512" "${KIBANA_SHA_URL}"; then | |
| error "Failed to download Kibana checksum file." | |
| exit 1 | |
| fi | |
| expected_hash=$(awk '{print $1}' "${DOWNLOAD_DIR}/${KIBANA_RPM}.sha512") | |
| actual_hash=$(sha512sum "${DOWNLOAD_DIR}/${KIBANA_RPM}" | awk '{print $1}') | |
| if [[ "$expected_hash" == "$actual_hash" ]]; then | |
| success "Kibana checksum verified." | |
| else | |
| error "Kibana checksum mismatch!" | |
| error "Expected: ${expected_hash}" | |
| error "Actual: ${actual_hash}" | |
| confirm_critical "This is dangerous. Continue despite checksum failure?" | |
| fi | |
| fi | |
| success "All RPM packages downloaded and verified." | |
| # ============================================================================== | |
| # Pre-upgrade: Backup configuration files | |
| # ============================================================================== | |
| header "Backing Up Configuration Files" | |
| BACKUP_TIMESTAMP=$(date +%Y%m%d_%H%M%S) | |
| BACKUP_BASE="/var/backup" | |
| BACKUP_DIR="${BACKUP_BASE}/elastic-upgrade-${BACKUP_TIMESTAMP}" | |
| if [[ ! -d "$BACKUP_BASE" ]]; then | |
| info "Creating backup directory: ${BACKUP_BASE}" | |
| mkdir -p "$BACKUP_BASE" | |
| chmod 700 "$BACKUP_BASE" | |
| fi | |
| # Check backup disk space | |
| free_backup_mb=$(df -m "$BACKUP_BASE" 2>/dev/null | awk 'NR==2{print $4}') | |
| if [[ -n "$free_backup_mb" && "$free_backup_mb" -lt 256 ]]; then | |
| warn "Less than 256 MB free on backup filesystem (${BACKUP_BASE})." | |
| confirm_or_abort "Continue without sufficient backup space?" | |
| fi | |
| if ! mkdir -p "$BACKUP_DIR"; then | |
| error "Failed to create backup directory: ${BACKUP_DIR}" | |
| error "Check permissions on ${BACKUP_BASE}" | |
| exit 1 | |
| fi | |
| chmod 700 "$BACKUP_DIR" | |
| if $UPGRADE_ES; then | |
| if [[ -d /etc/elasticsearch ]]; then | |
| step "Backing up Elasticsearch configuration" | |
| cp -a /etc/elasticsearch "$BACKUP_DIR/elasticsearch" | |
| success "Elasticsearch config backed up to: ${BACKUP_DIR}/elasticsearch/" | |
| config_count=$(find "$BACKUP_DIR/elasticsearch" -type f | wc -l) | |
| info " Backed up ${config_count} file(s)" | |
| else | |
| warn "No /etc/elasticsearch directory found to backup." | |
| fi | |
| # Also backup systemd overrides (contains LimitNOFILE, LimitMEMLOCK, etc.) | |
| if [[ -d /etc/systemd/system/elasticsearch.service.d ]]; then | |
| step "Backing up Elasticsearch systemd overrides" | |
| mkdir -p "$BACKUP_DIR/systemd-elasticsearch" | |
| cp -a /etc/systemd/system/elasticsearch.service.d/* "$BACKUP_DIR/systemd-elasticsearch/" 2>/dev/null || true | |
| success "Systemd overrides backed up to: ${BACKUP_DIR}/systemd-elasticsearch/" | |
| fi | |
| fi | |
| if $UPGRADE_KIBANA; then | |
| if [[ -d /etc/kibana ]]; then | |
| step "Backing up Kibana configuration" | |
| cp -a /etc/kibana "$BACKUP_DIR/kibana" | |
| success "Kibana config backed up to: ${BACKUP_DIR}/kibana/" | |
| config_count=$(find "$BACKUP_DIR/kibana" -type f | wc -l) | |
| info " Backed up ${config_count} file(s)" | |
| else | |
| warn "No /etc/kibana directory found to backup." | |
| fi | |
| if [[ -d /etc/systemd/system/kibana.service.d ]]; then | |
| mkdir -p "$BACKUP_DIR/systemd-kibana" | |
| cp -a /etc/systemd/system/kibana.service.d/* "$BACKUP_DIR/systemd-kibana/" 2>/dev/null || true | |
| success "Kibana systemd overrides backed up." | |
| fi | |
| fi | |
| info "All backups stored in: ${BOLD}${BACKUP_DIR}${NC}" | |
| echo "" | |
| # ============================================================================== | |
| # Elasticsearch Upgrade | |
| # ============================================================================== | |
| if $UPGRADE_ES; then | |
| header "Upgrading Elasticsearch: ${ES_CURRENT} -> ${TARGET_VERSION}" | |
| info "Using pre-downloaded RPM: ${DOWNLOAD_DIR}/${ES_RPM}" | |
| echo "" | |
| # --- Step 1: Disable shard allocation --- | |
| if $FORCE_MODE; then | |
| step "Step 1 - Disable shard allocation" | |
| warn "FORCE MODE: Skipping (Elasticsearch may not be running)." | |
| step "Step 2 - Flush all indices" | |
| warn "FORCE MODE: Skipping (Elasticsearch may not be running)." | |
| else | |
| step "Step 1 - Disable shard allocation" | |
| info "This prevents the cluster from rebalancing shards while the node is down." | |
| info "Command: PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":\"primaries\"}}" | |
| echo "" | |
| confirm_or_abort "Disable shard allocation now?" | |
| if es_curl_retry PUT "/_cluster/settings" '{"persistent":{"cluster.routing.allocation.enable":"primaries"}}'; then | |
| _allocation_disabled=true | |
| success "Shard allocation disabled (set to primaries only)." | |
| else | |
| error "Failed to disable shard allocation (HTTP $ES_CURL_HTTP_CODE)." | |
| confirm_or_abort "Continue despite this failure?" | |
| fi | |
| # --- Step 2: Flush --- | |
| step "Step 2 - Flush all indices" | |
| info "Flushing ensures all data is written to disk before stopping." | |
| info "Command: POST /_flush" | |
| echo "" | |
| confirm_or_abort "Flush all indices now?" | |
| # Use longer timeout for flush -- large clusters may take a while | |
| old_timeout="$API_TIMEOUT" | |
| API_TIMEOUT=120 | |
| es_curl POST "/_flush" | |
| flush_code="$ES_CURL_HTTP_CODE" | |
| API_TIMEOUT="$old_timeout" | |
| if [[ "$flush_code" == "200" ]]; then | |
| success "Flush completed." | |
| else | |
| warn "Flush returned HTTP $flush_code (this is usually okay to continue)." | |
| fi | |
| fi # end of: if ! $FORCE_MODE (steps 1-2) | |
| # --- Step 3: Stop Elasticsearch --- | |
| step "Step 3 - Stop Elasticsearch service" | |
| if ! systemctl is-active elasticsearch &>/dev/null; then | |
| info "Elasticsearch service is not running. Skipping stop." | |
| else | |
| info "Command: systemctl stop elasticsearch (timeout: ${STOP_TIMEOUT}s)" | |
| echo "" | |
| confirm_or_abort "Stop Elasticsearch now?" | |
| if ! timeout "$STOP_TIMEOUT" systemctl stop elasticsearch; then | |
| error "Elasticsearch did not stop within ${STOP_TIMEOUT} seconds." | |
| warn "The service may be stuck. Options:" | |
| warn " 1. Wait longer: systemctl stop elasticsearch" | |
| warn " 2. Force kill: systemctl kill -s SIGKILL elasticsearch" | |
| confirm_or_abort "Force kill the Elasticsearch process?" | |
| systemctl kill -s SIGKILL elasticsearch | |
| sleep 2 | |
| fi | |
| success "Elasticsearch stopped." | |
| fi | |
| # --- Step 4: Install RPM --- | |
| step "Step 4 - Install Elasticsearch RPM" | |
| info "Command: ${PKG_MGR} -y localinstall ${DOWNLOAD_DIR}/${ES_RPM}" | |
| info "Note: Config files in /etc/elasticsearch/ will NOT be overwritten." | |
| echo "" | |
| confirm_or_abort "Install the Elasticsearch RPM now?" | |
| ${PKG_MGR} -y localinstall "${DOWNLOAD_DIR}/${ES_RPM}" | |
| success "Elasticsearch RPM installed." | |
| # --- Step 4a: Fix keystore permissions and upgrade --- | |
| step "Step 4a - Checking Elasticsearch keystore" | |
| keystore_file="/etc/elasticsearch/elasticsearch.keystore" | |
| keystore_tmp="/etc/elasticsearch/elasticsearch.keystore.tmp" | |
| ES_USER=$(systemctl show elasticsearch -p User --value 2>/dev/null) || \ | |
| ES_USER=$(systemctl show elasticsearch -p User 2>/dev/null | cut -d= -f2) || true | |
| ES_GROUP=$(systemctl show elasticsearch -p Group --value 2>/dev/null) || \ | |
| ES_GROUP=$(systemctl show elasticsearch -p Group 2>/dev/null | cut -d= -f2) || true | |
| ES_USER="${ES_USER:-elasticsearch}" | |
| ES_GROUP="${ES_GROUP:-elasticsearch}" | |
| [[ -z "$ES_USER" ]] && ES_USER="elasticsearch" | |
| [[ -z "$ES_GROUP" ]] && ES_GROUP="elasticsearch" | |
| info "Elasticsearch runs as user: ${ES_USER}, group: ${ES_GROUP}" | |
| if [[ -f "$keystore_file" ]]; then | |
| info "Keystore exists, checking permissions..." | |
| chown -R "${ES_USER}:${ES_GROUP}" /etc/elasticsearch 2>/dev/null || true | |
| chmod 750 /etc/elasticsearch 2>/dev/null || true | |
| chmod 660 "$keystore_file" 2>/dev/null || true | |
| if [[ -f "$keystore_tmp" ]]; then | |
| warn "Found stale keystore temp file, removing..." | |
| rm -f "$keystore_tmp" | |
| fi | |
| info "Running keystore upgrade as ${ES_USER} user..." | |
| if [[ -x /usr/share/elasticsearch/bin/elasticsearch-keystore ]]; then | |
| if sudo -u "$ES_USER" /usr/share/elasticsearch/bin/elasticsearch-keystore upgrade 2>&1; then | |
| success "Keystore upgraded successfully." | |
| else | |
| warn "Keystore upgrade returned non-zero (may already be current format)." | |
| info "If ES fails to start with keystore errors, check:" | |
| info " ls -la /etc/elasticsearch/elasticsearch.keystore" | |
| info " sudo -u ${ES_USER} /usr/share/elasticsearch/bin/elasticsearch-keystore upgrade" | |
| fi | |
| else | |
| warn "elasticsearch-keystore binary not found or not executable." | |
| fi | |
| else | |
| info "No existing keystore found (will be created on first start if needed)." | |
| fi | |
| # --- Step 4b: Check for .rpmnew config files --- | |
| step "Step 4b - Checking for .rpmnew configuration files" | |
| info "RPM upgrades may create .rpmnew files when your config has been modified." | |
| echo "" | |
| rpmnew_found=false | |
| while IFS= read -r -d '' rpmnew_file; do | |
| rpmnew_found=true | |
| original="${rpmnew_file%.rpmnew}" | |
| warn "Found: ${BOLD}${rpmnew_file}${NC}" | |
| if [[ -f "$original" ]]; then | |
| diff_output=$(diff --brief "$original" "$rpmnew_file" 2>/dev/null) || true | |
| if [[ -n "$diff_output" ]]; then | |
| warn " -> Differs from current ${original}" | |
| info " -> Review with: diff ${original} ${rpmnew_file}" | |
| fi | |
| fi | |
| done < <(find /etc/elasticsearch -name '*.rpmnew' -print0 2>/dev/null) | |
| if $rpmnew_found; then | |
| echo "" | |
| warn "Review the .rpmnew files above and merge any needed changes into your config." | |
| warn "The .rpmnew files contain the new defaults from version ${TARGET_VERSION}." | |
| confirm_or_abort "Have you noted the .rpmnew files? Continue?" | |
| else | |
| success "No .rpmnew config files found (your configs were preserved cleanly)." | |
| fi | |
| # --- Step 4c: Verify JVM heap settings --- | |
| step "Step 4c - Verifying JVM heap settings" | |
| info "Checking that your JVM heap settings (-Xms / -Xmx) are still in place." | |
| echo "" | |
| jvm_opts_file="/etc/elasticsearch/jvm.options" | |
| jvm_opts_d="/etc/elasticsearch/jvm.options.d" | |
| heap_found=false | |
| xms="" | |
| xmx="" | |
| if [[ -d "$jvm_opts_d" ]]; then | |
| for f in "$jvm_opts_d"/*.options; do | |
| if [[ -f "$f" ]]; then | |
| _xms=$(grep -oP '^\s*-Xms\K\S+' "$f" 2>/dev/null | tail -1) || true | |
| _xmx=$(grep -oP '^\s*-Xmx\K\S+' "$f" 2>/dev/null | tail -1) || true | |
| if [[ -n "$_xms" || -n "$_xmx" ]]; then | |
| heap_found=true | |
| [[ -n "$_xms" ]] && xms="$_xms" | |
| [[ -n "$_xmx" ]] && xmx="$_xmx" | |
| info " Found in ${BOLD}${f}${NC}:" | |
| [[ -n "$_xms" ]] && info " -Xms${_xms}" | |
| [[ -n "$_xmx" ]] && info " -Xmx${_xmx}" | |
| fi | |
| fi | |
| done | |
| fi | |
| if [[ -f "$jvm_opts_file" ]]; then | |
| _xms=$(grep -oP '^\s*-Xms\K\S+' "$jvm_opts_file" 2>/dev/null | tail -1) || true | |
| _xmx=$(grep -oP '^\s*-Xmx\K\S+' "$jvm_opts_file" 2>/dev/null | tail -1) || true | |
| if [[ -n "$_xms" || -n "$_xmx" ]]; then | |
| if ! $heap_found; then | |
| heap_found=true | |
| fi | |
| [[ -n "$_xms" && -z "$xms" ]] && xms="$_xms" | |
| [[ -n "$_xmx" && -z "$xmx" ]] && xmx="$_xmx" | |
| info " Found in ${BOLD}${jvm_opts_file}${NC}:" | |
| [[ -n "$_xms" ]] && info " -Xms${_xms}" | |
| [[ -n "$_xmx" ]] && info " -Xmx${_xmx}" | |
| fi | |
| fi | |
| if [[ -f "${jvm_opts_file}.rpmnew" ]]; then | |
| warn "Found ${BOLD}${jvm_opts_file}.rpmnew${NC} — new JVM defaults from ${TARGET_VERSION}." | |
| info " Review with: diff ${jvm_opts_file} ${jvm_opts_file}.rpmnew" | |
| fi | |
| if $heap_found; then | |
| echo "" | |
| if [[ -n "$xms" && -n "$xmx" && "$xms" != "$xmx" ]]; then | |
| warn "Xms (${xms}) and Xmx (${xmx}) differ. Elastic recommends setting them equal." | |
| fi | |
| info "Verify these heap values are correct for this node before starting." | |
| confirm_or_abort "JVM heap settings look correct? Continue?" | |
| else | |
| warn "No explicit -Xms/-Xmx found in jvm.options or jvm.options.d/" | |
| warn "Elasticsearch will use its built-in defaults (typically 50% of RAM, max 31g)." | |
| warn "The default may have changed between versions. Verify this is acceptable." | |
| confirm_or_abort "Continue with default heap settings?" | |
| fi | |
| # --- Step 5: Reload systemd --- | |
| step "Step 5 - Reload systemd daemon" | |
| systemctl daemon-reload | |
| success "Systemd daemon reloaded." | |
| # --- Step 6: Start Elasticsearch --- | |
| step "Step 6 - Start Elasticsearch service" | |
| info "Command: systemctl start elasticsearch" | |
| echo "" | |
| confirm_or_abort "Start Elasticsearch now?" | |
| systemctl start elasticsearch | |
| info "Elasticsearch starting... waiting for it to become available." | |
| retries=0 | |
| while [[ $retries -lt $STARTUP_WAIT ]]; do | |
| es_curl GET "/" >/dev/null 2>&1 | |
| if [[ "$ES_CURL_HTTP_CODE" == "200" ]]; then | |
| break | |
| fi | |
| retries=$((retries + 1)) | |
| echo -ne "\r Waiting for Elasticsearch... (${retries}/${STARTUP_WAIT}s)" | |
| sleep 1 | |
| done | |
| echo "" | |
| if [[ $retries -ge $STARTUP_WAIT ]]; then | |
| error "Elasticsearch did not start within ${STARTUP_WAIT} seconds." | |
| error "Check the logs: journalctl -u elasticsearch -f" | |
| recent_logs=$(journalctl -u elasticsearch --no-pager -n 30 2>/dev/null) || recent_logs="" | |
| if echo "$recent_logs" | grep -qi "GLIBC\|glibc\|libc\.so\|GLIBCXX"; then | |
| error "" | |
| error "=== GLIBC INCOMPATIBILITY DETECTED ===" | |
| error "The Elasticsearch JDK requires a newer glibc than this system provides." | |
| error "Options:" | |
| error " 1. Upgrade your OS (e.g. CentOS 7 -> RHEL 8/9, Rocky 8/9, etc.)" | |
| error " 2. Install a compatible system JDK and configure ES_JAVA_HOME" | |
| error " 3. Roll back: ${PKG_MGR} -y downgrade elasticsearch-${ES_CURRENT}" | |
| fi | |
| if echo "$recent_logs" | grep -qi "unknown setting\|unsupported setting\|IllegalArgument"; then | |
| error "" | |
| error "=== CONFIGURATION ERROR DETECTED ===" | |
| error "Elasticsearch may have failed due to removed/deprecated settings." | |
| error "Check elasticsearch.yml for settings that were removed in ${TARGET_VERSION}." | |
| fi | |
| if echo "$recent_logs" | grep -qi "keystore\|KeyStoreException"; then | |
| error "" | |
| error "=== KEYSTORE ERROR DETECTED ===" | |
| error "Try running:" | |
| error " sudo -u ${ES_USER:-elasticsearch} /usr/share/elasticsearch/bin/elasticsearch-keystore upgrade" | |
| fi | |
| error "" | |
| error "To roll back: ${PKG_MGR} -y downgrade elasticsearch-${ES_CURRENT}" | |
| error "Config backup: ${BACKUP_DIR}" | |
| confirm_or_abort "Continue anyway (maybe it needs more time)?" | |
| else | |
| es_curl GET "/" | |
| version_body="$ES_CURL_BODY" | |
| new_version=$(echo "$version_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('version',{}).get('number','unknown'))" 2>/dev/null) || new_version="unknown" | |
| success "Elasticsearch is running. Version: ${BOLD}${new_version}${NC}" | |
| fi | |
| # --- Step 7: Wait for node to rejoin --- | |
| if $FORCE_MODE; then | |
| step "Step 7 - Wait for node to rejoin cluster" | |
| warn "FORCE MODE: Skipping cluster rejoin check." | |
| warn "Verify manually that the node has rejoined: GET /_cat/nodes" | |
| _allocation_disabled=false | |
| step "Step 7a - Re-enable shard allocation" | |
| warn "FORCE MODE: Skipping. Re-enable manually if needed:" | |
| warn " PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}" | |
| step "Step 7b - Wait for cluster recovery" | |
| warn "FORCE MODE: Skipping. Monitor recovery manually: GET /_cluster/health?pretty" | |
| else | |
| step "Step 7 - Wait for node to rejoin cluster" | |
| info "Checking cluster membership..." | |
| es_curl GET "/_cluster/health?pretty" | |
| health_body="$ES_CURL_BODY" | |
| current_nodes=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('number_of_nodes',0))" 2>/dev/null) || current_nodes="?" | |
| info "Nodes in cluster: ${current_nodes} (was: ${num_nodes:-?})" | |
| success "Node has rejoined the cluster." | |
| # --- Step 7a: Re-enable shard allocation --- | |
| step "Step 7a - Re-enable shard allocation" | |
| info "Clearing both persistent and transient allocation overrides." | |
| echo "" | |
| confirm_or_abort "Re-enable shard allocation now?" | |
| if es_curl_retry PUT "/_cluster/settings" '{"persistent":{"cluster.routing.allocation.enable":null},"transient":{"cluster.routing.allocation.enable":null}}'; then | |
| # Verify it actually took effect | |
| info "Verifying allocation setting was cleared..." | |
| es_curl GET "/_cluster/settings?flat_settings=true" | |
| verify_body="$ES_CURL_BODY" | |
| if echo "$verify_body" | grep -q '"cluster\.routing\.allocation\.enable"' 2>/dev/null; then | |
| error "Allocation setting is STILL present after clearing!" | |
| error "Fix manually:" | |
| error " PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}" | |
| confirm_or_abort "Continue anyway?" | |
| else | |
| _allocation_disabled=false | |
| success "Verified: shard allocation overrides cleared." | |
| fi | |
| else | |
| error "Failed to re-enable shard allocation (HTTP $ES_CURL_HTTP_CODE)." | |
| error "You MUST manually re-enable it:" | |
| error " PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}" | |
| fi | |
| # --- Step 7b: Wait for cluster recovery --- | |
| step "Step 7b - Wait for cluster recovery" | |
| info "Temporarily increasing concurrent incoming recoveries to 10..." | |
| if es_curl_quiet PUT "/_cluster/settings" '{"transient":{"cluster.routing.allocation.node_concurrent_incoming_recoveries":10}}'; then | |
| _recovery_boosted=true | |
| success "Recovery concurrency boosted." | |
| else | |
| warn "Could not increase recovery concurrency. Continuing with defaults." | |
| fi | |
| info "Monitoring cluster health until green — timeout $((RECOVERY_WAIT / 60)) minutes (Ctrl+C to stop waiting)..." | |
| echo "" | |
| retries=0 | |
| while [[ $retries -lt $RECOVERY_WAIT ]]; do | |
| es_curl GET "/_cluster/health" | |
| health_body="$ES_CURL_BODY" | |
| status=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('status','unknown'))" 2>/dev/null) || status="unknown" | |
| init=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('initializing_shards',0))" 2>/dev/null) || init="?" | |
| reloc=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('relocating_shards',0))" 2>/dev/null) || reloc="?" | |
| unass=$(echo "$health_body" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('unassigned_shards',0))" 2>/dev/null) || unass="?" | |
| recovery_info="" | |
| es_curl GET "/_cat/recovery?active_only=true&h=index,shard,stage,bytes_percent&format=json" | |
| recovery_body="$ES_CURL_BODY" | |
| if [[ "$ES_CURL_HTTP_CODE" == "200" ]]; then | |
| active_recoveries=$(echo "$recovery_body" | python3 -c "import json,sys; print(len(json.load(sys.stdin)))" 2>/dev/null) || active_recoveries=0 | |
| if [[ "$active_recoveries" -gt 0 ]]; then | |
| avg_pct=$(echo "$recovery_body" | python3 -c " | |
| import json, sys | |
| data = json.load(sys.stdin) | |
| pcts = [float(r.get('bytes_percent','0').rstrip('%')) for r in data if r.get('bytes_percent')] | |
| print(f'{sum(pcts)/len(pcts):.1f}' if pcts else '0') | |
| " 2>/dev/null) || avg_pct="?" | |
| recovery_info=" | Recoveries: ${active_recoveries} (avg ${avg_pct}%)" | |
| fi | |
| fi | |
| if [[ "$status" == "green" ]]; then | |
| echo "" | |
| success "Cluster is ${GREEN}${BOLD}GREEN${NC}!" | |
| break | |
| fi | |
| elapsed_min=$((retries / 60)) | |
| elapsed_sec=$((retries % 60)) | |
| printf "\r Status: %-6s | Init: %-3s | Reloc: %-3s | Unassigned: %-3s%s | %dm%02ds " \ | |
| "$status" "$init" "$reloc" "$unass" "$recovery_info" "$elapsed_min" "$elapsed_sec" | |
| retries=$((retries + 5)) | |
| sleep 5 | |
| done | |
| if $_recovery_boosted; then | |
| info "Resetting recovery concurrency to default..." | |
| if es_curl_quiet PUT "/_cluster/settings" '{"transient":{"cluster.routing.allocation.node_concurrent_incoming_recoveries":null}}'; then | |
| _recovery_boosted=false | |
| success "Recovery concurrency reset to default." | |
| else | |
| warn "Could not reset recovery concurrency. Reset manually:" | |
| warn " PUT /_cluster/settings {\"transient\":{\"cluster.routing.allocation.node_concurrent_incoming_recoveries\":null}}" | |
| fi | |
| fi | |
| if [[ $retries -ge $RECOVERY_WAIT ]]; then | |
| echo "" | |
| warn "Cluster did not reach GREEN within $((RECOVERY_WAIT / 60)) minutes." | |
| warn "Current status: ${status:-unknown}" | |
| warn "This may be normal if you are doing a rolling upgrade across multiple nodes." | |
| info "Monitor with: GET /_cluster/health?pretty" | |
| info "Active recoveries: GET /_cat/recovery?active_only=true&v" | |
| fi | |
| fi # end of: if ! $FORCE_MODE (steps 7-7b) | |
| success "Elasticsearch upgrade complete on this node." | |
| fi | |
| # ============================================================================== | |
| # Kibana Upgrade | |
| # ============================================================================== | |
| if $UPGRADE_KIBANA; then | |
| header "Upgrading Kibana: ${KIBANA_CURRENT} -> ${TARGET_VERSION}" | |
| # Kibana migrations require cluster.routing.allocation.enable to be "all" (or unset). | |
| if ! $FORCE_MODE; then | |
| step "Verifying shard allocation is enabled" | |
| info "Kibana migrations will fail if cluster.routing.allocation.enable is not 'all'." | |
| es_curl GET "/_cluster/settings?flat_settings=true&include_defaults=true" | |
| alloc_body="$ES_CURL_BODY" | |
| alloc_code="$ES_CURL_HTTP_CODE" | |
| if [[ "$alloc_code" == "200" ]]; then | |
| persistent_alloc=$(echo "$alloc_body" | python3 -c " | |
| import json, sys | |
| d = json.load(sys.stdin) | |
| v = d.get('persistent', {}).get('cluster.routing.allocation.enable', '') | |
| print(v) | |
| " 2>/dev/null) || persistent_alloc="" | |
| transient_alloc=$(echo "$alloc_body" | python3 -c " | |
| import json, sys | |
| d = json.load(sys.stdin) | |
| v = d.get('transient', {}).get('cluster.routing.allocation.enable', '') | |
| print(v) | |
| " 2>/dev/null) || transient_alloc="" | |
| alloc_blocked=false | |
| if [[ -n "$persistent_alloc" && "$persistent_alloc" != "all" ]]; then | |
| warn "Persistent cluster.routing.allocation.enable = '${persistent_alloc}' (must be 'all' or unset)" | |
| alloc_blocked=true | |
| fi | |
| if [[ -n "$transient_alloc" && "$transient_alloc" != "all" ]]; then | |
| warn "Transient cluster.routing.allocation.enable = '${transient_alloc}' (must be 'all' or unset)" | |
| alloc_blocked=true | |
| fi | |
| if $alloc_blocked; then | |
| warn "" | |
| warn "Kibana migrations WILL FAIL with these settings." | |
| info "Fixing: resetting allocation to default (all)..." | |
| if es_curl_retry PUT "/_cluster/settings" '{"persistent":{"cluster.routing.allocation.enable":null},"transient":{"cluster.routing.allocation.enable":null}}'; then | |
| # Verify | |
| es_curl GET "/_cluster/settings?flat_settings=true" | |
| verify_body="$ES_CURL_BODY" | |
| if echo "$verify_body" | grep -q '"cluster\.routing\.allocation\.enable"' 2>/dev/null; then | |
| error "Allocation setting is STILL present after clearing!" | |
| error "Fix manually before starting Kibana." | |
| confirm_or_abort "Continue anyway? (Kibana will likely fail to start)" | |
| else | |
| success "Verified: allocation overrides cleared. Kibana migrations can proceed." | |
| fi | |
| else | |
| error "Failed to reset allocation (HTTP $ES_CURL_HTTP_CODE)." | |
| error "Fix manually before starting Kibana:" | |
| error " PUT /_cluster/settings {\"persistent\":{\"cluster.routing.allocation.enable\":null}}" | |
| confirm_or_abort "Continue anyway? (Kibana will likely fail to start)" | |
| fi | |
| else | |
| success "Shard allocation is set to 'all'. Kibana migrations can proceed." | |
| fi | |
| else | |
| warn "Could not check allocation settings (HTTP $alloc_code). Kibana may fail if allocation is restricted." | |
| fi | |
| echo "" | |
| fi | |
| info "Using pre-downloaded RPM: ${DOWNLOAD_DIR}/${KIBANA_RPM}" | |
| echo "" | |
| # --- K2: Stop Kibana --- | |
| step "K2 - Stop Kibana service" | |
| if ! systemctl is-active kibana &>/dev/null; then | |
| info "Kibana service is not running. Skipping stop." | |
| else | |
| info "Command: systemctl stop kibana" | |
| echo "" | |
| confirm_or_abort "Stop Kibana now?" | |
| if ! timeout "$STOP_TIMEOUT" systemctl stop kibana; then | |
| error "Kibana did not stop within ${STOP_TIMEOUT} seconds." | |
| confirm_or_abort "Force kill the Kibana process?" | |
| systemctl kill -s SIGKILL kibana | |
| sleep 2 | |
| fi | |
| success "Kibana stopped." | |
| fi | |
| # --- K3: Install RPM --- | |
| step "K3 - Install Kibana RPM" | |
| info "Command: ${PKG_MGR} -y localinstall ${DOWNLOAD_DIR}/${KIBANA_RPM}" | |
| info "Note: Config files in /etc/kibana/ will NOT be overwritten." | |
| echo "" | |
| confirm_or_abort "Install the Kibana RPM now?" | |
| ${PKG_MGR} -y localinstall "${DOWNLOAD_DIR}/${KIBANA_RPM}" | |
| success "Kibana RPM installed." | |
| # --- K4: Reload systemd --- | |
| step "K4 - Reload systemd daemon" | |
| systemctl daemon-reload | |
| success "Systemd daemon reloaded." | |
| # --- K5: Start Kibana --- | |
| step "K5 - Start Kibana service" | |
| info "Command: systemctl start kibana" | |
| echo "" | |
| confirm_or_abort "Start Kibana now?" | |
| systemctl start kibana | |
| info "Kibana starting... waiting for it to become ready." | |
| info "Kibana runs migrations after an upgrade, which may take several minutes." | |
| # Detect Kibana listen address from config | |
| kibana_config="/etc/kibana/kibana.yml" | |
| kibana_host="localhost" | |
| kibana_port="5601" | |
| kibana_scheme="http" | |
| if [[ -f "$kibana_config" ]]; then | |
| cfg_host=$(grep -E '^\s*server\.host\s*:' "$kibana_config" 2>/dev/null | sed 's/^[^:]*:\s*//' | tr -d '"'"'" | xargs) || true | |
| if [[ -n "$cfg_host" && "$cfg_host" != "0.0.0.0" && "$cfg_host" != "::" ]]; then | |
| kibana_host="$cfg_host" | |
| fi | |
| cfg_port=$(grep -E '^\s*server\.port\s*:' "$kibana_config" 2>/dev/null | sed 's/^[^:]*:\s*//' | tr -d '"'"'" | xargs) || true | |
| if [[ -n "$cfg_port" ]]; then | |
| kibana_port="$cfg_port" | |
| fi | |
| cfg_ssl=$(grep -E '^\s*server\.ssl\.enabled\s*:' "$kibana_config" 2>/dev/null | sed 's/^[^:]*:\s*//' | tr -d '"'"'" | xargs) || true | |
| if [[ "$cfg_ssl" == "true" ]]; then | |
| kibana_scheme="https" | |
| fi | |
| fi | |
| KIBANA_URL="${kibana_scheme}://${kibana_host}:${kibana_port}" | |
| info "Kibana URL: ${KIBANA_URL}" | |
| declare -a kibana_curl_cmd=(curl -s --max-time 10 -o /dev/null -w '%{http_code}') | |
| if [[ "$kibana_scheme" == "https" ]]; then | |
| kibana_curl_cmd+=(-k) | |
| fi | |
| retries=0 | |
| last_code="000" | |
| while [[ $retries -lt $KIBANA_STARTUP_WAIT ]]; do | |
| last_code=$("${kibana_curl_cmd[@]}" "${KIBANA_URL}/api/status" 2>/dev/null) || last_code="000" | |
| if [[ "$last_code" == "200" ]]; then | |
| break | |
| fi | |
| retries=$((retries + 1)) | |
| if [[ "$last_code" == "503" ]]; then | |
| echo -ne "\r Kibana is running migrations... (${retries}s, HTTP ${last_code}) " | |
| elif [[ "$last_code" == "000" ]]; then | |
| echo -ne "\r Waiting for Kibana to start listening... (${retries}s) " | |
| else | |
| echo -ne "\r Waiting for Kibana... (${retries}s, HTTP ${last_code}) " | |
| fi | |
| sleep 1 | |
| done | |
| echo "" | |
| if [[ "$last_code" == "200" ]]; then | |
| success "Kibana is ready and accepting requests." | |
| elif [[ "$last_code" == "503" ]]; then | |
| warn "Kibana is still running migrations after ${KIBANA_STARTUP_WAIT}s (HTTP 503)." | |
| warn "This is normal for large deployments. It should become available shortly." | |
| warn "Monitor with: curl -s ${KIBANA_URL}/api/status | python3 -m json.tool" | |
| elif [[ "$last_code" == "000" ]]; then | |
| warn "Kibana did not start responding within ${KIBANA_STARTUP_WAIT}s." | |
| warn "Check logs: journalctl -u kibana -f" | |
| warn " tail -f /var/log/kibana/kibana.log" | |
| recent_logs=$(journalctl -u kibana --no-pager -n 30 2>/dev/null) || recent_logs="" | |
| if echo "$recent_logs" | grep -qi "ECONNREFUSED\|connect.*elasticsearch"; then | |
| warn "" | |
| warn "=== ELASTICSEARCH CONNECTION ISSUE ===" | |
| warn "Kibana cannot connect to Elasticsearch." | |
| warn "Verify Elasticsearch is running and reachable." | |
| fi | |
| else | |
| warn "Kibana returned unexpected status: HTTP ${last_code}" | |
| warn "Check logs: journalctl -u kibana -f" | |
| fi | |
| KIBANA_NEW=$(get_rpm_version kibana) | |
| success "Kibana upgrade complete. Version: ${BOLD}${KIBANA_NEW}${NC}" | |
| fi | |
| # ============================================================================== | |
| # Post-upgrade: Restore version locks | |
| # ============================================================================== | |
| if command -v "${PKG_MGR}" &>/dev/null && ${PKG_MGR} versionlock list &>/dev/null 2>&1; then | |
| if $versionlock_removed_es || $versionlock_removed_kibana; then | |
| header "Restoring Version Locks" | |
| if $versionlock_removed_es && $UPGRADE_ES; then | |
| info "Re-adding version lock for Elasticsearch" | |
| if ${PKG_MGR} versionlock add elasticsearch 2>/dev/null; then | |
| if ${PKG_MGR} versionlock list 2>/dev/null | grep -q "elasticsearch"; then | |
| success "Elasticsearch version lock restored." | |
| else | |
| warn "Versionlock add returned success but lock not found in list." | |
| warn "Add manually: ${PKG_MGR} versionlock add elasticsearch" | |
| fi | |
| else | |
| warn "Failed to restore Elasticsearch version lock." | |
| warn "Add manually: ${PKG_MGR} versionlock add elasticsearch" | |
| fi | |
| fi | |
| if $versionlock_removed_kibana && $UPGRADE_KIBANA; then | |
| info "Re-adding version lock for Kibana" | |
| if ${PKG_MGR} versionlock add kibana 2>/dev/null; then | |
| if ${PKG_MGR} versionlock list 2>/dev/null | grep -q "kibana"; then | |
| success "Kibana version lock restored." | |
| else | |
| warn "Versionlock add returned success but lock not found in list." | |
| warn "Add manually: ${PKG_MGR} versionlock add kibana" | |
| fi | |
| else | |
| warn "Failed to restore Kibana version lock." | |
| warn "Add manually: ${PKG_MGR} versionlock add kibana" | |
| fi | |
| fi | |
| fi | |
| fi | |
| # ============================================================================== | |
| # Cleanup | |
| # ============================================================================== | |
| # Clear trap since we finished successfully | |
| _allocation_disabled=false | |
| _recovery_boosted=false | |
| header "Upgrade Complete" | |
| echo -e " ${GREEN}OK${NC} Upgrade finished successfully on this node." | |
| echo "" | |
| $UPGRADE_ES && echo -e " Elasticsearch: ${ES_CURRENT} -> ${BOLD}${TARGET_VERSION}${NC}" | |
| $UPGRADE_KIBANA && echo -e " Kibana: ${KIBANA_CURRENT} -> ${BOLD}${TARGET_VERSION}${NC}" | |
| echo "" | |
| echo -e " Config backup: ${BOLD}${BACKUP_DIR}${NC}" | |
| echo "" | |
| if confirm "Clean up downloaded RPM files from ${DOWNLOAD_DIR}?" "y"; then | |
| rm -rf "${DOWNLOAD_DIR}" | |
| success "Cleaned up download directory." | |
| else | |
| info "RPM files kept in ${DOWNLOAD_DIR}" | |
| fi | |
| echo "" | |
| info "If this is a multi-node cluster, repeat this process on the next node." | |
| info "Upgrade order: non-master-eligible nodes first, then master-eligible nodes." | |
| echo "" | |
| success "Done!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment