From fd5987e06ede8abad83dc8129cedb295f92a4b1c Mon Sep 17 00:00:00 2001 From: RomanNum3ral Date: Fri, 27 Mar 2026 22:09:38 +0000 Subject: [PATCH] Update master_node_install.sh --- master_node_install.sh | 163 +++++++++++++++++++++++++++++++++++------ 1 file changed, 141 insertions(+), 22 deletions(-) diff --git a/master_node_install.sh b/master_node_install.sh index 06bfd62..ef9e8a4 100644 --- a/master_node_install.sh +++ b/master_node_install.sh @@ -4,14 +4,32 @@ set -Eeuo pipefail ######################################## # Arch Linux Rancher Management Cluster # Single-node RKE2 server + Rancher +# +# What this script does: +# - Disables swap +# - Installs required Arch packages +# - Configures kernel modules and sysctl for Kubernetes +# - Configures NetworkManager to ignore CNI interfaces +# - Installs RKE2 server +# - Waits for Kubernetes and bundled RKE2 addons to become healthy +# - Installs cert-manager +# - Installs Rancher +# +# Optional environment variables: +# RKE2_VERSION=v1.34.5+rke2r1 +# RANCHER_HOSTNAME=rancher.example.com +# BOOTSTRAP_PASSWORD=changeme +# RKE2_TOKEN=my-shared-secret +# INSTALL_RANCHER=true ######################################## RKE2_VERSION="${RKE2_VERSION:-v1.34.5+rke2r1}" INSTALL_RANCHER="${INSTALL_RANCHER:-true}" -BOOTSTRAP_PASSWORD="${BOOTSTRAP_PASSWORD:-admin}" +BOOTSTRAP_PASSWORD="${BOOTSTRAP_PASSWORD:-adminadminadmin}" RANCHER_HOSTNAME="${RANCHER_HOSTNAME:-}" RKE2_CONFIG_DIR="/etc/rancher/rke2" RKE2_CONFIG_FILE="${RKE2_CONFIG_DIR}/config.yaml" +RKE2_TOKEN_FILE="${RKE2_CONFIG_DIR}/server-token" KUBECONFIG_FILE="/etc/rancher/rke2/rke2.yaml" RANCHER_REPO_NAME="rancher-stable" RANCHER_REPO_URL="https://releases.rancher.com/server-charts/stable" @@ -37,6 +55,7 @@ die() { on_error() { local exit_code=$? local line_no=$1 + warn "Script failed on line ${line_no} with exit code ${exit_code}" warn "Useful diagnostics:" echo " sudo systemctl status rke2-server -l --no-pager" @@ -59,9 +78,25 @@ helm_rke2() { helm --kubeconfig "${KUBECONFIG_FILE}" "$@" } +wait_for_file() { + local file="$1" + local timeout="${2:-300}" + local waited=0 + + until [[ -f "${file}" ]]; do + sleep 2 + waited=$((waited + 2)) + if (( waited >= timeout )); then + die "Timed out waiting for file: ${file}" + fi + done +} + disable_swap() { log "Disabling swap" + swapoff -a || true + if [[ -f /etc/fstab ]]; then cp /etc/fstab "/etc/fstab.bak.$(date +%Y%m%d%H%M%S)" sed -Ei '/^[^#].+\s+swap\s+/ s/^/# disabled-by-rancher-script /' /etc/fstab @@ -70,6 +105,7 @@ disable_swap() { install_packages() { log "Installing required Arch packages" + pacman -Sy --noconfirm archlinux-keyring if pacman -Q iptables >/dev/null 2>&1; then @@ -89,7 +125,6 @@ install_packages() { iproute2 \ iptables-nft \ jq \ - nftables \ nfs-utils \ open-iscsi \ openssl \ @@ -142,8 +177,14 @@ EOF enable_support_services() { log "Enabling support services" + systemctl enable --now iscsid.service || true - systemctl enable --now nftables.service || true + + # Do NOT enable nftables.service here. + # On this Arch + RKE2 setup it broke service routing for the cluster IP range. + systemctl stop nftables.service >/dev/null 2>&1 || true + systemctl disable nftables.service >/dev/null 2>&1 || true + nft flush ruleset >/dev/null 2>&1 || true } install_rke2() { @@ -151,7 +192,19 @@ install_rke2() { mkdir -p "${RKE2_CONFIG_DIR}" + if [[ -n "${RKE2_TOKEN:-}" ]]; then + printf '%s\n' "${RKE2_TOKEN}" > "${RKE2_TOKEN_FILE}" + chmod 600 "${RKE2_TOKEN_FILE}" + elif [[ ! -f "${RKE2_TOKEN_FILE}" ]]; then + openssl rand -hex 24 > "${RKE2_TOKEN_FILE}" + chmod 600 "${RKE2_TOKEN_FILE}" + fi + + local token + token="$(<"${RKE2_TOKEN_FILE}")" + cat >"${RKE2_CONFIG_FILE}" <= 600 )); then - journalctl -u rke2-server -n 200 --no-pager || true - die "Timed out waiting for ${KUBECONFIG_FILE}" - fi - done + wait_for_file "${KUBECONFIG_FILE}" 600 + wait_for_file "/var/lib/rancher/rke2/bin/kubectl" 600 - waited=0 - until /var/lib/rancher/rke2/bin/kubectl --kubeconfig "${KUBECONFIG_FILE}" get --raw=/readyz >/dev/null 2>&1; do + local waited=0 + until kubectl_rke2 get --raw=/readyz >/dev/null 2>&1; do sleep 5 waited=$((waited + 5)) + if (( waited % 30 == 0 )); then - warn "Kubernetes API not ready yet; latest rke2-server logs:" + warn "Kubernetes API not ready yet; recent rke2-server logs:" journalctl -u rke2-server -n 40 --no-pager || true fi + if (( waited >= 900 )); then journalctl -u rke2-server -n 200 --no-pager || true die "Timed out waiting for Kubernetes API readiness" fi done +} +wait_for_ready_node() { log "Waiting for any node to become Ready" - waited=0 + local waited=0 until kubectl_rke2 get nodes -o json 2>/dev/null | jq -e ' .items | length > 0 and any(.[]; any(.status.conditions[]?; .type=="Ready" and .status=="True")) ' >/dev/null; do sleep 5 waited=$((waited + 5)) + if (( waited % 30 == 0 )); then warn "No Ready node yet; current status:" kubectl_rke2 get nodes -o wide || true kubectl_rke2 get pods -A || true - journalctl -u rke2-server -n 60 --no-pager || true fi + if (( waited >= 1200 )); then kubectl_rke2 get nodes -o wide || true kubectl_rke2 get pods -A || true @@ -218,11 +269,64 @@ wait_for_rke2() { die "Timed out waiting for a Ready node" fi done +} + +wait_for_system_pods() { + log "Waiting for core system pods" + + local waited=0 + until kubectl_rke2 -n kube-system get pod -l k8s-app=kube-dns >/dev/null 2>&1; do + sleep 5 + waited=$((waited + 5)) + if (( waited >= 600 )); then + kubectl_rke2 get pods -A || true + die "Timed out waiting for kube-system pods to appear" + fi + done - log "Waiting for system pods to settle" kubectl_rke2 get nodes -o wide || true kubectl_rke2 get pods -A || true - kubectl_rke2 -n kube-system wait --for=condition=Ready pods --all --timeout=20m || true + + # Wait for Canal first because service routing depends on it. + waited=0 + until kubectl_rke2 -n kube-system get pods -l app=rke2-canal -o json 2>/dev/null | jq -e ' + .items | length > 0 and all(.[]; .status.phase=="Running") + ' >/dev/null; do + sleep 5 + waited=$((waited + 5)) + if (( waited % 30 == 0 )); then + warn "Canal not fully ready yet" + kubectl_rke2 -n kube-system get pods -o wide || true + fi + if (( waited >= 900 )); then + kubectl_rke2 -n kube-system get pods -o wide || true + die "Timed out waiting for Canal" + fi + done + + # Give kube-proxy and service routing a moment to settle. + sleep 20 + + # Wait for bundled addons that Rancher depends on. + waited=0 + until kubectl_rke2 -n kube-system get deploy rke2-ingress-nginx-controller rke2-metrics-server rke2-snapshot-controller >/dev/null 2>&1; do + sleep 5 + waited=$((waited + 5)) + if (( waited >= 900 )); then + kubectl_rke2 -n kube-system get pods -o wide || true + die "Timed out waiting for bundled RKE2 addon deployments" + fi + done + + kubectl_rke2 -n kube-system rollout status deploy/rke2-ingress-nginx-controller --timeout=20m + kubectl_rke2 -n kube-system rollout status deploy/rke2-metrics-server --timeout=20m + kubectl_rke2 -n kube-system rollout status deploy/rke2-snapshot-controller --timeout=20m + + # CoreDNS can be slightly slower; wait for it too. + kubectl_rke2 -n kube-system rollout status deploy/rke2-coredns-rke2-coredns --timeout=20m || true + + log "System pods are settled" + kubectl_rke2 get pods -A || true } resolve_hostname() { @@ -232,6 +336,7 @@ resolve_hostname() { local detected_ip="" detected_ip="$(ip route get 1.1.1.1 2>/dev/null | awk '{for (i=1; i<=NF; i++) if ($i=="src") {print $(i+1); exit}}')" + [[ -n "${detected_ip}" ]] || die "Could not auto-detect server IP. Set RANCHER_HOSTNAME manually." RANCHER_HOSTNAME="${detected_ip}.sslip.io" @@ -263,7 +368,10 @@ install_cert_manager() { } install_rancher() { - [[ "${INSTALL_RANCHER}" == "true" ]] || return + if [[ "${INSTALL_RANCHER}" != "true" ]]; then + warn "INSTALL_RANCHER=false, skipping Rancher install" + return + fi resolve_hostname @@ -292,16 +400,25 @@ print_summary() { if [[ -f /var/lib/rancher/rke2/server/node-token ]]; then node_token="$(