diff --git a/master_node_install.sh b/master_node_install.sh index 4d90256..3296bf5 100644 --- a/master_node_install.sh +++ b/master_node_install.sh @@ -4,33 +4,14 @@ set -Eeuo pipefail ######################################## # Arch Linux Rancher Management Cluster # Single-node RKE2 server + Rancher -# -# What this script does: -# - Disables swap -# - Installs required Arch packages -# - Ensures kernel modules/sysctl are set for Kubernetes -# - Configures NetworkManager to ignore CNI interfaces -# - Installs RKE2 server pinned to a Rancher-friendly 1.34 release -# - Waits for Kubernetes to become healthy -# - Installs cert-manager -# - Installs Rancher via Helm -# - Prints the Rancher URL and bootstrap password -# -# Optional environment variables: -# RKE2_VERSION=v1.34.5+rke2r1 -# RANCHER_HOSTNAME=rancher.example.com -# BOOTSTRAP_PASSWORD=changeme -# RKE2_TOKEN=my-shared-secret -# INSTALL_RANCHER=true ######################################## RKE2_VERSION="${RKE2_VERSION:-v1.34.5+rke2r1}" INSTALL_RANCHER="${INSTALL_RANCHER:-true}" -BOOTSTRAP_PASSWORD="${BOOTSTRAP_PASSWORD:-adminadminadmin}" -RANCHER_HOSTNAME="${RANCHER_HOSTNAME:-}" +BOOTSTRAP_PASSWORD="${BOOTSTRAP_PASSWORD:-V1P4F7uaqpAFHsVzLX6M}" +RANCHER_HOSTNAME="${RANCHER_HOSTNAME:-rancher.fortis-scientia.com}" RKE2_CONFIG_DIR="/etc/rancher/rke2" RKE2_CONFIG_FILE="${RKE2_CONFIG_DIR}/config.yaml" -RKE2_TOKEN_FILE="${RKE2_CONFIG_DIR}/server-token" KUBECONFIG_FILE="/etc/rancher/rke2/rke2.yaml" RANCHER_REPO_NAME="rancher-stable" RANCHER_REPO_URL="https://releases.rancher.com/server-charts/stable" @@ -56,14 +37,12 @@ die() { on_error() { local exit_code=$? local line_no=$1 - warn "Script failed on line ${line_no} with exit code ${exit_code}" warn "Useful diagnostics:" echo " sudo systemctl status rke2-server -l --no-pager" echo " sudo journalctl -u rke2-server -n 200 --no-pager" - echo " sudo tail -n 200 /var/lib/rancher/rke2/agent/logs/kubelet.log" - echo " sudo /var/lib/rancher/rke2/bin/crictl --runtime-endpoint unix:///run/k3s/containerd/containerd.sock ps -a" - echo " sudo /var/lib/rancher/rke2/bin/crictl --runtime-endpoint unix:///run/k3s/containerd/containerd.sock pods" + echo " sudo /var/lib/rancher/rke2/bin/kubectl --kubeconfig ${KUBECONFIG_FILE} get nodes -o wide" + echo " sudo /var/lib/rancher/rke2/bin/kubectl --kubeconfig ${KUBECONFIG_FILE} get pods -A" exit "${exit_code}" } trap 'on_error $LINENO' ERR @@ -72,35 +51,6 @@ require_root() { [[ "${EUID}" -eq 0 ]] || die "Run this script as root: sudo $0" } -wait_for_file() { - local file="$1" - local timeout="${2:-300}" - local waited=0 - - until [[ -f "${file}" ]]; do - sleep 2 - waited=$((waited + 2)) - if (( waited >= timeout )); then - die "Timed out waiting for file: ${file}" - fi - done -} - -wait_for_cmd() { - local cmd="$1" - local timeout="${2:-600}" - local interval="${3:-5}" - local waited=0 - - until eval "${cmd}" >/dev/null 2>&1; do - sleep "${interval}" - waited=$((waited + interval)) - if (( waited >= timeout )); then - die "Timed out waiting for command to succeed: ${cmd}" - fi - done -} - kubectl_rke2() { /var/lib/rancher/rke2/bin/kubectl --kubeconfig "${KUBECONFIG_FILE}" "$@" } @@ -111,9 +61,7 @@ helm_rke2() { disable_swap() { log "Disabling swap" - swapoff -a || true - if [[ -f /etc/fstab ]]; then cp /etc/fstab "/etc/fstab.bak.$(date +%Y%m%d%H%M%S)" sed -Ei '/^[^#].+\s+swap\s+/ s/^/# disabled-by-rancher-script /' /etc/fstab @@ -122,7 +70,6 @@ disable_swap() { install_packages() { log "Installing required Arch packages" - pacman -Sy --noconfirm archlinux-keyring if pacman -Q iptables >/dev/null 2>&1; then @@ -184,11 +131,17 @@ EOF systemctl restart NetworkManager fi + + if systemctl list-unit-files | grep -q '^nm-cloud-setup.service'; then + systemctl disable --now nm-cloud-setup.service || true + fi + if systemctl list-unit-files | grep -q '^nm-cloud-setup.timer'; then + systemctl disable --now nm-cloud-setup.timer || true + fi } enable_support_services() { log "Enabling support services" - systemctl enable --now iscsid.service || true systemctl enable --now nftables.service || true } @@ -198,19 +151,7 @@ install_rke2() { mkdir -p "${RKE2_CONFIG_DIR}" - if [[ -n "${RKE2_TOKEN:-}" ]]; then - printf '%s\n' "${RKE2_TOKEN}" > "${RKE2_TOKEN_FILE}" - chmod 600 "${RKE2_TOKEN_FILE}" - elif [[ ! -f "${RKE2_TOKEN_FILE}" ]]; then - openssl rand -hex 24 > "${RKE2_TOKEN_FILE}" - chmod 600 "${RKE2_TOKEN_FILE}" - fi - - local token - token="$(<"${RKE2_TOKEN_FILE}")" - cat >"${RKE2_CONFIG_FILE}" <= 600 )); then + journalctl -u rke2-server -n 200 --no-pager || true + die "Timed out waiting for ${KUBECONFIG_FILE}" + fi + done - wait_for_cmd "kubectl_rke2 get --raw=/readyz" 900 5 + waited=0 + until /var/lib/rancher/rke2/bin/kubectl --kubeconfig "${KUBECONFIG_FILE}" get --raw=/readyz >/dev/null 2>&1; do + sleep 5 + waited=$((waited + 5)) + if (( waited % 30 == 0 )); then + warn "Kubernetes API not ready yet; latest rke2-server logs:" + journalctl -u rke2-server -n 40 --no-pager || true + fi + if (( waited >= 900 )); then + journalctl -u rke2-server -n 200 --no-pager || true + die "Timed out waiting for Kubernetes API readiness" + fi + done - log "Waiting for local node to become Ready" - wait_for_cmd '[[ "$(kubectl_rke2 get node "$(hostname -s)" -o jsonpath="{.status.conditions[?(@.type==\"Ready\")].status}" 2>/dev/null || true)" == "True" ]]' 1200 5 + log "Waiting for any node to become Ready" + + waited=0 + until kubectl_rke2 get nodes -o json 2>/dev/null | jq -e ' + .items | length > 0 and any(.[]; any(.status.conditions[]?; .type=="Ready" and .status=="True")) + ' >/dev/null; do + sleep 5 + waited=$((waited + 5)) + if (( waited % 30 == 0 )); then + warn "No Ready node yet; current status:" + kubectl_rke2 get nodes -o wide || true + kubectl_rke2 get pods -A || true + journalctl -u rke2-server -n 60 --no-pager || true + fi + if (( waited >= 1200 )); then + kubectl_rke2 get nodes -o wide || true + kubectl_rke2 get pods -A || true + journalctl -u rke2-server -n 200 --no-pager || true + die "Timed out waiting for a Ready node" + fi + done log "Waiting for system pods to settle" - wait_for_cmd 'kubectl_rke2 -n kube-system get pods >/dev/null 2>&1' 300 5 + kubectl_rke2 get nodes -o wide || true + kubectl_rke2 get pods -A || true kubectl_rke2 -n kube-system wait --for=condition=Ready pods --all --timeout=20m || true } @@ -252,7 +232,6 @@ resolve_hostname() { local detected_ip="" detected_ip="$(ip route get 1.1.1.1 2>/dev/null | awk '{for (i=1; i<=NF; i++) if ($i=="src") {print $(i+1); exit}}')" - [[ -n "${detected_ip}" ]] || die "Could not auto-detect server IP. Set RANCHER_HOSTNAME manually." RANCHER_HOSTNAME="${detected_ip}.sslip.io" @@ -270,7 +249,6 @@ install_cert_manager() { [[ -n "${cert_manager_app_version}" ]] || die "Could not determine cert-manager appVersion" kubectl_rke2 create namespace cert-manager --dry-run=client -o yaml | kubectl_rke2 apply -f - - kubectl_rke2 apply -f "https://github.com/cert-manager/cert-manager/releases/download/${cert_manager_app_version}/cert-manager.crds.yaml" helm_rke2 upgrade --install cert-manager jetstack/cert-manager \ @@ -285,10 +263,7 @@ install_cert_manager() { } install_rancher() { - if [[ "${INSTALL_RANCHER}" != "true" ]]; then - warn "INSTALL_RANCHER=false, skipping Rancher install" - return - fi + [[ "${INSTALL_RANCHER}" == "true" ]] || return resolve_hostname @@ -317,25 +292,16 @@ print_summary() { if [[ -f /var/lib/rancher/rke2/server/node-token ]]; then node_token="$(