#!/usr/bin/env bash set -Eeuo pipefail ######################################## # Arch Linux Rancher Management Cluster # Single-node RKE2 server + Rancher # # What this script does: # - Disables swap # - Installs required Arch packages # - Configures kernel modules and sysctl for Kubernetes # - Configures NetworkManager to ignore CNI interfaces # - Disables host nftables service to avoid breaking RKE2 service routing # - Installs RKE2 server # - Waits for Kubernetes and bundled RKE2 addons to become healthy # - Installs cert-manager # - Installs Rancher # # Optional environment variables: # RKE2_VERSION=v1.34.5+rke2r1 # RANCHER_HOSTNAME=rancher.example.com # BOOTSTRAP_PASSWORD=changeme # RKE2_TOKEN=my-shared-secret # INSTALL_RANCHER=true ######################################## RKE2_VERSION="${RKE2_VERSION:-v1.34.5+rke2r1}" INSTALL_RANCHER="${INSTALL_RANCHER:-true}" BOOTSTRAP_PASSWORD="${BOOTSTRAP_PASSWORD:-adminadminadmin}" RANCHER_HOSTNAME="${RANCHER_HOSTNAME:-}" RKE2_CONFIG_DIR="/etc/rancher/rke2" RKE2_CONFIG_FILE="${RKE2_CONFIG_DIR}/config.yaml" RKE2_TOKEN_FILE="${RKE2_CONFIG_DIR}/server-token" KUBECONFIG_FILE="/etc/rancher/rke2/rke2.yaml" RANCHER_REPO_NAME="rancher-stable" RANCHER_REPO_URL="https://releases.rancher.com/server-charts/stable" log() { echo echo "============================================================" echo "[INFO] $*" echo "============================================================" } warn() { echo echo "[WARN] $*" >&2 } die() { echo echo "[ERROR] $*" >&2 exit 1 } on_error() { local exit_code=$? local line_no=$1 warn "Script failed on line ${line_no} with exit code ${exit_code}" warn "Useful diagnostics:" echo " sudo systemctl status rke2-server -l --no-pager" echo " sudo journalctl -u rke2-server -n 200 --no-pager" echo " sudo /var/lib/rancher/rke2/bin/kubectl --kubeconfig ${KUBECONFIG_FILE} get nodes -o wide" echo " sudo /var/lib/rancher/rke2/bin/kubectl --kubeconfig ${KUBECONFIG_FILE} get pods -A" echo " sudo /var/lib/rancher/rke2/bin/kubectl --kubeconfig ${KUBECONFIG_FILE} -n kube-system get deploy,ds" exit "${exit_code}" } trap 'on_error $LINENO' ERR require_root() { [[ "${EUID}" -eq 0 ]] || die "Run this script as root: sudo $0" } kubectl_rke2() { /var/lib/rancher/rke2/bin/kubectl --kubeconfig "${KUBECONFIG_FILE}" "$@" } helm_rke2() { helm --kubeconfig "${KUBECONFIG_FILE}" "$@" } wait_for_file() { local file="$1" local timeout="${2:-300}" local waited=0 until [[ -f "${file}" ]]; do sleep 2 waited=$((waited + 2)) if (( waited >= timeout )); then die "Timed out waiting for file: ${file}" fi done } disable_swap() { log "Disabling swap" swapoff -a || true if [[ -f /etc/fstab ]]; then cp /etc/fstab "/etc/fstab.bak.$(date +%Y%m%d%H%M%S)" sed -Ei '/^[^#].+\s+swap\s+/ s/^/# disabled-by-rancher-script /' /etc/fstab fi } install_packages() { log "Installing required Arch packages" pacman -Sy --noconfirm archlinux-keyring if pacman -Q iptables >/dev/null 2>&1; then pacman -Rdd --noconfirm iptables || true fi pacman -Syu --noconfirm pacman -S --needed --noconfirm \ bash-completion \ ca-certificates \ cni-plugins \ conntrack-tools \ curl \ ethtool \ gzip \ helm \ iproute2 \ iptables-nft \ jq \ nfs-utils \ open-iscsi \ openssl \ socat \ tar \ unzip \ wget } configure_kernel() { log "Configuring kernel modules and sysctl" cat >/etc/modules-load.d/k8s.conf <<'EOF' overlay br_netfilter EOF modprobe overlay modprobe br_netfilter cat >/etc/sysctl.d/90-kubernetes.conf <<'EOF' net.bridge.bridge-nf-call-iptables = 1 net.bridge.bridge-nf-call-ip6tables = 1 net.ipv4.ip_forward = 1 EOF sysctl --system >/dev/null } configure_networkmanager() { if systemctl is-enabled NetworkManager >/dev/null 2>&1 || systemctl is-active NetworkManager >/dev/null 2>&1; then log "Configuring NetworkManager to ignore CNI interfaces" mkdir -p /etc/NetworkManager/conf.d cat >/etc/NetworkManager/conf.d/rke2-cni.conf <<'EOF' [keyfile] unmanaged-devices=interface-name:cali*;interface-name:flannel*;interface-name:cni*;interface-name:vxlan.calico;interface-name:kube-ipvs0;interface-name:nodelocaldns;interface-name:tunl* EOF systemctl restart NetworkManager fi if systemctl list-unit-files | grep -q '^nm-cloud-setup.service'; then systemctl disable --now nm-cloud-setup.service || true fi if systemctl list-unit-files | grep -q '^nm-cloud-setup.timer'; then systemctl disable --now nm-cloud-setup.timer || true fi } enable_support_services() { log "Enabling support services" systemctl enable --now iscsid.service || true # Do NOT enable nftables.service here. # On this Arch + RKE2 setup it can break service routing for cluster IPs. systemctl stop nftables.service >/dev/null 2>&1 || true systemctl disable nftables.service >/dev/null 2>&1 || true nft flush ruleset >/dev/null 2>&1 || true } install_rke2() { log "Installing RKE2 server ${RKE2_VERSION}" mkdir -p "${RKE2_CONFIG_DIR}" if [[ -n "${RKE2_TOKEN:-}" ]]; then printf '%s\n' "${RKE2_TOKEN}" > "${RKE2_TOKEN_FILE}" chmod 600 "${RKE2_TOKEN_FILE}" elif [[ ! -f "${RKE2_TOKEN_FILE}" ]]; then openssl rand -hex 24 > "${RKE2_TOKEN_FILE}" chmod 600 "${RKE2_TOKEN_FILE}" fi local token token="$(<"${RKE2_TOKEN_FILE}")" cat >"${RKE2_CONFIG_FILE}" </etc/profile.d/rke2-path.sh <<'EOF' export PATH=$PATH:/var/lib/rancher/rke2/bin:/usr/local/bin export KUBECONFIG=/etc/rancher/rke2/rke2.yaml EOF systemctl daemon-reload systemctl enable rke2-server.service systemctl restart rke2-server.service } wait_for_api() { log "Waiting for RKE2 and Kubernetes API" wait_for_file "${KUBECONFIG_FILE}" 600 wait_for_file "/var/lib/rancher/rke2/bin/kubectl" 600 local waited=0 until kubectl_rke2 get --raw=/readyz >/dev/null 2>&1; do sleep 5 waited=$((waited + 5)) if (( waited % 30 == 0 )); then warn "Kubernetes API not ready yet; recent rke2-server logs:" journalctl -u rke2-server -n 40 --no-pager || true fi if (( waited >= 900 )); then journalctl -u rke2-server -n 200 --no-pager || true die "Timed out waiting for Kubernetes API readiness" fi done } wait_for_ready_node() { log "Waiting for any node to become Ready" local waited=0 until kubectl_rke2 get nodes -o json 2>/dev/null | jq -e ' .items | length > 0 and any(.[]; any(.status.conditions[]?; .type=="Ready" and .status=="True")) ' >/dev/null; do sleep 5 waited=$((waited + 5)) if (( waited % 30 == 0 )); then warn "No Ready node yet; current status:" kubectl_rke2 get nodes -o wide || true kubectl_rke2 get pods -A || true fi if (( waited >= 1200 )); then kubectl_rke2 get nodes -o wide || true kubectl_rke2 get pods -A || true journalctl -u rke2-server -n 200 --no-pager || true die "Timed out waiting for a Ready node" fi done } wait_for_system_pods() { log "Waiting for core system pods" local waited=0 until kubectl_rke2 -n kube-system get pod -l k8s-app=kube-dns >/dev/null 2>&1; do sleep 5 waited=$((waited + 5)) if (( waited >= 600 )); then kubectl_rke2 get pods -A || true die "Timed out waiting for kube-system pods to appear" fi done kubectl_rke2 get nodes -o wide || true kubectl_rke2 get pods -A || true log "Waiting for Canal (CNI) DaemonSet to exist" waited=0 until kubectl_rke2 -n kube-system get daemonset rke2-canal >/dev/null 2>&1; do sleep 5 waited=$((waited + 5)) if (( waited % 30 == 0 )); then warn "rke2-canal DaemonSet not present yet" kubectl_rke2 -n kube-system get daemonsets || true kubectl_rke2 -n kube-system get pods -o wide || true fi if (( waited >= 900 )); then kubectl_rke2 -n kube-system get daemonsets || true kubectl_rke2 -n kube-system get pods -o wide || true die "Timed out waiting for rke2-canal DaemonSet to appear" fi done log "Waiting for Canal (CNI) to be fully rolled out" kubectl_rke2 -n kube-system rollout status daemonset/rke2-canal --timeout=20m sleep 20 log "Waiting for bundled RKE2 addon objects to exist" waited=0 until kubectl_rke2 -n kube-system get deployment rke2-coredns-rke2-coredns >/dev/null 2>&1 \ && kubectl_rke2 -n kube-system get deployment rke2-coredns-rke2-coredns-autoscaler >/dev/null 2>&1 \ && kubectl_rke2 -n kube-system get deployment rke2-metrics-server >/dev/null 2>&1 \ && kubectl_rke2 -n kube-system get deployment rke2-snapshot-controller >/dev/null 2>&1 \ && kubectl_rke2 -n kube-system get daemonset rke2-ingress-nginx-controller >/dev/null 2>&1; do sleep 5 waited=$((waited + 5)) if (( waited % 30 == 0 )); then warn "Bundled addon objects are not all present yet" kubectl_rke2 -n kube-system get deploy,ds || true kubectl_rke2 -n kube-system get pods -o wide || true fi if (( waited >= 900 )); then kubectl_rke2 -n kube-system get deploy,ds || true kubectl_rke2 -n kube-system get pods -o wide || true die "Timed out waiting for bundled RKE2 addon objects" fi done log "Waiting for bundled RKE2 addons to roll out" kubectl_rke2 -n kube-system rollout status deployment/rke2-coredns-rke2-coredns --timeout=20m kubectl_rke2 -n kube-system rollout status deployment/rke2-coredns-rke2-coredns-autoscaler --timeout=20m kubectl_rke2 -n kube-system rollout status deployment/rke2-metrics-server --timeout=20m kubectl_rke2 -n kube-system rollout status deployment/rke2-snapshot-controller --timeout=20m kubectl_rke2 -n kube-system rollout status daemonset/rke2-ingress-nginx-controller --timeout=20m log "System pods are settled" kubectl_rke2 get pods -A || true } resolve_hostname() { if [[ -n "${RANCHER_HOSTNAME}" ]]; then return fi local detected_ip="" detected_ip="$(ip route get 1.1.1.1 2>/dev/null | awk '{for (i=1; i<=NF; i++) if ($i=="src") {print $(i+1); exit}}')" [[ -n "${detected_ip}" ]] || die "Could not auto-detect server IP. Set RANCHER_HOSTNAME manually." RANCHER_HOSTNAME="${detected_ip}.sslip.io" log "Auto-detected Rancher hostname: ${RANCHER_HOSTNAME}" } install_cert_manager() { log "Installing cert-manager" helm repo add jetstack https://charts.jetstack.io >/dev/null 2>&1 || true helm repo update >/dev/null local cert_manager_app_version="" cert_manager_app_version="$(helm show chart jetstack/cert-manager | awk '/^appVersion:/ {print $2; exit}')" [[ -n "${cert_manager_app_version}" ]] || die "Could not determine cert-manager appVersion" kubectl_rke2 create namespace cert-manager --dry-run=client -o yaml | kubectl_rke2 apply -f - kubectl_rke2 apply -f "https://github.com/cert-manager/cert-manager/releases/download/${cert_manager_app_version}/cert-manager.crds.yaml" helm_rke2 upgrade --install cert-manager jetstack/cert-manager \ --namespace cert-manager \ --create-namespace \ --wait \ --timeout 20m kubectl_rke2 -n cert-manager rollout status deploy/cert-manager --timeout=20m kubectl_rke2 -n cert-manager rollout status deploy/cert-manager-webhook --timeout=20m kubectl_rke2 -n cert-manager rollout status deploy/cert-manager-cainjector --timeout=20m } install_rancher() { if [[ "${INSTALL_RANCHER}" != "true" ]]; then warn "INSTALL_RANCHER=false, skipping Rancher install" return fi resolve_hostname log "Installing Rancher" helm repo add "${RANCHER_REPO_NAME}" "${RANCHER_REPO_URL}" >/dev/null 2>&1 || true helm repo update >/dev/null kubectl_rke2 create namespace cattle-system --dry-run=client -o yaml | kubectl_rke2 apply -f - helm_rke2 upgrade --install rancher "${RANCHER_REPO_NAME}/rancher" \ --namespace cattle-system \ --set hostname="${RANCHER_HOSTNAME}" \ --set bootstrapPassword="${BOOTSTRAP_PASSWORD}" \ --set replicas=1 \ --set ingress.tls.source=rancher \ --wait \ --timeout 30m kubectl_rke2 -n cattle-system rollout status deploy/rancher --timeout=30m } print_summary() { local node_token="" local bootstrap_secret_password="" if [[ -f /var/lib/rancher/rke2/server/node-token ]]; then node_token="$(