#!/usr/bin/env bash set -Eeuo pipefail ######################################## # Arch Linux Rancher Management Cluster # RKE2 + Rancher + optional Longhorn # + fixed RKE2 config generation # + cloudflared-friendly Rancher hostname handling # + bootstrap cleanup on failed first start ######################################## # ---------- Config ---------- # RKE2 RKE2_CHANNEL="${RKE2_CHANNEL:-stable}" RKE2_VERSION="${RKE2_VERSION:-v1.34.5+rke2r1}" RKE2_TOKEN="${RKE2_TOKEN:-}" CLUSTER_CIDR="${CLUSTER_CIDR:-192.168.0.0/16}" SERVICE_CIDR="${SERVICE_CIDR:-10.43.0.0/16}" CLUSTER_DNS="${CLUSTER_DNS:-10.43.0.10}" CNI_PLUGIN="${CNI_PLUGIN:-canal}" # canal | calico | cilium | flannel DISABLE_RKE2_INGRESS="${DISABLE_RKE2_INGRESS:-true}" ALLOW_SCHEDULING_ON_SERVER="${ALLOW_SCHEDULING_ON_SERVER:-true}" RESET_FAILED_BOOTSTRAP="${RESET_FAILED_BOOTSTRAP:-true}" # Rancher INSTALL_RANCHER="${INSTALL_RANCHER:-true}" RANCHER_HOSTNAME="${RANCHER_HOSTNAME:-}" # REQUIRED, e.g. rancher.example.com RANCHER_NAMESPACE="${RANCHER_NAMESPACE:-cattle-system}" RANCHER_REPO_CHANNEL="${RANCHER_REPO_CHANNEL:-stable}" # stable | latest | alpha RANCHER_CHART_VERSION="${RANCHER_CHART_VERSION:-2.13.4}" RANCHER_BOOTSTRAP_PASSWORD="${RANCHER_BOOTSTRAP_PASSWORD:-}" RANCHER_REPLICAS="${RANCHER_REPLICAS:-1}" RANCHER_TLS_SOURCE="${RANCHER_TLS_SOURCE:-rancher}" # rancher | letsEncrypt | secret RANCHER_PRIVATE_CA="${RANCHER_PRIVATE_CA:-false}" LETSENCRYPT_EMAIL="${LETSENCRYPT_EMAIL:-}" # ingress-nginx INSTALL_INGRESS_NGINX="${INSTALL_INGRESS_NGINX:-true}" INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}" INGRESS_CLASS_NAME="${INGRESS_CLASS_NAME:-nginx}" # cert-manager INSTALL_CERT_MANAGER="${INSTALL_CERT_MANAGER:-true}" CERT_MANAGER_NAMESPACE="${CERT_MANAGER_NAMESPACE:-cert-manager}" CERT_MANAGER_CHART_VERSION="${CERT_MANAGER_CHART_VERSION:-v1.18.3}" # Longhorn INSTALL_LONGHORN="${INSTALL_LONGHORN:-false}" LONGHORN_NAMESPACE="${LONGHORN_NAMESPACE:-longhorn-system}" LONGHORN_CHART_VERSION="${LONGHORN_CHART_VERSION:-1.11.0}" LONGHORN_DEFAULT_REPLICA_COUNT="${LONGHORN_DEFAULT_REPLICA_COUNT:-1}" # Helm INSTALL_HELM="${INSTALL_HELM:-true}" HELM_VERSION="${HELM_VERSION:-v3.18.4}" # cloudflared helper file only; does not install cloudflared WRITE_CLOUDFLARED_EXAMPLE="${WRITE_CLOUDFLARED_EXAMPLE:-true}" CLOUDFLARED_SERVICE_TARGET="${CLOUDFLARED_SERVICE_TARGET:-https://127.0.0.1}" # User detection REAL_USER="${SUDO_USER:-root}" REAL_HOME="$(getent passwd "${REAL_USER}" | cut -d: -f6 || true)" REAL_HOME="${REAL_HOME:-/root}" REAL_KUBECONFIG_DIR="${REAL_HOME}/.kube" # ---------- Logging ---------- log() { echo echo "============================================================" echo "[INFO] $*" echo "============================================================" } warn() { echo echo "[WARN] $*" >&2 } die() { echo echo "[ERROR] $*" >&2 exit 1 } # ---------- Helpers ---------- require_root() { [[ "${EUID}" -eq 0 ]] || die "Run as root: sudo ./master_node_install.sh" } require_cmd() { command -v "$1" >/dev/null 2>&1 || die "Required command not found: $1" } retry() { local attempts="$1" local sleep_seconds="$2" shift 2 local n=1 until "$@"; do if (( n >= attempts )); then return 1 fi warn "Command failed (attempt ${n}/${attempts}): $*" sleep "${sleep_seconds}" ((n++)) done } helm_repo_add_force() { local name="$1" local url="$2" if helm repo list 2>/dev/null | awk 'NR>1 {print $1}' | grep -qx "${name}"; then helm repo add "${name}" "${url}" --force-update >/dev/null else helm repo add "${name}" "${url}" >/dev/null fi } kubectl_ns_apply() { local ns="$1" "${KUBECTL_BIN}" create namespace "${ns}" --dry-run=client -o yaml | "${KUBECTL_BIN}" apply -f - } write_file_if_changed() { local path="$1" local tmp tmp="$(mktemp)" cat > "${tmp}" if [[ -f "${path}" ]] && cmp -s "${tmp}" "${path}"; then rm -f "${tmp}" return 0 fi install -D -m 0644 "${tmp}" "${path}" rm -f "${tmp}" } # ---------- Tool paths ---------- RKE2_BIN_DIR="/var/lib/rancher/rke2/bin" KUBECTL_BIN="${RKE2_BIN_DIR}/kubectl" CRICTL_BIN="${RKE2_BIN_DIR}/crictl" KUBECONFIG_SYSTEM="/etc/rancher/rke2/rke2.yaml" RKE2_CONFIG="/etc/rancher/rke2/config.yaml" RKE2_SERVER_STATE_DIR="/var/lib/rancher/rke2/server" RKE2_AGENT_LOG_DIR="/var/lib/rancher/rke2/agent/logs" CRICTL_RUNTIME_ENDPOINT="unix:///run/k3s/containerd/containerd.sock" CRICTL_IMAGE_ENDPOINT="unix:///run/k3s/containerd/containerd.sock" # ---------- Validation ---------- validate_inputs() { if [[ "${INSTALL_RANCHER}" == "true" && -z "${RANCHER_HOSTNAME}" ]]; then die "RANCHER_HOSTNAME must be set, for example: export RANCHER_HOSTNAME=rancher.example.com" fi if [[ -z "${RKE2_TOKEN}" ]]; then RKE2_TOKEN="$(openssl rand -hex 32)" fi if [[ "${INSTALL_RANCHER}" == "true" && -z "${RANCHER_BOOTSTRAP_PASSWORD}" ]]; then RANCHER_BOOTSTRAP_PASSWORD="$(openssl rand -base64 24 | tr -d '\n' | tr '/+' 'AB' | cut -c1-20)" fi if [[ "${RANCHER_TLS_SOURCE}" == "letsEncrypt" && -z "${LETSENCRYPT_EMAIL}" ]]; then die "LETSENCRYPT_EMAIL must be set when RANCHER_TLS_SOURCE=letsEncrypt" fi case "${RANCHER_REPO_CHANNEL}" in stable|latest|alpha) ;; *) die "Invalid RANCHER_REPO_CHANNEL=${RANCHER_REPO_CHANNEL}. Use stable, latest, or alpha." ;; esac } # ---------- Error trap ---------- on_error() { local exit_code=$? warn "Script failed on line $1 with exit code ${exit_code}" warn "Useful diagnostics:" echo " sudo systemctl status rke2-server -l --no-pager" echo " sudo journalctl -u rke2-server -n 200 --no-pager" echo " sudo tail -n 200 ${RKE2_AGENT_LOG_DIR}/kubelet.log" echo " sudo ${CRICTL_BIN} --runtime-endpoint ${CRICTL_RUNTIME_ENDPOINT} ps -a" echo " sudo ${CRICTL_BIN} --runtime-endpoint ${CRICTL_RUNTIME_ENDPOINT} pods" exit "${exit_code}" } trap 'on_error $LINENO' ERR # ---------- Step 1: Install base packages ---------- install_base_packages() { log "Installing required Arch packages" pacman -Sy --noconfirm --needed \ ca-certificates \ curl \ tar \ gzip \ jq \ openssl \ unzip \ wget \ iptables-nft \ nftables \ conntrack-tools \ socat \ ethtool \ iproute2 \ bash-completion \ open-iscsi \ nfs-utils \ cni-plugins systemctl enable --now iscsid || true } # ---------- Step 2: Disable swap ---------- disable_swap() { log "Disabling swap" swapoff -a || true if [[ -f /etc/fstab ]]; then cp /etc/fstab "/etc/fstab.bak.$(date +%Y%m%d%H%M%S)" sed -ri '/\sswap\s/s/^/# DISABLED FOR KUBERNETES: /' /etc/fstab fi } # ---------- Step 3: Kernel modules and sysctl ---------- configure_kernel_networking() { log "Configuring kernel modules and sysctl" write_file_if_changed /etc/modules-load.d/rke2.conf <<'EOF' overlay br_netfilter nf_conntrack EOF modprobe overlay modprobe br_netfilter modprobe nf_conntrack || true write_file_if_changed /etc/sysctl.d/90-rke2.conf <<'EOF' net.ipv4.ip_forward = 1 net.bridge.bridge-nf-call-iptables = 1 net.bridge.bridge-nf-call-ip6tables = 1 fs.inotify.max_user_instances = 8192 fs.inotify.max_user_watches = 1048576 vm.max_map_count = 262144 EOF sysctl --system } # ---------- Step 4: firewalld ---------- handle_firewall() { log "Checking for firewalld" if systemctl list-unit-files 2>/dev/null | grep -q '^firewalld\.service'; then if systemctl is-enabled firewalld >/dev/null 2>&1 || systemctl is-active firewalld >/dev/null 2>&1; then warn "firewalld is active or enabled; disabling it for RKE2 compatibility" systemctl disable --now firewalld || true fi fi } # ---------- Step 5: Install Helm ---------- install_helm() { [[ "${INSTALL_HELM}" == "true" ]] || return 0 log "Installing Helm ${HELM_VERSION}" local tmpdir arch tmpdir="$(mktemp -d)" arch="$(uname -m)" case "${arch}" in x86_64) arch="amd64" ;; aarch64) arch="arm64" ;; *) die "Unsupported architecture for Helm: ${arch}" ;; esac curl -fsSL -o "${tmpdir}/helm.tar.gz" \ "https://get.helm.sh/helm-${HELM_VERSION}-linux-${arch}.tar.gz" tar -xzf "${tmpdir}/helm.tar.gz" -C "${tmpdir}" install -m 0755 "${tmpdir}/linux-${arch}/helm" /usr/local/bin/helm rm -rf "${tmpdir}" helm version } # ---------- Step 6: Install RKE2 ---------- install_rke2() { log "Installing RKE2 ${RKE2_VERSION}" export INSTALL_RKE2_CHANNEL="${RKE2_CHANNEL}" export INSTALL_RKE2_METHOD="tar" export INSTALL_RKE2_VERSION="${RKE2_VERSION}" curl -sfL https://get.rke2.io | sh - } # ---------- Step 7: Detect node info ---------- detect_node_name() { local name="" name="$(hostnamectl --static 2>/dev/null || true)" [[ -n "${name}" ]] || name="$(uname -n)" [[ -n "${name}" ]] || die "Failed to determine node name" printf '%s\n' "${name}" } detect_node_ip() { local ip="" ip="$(ip -4 route get 1.1.1.1 2>/dev/null | awk '{for(i=1;i<=NF;i++) if ($i=="src") {print $(i+1); exit}}')" [[ -n "${ip}" ]] || ip="$(hostname -I 2>/dev/null | awk '{print $1}')" [[ -n "${ip}" ]] || die "Failed to determine node IP" printf '%s\n' "${ip}" } # ---------- Step 8: Configure RKE2 ---------- configure_rke2() { log "Writing ${RKE2_CONFIG}" mkdir -p /etc/rancher/rke2 local node_ip node_name node_ip="$(detect_node_ip)" node_name="$(detect_node_name)" { echo "token: ${RKE2_TOKEN}" echo 'write-kubeconfig-mode: "0644"' echo "node-name: ${node_name}" echo "tls-san:" echo " - ${node_ip}" echo " - 127.0.0.1" echo "cluster-cidr: ${CLUSTER_CIDR}" echo "service-cidr: ${SERVICE_CIDR}" echo "cluster-dns: ${CLUSTER_DNS}" echo "cni: ${CNI_PLUGIN}" echo "etcd-expose-metrics: true" if [[ "${DISABLE_RKE2_INGRESS}" == "true" ]]; then echo "disable:" echo " - rke2-ingress-nginx" fi } > "${RKE2_CONFIG}" } # ---------- Step 9: Write crictl config ---------- configure_crictl() { log "Writing /etc/crictl.yaml" cat >/etc/crictl.yaml </dev/null 2>&1 || { journalctl -u rke2-server --no-pager -n 200 || true [[ -f "${RKE2_AGENT_LOG_DIR}/kubelet.log" ]] && tail -n 200 "${RKE2_AGENT_LOG_DIR}/kubelet.log" || true die "Kubernetes API did not become ready" } } # ---------- Step 12: Configure kubeconfig ---------- configure_kubeconfig() { log "Configuring kubeconfig for root and user" mkdir -p /root/.kube cp -f "${KUBECONFIG_SYSTEM}" /root/.kube/config chmod 600 /root/.kube/config if [[ -n "${REAL_HOME}" && -d "${REAL_HOME}" ]]; then mkdir -p "${REAL_KUBECONFIG_DIR}" cp -f "${KUBECONFIG_SYSTEM}" "${REAL_KUBECONFIG_DIR}/config" chown -R "${REAL_USER}:${REAL_USER}" "${REAL_KUBECONFIG_DIR}" chmod 600 "${REAL_KUBECONFIG_DIR}/config" fi } # ---------- Step 13: Allow scheduling on server ---------- allow_server_scheduling() { if [[ "${ALLOW_SCHEDULING_ON_SERVER}" == "true" ]]; then log "Removing control-plane taints for single-node scheduling" "${KUBECTL_BIN}" taint nodes --all node-role.kubernetes.io/control-plane- || true "${KUBECTL_BIN}" taint nodes --all node-role.kubernetes.io/master- || true fi } # ---------- Step 14: Wait for core system ---------- wait_for_core_system() { log "Waiting for core system pods" retry 90 5 "${KUBECTL_BIN}" get nodes >/dev/null 2>&1 retry 90 5 "${KUBECTL_BIN}" -n kube-system get pods >/dev/null 2>&1 "${KUBECTL_BIN}" get nodes -o wide echo "${KUBECTL_BIN}" get pods -A } # ---------- Step 15: Helm repos ---------- configure_helm_repos() { [[ "${INSTALL_HELM}" == "true" ]] || return 0 log "Configuring Helm repositories" helm_repo_add_force ingress-nginx https://kubernetes.github.io/ingress-nginx helm_repo_add_force jetstack https://charts.jetstack.io helm_repo_add_force longhorn https://charts.longhorn.io case "${RANCHER_REPO_CHANNEL}" in stable) helm_repo_add_force rancher-stable https://releases.rancher.com/server-charts/stable RANCHER_CHART="rancher-stable/rancher" ;; latest) helm_repo_add_force rancher-latest https://releases.rancher.com/server-charts/latest RANCHER_CHART="rancher-latest/rancher" ;; alpha) helm_repo_add_force rancher-alpha https://releases.rancher.com/server-charts/alpha RANCHER_CHART="rancher-alpha/rancher" ;; esac helm repo update } # ---------- Step 16: Install ingress-nginx ---------- install_ingress_nginx() { [[ "${INSTALL_INGRESS_NGINX}" == "true" ]] || return 0 log "Installing ingress-nginx" kubectl_ns_apply "${INGRESS_NAMESPACE}" helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx \ --namespace "${INGRESS_NAMESPACE}" \ --create-namespace \ --set controller.kind=DaemonSet \ --set controller.hostNetwork=true \ --set controller.dnsPolicy=ClusterFirstWithHostNet \ --set controller.service.type=ClusterIP \ --set controller.ingressClass="${INGRESS_CLASS_NAME}" \ --set controller.ingressClassResource.name="${INGRESS_CLASS_NAME}" \ --set controller.ingressClassResource.default=true \ --set controller.watchIngressWithoutClass=true \ --set controller.reportNodeInternalIp=true \ --wait \ --timeout 20m "${KUBECTL_BIN}" -n "${INGRESS_NAMESPACE}" rollout status daemonset/ingress-nginx-controller --timeout=20m } # ---------- Step 17: Install cert-manager ---------- install_cert_manager() { [[ "${INSTALL_CERT_MANAGER}" == "true" ]] || return 0 [[ "${INSTALL_RANCHER}" == "true" ]] || return 0 [[ "${RANCHER_TLS_SOURCE}" != "secret" ]] || return 0 log "Installing cert-manager" kubectl_ns_apply "${CERT_MANAGER_NAMESPACE}" helm upgrade --install cert-manager jetstack/cert-manager \ --namespace "${CERT_MANAGER_NAMESPACE}" \ --create-namespace \ --version "${CERT_MANAGER_CHART_VERSION}" \ --set crds.enabled=true \ --wait \ --timeout 20m "${KUBECTL_BIN}" -n "${CERT_MANAGER_NAMESPACE}" rollout status deployment/cert-manager --timeout=20m "${KUBECTL_BIN}" -n "${CERT_MANAGER_NAMESPACE}" rollout status deployment/cert-manager-cainjector --timeout=20m "${KUBECTL_BIN}" -n "${CERT_MANAGER_NAMESPACE}" rollout status deployment/cert-manager-webhook --timeout=20m } # ---------- Step 18: Install Rancher ---------- install_rancher() { [[ "${INSTALL_RANCHER}" == "true" ]] || return 0 log "Installing Rancher" echo "${RANCHER_BOOTSTRAP_PASSWORD}" >/root/rancher-bootstrap-password.txt chmod 600 /root/rancher-bootstrap-password.txt kubectl_ns_apply "${RANCHER_NAMESPACE}" local -a rancher_args=( --namespace "${RANCHER_NAMESPACE}" --create-namespace --version "${RANCHER_CHART_VERSION}" --set hostname="${RANCHER_HOSTNAME}" --set bootstrapPassword="${RANCHER_BOOTSTRAP_PASSWORD}" --set replicas="${RANCHER_REPLICAS}" --set ingress.ingressClassName="${INGRESS_CLASS_NAME}" --set ingress.tls.source="${RANCHER_TLS_SOURCE}" --wait --timeout 30m ) if [[ "${RANCHER_PRIVATE_CA}" == "true" ]]; then rancher_args+=( --set privateCA=true ) fi if [[ "${RANCHER_TLS_SOURCE}" == "letsEncrypt" ]]; then rancher_args+=( --set letsEncrypt.email="${LETSENCRYPT_EMAIL}" ) fi helm upgrade --install rancher "${RANCHER_CHART}" "${rancher_args[@]}" "${KUBECTL_BIN}" -n "${RANCHER_NAMESPACE}" rollout status deployment/rancher --timeout=30m || true if "${KUBECTL_BIN}" -n "${RANCHER_NAMESPACE}" get deployment rancher-webhook >/dev/null 2>&1; then "${KUBECTL_BIN}" -n "${RANCHER_NAMESPACE}" rollout status deployment/rancher-webhook --timeout=30m || true fi if "${KUBECTL_BIN}" -n "${RANCHER_NAMESPACE}" get deployment cattle-cluster-agent >/dev/null 2>&1; then "${KUBECTL_BIN}" -n "${RANCHER_NAMESPACE}" rollout status deployment/cattle-cluster-agent --timeout=30m || true fi } # ---------- Step 19: Install Longhorn ---------- install_longhorn() { [[ "${INSTALL_LONGHORN}" == "true" ]] || return 0 log "Installing Longhorn" systemctl enable --now iscsid || true kubectl_ns_apply "${LONGHORN_NAMESPACE}" helm upgrade --install longhorn longhorn/longhorn \ --namespace "${LONGHORN_NAMESPACE}" \ --create-namespace \ --version "${LONGHORN_CHART_VERSION}" \ --set defaultSettings.defaultReplicaCount="${LONGHORN_DEFAULT_REPLICA_COUNT}" \ --wait \ --timeout 30m "${KUBECTL_BIN}" -n "${LONGHORN_NAMESPACE}" get pods } # ---------- Step 20: Write cloudflared example ---------- write_cloudflared_example() { [[ "${WRITE_CLOUDFLARED_EXAMPLE}" == "true" ]] || return 0 [[ -n "${RANCHER_HOSTNAME}" ]] || return 0 log "Writing example cloudflared ingress file" mkdir -p /root/rancher-install-artifacts cat >/root/rancher-install-artifacts/cloudflared-config-example.yml </root/rancher-install-artifacts/README.txt </dev/null 2>&1 || command -v b2sum >/dev/null 2>&1 || die "Neither sha256sum nor b2sum is installed" validate_inputs install_base_packages disable_swap configure_kernel_networking handle_firewall install_helm install_rke2 configure_rke2 configure_crictl reset_failed_rke2_bootstrap start_rke2 configure_kubeconfig allow_server_scheduling wait_for_core_system configure_helm_repos install_ingress_nginx install_cert_manager install_rancher install_longhorn write_cloudflared_example validate_install save_cluster_info echo echo "Installation complete." echo echo "Rancher URL:" echo " https://${RANCHER_HOSTNAME}" echo echo "Bootstrap password file:" echo " /root/rancher-bootstrap-password.txt" echo echo "Cloudflared example file:" echo " /root/rancher-install-artifacts/cloudflared-config-example.yml" echo echo "Next:" echo " 1. Point your Cloudflare Tunnel hostname at Rancher." echo " 2. Log into Rancher." echo " 3. Create or import downstream clusters from Rancher." echo } main "$@"