kubernetes-arch-install/master_node_install.sh

453 lines
13 KiB
Bash

#!/usr/bin/env bash
set -Eeuo pipefail
########################################
# Arch Linux Rancher Management Cluster
# Single-node RKE2 server + Rancher
#
# What this script does:
# - Disables swap
# - Installs required Arch packages
# - Configures kernel modules and sysctl for Kubernetes
# - Configures NetworkManager to ignore CNI interfaces
# - Disables host nftables service to avoid breaking RKE2 service routing
# - Installs RKE2 server
# - Waits for Kubernetes and bundled RKE2 addons to become healthy
# - Installs cert-manager
# - Installs Rancher
#
# Optional environment variables:
# RKE2_VERSION=v1.34.5+rke2r1
# RANCHER_HOSTNAME=rancher.example.com
# BOOTSTRAP_PASSWORD=changeme
# RKE2_TOKEN=my-shared-secret
# INSTALL_RANCHER=true
########################################
RKE2_VERSION="${RKE2_VERSION:-v1.34.5+rke2r1}"
INSTALL_RANCHER="${INSTALL_RANCHER:-true}"
BOOTSTRAP_PASSWORD="${BOOTSTRAP_PASSWORD:-adminadminadmin}"
RANCHER_HOSTNAME="${RANCHER_HOSTNAME:-}"
RKE2_CONFIG_DIR="/etc/rancher/rke2"
RKE2_CONFIG_FILE="${RKE2_CONFIG_DIR}/config.yaml"
RKE2_TOKEN_FILE="${RKE2_CONFIG_DIR}/server-token"
KUBECONFIG_FILE="/etc/rancher/rke2/rke2.yaml"
RANCHER_REPO_NAME="rancher-stable"
RANCHER_REPO_URL="https://releases.rancher.com/server-charts/stable"
log() {
echo
echo "============================================================"
echo "[INFO] $*"
echo "============================================================"
}
warn() {
echo
echo "[WARN] $*" >&2
}
die() {
echo
echo "[ERROR] $*" >&2
exit 1
}
on_error() {
local exit_code=$?
local line_no=$1
warn "Script failed on line ${line_no} with exit code ${exit_code}"
warn "Useful diagnostics:"
echo " sudo systemctl status rke2-server -l --no-pager"
echo " sudo journalctl -u rke2-server -n 200 --no-pager"
echo " sudo /var/lib/rancher/rke2/bin/kubectl --kubeconfig ${KUBECONFIG_FILE} get nodes -o wide"
echo " sudo /var/lib/rancher/rke2/bin/kubectl --kubeconfig ${KUBECONFIG_FILE} get pods -A"
exit "${exit_code}"
}
trap 'on_error $LINENO' ERR
require_root() {
[[ "${EUID}" -eq 0 ]] || die "Run this script as root: sudo $0"
}
kubectl_rke2() {
/var/lib/rancher/rke2/bin/kubectl --kubeconfig "${KUBECONFIG_FILE}" "$@"
}
helm_rke2() {
helm --kubeconfig "${KUBECONFIG_FILE}" "$@"
}
wait_for_file() {
local file="$1"
local timeout="${2:-300}"
local waited=0
until [[ -f "${file}" ]]; do
sleep 2
waited=$((waited + 2))
if (( waited >= timeout )); then
die "Timed out waiting for file: ${file}"
fi
done
}
disable_swap() {
log "Disabling swap"
swapoff -a || true
if [[ -f /etc/fstab ]]; then
cp /etc/fstab "/etc/fstab.bak.$(date +%Y%m%d%H%M%S)"
sed -Ei '/^[^#].+\s+swap\s+/ s/^/# disabled-by-rancher-script /' /etc/fstab
fi
}
install_packages() {
log "Installing required Arch packages"
pacman -Sy --noconfirm archlinux-keyring
if pacman -Q iptables >/dev/null 2>&1; then
pacman -Rdd --noconfirm iptables || true
fi
pacman -Syu --noconfirm
pacman -S --needed --noconfirm \
bash-completion \
ca-certificates \
cni-plugins \
conntrack-tools \
curl \
ethtool \
gzip \
helm \
iproute2 \
iptables-nft \
jq \
nfs-utils \
open-iscsi \
openssl \
socat \
tar \
unzip \
wget
}
configure_kernel() {
log "Configuring kernel modules and sysctl"
cat >/etc/modules-load.d/k8s.conf <<'EOF'
overlay
br_netfilter
EOF
modprobe overlay
modprobe br_netfilter
cat >/etc/sysctl.d/90-kubernetes.conf <<'EOF'
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward = 1
EOF
sysctl --system >/dev/null
}
configure_networkmanager() {
if systemctl is-enabled NetworkManager >/dev/null 2>&1 || systemctl is-active NetworkManager >/dev/null 2>&1; then
log "Configuring NetworkManager to ignore CNI interfaces"
mkdir -p /etc/NetworkManager/conf.d
cat >/etc/NetworkManager/conf.d/rke2-cni.conf <<'EOF'
[keyfile]
unmanaged-devices=interface-name:cali*;interface-name:flannel*;interface-name:cni*;interface-name:vxlan.calico;interface-name:kube-ipvs0;interface-name:nodelocaldns;interface-name:tunl*
EOF
systemctl restart NetworkManager
fi
if systemctl list-unit-files | grep -q '^nm-cloud-setup.service'; then
systemctl disable --now nm-cloud-setup.service || true
fi
if systemctl list-unit-files | grep -q '^nm-cloud-setup.timer'; then
systemctl disable --now nm-cloud-setup.timer || true
fi
}
enable_support_services() {
log "Enabling support services"
systemctl enable --now iscsid.service || true
# Do NOT enable nftables.service here.
# On this Arch + RKE2 setup it can break service routing for cluster IPs.
systemctl stop nftables.service >/dev/null 2>&1 || true
systemctl disable nftables.service >/dev/null 2>&1 || true
nft flush ruleset >/dev/null 2>&1 || true
}
install_rke2() {
log "Installing RKE2 server ${RKE2_VERSION}"
mkdir -p "${RKE2_CONFIG_DIR}"
if [[ -n "${RKE2_TOKEN:-}" ]]; then
printf '%s\n' "${RKE2_TOKEN}" > "${RKE2_TOKEN_FILE}"
chmod 600 "${RKE2_TOKEN_FILE}"
elif [[ ! -f "${RKE2_TOKEN_FILE}" ]]; then
openssl rand -hex 24 > "${RKE2_TOKEN_FILE}"
chmod 600 "${RKE2_TOKEN_FILE}"
fi
local token
token="$(<"${RKE2_TOKEN_FILE}")"
cat >"${RKE2_CONFIG_FILE}" <<EOF
token: ${token}
write-kubeconfig-mode: "0644"
tls-san:
- ${RANCHER_HOSTNAME:-127.0.0.1}
EOF
curl -sfL https://get.rke2.io | INSTALL_RKE2_TYPE=server INSTALL_RKE2_VERSION="${RKE2_VERSION}" sh -
mkdir -p /etc/profile.d
cat >/etc/profile.d/rke2-path.sh <<'EOF'
export PATH=$PATH:/var/lib/rancher/rke2/bin:/usr/local/bin
export KUBECONFIG=/etc/rancher/rke2/rke2.yaml
EOF
systemctl daemon-reload
systemctl enable rke2-server.service
systemctl restart rke2-server.service
}
wait_for_api() {
log "Waiting for RKE2 and Kubernetes API"
wait_for_file "${KUBECONFIG_FILE}" 600
wait_for_file "/var/lib/rancher/rke2/bin/kubectl" 600
local waited=0
until kubectl_rke2 get --raw=/readyz >/dev/null 2>&1; do
sleep 5
waited=$((waited + 5))
if (( waited % 30 == 0 )); then
warn "Kubernetes API not ready yet; recent rke2-server logs:"
journalctl -u rke2-server -n 40 --no-pager || true
fi
if (( waited >= 900 )); then
journalctl -u rke2-server -n 200 --no-pager || true
die "Timed out waiting for Kubernetes API readiness"
fi
done
}
wait_for_ready_node() {
log "Waiting for any node to become Ready"
local waited=0
until kubectl_rke2 get nodes -o json 2>/dev/null | jq -e '
.items | length > 0 and any(.[]; any(.status.conditions[]?; .type=="Ready" and .status=="True"))
' >/dev/null; do
sleep 5
waited=$((waited + 5))
if (( waited % 30 == 0 )); then
warn "No Ready node yet; current status:"
kubectl_rke2 get nodes -o wide || true
kubectl_rke2 get pods -A || true
fi
if (( waited >= 1200 )); then
kubectl_rke2 get nodes -o wide || true
kubectl_rke2 get pods -A || true
journalctl -u rke2-server -n 200 --no-pager || true
die "Timed out waiting for a Ready node"
fi
done
}
wait_for_system_pods() {
log "Waiting for core system pods"
local waited=0
until kubectl_rke2 -n kube-system get pod -l k8s-app=kube-dns >/dev/null 2>&1; do
sleep 5
waited=$((waited + 5))
if (( waited >= 600 )); then
kubectl_rke2 get pods -A || true
die "Timed out waiting for kube-system pods to appear"
fi
done
kubectl_rke2 get nodes -o wide || true
kubectl_rke2 get pods -A || true
log "Waiting for Canal (CNI) DaemonSet to exist"
waited=0
until kubectl_rke2 -n kube-system get daemonset rke2-canal >/dev/null 2>&1; do
sleep 5
waited=$((waited + 5))
if (( waited % 30 == 0 )); then
warn "rke2-canal DaemonSet not present yet"
kubectl_rke2 -n kube-system get daemonsets || true
kubectl_rke2 -n kube-system get pods -o wide || true
fi
if (( waited >= 900 )); then
kubectl_rke2 -n kube-system get daemonsets || true
kubectl_rke2 -n kube-system get pods -o wide || true
die "Timed out waiting for rke2-canal DaemonSet to appear"
fi
done
log "Waiting for Canal (CNI) to be fully rolled out"
kubectl_rke2 -n kube-system rollout status daemonset/rke2-canal --timeout=20m
# Give kube-proxy and service routing a moment to settle.
sleep 20
log "Waiting for bundled RKE2 addon deployments"
waited=0
until kubectl_rke2 -n kube-system get deploy \
rke2-ingress-nginx-controller \
rke2-metrics-server \
rke2-snapshot-controller \
rke2-coredns-rke2-coredns >/dev/null 2>&1; do
sleep 5
waited=$((waited + 5))
if (( waited % 30 == 0 )); then
warn "Bundled addon deployments are not all present yet"
kubectl_rke2 -n kube-system get deploy || true
kubectl_rke2 -n kube-system get pods -o wide || true
fi
if (( waited >= 900 )); then
kubectl_rke2 -n kube-system get pods -o wide || true
die "Timed out waiting for bundled RKE2 addon deployments"
fi
done
kubectl_rke2 -n kube-system rollout status deploy/rke2-ingress-nginx-controller --timeout=20m
kubectl_rke2 -n kube-system rollout status deploy/rke2-metrics-server --timeout=20m
kubectl_rke2 -n kube-system rollout status deploy/rke2-snapshot-controller --timeout=20m
kubectl_rke2 -n kube-system rollout status deploy/rke2-coredns-rke2-coredns --timeout=20m
log "System pods are settled"
kubectl_rke2 get pods -A || true
}
resolve_hostname() {
if [[ -n "${RANCHER_HOSTNAME}" ]]; then
return
fi
local detected_ip=""
detected_ip="$(ip route get 1.1.1.1 2>/dev/null | awk '{for (i=1; i<=NF; i++) if ($i=="src") {print $(i+1); exit}}')"
[[ -n "${detected_ip}" ]] || die "Could not auto-detect server IP. Set RANCHER_HOSTNAME manually."
RANCHER_HOSTNAME="${detected_ip}.sslip.io"
log "Auto-detected Rancher hostname: ${RANCHER_HOSTNAME}"
}
install_cert_manager() {
log "Installing cert-manager"
helm repo add jetstack https://charts.jetstack.io >/dev/null 2>&1 || true
helm repo update >/dev/null
local cert_manager_app_version=""
cert_manager_app_version="$(helm show chart jetstack/cert-manager | awk '/^appVersion:/ {print $2; exit}')"
[[ -n "${cert_manager_app_version}" ]] || die "Could not determine cert-manager appVersion"
kubectl_rke2 create namespace cert-manager --dry-run=client -o yaml | kubectl_rke2 apply -f -
kubectl_rke2 apply -f "https://github.com/cert-manager/cert-manager/releases/download/${cert_manager_app_version}/cert-manager.crds.yaml"
helm_rke2 upgrade --install cert-manager jetstack/cert-manager \
--namespace cert-manager \
--create-namespace \
--wait \
--timeout 20m
kubectl_rke2 -n cert-manager rollout status deploy/cert-manager --timeout=20m
kubectl_rke2 -n cert-manager rollout status deploy/cert-manager-webhook --timeout=20m
kubectl_rke2 -n cert-manager rollout status deploy/cert-manager-cainjector --timeout=20m
}
install_rancher() {
if [[ "${INSTALL_RANCHER}" != "true" ]]; then
warn "INSTALL_RANCHER=false, skipping Rancher install"
return
fi
resolve_hostname
log "Installing Rancher"
helm repo add "${RANCHER_REPO_NAME}" "${RANCHER_REPO_URL}" >/dev/null 2>&1 || true
helm repo update >/dev/null
kubectl_rke2 create namespace cattle-system --dry-run=client -o yaml | kubectl_rke2 apply -f -
helm_rke2 upgrade --install rancher "${RANCHER_REPO_NAME}/rancher" \
--namespace cattle-system \
--set hostname="${RANCHER_HOSTNAME}" \
--set bootstrapPassword="${BOOTSTRAP_PASSWORD}" \
--set replicas=1 \
--set ingress.tls.source=rancher \
--wait \
--timeout 30m
kubectl_rke2 -n cattle-system rollout status deploy/rancher --timeout=30m
}
print_summary() {
local node_token=""
local bootstrap_secret_password=""
if [[ -f /var/lib/rancher/rke2/server/node-token ]]; then
node_token="$(</var/lib/rancher/rke2/server/node-token)"
else
node_token="$(<"${RKE2_TOKEN_FILE}")"
fi
bootstrap_secret_password="$(kubectl_rke2 get secret -n cattle-system bootstrap-secret -o go-template='{{ .data.bootstrapPassword|base64decode }}' 2>/dev/null || true)"
log "Installation complete"
echo "RKE2 version: ${RKE2_VERSION}"
echo "Kubeconfig: ${KUBECONFIG_FILE}"
echo "Node token: ${node_token}"
echo "Rancher URL: https://${RANCHER_HOSTNAME}"
echo "Bootstrap pw: ${bootstrap_secret_password:-${BOOTSTRAP_PASSWORD}}"
echo
echo "Useful commands:"
echo " export KUBECONFIG=${KUBECONFIG_FILE}"
echo " /var/lib/rancher/rke2/bin/kubectl get nodes -o wide"
echo " /var/lib/rancher/rke2/bin/kubectl get pods -A"
echo " sudo systemctl status rke2-server --no-pager"
}
main() {
require_root
disable_swap
install_packages
configure_kernel
configure_networkmanager
enable_support_services
install_rke2
wait_for_api
wait_for_ready_node
wait_for_system_pods
install_cert_manager
install_rancher
print_summary
}
main "$@"