kubernetes-arch-install/master_node_install.sh

630 lines
17 KiB
Bash

#!/usr/bin/env bash
set -Eeuo pipefail
########################################
# Arch Linux Rancher Management Cluster
# RKE2 + Rancher + optional Longhorn
#
# PURPOSE
# - Replaces kubeadm with RKE2 so Rancher sits on a Rancher-native distro
# - Creates a dedicated management cluster
# - Installs Rancher in a way that fits Rancher lifecycle management better
# - Optionally installs Longhorn for persistent volumes
#
# IMPORTANT
# - Use a REAL DNS name for RANCHER_HOSTNAME
# - This script is for the Rancher management cluster itself
# - Create/import downstream clusters from Rancher afterward
# - Avoid running general app workloads on this local Rancher cluster
########################################
# ---------- User-configurable defaults ----------
# RKE2 / Kubernetes
RKE2_CHANNEL="${RKE2_CHANNEL:-stable}"
RKE2_VERSION="${RKE2_VERSION:-v1.34.5+rke2r1}"
RKE2_TOKEN="${RKE2_TOKEN:-}"
CLUSTER_CIDR="${CLUSTER_CIDR:-192.168.0.0/16}"
SERVICE_CIDR="${SERVICE_CIDR:-10.43.0.0/16}"
CLUSTER_DNS="${CLUSTER_DNS:-10.43.0.10}"
CNI_PLUGIN="${CNI_PLUGIN:-canal}" # canal | calico | cilium | flannel
DISABLE_RKE2_INGRESS="${DISABLE_RKE2_INGRESS:-true}"
# Rancher
INSTALL_RANCHER="${INSTALL_RANCHER:-true}"
RANCHER_HOSTNAME="${RANCHER_HOSTNAME:-}" # REQUIRED, e.g. rancher.example.com
RANCHER_NAMESPACE="${RANCHER_NAMESPACE:-cattle-system}"
RANCHER_CHART_VERSION="${RANCHER_CHART_VERSION:-2.13.4}"
RANCHER_BOOTSTRAP_PASSWORD="${RANCHER_BOOTSTRAP_PASSWORD:-}"
RANCHER_REPLICAS="${RANCHER_REPLICAS:-1}"
RANCHER_TLS_SOURCE="${RANCHER_TLS_SOURCE:-rancher}" # rancher | letsEncrypt | secret
RANCHER_PRIVATE_CA="${RANCHER_PRIVATE_CA:-false}"
LETSENCRYPT_EMAIL="${LETSENCRYPT_EMAIL:-}"
# ingress-nginx
INSTALL_INGRESS_NGINX="${INSTALL_INGRESS_NGINX:-true}"
INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}"
INGRESS_CLASS_NAME="${INGRESS_CLASS_NAME:-nginx}"
# cert-manager
INSTALL_CERT_MANAGER="${INSTALL_CERT_MANAGER:-true}"
CERT_MANAGER_NAMESPACE="${CERT_MANAGER_NAMESPACE:-cert-manager}"
CERT_MANAGER_CHART_VERSION="${CERT_MANAGER_CHART_VERSION:-v1.18.3}"
# Longhorn
INSTALL_LONGHORN="${INSTALL_LONGHORN:-true}"
LONGHORN_NAMESPACE="${LONGHORN_NAMESPACE:-longhorn-system}"
LONGHORN_CHART_VERSION="${LONGHORN_CHART_VERSION:-1.11.0}"
LONGHORN_DEFAULT_REPLICA_COUNT="${LONGHORN_DEFAULT_REPLICA_COUNT:-1}"
# Helm
INSTALL_HELM="${INSTALL_HELM:-true}"
HELM_VERSION="${HELM_VERSION:-v3.18.4}"
# Misc
ALLOW_SCHEDULING_ON_SERVER="${ALLOW_SCHEDULING_ON_SERVER:-true}"
REAL_USER="${SUDO_USER:-root}"
REAL_HOME="$(getent passwd "${REAL_USER}" | cut -d: -f6 || true)"
REAL_HOME="${REAL_HOME:-/root}"
REAL_KUBECONFIG_DIR="${REAL_HOME}/.kube"
# ---------- Logging ----------
log() {
echo
echo "============================================================"
echo "[INFO] $*"
echo "============================================================"
}
warn() {
echo
echo "[WARN] $*" >&2
}
die() {
echo
echo "[ERROR] $*" >&2
exit 1
}
# ---------- Helpers ----------
require_root() {
[[ "${EUID}" -eq 0 ]] || die "Run as root: sudo ./rancher_rke2_management_install.sh"
}
require_cmd() {
command -v "$1" >/dev/null 2>&1 || die "Required command not found: $1"
}
retry() {
local attempts="${1:-10}"
local sleep_seconds="${2:-5}"
shift 2 || true
local n=1
until "$@"; do
if (( n >= attempts )); then
return 1
fi
warn "Command failed (attempt ${n}/${attempts}): $*"
sleep "${sleep_seconds}"
((n++))
done
}
helm_repo_add_force() {
local name="$1"
local url="$2"
if helm repo list 2>/dev/null | awk 'NR>1 {print $1}' | grep -qx "${name}"; then
helm repo add "${name}" "${url}" --force-update >/dev/null
else
helm repo add "${name}" "${url}" >/dev/null
fi
}
kubectl_ns_apply() {
local ns="$1"
kubectl create namespace "${ns}" --dry-run=client -o yaml | kubectl apply -f -
}
write_file_if_changed() {
local path="$1"
local tmp
tmp="$(mktemp)"
cat > "${tmp}"
if [[ -f "${path}" ]] && cmp -s "${tmp}" "${path}"; then
rm -f "${tmp}"
return 0
fi
install -D -m 0644 "${tmp}" "${path}"
rm -f "${tmp}"
}
# ---------- Validation ----------
validate_inputs() {
if [[ "${INSTALL_RANCHER}" == "true" && -z "${RANCHER_HOSTNAME}" ]]; then
die "RANCHER_HOSTNAME must be set to a real DNS name, e.g. rancher.example.com"
fi
if [[ "${INSTALL_RANCHER}" == "true" && -z "${RANCHER_BOOTSTRAP_PASSWORD}" ]]; then
RANCHER_BOOTSTRAP_PASSWORD="$(openssl rand -base64 24 | tr -d '\n' | tr '/+' 'AB' | cut -c1-20)"
fi
if [[ "${INSTALL_RANCHER}" == "true" && "${RANCHER_TLS_SOURCE}" == "letsEncrypt" && -z "${LETSENCRYPT_EMAIL}" ]]; then
die "LETSENCRYPT_EMAIL must be set when RANCHER_TLS_SOURCE=letsEncrypt"
fi
if [[ -z "${RKE2_TOKEN}" ]]; then
RKE2_TOKEN="$(openssl rand -hex 32)"
fi
}
# ---------- Step 1: base packages ----------
install_base_packages() {
log "Installing required Arch packages"
pacman -Sy --noconfirm --needed \
curl tar gzip jq openssl unzip wget \
iptables nftables conntrack-tools socat ethtool \
iproute2 ca-certificates gnupg bash-completion \
open-iscsi nfs-utils cni-plugins
systemctl enable --now iscsid || true
}
# ---------- Step 2: disable swap ----------
disable_swap() {
log "Disabling swap"
swapoff -a || true
if [[ -f /etc/fstab ]]; then
cp /etc/fstab "/etc/fstab.bak.$(date +%Y%m%d%H%M%S)"
sed -ri '/\sswap\s/s/^/# /' /etc/fstab
fi
}
# ---------- Step 3: kernel modules / sysctl ----------
configure_kernel_networking() {
log "Configuring kernel modules and sysctl"
write_file_if_changed /etc/modules-load.d/rke2.conf <<'EOF'
overlay
br_netfilter
nf_conntrack
EOF
modprobe overlay
modprobe br_netfilter
modprobe nf_conntrack || true
write_file_if_changed /etc/sysctl.d/90-rke2.conf <<'EOF'
net.ipv4.ip_forward = 1
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
fs.inotify.max_user_instances = 8192
fs.inotify.max_user_watches = 1048576
vm.max_map_count = 262144
EOF
sysctl --system
}
# ---------- Step 4: firewall note ----------
handle_firewall() {
log "Checking for firewalld"
if systemctl list-unit-files 2>/dev/null | grep -q '^firewalld\.service'; then
if systemctl is-enabled firewalld >/dev/null 2>&1 || systemctl is-active firewalld >/dev/null 2>&1; then
warn "firewalld appears enabled/active. RKE2 docs warn that firewalld conflicts with the default Canal networking stack."
warn "Disabling firewalld on this node."
systemctl disable --now firewalld || true
fi
fi
}
# ---------- Step 5: install Helm ----------
install_helm() {
[[ "${INSTALL_HELM}" == "true" ]] || return 0
log "Installing Helm ${HELM_VERSION}"
local tmpdir arch
tmpdir="$(mktemp -d)"
arch="$(uname -m)"
case "${arch}" in
x86_64) arch="amd64" ;;
aarch64) arch="arm64" ;;
*) die "Unsupported architecture for Helm: ${arch}" ;;
esac
curl -fsSL -o "${tmpdir}/helm.tar.gz" \
"https://get.helm.sh/helm-${HELM_VERSION}-linux-${arch}.tar.gz"
tar -xzf "${tmpdir}/helm.tar.gz" -C "${tmpdir}"
install -m 0755 "${tmpdir}/linux-${arch}/helm" /usr/local/bin/helm
rm -rf "${tmpdir}"
helm version
}
# ---------- Step 6: install RKE2 ----------
install_rke2() {
log "Installing RKE2 ${RKE2_VERSION}"
export INSTALL_RKE2_CHANNEL="${RKE2_CHANNEL}"
export INSTALL_RKE2_METHOD="tar"
export INSTALL_RKE2_VERSION="${RKE2_VERSION}"
curl -sfL https://get.rke2.io | sh -
}
# ---------- Step 7: configure RKE2 ----------
configure_rke2() {
log "Writing /etc/rancher/rke2/config.yaml"
mkdir -p /etc/rancher/rke2
local node_ip node_name
node_ip="$(ip -4 route get 1.1.1.1 2>/dev/null | awk '{for(i=1;i<=NF;i++) if ($i=="src") {print $(i+1); exit}}')"
[[ -n "${node_ip}" ]] || node_ip="$(hostnamectl --static 2>/dev/null || true)"
[[ -n "${node_ip}" ]] || node_ip="$(hostname -I 2>/dev/null | awk '{print $1}')"
node_name="$(hostnamectl --static 2>/dev/null || true)"
[[ -n "${node_name}" ]] || node_name="$(uname -n)"
[[ -n "${node_ip}" ]] || die "Failed to determine node IP"
[[ -n "${node_name}" ]] || die "Failed to determine node name"
cat >/etc/rancher/rke2/config.yaml <<EOF
token: ${RKE2_TOKEN}
write-kubeconfig-mode: "0644"
node-name: ${node_name}
tls-san:
- ${node_ip}
cluster-cidr: ${CLUSTER_CIDR}
service-cidr: ${SERVICE_CIDR}
cluster-dns: ${CLUSTER_DNS}
cni: ${CNI_PLUGIN}
etcd-expose-metrics: true
kube-apiserver-arg:
- anonymous-auth=false
kubelet-arg:
- protect-kernel-defaults=false
EOF
if [[ -n "${RANCHER_HOSTNAME}" ]]; then
echo " - ${RANCHER_HOSTNAME}" >> /etc/rancher/rke2/config.yaml
fi
if [[ "${DISABLE_RKE2_INGRESS}" == "true" ]]; then
cat >> /etc/rancher/rke2/config.yaml <<'EOF'
disable:
- rke2-ingress-nginx
EOF
fi
}
# ---------- Step 8: start RKE2 ----------
start_rke2() {
log "Starting rke2-server"
systemctl daemon-reload
systemctl enable --now rke2-server
log "Waiting for RKE2 server to become active"
retry 60 5 systemctl is-active --quiet rke2-server || {
journalctl -u rke2-server --no-pager -n 200 || true
die "rke2-server did not start successfully"
}
export PATH="/var/lib/rancher/rke2/bin:${PATH}"
export KUBECONFIG=/etc/rancher/rke2/rke2.yaml
log "Waiting for Kubernetes API"
retry 60 5 kubectl get nodes >/dev/null 2>&1 || {
kubectl get pods -A || true
journalctl -u rke2-server --no-pager -n 200 || true
die "Kubernetes API did not become ready"
}
}
# ---------- Step 9: configure kubectl for root and real user ----------
configure_kubeconfig() {
log "Configuring kubeconfig"
mkdir -p /root/.kube
cp -f /etc/rancher/rke2/rke2.yaml /root/.kube/config
chmod 600 /root/.kube/config
if [[ -n "${REAL_HOME}" && -d "${REAL_HOME}" ]]; then
mkdir -p "${REAL_KUBECONFIG_DIR}"
cp -f /etc/rancher/rke2/rke2.yaml "${REAL_KUBECONFIG_DIR}/config"
chown -R "${REAL_USER}:${REAL_USER}" "${REAL_KUBECONFIG_DIR}"
chmod 600 "${REAL_KUBECONFIG_DIR}/config"
else
warn "Could not determine invoking user's home directory; skipping user kubeconfig setup"
fi
}
# ---------- Step 10: allow scheduling on single-node server if requested ----------
allow_server_scheduling() {
if [[ "${ALLOW_SCHEDULING_ON_SERVER}" == "true" ]]; then
log "Removing control-plane scheduling taints for single-node use"
kubectl taint nodes --all node-role.kubernetes.io/control-plane- || true
kubectl taint nodes --all node-role.kubernetes.io/master- || true
fi
}
# ---------- Step 11: wait for core system ----------
wait_for_core_system() {
log "Waiting for core system pods"
retry 60 5 kubectl -n kube-system rollout status deployment/coredns --timeout=15s || true
kubectl get nodes -o wide
echo
kubectl get pods -A
}
# ---------- Step 12: install ingress-nginx ----------
install_ingress_nginx() {
[[ "${INSTALL_INGRESS_NGINX}" == "true" ]] || return 0
log "Installing ingress-nginx"
helm_repo_add_force ingress-nginx https://kubernetes.github.io/ingress-nginx
helm repo update
kubectl_ns_apply "${INGRESS_NAMESPACE}"
helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx \
--namespace "${INGRESS_NAMESPACE}" \
--create-namespace \
--set controller.kind=DaemonSet \
--set controller.hostNetwork=true \
--set controller.dnsPolicy=ClusterFirstWithHostNet \
--set controller.service.type=ClusterIP \
--set controller.ingressClass="${INGRESS_CLASS_NAME}" \
--set controller.ingressClassResource.name="${INGRESS_CLASS_NAME}" \
--set controller.ingressClassResource.default=true \
--set controller.watchIngressWithoutClass=true \
--set controller.reportNodeInternalIp=true \
--wait \
--timeout 20m
kubectl -n "${INGRESS_NAMESPACE}" rollout status daemonset/ingress-nginx-controller --timeout=20m
}
# ---------- Step 13: install cert-manager ----------
install_cert_manager() {
[[ "${INSTALL_CERT_MANAGER}" == "true" ]] || return 0
[[ "${INSTALL_RANCHER}" == "true" ]] || return 0
[[ "${RANCHER_TLS_SOURCE}" != "secret" ]] || return 0
log "Installing cert-manager"
helm_repo_add_force jetstack https://charts.jetstack.io
helm repo update
kubectl_ns_apply "${CERT_MANAGER_NAMESPACE}"
helm upgrade --install cert-manager jetstack/cert-manager \
--namespace "${CERT_MANAGER_NAMESPACE}" \
--create-namespace \
--version "${CERT_MANAGER_CHART_VERSION}" \
--set crds.enabled=true \
--wait \
--timeout 20m
kubectl -n "${CERT_MANAGER_NAMESPACE}" rollout status deployment/cert-manager --timeout=20m
kubectl -n "${CERT_MANAGER_NAMESPACE}" rollout status deployment/cert-manager-cainjector --timeout=20m
kubectl -n "${CERT_MANAGER_NAMESPACE}" rollout status deployment/cert-manager-webhook --timeout=20m
}
# ---------- Step 14: install Rancher ----------
install_rancher() {
[[ "${INSTALL_RANCHER}" == "true" ]] || return 0
log "Installing Rancher"
echo "${RANCHER_BOOTSTRAP_PASSWORD}" >/root/rancher-bootstrap-password.txt
chmod 600 /root/rancher-bootstrap-password.txt
helm_repo_add_force rancher-stable https://releases.rancher.com/server-charts/stable
helm repo update
kubectl_ns_apply "${RANCHER_NAMESPACE}"
local -a rancher_args=(
--namespace "${RANCHER_NAMESPACE}"
--create-namespace
--version "${RANCHER_CHART_VERSION}"
--set hostname="${RANCHER_HOSTNAME}"
--set bootstrapPassword="${RANCHER_BOOTSTRAP_PASSWORD}"
--set replicas="${RANCHER_REPLICAS}"
--set ingress.ingressClassName="${INGRESS_CLASS_NAME}"
--set ingress.tls.source="${RANCHER_TLS_SOURCE}"
--wait
--timeout 30m
)
if [[ "${RANCHER_PRIVATE_CA}" == "true" ]]; then
rancher_args+=( --set privateCA=true )
fi
if [[ "${RANCHER_TLS_SOURCE}" == "letsEncrypt" ]]; then
rancher_args+=( --set letsEncrypt.email="${LETSENCRYPT_EMAIL}" )
fi
helm upgrade --install rancher rancher-stable/rancher "${rancher_args[@]}"
kubectl -n "${RANCHER_NAMESPACE}" rollout status deployment/rancher --timeout=30m
if kubectl -n "${RANCHER_NAMESPACE}" get deployment rancher-webhook >/dev/null 2>&1; then
kubectl -n "${RANCHER_NAMESPACE}" rollout status deployment/rancher-webhook --timeout=30m
fi
if kubectl -n "${RANCHER_NAMESPACE}" get deployment cattle-cluster-agent >/dev/null 2>&1; then
kubectl -n "${RANCHER_NAMESPACE}" rollout status deployment/cattle-cluster-agent --timeout=30m || true
fi
}
# ---------- Step 15: install Longhorn ----------
install_longhorn() {
[[ "${INSTALL_LONGHORN}" == "true" ]] || return 0
log "Installing Longhorn"
systemctl enable --now iscsid || true
helm_repo_add_force longhorn https://charts.longhorn.io
helm repo update
kubectl_ns_apply "${LONGHORN_NAMESPACE}"
helm upgrade --install longhorn longhorn/longhorn \
--namespace "${LONGHORN_NAMESPACE}" \
--create-namespace \
--version "${LONGHORN_CHART_VERSION}" \
--set defaultSettings.defaultReplicaCount="${LONGHORN_DEFAULT_REPLICA_COUNT}" \
--wait \
--timeout 30m
kubectl -n "${LONGHORN_NAMESPACE}" get pods
}
# ---------- Step 16: validation ----------
validate_install() {
log "Validation"
kubectl get nodes -o wide
echo
kubectl get pods -A
echo
kubectl get ingress -A || true
echo
kubectl get sc || true
if [[ "${INSTALL_RANCHER}" == "true" ]]; then
echo
kubectl -n "${RANCHER_NAMESPACE}" get all || true
fi
if [[ "${INSTALL_LONGHORN}" == "true" ]]; then
echo
kubectl -n "${LONGHORN_NAMESPACE}" get all || true
fi
}
# ---------- Step 17: save useful info ----------
save_cluster_info() {
log "Saving useful cluster information"
mkdir -p /root/rancher-install-artifacts
cat >/root/rancher-install-artifacts/README.txt <<EOF
RKE2 server token:
/var/lib/rancher/rke2/server/token
Root kubeconfig:
/root/.kube/config
User kubeconfig:
${REAL_KUBECONFIG_DIR}/config
Rancher bootstrap password:
/root/rancher-bootstrap-password.txt
Rancher URL:
https://${RANCHER_HOSTNAME}
Systemd service:
systemctl status rke2-server --no-pager
Logs:
journalctl -u rke2-server -f
Kubectl bundled with RKE2:
/var/lib/rancher/rke2/bin/kubectl
EOF
}
# ---------- Main ----------
main() {
require_root
require_cmd curl
require_cmd sed
require_cmd awk
require_cmd openssl
require_cmd sha256sum
validate_inputs
install_base_packages
disable_swap
configure_kernel_networking
handle_firewall
install_helm
install_rke2
configure_rke2
start_rke2
configure_kubeconfig
allow_server_scheduling
wait_for_core_system
install_ingress_nginx
install_cert_manager
install_rancher
install_longhorn
validate_install
save_cluster_info
echo
echo "RKE2 management cluster installation is complete."
echo
echo "RKE2 version:"
echo " ${RKE2_VERSION}"
echo
echo "Rancher URL:"
echo " https://${RANCHER_HOSTNAME}"
echo
echo "Bootstrap password file:"
echo " /root/rancher-bootstrap-password.txt"
echo
echo "RKE2 server token:"
echo " /var/lib/rancher/rke2/server/token"
echo
echo "kubectl configured for:"
echo " root: /root/.kube/config"
echo " ${REAL_USER}: ${REAL_KUBECONFIG_DIR}/config"
echo
if [[ "${INSTALL_LONGHORN}" == "true" ]]; then
echo "Longhorn is installed for persistent volumes."
echo
fi
echo "Next step:"
echo " Log into Rancher and create/import downstream clusters there."
echo
}
main "$@"