kubernetes-arch-install/rancher_no_work_master_node...

541 lines
17 KiB
Bash

#!/usr/bin/env bash
set -Eeuo pipefail
########################################
# Arch Linux Kubernetes Control Plane
# Fully automated master node installer
# + Official Kubernetes binaries pinned to 1.34.x
# + Helm
# + ingress-nginx
# + cert-manager
# + Rancher
########################################
# ---------- Config ----------
POD_CIDR="${POD_CIDR:-192.168.0.0/16}"
CALICO_VERSION="${CALICO_VERSION:-v3.31.4}"
# Rancher-compatible Kubernetes version
K8S_VERSION="${K8S_VERSION:-v1.34.6}"
K8S_SERIES_REGEX='^v1\.34\.[0-9]+$'
K8S_ARCH="${K8S_ARCH:-amd64}"
KUBECONFIG_DIR_ROOT="/root/.kube"
JOIN_COMMAND_FILE="/root/kubeadm-join-command.sh"
INSTALL_HELM="${INSTALL_HELM:-true}"
INSTALL_RANCHER="${INSTALL_RANCHER:-true}"
ALLOW_WORKLOADS_ON_CONTROL_PLANE="${ALLOW_WORKLOADS_ON_CONTROL_PLANE:-true}"
# Rancher settings
RANCHER_REPO_CHANNEL="${RANCHER_REPO_CHANNEL:-stable}" # stable | latest | alpha
RANCHER_BOOTSTRAP_PASSWORD="${RANCHER_BOOTSTRAP_PASSWORD:-}"
RANCHER_HOSTNAME="${RANCHER_HOSTNAME:-}" # auto -> rancher.<NODE_IP>.sslip.io
RANCHER_REPLICAS="${RANCHER_REPLICAS:-1}"
RANCHER_NAMESPACE="${RANCHER_NAMESPACE:-cattle-system}"
# ingress-nginx settings
INGRESS_NAMESPACE="${INGRESS_NAMESPACE:-ingress-nginx}"
INGRESS_CLASS_NAME="${INGRESS_CLASS_NAME:-nginx}"
# cert-manager settings
CERT_MANAGER_NAMESPACE="${CERT_MANAGER_NAMESPACE:-cert-manager}"
# Binary locations
KUBEADM_BIN="/usr/local/bin/kubeadm"
KUBECTL_BIN="/usr/local/bin/kubectl"
KUBELET_BIN="/usr/local/bin/kubelet"
# Detect the real invoking user when run with sudo
REAL_USER="${SUDO_USER:-root}"
REAL_HOME="$(getent passwd "$REAL_USER" | cut -d: -f6 || true)"
REAL_HOME="${REAL_HOME:-/root}"
REAL_KUBECONFIG_DIR="${REAL_HOME}/.kube"
# ---------- Logging ----------
log() {
echo
echo "============================================================"
echo "[INFO] $*"
echo "============================================================"
}
warn() {
echo
echo "[WARN] $*" >&2
}
die() {
echo
echo "[ERROR] $*" >&2
exit 1
}
# ---------- Helpers ----------
require_cmd() {
command -v "$1" >/dev/null 2>&1 || die "Required command not found: $1"
}
retry() {
local attempts="${1:-10}"
local sleep_seconds="${2:-5}"
shift 2 || true
local n=1
until "$@"; do
if (( n >= attempts )); then
return 1
fi
warn "Command failed (attempt ${n}/${attempts}): $*"
sleep "${sleep_seconds}"
((n++))
done
}
helm_repo_add_force() {
local name="$1"
local url="$2"
if helm repo list 2>/dev/null | awk '{print $1}' | grep -qx "${name}"; then
helm repo add "${name}" "${url}" --force-update >/dev/null
else
helm repo add "${name}" "${url}" >/dev/null
fi
}
kubectl_ns_apply() {
local ns="$1"
"${KUBECTL_BIN}" create namespace "${ns}" --dry-run=client -o yaml | "${KUBECTL_BIN}" apply -f -
}
download_k8s_binary() {
local name="$1"
local tmpdir
tmpdir="$(mktemp -d)"
curl -fsSL -o "${tmpdir}/${name}" \
"https://dl.k8s.io/release/${K8S_VERSION}/bin/linux/${K8S_ARCH}/${name}"
curl -fsSL -o "${tmpdir}/${name}.sha256" \
"https://dl.k8s.io/release/${K8S_VERSION}/bin/linux/${K8S_ARCH}/${name}.sha256"
(
cd "${tmpdir}"
echo "$(cat "${name}.sha256") ${name}" | sha256sum --check --status
) || die "Checksum verification failed for ${name} ${K8S_VERSION}"
install -o root -g root -m 0755 "${tmpdir}/${name}" "/usr/local/bin/${name}"
rm -rf "${tmpdir}"
}
install_kubelet_service() {
log "Installing kubelet systemd service"
mkdir -p /etc/systemd/system/kubelet.service.d
touch /etc/default/kubelet
cat >/etc/systemd/system/kubelet.service <<'EOF'
[Unit]
Description=kubelet: The Kubernetes Node Agent
Documentation=https://kubernetes.io/docs/
After=containerd.service network-online.target
Wants=network-online.target
Requires=containerd.service
[Service]
ExecStart=/usr/local/bin/kubelet
Restart=always
StartLimitInterval=0
RestartSec=10
[Install]
WantedBy=multi-user.target
EOF
cat >/etc/systemd/system/kubelet.service.d/10-kubeadm.conf <<'EOF'
[Service]
Environment="KUBELET_KUBECONFIG_ARGS=--bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf"
Environment="KUBELET_CONFIG_ARGS=--config=/var/lib/kubelet/config.yaml"
EnvironmentFile=-/var/lib/kubelet/kubeadm-flags.env
EnvironmentFile=-/etc/default/kubelet
ExecStart=
ExecStart=/usr/local/bin/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_CONFIG_ARGS $KUBELET_KUBEADM_ARGS $KUBELET_EXTRA_ARGS
EOF
}
existing_cluster_version() {
if [[ -f /etc/kubernetes/admin.conf ]]; then
"${KUBECTL_BIN}" --kubeconfig=/etc/kubernetes/admin.conf version -o json 2>/dev/null | \
jq -r '.serverVersion.gitVersion // empty'
fi
}
ensure_rancher_supported_k8s() {
[[ "${K8S_VERSION}" =~ ${K8S_SERIES_REGEX} ]] || die \
"Rancher is enabled, but K8S_VERSION=${K8S_VERSION} is not a 1.34.x release. Set K8S_VERSION to a supported 1.34.x patch release."
}
# ---------- Root check ----------
if [[ "${EUID}" -ne 0 ]]; then
die "Run this script as root, for example: sudo ./master_node_install.sh"
fi
# ---------- Cleanup on error ----------
on_error() {
local exit_code=$?
warn "Script failed on line $1 with exit code ${exit_code}"
warn "Useful diagnostics:"
echo " journalctl -u containerd -u kubelet -b --no-pager | tail -n 200"
echo " systemctl status containerd kubelet --no-pager"
echo " ${KUBECTL_BIN} get nodes -o wide"
echo " ${KUBECTL_BIN} get pods -A"
exit "${exit_code}"
}
trap 'on_error $LINENO' ERR
if [[ "${INSTALL_RANCHER}" == "true" ]]; then
ensure_rancher_supported_k8s
fi
# ---------- Step 1: Disable swap ----------
log "Disabling swap immediately"
swapoff -a || true
log "Disabling swap persistently in /etc/fstab"
if [[ -f /etc/fstab ]]; then
cp /etc/fstab /etc/fstab.bak.$(date +%Y%m%d%H%M%S)
sed -ri '/\sswap\s/s/^/# DISABLED FOR KUBERNETES: /' /etc/fstab
fi
# ---------- Step 2: Update system ----------
log "Updating package databases and system packages"
pacman -Syu --noconfirm
# ---------- Step 3: Resolve iptables conflict automatically ----------
log "Resolving iptables backend for Kubernetes"
if pacman -Q iptables >/dev/null 2>&1; then
log "Removing legacy iptables package so iptables-nft can be installed"
pacman -Rdd --noconfirm iptables || true
fi
# ---------- Step 4: Install required Arch packages ----------
log "Installing runtime and support packages from Arch"
pacman -S --needed --noconfirm \
ca-certificates \
curl \
containerd \
cni-plugins \
crictl \
ethtool \
iptables-nft \
conntrack-tools \
socat \
tar \
gzip \
jq \
openssl \
helm
# ---------- Step 5: Remove Arch Kubernetes packages if present ----------
log "Removing Arch-provided kubeadm/kubectl/kubelet if present"
for pkg in kubeadm kubectl kubelet; do
if pacman -Q "${pkg}" >/dev/null 2>&1; then
pacman -Rdd --noconfirm "${pkg}" || true
fi
done
# ---------- Step 6: Install pinned Kubernetes binaries ----------
log "Installing Kubernetes binaries ${K8S_VERSION}"
download_k8s_binary kubeadm
download_k8s_binary kubectl
download_k8s_binary kubelet
require_cmd "${KUBEADM_BIN}"
require_cmd "${KUBECTL_BIN}"
require_cmd "${KUBELET_BIN}"
# ---------- Step 7: Kernel modules ----------
log "Configuring required kernel modules"
cat >/etc/modules-load.d/k8s.conf <<'EOF'
overlay
br_netfilter
EOF
modprobe overlay
modprobe br_netfilter
# ---------- Step 8: Sysctl ----------
log "Configuring Kubernetes sysctl settings"
cat >/etc/sysctl.d/99-kubernetes-cri.conf <<'EOF'
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward = 1
EOF
sysctl --system
# ---------- Step 9: containerd config ----------
log "Configuring containerd"
mkdir -p /etc/containerd
if [[ ! -f /etc/containerd/config.toml ]]; then
containerd config default >/etc/containerd/config.toml
else
cp /etc/containerd/config.toml /etc/containerd/config.toml.bak.$(date +%Y%m%d%H%M%S)
fi
sed -ri 's/^\s*SystemdCgroup = false/SystemdCgroup = true/' /etc/containerd/config.toml
# ---------- Step 10: kubelet service ----------
install_kubelet_service
# ---------- Step 11: Enable services ----------
log "Enabling and starting containerd and kubelet"
systemctl daemon-reload
systemctl enable --now containerd
systemctl enable --now kubelet
# ---------- Step 12: Wait for containerd ----------
log "Waiting for containerd to become active"
for i in {1..20}; do
if systemctl is-active --quiet containerd; then
break
fi
sleep 1
done
systemctl is-active --quiet containerd || die "containerd did not start successfully"
# ---------- Step 13: Handle existing cluster ----------
EXISTING_CLUSTER_VERSION="$(existing_cluster_version || true)"
if [[ -n "${EXISTING_CLUSTER_VERSION}" ]]; then
log "Detected existing Kubernetes cluster: ${EXISTING_CLUSTER_VERSION}"
if [[ "${EXISTING_CLUSTER_VERSION}" != "${K8S_VERSION}" ]]; then
die "Existing cluster version is ${EXISTING_CLUSTER_VERSION}, but this script is pinned to ${K8S_VERSION}. Reset/rebuild the cluster before rerunning."
fi
fi
# ---------- Step 14: Pre-pull Kubernetes images ----------
log "Pulling Kubernetes control-plane images"
"${KUBEADM_BIN}" config images pull --kubernetes-version="${K8S_VERSION}"
# ---------- Step 15: Initialize cluster ----------
if [[ -f /etc/kubernetes/admin.conf ]]; then
warn "/etc/kubernetes/admin.conf already exists; skipping kubeadm init"
else
log "Initializing Kubernetes control plane"
"${KUBEADM_BIN}" init \
--kubernetes-version="${K8S_VERSION}" \
--pod-network-cidr="${POD_CIDR}"
fi
# ---------- Step 16: Configure kubectl for root ----------
log "Configuring kubectl for root"
mkdir -p "${KUBECONFIG_DIR_ROOT}"
cp -f /etc/kubernetes/admin.conf "${KUBECONFIG_DIR_ROOT}/config"
chmod 600 "${KUBECONFIG_DIR_ROOT}/config"
export KUBECONFIG=/etc/kubernetes/admin.conf
# ---------- Step 17: Configure kubectl for invoking user ----------
if [[ -n "${REAL_HOME}" && -d "${REAL_HOME}" ]]; then
log "Configuring kubectl for user ${REAL_USER}"
mkdir -p "${REAL_KUBECONFIG_DIR}"
cp -f /etc/kubernetes/admin.conf "${REAL_KUBECONFIG_DIR}/config"
chown -R "${REAL_USER}:${REAL_USER}" "${REAL_KUBECONFIG_DIR}"
chmod 600 "${REAL_KUBECONFIG_DIR}/config"
else
warn "Could not determine invoking user's home directory; skipping user kubeconfig setup"
fi
# ---------- Step 18: Verify cluster version ----------
log "Verifying Kubernetes server version"
SERVER_VERSION="$("${KUBECTL_BIN}" version -o json | jq -r '.serverVersion.gitVersion')"
[[ "${SERVER_VERSION}" =~ ${K8S_SERIES_REGEX} ]] || die \
"Cluster server version ${SERVER_VERSION} is not a supported 1.34.x release for this Rancher workflow."
# ---------- Step 19: Wait for API ----------
log "Waiting for Kubernetes API to become responsive"
retry 60 5 "${KUBECTL_BIN}" version --request-timeout=10s >/dev/null
# ---------- Step 20: Optionally allow workloads on control-plane ----------
if [[ "${ALLOW_WORKLOADS_ON_CONTROL_PLANE}" == "true" ]]; then
log "Allowing workloads on the control-plane node (single-node/lab mode)"
"${KUBECTL_BIN}" taint nodes --all node-role.kubernetes.io/control-plane- >/dev/null 2>&1 || true
"${KUBECTL_BIN}" taint nodes --all node-role.kubernetes.io/master- >/dev/null 2>&1 || true
fi
# ---------- Step 21: Install Calico ----------
log "Installing Calico networking"
"${KUBECTL_BIN}" apply -f "https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/calico.yaml"
# ---------- Step 22: Wait for node readiness ----------
log "Waiting for node(s) to become Ready"
"${KUBECTL_BIN}" wait --for=condition=Ready node --all --timeout=10m
# ---------- Step 23: Wait for Calico ----------
log "Waiting for Calico components"
"${KUBECTL_BIN}" -n kube-system rollout status daemonset/calico-node --timeout=10m || true
"${KUBECTL_BIN}" -n kube-system rollout status deployment/calico-kube-controllers --timeout=10m || true
# ---------- Step 24: Save worker join command ----------
log "Saving worker join command"
"${KUBEADM_BIN}" token create --print-join-command > "${JOIN_COMMAND_FILE}"
chmod 700 "${JOIN_COMMAND_FILE}"
# ---------- Step 25: Determine node info ----------
log "Determining control-plane node information"
NODE_NAME="$("${KUBECTL_BIN}" get nodes -o jsonpath='{.items[0].metadata.name}')"
NODE_IP="$("${KUBECTL_BIN}" get node "${NODE_NAME}" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}')"
if [[ -z "${NODE_NAME}" || -z "${NODE_IP}" ]]; then
die "Failed to determine node name or node IP"
fi
if [[ -z "${RANCHER_HOSTNAME}" ]]; then
RANCHER_HOSTNAME="rancher.${NODE_IP}.sslip.io"
fi
if [[ -z "${RANCHER_BOOTSTRAP_PASSWORD}" ]]; then
RANCHER_BOOTSTRAP_PASSWORD="$(openssl rand -base64 24 | tr -d '\n' | tr '/+' 'AB' | cut -c1-20)"
fi
echo "${RANCHER_BOOTSTRAP_PASSWORD}" >/root/rancher-bootstrap-password.txt
chmod 600 /root/rancher-bootstrap-password.txt
# ---------- Step 26: Install Helm repos ----------
if [[ "${INSTALL_HELM}" == "true" ]]; then
log "Configuring Helm repositories"
helm_repo_add_force ingress-nginx https://kubernetes.github.io/ingress-nginx
helm_repo_add_force jetstack https://charts.jetstack.io
case "${RANCHER_REPO_CHANNEL}" in
stable)
helm_repo_add_force rancher-stable https://releases.rancher.com/server-charts/stable
RANCHER_CHART="rancher-stable/rancher"
;;
latest)
helm_repo_add_force rancher-latest https://releases.rancher.com/server-charts/latest
RANCHER_CHART="rancher-latest/rancher"
;;
alpha)
helm_repo_add_force rancher-alpha https://releases.rancher.com/server-charts/alpha
RANCHER_CHART="rancher-alpha/rancher"
;;
*)
die "Invalid RANCHER_REPO_CHANNEL: ${RANCHER_REPO_CHANNEL} (expected: stable, latest, alpha)"
;;
esac
helm repo update
fi
# ---------- Step 27: Install ingress-nginx ----------
if [[ "${INSTALL_RANCHER}" == "true" ]]; then
log "Installing ingress-nginx"
kubectl_ns_apply "${INGRESS_NAMESPACE}"
helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx \
--namespace "${INGRESS_NAMESPACE}" \
--create-namespace \
--set controller.kind=DaemonSet \
--set controller.hostNetwork=true \
--set controller.dnsPolicy=ClusterFirstWithHostNet \
--set controller.service.type=ClusterIP \
--set controller.ingressClass="${INGRESS_CLASS_NAME}" \
--set controller.ingressClassResource.name="${INGRESS_CLASS_NAME}" \
--set controller.ingressClassResource.default=true \
--set controller.watchIngressWithoutClass=true \
--set controller.reportNodeInternalIp=true \
--wait \
--timeout 15m
log "Waiting for ingress-nginx controller"
"${KUBECTL_BIN}" -n "${INGRESS_NAMESPACE}" rollout status daemonset/ingress-nginx-controller --timeout=15m
fi
# ---------- Step 28: Install cert-manager ----------
if [[ "${INSTALL_RANCHER}" == "true" ]]; then
log "Installing cert-manager"
kubectl_ns_apply "${CERT_MANAGER_NAMESPACE}"
helm upgrade --install cert-manager jetstack/cert-manager \
--namespace "${CERT_MANAGER_NAMESPACE}" \
--create-namespace \
--set crds.enabled=true \
--wait \
--timeout 15m
log "Waiting for cert-manager deployments"
"${KUBECTL_BIN}" -n "${CERT_MANAGER_NAMESPACE}" rollout status deployment/cert-manager --timeout=15m
"${KUBECTL_BIN}" -n "${CERT_MANAGER_NAMESPACE}" rollout status deployment/cert-manager-cainjector --timeout=15m
"${KUBECTL_BIN}" -n "${CERT_MANAGER_NAMESPACE}" rollout status deployment/cert-manager-webhook --timeout=15m
fi
# ---------- Step 29: Install Rancher ----------
if [[ "${INSTALL_RANCHER}" == "true" ]]; then
log "Installing Rancher"
kubectl_ns_apply "${RANCHER_NAMESPACE}"
helm upgrade --install rancher "${RANCHER_CHART}" \
--namespace "${RANCHER_NAMESPACE}" \
--create-namespace \
--set hostname="${RANCHER_HOSTNAME}" \
--set bootstrapPassword="${RANCHER_BOOTSTRAP_PASSWORD}" \
--set replicas="${RANCHER_REPLICAS}" \
--set ingress.ingressClassName="${INGRESS_CLASS_NAME}" \
--set ingress.tls.source=rancher \
--wait \
--timeout 20m
log "Waiting for Rancher rollout"
"${KUBECTL_BIN}" -n "${RANCHER_NAMESPACE}" rollout status deployment/rancher --timeout=20m || true
fi
# ---------- Step 30: Show cluster status ----------
log "Cluster status"
"${KUBECTL_BIN}" get nodes -o wide || true
echo
"${KUBECTL_BIN}" get pods -A || true
echo
"${KUBECTL_BIN}" get ingress -A || true
# ---------- Final output ----------
echo
echo "Kubernetes control plane installation is complete."
echo
echo "Pinned Kubernetes version:"
echo " ${K8S_VERSION}"
echo
echo "kubectl configured for:"
echo " root: ${KUBECONFIG_DIR_ROOT}/config"
echo " ${REAL_USER}: ${REAL_KUBECONFIG_DIR}/config"
echo
echo "Worker join command saved to:"
echo " ${JOIN_COMMAND_FILE}"
echo
echo "To view it:"
echo " sudo cat ${JOIN_COMMAND_FILE}"
echo
if [[ "${INSTALL_RANCHER}" == "true" ]]; then
echo "Rancher install completed."
echo
echo "Rancher URL:"
echo " https://${RANCHER_HOSTNAME}"
echo
echo "Rancher bootstrap password saved to:"
echo " /root/rancher-bootstrap-password.txt"
echo
echo "To view it:"
echo " sudo cat /root/rancher-bootstrap-password.txt"
echo
echo "Notes:"
echo " - Rancher is using a 1.34.x Kubernetes control plane on purpose for compatibility."
echo " - ingress-nginx is using host networking, so access Rancher directly on this node's IP over 443."
echo " - If a local firewall is enabled, ensure ports 80 and 443 are allowed."
echo " - Rancher-generated TLS will usually produce a browser warning until you trust the cert."
echo
fi