diff --git a/master_node_install.sh b/master_node_install.sh index ef9e8a4..4747807 100644 --- a/master_node_install.sh +++ b/master_node_install.sh @@ -10,6 +10,7 @@ set -Eeuo pipefail # - Installs required Arch packages # - Configures kernel modules and sysctl for Kubernetes # - Configures NetworkManager to ignore CNI interfaces +# - Disables host nftables service to avoid breaking RKE2 service routing # - Installs RKE2 server # - Waits for Kubernetes and bundled RKE2 addons to become healthy # - Installs cert-manager @@ -181,7 +182,7 @@ enable_support_services() { systemctl enable --now iscsid.service || true # Do NOT enable nftables.service here. - # On this Arch + RKE2 setup it broke service routing for the cluster IP range. + # On this Arch + RKE2 setup it can break service routing for cluster IPs. systemctl stop nftables.service >/dev/null 2>&1 || true systemctl disable nftables.service >/dev/null 2>&1 || true nft flush ruleset >/dev/null 2>&1 || true @@ -287,15 +288,24 @@ wait_for_system_pods() { kubectl_rke2 get nodes -o wide || true kubectl_rke2 get pods -A || true - # Wait for Canal first because service routing depends on it. + log "Waiting for Canal (CNI) to be fully ready" + waited=0 until kubectl_rke2 -n kube-system get pods -l app=rke2-canal -o json 2>/dev/null | jq -e ' - .items | length > 0 and all(.[]; .status.phase=="Running") + .items + | length > 0 + and all( + .[]; + ( + (.status.containerStatuses // []) | length + ) > 0 + and all(.status.containerStatuses[]; .ready == true) + ) ' >/dev/null; do sleep 5 waited=$((waited + 5)) if (( waited % 30 == 0 )); then - warn "Canal not fully ready yet" + warn "Canal is not fully ready yet" kubectl_rke2 -n kube-system get pods -o wide || true fi if (( waited >= 900 )); then @@ -307,11 +317,21 @@ wait_for_system_pods() { # Give kube-proxy and service routing a moment to settle. sleep 20 - # Wait for bundled addons that Rancher depends on. + log "Waiting for bundled RKE2 addon deployments" + waited=0 - until kubectl_rke2 -n kube-system get deploy rke2-ingress-nginx-controller rke2-metrics-server rke2-snapshot-controller >/dev/null 2>&1; do + until kubectl_rke2 -n kube-system get deploy \ + rke2-ingress-nginx-controller \ + rke2-metrics-server \ + rke2-snapshot-controller \ + rke2-coredns-rke2-coredns >/dev/null 2>&1; do sleep 5 waited=$((waited + 5)) + if (( waited % 30 == 0 )); then + warn "Bundled addon deployments are not all present yet" + kubectl_rke2 -n kube-system get deploy || true + kubectl_rke2 -n kube-system get pods -o wide || true + fi if (( waited >= 900 )); then kubectl_rke2 -n kube-system get pods -o wide || true die "Timed out waiting for bundled RKE2 addon deployments" @@ -321,9 +341,7 @@ wait_for_system_pods() { kubectl_rke2 -n kube-system rollout status deploy/rke2-ingress-nginx-controller --timeout=20m kubectl_rke2 -n kube-system rollout status deploy/rke2-metrics-server --timeout=20m kubectl_rke2 -n kube-system rollout status deploy/rke2-snapshot-controller --timeout=20m - - # CoreDNS can be slightly slower; wait for it too. - kubectl_rke2 -n kube-system rollout status deploy/rke2-coredns-rke2-coredns --timeout=20m || true + kubectl_rke2 -n kube-system rollout status deploy/rke2-coredns-rke2-coredns --timeout=20m log "System pods are settled" kubectl_rke2 get pods -A || true