阅读量:1
kubernetes/k8s重启实体服务器后集群启动异常The connection to the server xxx:6443 was refused - did you specify the right host or port? 故障排错
重启服务器后集群未能正常启动原因排查
问题描述
重启服务器后发现k8s集群起不来
[root@master01 ~]# kubectl get nodes E0123 13:17:10.504376 2675 memcache.go:265] couldn't get current server API group list: Get "https://192.168.26.222:16443/api?timeout=32s": proxyconnect tcp: dial tcp 192.168.0.112:4780: connect: connection refused E0123 13:17:12.507567 2675 memcache.go:265] couldn't get current server API group list: Get "https://192.168.26.222:16443/api?timeout=32s": proxyconnect tcp: dial tcp 192.168.0.112:4780: connect: connection refused E0123 13:17:14.510613 2675 memcache.go:265] couldn't get current server API group list: Get "https://192.168.26.222:16443/api?timeout=32s": proxyconnect tcp: dial tcp 192.168.0.112:4780: connect: connection refused E0123 13:17:16.513368 2675 memcache.go:265] couldn't get current server API group list: Get "https://192.168.26.222:16443/api?timeout=32s": proxyconnect tcp: dial tcp 192.168.0.112:4780: connect: connection refused E0123 13:17:18.517088 2675 memcache.go:265] couldn't get current server API group list: Get "https://192.168.26.222:16443/api?timeout=32s": proxyconnect tcp: dial tcp 192.168.0.112:4780: connect: connection refused The connection to the server 192.168.26.222:16443 was refused - did you specify the right host or port? #2024/1/23追加更新,proxyconnect tcp: dial tcp 192.168.0.112:4780: connect: connection refused 代理连接失败,需要先把代理服务器开起来或者注销环境变量内代理信息,再进行其他troubleshooting操作。有概率代理问题解决后集群恢复正常。
#查看kubelet运行状态,状态正常 [root@master01 ~]# systemctl status kubelet ● kubelet.service - kubelet: The Kubernetes Node Agent Loaded: loaded (/usr/lib/systemd/system/kubelet.service; enabled; preset: disabled) Drop-In: /usr/lib/systemd/system/kubelet.service.d └─10-kubeadm.conf Active: active (running) since Sun 2024-01-21 16:52:39 CST; 5min ago Docs: https://kubernetes.io/docs/ Main PID: 20089 (kubelet) Tasks: 15 (limit: 36168) Memory: 44.0M CPU: 49.245s CGroup: /system.slice/kubelet.service └─20089 /usr/bin/kubelet --bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf --config=/var/lib/kubelet/config.yaml --container-runtime-endpoint=> Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.313771 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"policysync\" (UniqueName: \"kubernetes.io> Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.314001 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"etcd-certs\" (UniqueName: \"kubernetes.io> Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315248 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"kubeconfig\" (UniqueName: \"kubernetes.io> Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315365 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"ca-certs\" (UniqueName: \"kubernetes.io/h> Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315454 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"cni-bin-dir\" (UniqueName: \"kubernetes.i> Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315568 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"etcd-data\" (UniqueName: \"kubernetes.io/> Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315638 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"k8s-certs\" (UniqueName: \"kubernetes.io/> Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315699 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"lib-modules\" (UniqueName: \"kubernetes.i> Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315763 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"sys-fs\" (UniqueName: \"kubernetes.io/hos> Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315827 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"cni-log-dir\" (UniqueName: \"kubernetes.i> --- #重启后查看kubelet的log,未发现异常 [root@master01 ~]# journalctl -fu kubelet Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.313771 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"policysync\" (UniqueName: \"kubernetes.io/host-path/09898e3e-86cd-4e17-b65b-d9316456b924-policysync\") pod \"calico-node-g82pm\" (UID: \"09898e3e-86cd-4e17-b65b-d9316456b924\") " pod="kube-system/calico-node-g82pm" Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.314001 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"etcd-certs\" (UniqueName: \"kubernetes.io/host-path/d84a5bdd5adeecdea8774912b5490e5a-etcd-certs\") pod \"etcd-master01\" (UID: \"d84a5bdd5adeecdea8774912b5490e5a\") " pod="kube-system/etcd-master01" Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315248 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"kubeconfig\" (UniqueName: \"kubernetes.io/host-path/74f278afe29fec416b9250c4a8bd9891-kubeconfig\") pod \"kube-controller-manager-master01\" (UID: \"74f278afe29fec416b9250c4a8bd9891\") " pod="kube-system/kube-controller-manager-master01" Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315365 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"ca-certs\" (UniqueName: \"kubernetes.io/host-path/dba9e9d8eaf89a013e528eca0851754b-ca-certs\") pod \"kube-apiserver-master01\" (UID: \"dba9e9d8eaf89a013e528eca0851754b\") " pod="kube-system/kube-apiserver-master01" Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315454 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"cni-bin-dir\" (UniqueName: \"kubernetes.io/host-path/09898e3e-86cd-4e17-b65b-d9316456b924-cni-bin-dir\") pod \"calico-node-g82pm\" (UID: \"09898e3e-86cd-4e17-b65b-d9316456b924\") " pod="kube-system/calico-node-g82pm" Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315568 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"etcd-data\" (UniqueName: \"kubernetes.io/host-path/d84a5bdd5adeecdea8774912b5490e5a-etcd-data\") pod \"etcd-master01\" (UID: \"d84a5bdd5adeecdea8774912b5490e5a\") " pod="kube-system/etcd-master01" Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315638 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"k8s-certs\" (UniqueName: \"kubernetes.io/host-path/74f278afe29fec416b9250c4a8bd9891-k8s-certs\") pod \"kube-controller-manager-master01\" (UID: \"74f278afe29fec416b9250c4a8bd9891\") " pod="kube-system/kube-controller-manager-master01" Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315699 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"lib-modules\" (UniqueName: \"kubernetes.io/host-path/c429d334-211c-4183-b759-1244d054050a-lib-modules\") pod \"kube-proxy-xfgcw\" (UID: \"c429d334-211c-4183-b759-1244d054050a\") " pod="kube-system/kube-proxy-xfgcw" Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315763 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"sys-fs\" (UniqueName: \"kubernetes.io/host-path/09898e3e-86cd-4e17-b65b-d9316456b924-sys-fs\") pod \"calico-node-g82pm\" (UID: \"09898e3e-86cd-4e17-b65b-d9316456b924\") " pod="kube-system/calico-node-g82pm" Jan 21 16:52:50 master01 kubelet[20089]: I0121 16:52:50.315827 20089 reconciler_common.go:258] "operationExecutor.VerifyControllerAttachedVolume started for volume \"cni-log-dir\" (UniqueName: \"kubernetes.io/host-path/09898e3e-86cd-4e17-b65b-d9316456b924-cni-log-dir\") pod \"calico-node-g82pm\" (UID: \"09898e3e-86cd-4e17-b65b-d9316456b924\") " pod="kube-system/calico-node-g82pm" --- #查看容器状态,同样未发现异常 [root@master01 ~]# systemctl status containerd.service ● containerd.service - containerd container runtime Loaded: loaded (/etc/systemd/system/containerd.service; enabled; preset: disabled) Active: active (running) since Sun 2024-01-21 16:59:09 CST; 19min ago Docs: https://containerd.io Process: 25997 ExecStartPre=/sbin/modprobe overlay (code=exited, status=0/SUCCESS) Main PID: 25998 (containerd) Tasks: 133 Memory: 253.1M CPU: 2min 5.685s CGroup: /system.slice/containerd.service ├─ 1384 /usr/local/bin/containerd-shim-runc-v2 -namespace k8s.io -id 18119b7321c954ba451f8ebd312f7b87bf71505a8bb6a94869e7826c343e9bc8 -address /run/containerd/containerd.sock ├─ 1385 /usr/local/bin/containerd-shim-runc-v2 -namespace k8s.io -id 97965e2a881156f0f90d4991b47f1706eaee9ce9ecc62953c508b1a179cd0bb8 -address /run/containerd/containerd.sock ├─ 1386 /usr/local/bin/containerd-shim-runc-v2 -namespace k8s.io -id ac5d837f4fe59bab7870a90f0d2df3601330893e5d72f05f02e06007a76a83bd -address /run/containerd/containerd.sock ├─ 1387 /usr/local/bin/containerd-shim-runc-v2 -namespace k8s.io -id ebb860638e85623530c06c2cde62ca1e697f5c0aa3adc4e0fd847bb737b76845 -address /run/containerd/containerd.sock ├─ 2025 /usr/local/bin/containerd-shim-runc-v2 -namespace k8s.io -id c70518f72a23c97a69328658ab2023a3ee37b9323e107bd4db225effb7fa3209 -address /run/containerd/containerd.sock ├─ 2053 /usr/local/bin/containerd-shim-runc-v2 -namespace k8s.io -id 9d6b6d456185811f25a848c1919cb540af22120b0594fc325d24d78ffbab9f85 -address /run/containerd/containerd.sock ├─ 2977 /usr/local/bin/containerd-shim-runc-v2 -namespace k8s.io -id 4b8c6466ef3c91987df68c99d526687f77c61571e03fae6ddc3cf93fbc5478fb -address /run/containerd/containerd.sock ├─ 3028 /usr/local/bin/containerd-shim-runc-v2 -namespace k8s.io -id c311156c2ab839567fbc7fdc6a42a125eb470e034533728f975920b42eb995e4 -address /run/containerd/containerd.sock ├─ 3260 /usr/local/bin/containerd-shim-runc-v2 -namespace k8s.io -id dbe43b5f3ea7b5b91aa155368d4a22a234fa6c8a8378219594834c484e2a926f -address /run/containerd/containerd.sock └─25998 /usr/local/bin/containerd Jan 21 17:08:44 master01 containerd[25998]: time="2024-01-21T17:08:44.127760021+08:00" level=info msg="Forcibly stopping sandbox \"f101febe101ffb3ab6096780d50fe82ee3bb65ea0718238ca1d4a8ff87a35299\"" Jan 21 17:08:44 master01 containerd[25998]: time="2024-01-21T17:08:44.127830505+08:00" level=info msg="TearDown network for sandbox \"f101febe101ffb3ab6096780d50fe82ee3bb65ea0718238ca1d4a8ff87a35299\" successf> Jan 21 17:08:44 master01 containerd[25998]: time="2024-01-21T17:08:44.131351745+08:00" level=info msg="RemovePodSandbox \"f101febe101ffb3ab6096780d50fe82ee3bb65ea0718238ca1d4a8ff87a35299\" returns successfully" Jan 21 17:08:44 master01 containerd[25998]: time="2024-01-21T17:08:44.131814988+08:00" level=info msg="StopPodSandbox for \"19f6f66af3bdb85221d012f87f588028052105222dcd6a092ca2e41f1a07c149\"" Jan 21 17:08:44 master01 containerd[25998]: time="2024-01-21T17:08:44.131903009+08:00" level=info msg="TearDown network for sandbox \"19f6f66af3bdb85221d012f87f588028052105222dcd6a092ca2e41f1a07c149\" successf> Jan 21 17:08:44 master01 containerd[25998]: time="2024-01-21T17:08:44.131964250+08:00" level=info msg="StopPodSandbox for \"19f6f66af3bdb85221d012f87f588028052105222dcd6a092ca2e41f1a07c149\" returns successful> Jan 21 17:08:44 master01 containerd[25998]: time="2024-01-21T17:08:44.132344600+08:00" level=info msg="RemovePodSandbox for \"19f6f66af3bdb85221d012f87f588028052105222dcd6a092ca2e41f1a07c149\"" Jan 21 17:08:44 master01 containerd[25998]: time="2024-01-21T17:08:44.132387551+08:00" level=info msg="Forcibly stopping sandbox \"19f6f66af3bdb85221d012f87f588028052105222dcd6a092ca2e41f1a07c149\"" Jan 21 17:08:44 master01 containerd[25998]: time="2024-01-21T17:08:44.132465665+08:00" level=info msg="TearDown network for sandbox \"19f6f66af3bdb85221d012f87f588028052105222dcd6a092ca2e41f1a07c149\" successf> Jan 21 17:08:44 master01 containerd[25998]: time="2024-01-21T17:08:44.136367532+08:00" level=info msg="RemovePodSandbox \"19f6f66af3bdb85221d012f87f588028052105222dcd6a092ca2e41f1a07c149\" returns successfully
#查看api-server是否处于监听状态,发现api-server有处于Exited状态的容器 [root@master01 ~]# netstat -pnlt | grep 6443 tcp 0 0 0.0.0.0:16443 0.0.0.0:* LISTEN 808/haproxy tcp6 0 0 :::6443 :::* LISTEN 1596/kube-apiserver [root@master01 ~]# crictl ps -a| grep kube-apiserver cb8cfe6f4d7bd bb5e0dde9054c 13 minutes ago Running kube-apiserver 12 ebb860638e856 kube-apiserver-master01 d4aee3d9d0659 bb5e0dde9054c 25 hours ago Exited kube-apiserver 11 90e52cd1a57e4 kube-apiserver-master01 #查看所有容器的状态,发现所有服务均有处于Exited状态的 [root@master01 ~]# crictl ps -a CONTAINER IMAGE CREATED STATE NAME ATTEMPT POD ID POD 8995520026e8e ead0a4a53df89 23 minutes ago Running coredns 4 dbe43b5f3ea7b coredns-66f779496c-l7l6d 8e696a0669d42 1919f2787fa70 24 minutes ago Running calico-kube-controllers 7 c311156c2ab83 calico-kube-controllers-7ddc4f45bc-wtjnn d88c194cccd24 ead0a4a53df89 24 minutes ago Running coredns 4 4b8c6466ef3c9 coredns-66f779496c-7v6qk 166565dbbd5c6 8065b798a4d67 24 minutes ago Running calico-node 5 9d6b6d4561858 calico-node-g82pm 43c3e857538d5 8065b798a4d67 24 minutes ago Exited mount-bpffs 0 9d6b6d4561858 calico-node-g82pm 78ee636a32117 9dee260ef7f59 24 minutes ago Exited install-cni 0 9d6b6d4561858 calico-node-g82pm b8ef480c04570 9dee260ef7f59 24 minutes ago Exited upgrade-ipam 2 9d6b6d4561858 calico-node-g82pm 498460e788935 ea1030da44aa1 24 minutes ago Running kube-proxy 5 c70518f72a23c kube-proxy-xfgcw a5ba4845dd312 4be79c38a4bab 25 minutes ago Running kube-controller-manager 9 97965e2a88115 kube-controller-manager-master01 efa185f872020 f6f496300a2ae 25 minutes ago Running kube-scheduler 8 18119b7321c95 kube-scheduler-master01 cb8cfe6f4d7bd bb5e0dde9054c 25 minutes ago Running kube-apiserver 12 ebb860638e856 kube-apiserver-master01 8bf8270e4c3c0 73deb9a3f7025 25 minutes ago Running etcd 9 ac5d837f4fe59 etcd-master01 99f87ee9a51ae 8065b798a4d67 25 hours ago Exited calico-node 4 19f6f66af3bdb calico-node-g82pm 8d00ede2e2793 ead0a4a53df89 25 hours ago Exited coredns 3 1deba785ea1d1 coredns-66f779496c-l7l6d b97bfdec0a863 ead0a4a53df89 25 hours ago Exited coredns 3 772b653ef2b49 coredns-66f779496c-7v6qk 0f9b3e5d9f704 1919f2787fa70 25 hours ago Exited calico-kube-controllers 6 edecedd968a36 calico-kube-controllers-7ddc4f45bc-wtjnn dd028e5b4aa67 ea1030da44aa1 25 hours ago Exited kube-proxy 4 d26d7cb550015 kube-proxy-xfgcw d4aee3d9d0659 bb5e0dde9054c 25 hours ago Exited kube-apiserver 11 90e52cd1a57e4 kube-apiserver-master01 e9f18a382f3e6 4be79c38a4bab 25 hours ago Exited kube-controller-manager 8 f101febe101ff kube-controller-manager-master01 c7f75086e9c20 73deb9a3f7025 25 hours ago Exited etcd 8 5043a22f3f2da etcd-master01 41ef67cc035e0 f6f496300a2ae 25 hours ago Exited kube-scheduler 7 db6aef6293e51 kube-scheduler-master01 #筛选出退出状态的容器并删除 [root@master01 ~]# crictl rm `crictl ps -a |grep Exited|awk '{print $1}'` 43c3e857538d5 78ee636a32117 b8ef480c04570 99f87ee9a51ae 8d00ede2e2793 b97bfdec0a863 0f9b3e5d9f704 dd028e5b4aa67 d4aee3d9d0659 e9f18a382f3e6 c7f75086e9c20 41ef67cc035e0 #再次查看所有容器,退出状态的容器已删除 [root@master01 ~]# crictl ps -a CONTAINER IMAGE CREATED STATE NAME ATTEMPT POD ID POD 8995520026e8e ead0a4a53df89 32 minutes ago Running coredns 4 dbe43b5f3ea7b coredns-66f779496c-l7l6d 8e696a0669d42 1919f2787fa70 32 minutes ago Running calico-kube-controllers 7 c311156c2ab83 calico-kube-controllers-7ddc4f45bc-wtjnn d88c194cccd24 ead0a4a53df89 32 minutes ago Running coredns 4 4b8c6466ef3c9 coredns-66f779496c-7v6qk 166565dbbd5c6 8065b798a4d67 32 minutes ago Running calico-node 5 9d6b6d4561858 calico-node-g82pm 498460e788935 ea1030da44aa1 33 minutes ago Running kube-proxy 5 c70518f72a23c kube-proxy-xfgcw a5ba4845dd312 4be79c38a4bab 34 minutes ago Running kube-controller-manager 9 97965e2a88115 kube-controller-manager-master01 efa185f872020 f6f496300a2ae 34 minutes ago Running kube-scheduler 8 18119b7321c95 kube-scheduler-master01 cb8cfe6f4d7bd bb5e0dde9054c 34 minutes ago Running kube-apiserver 12 ebb860638e856 kube-apiserver-master01 8bf8270e4c3c0 73deb9a3f7025 34 minutes ago Running etcd 9 ac5d837f4fe59 etcd-master01 #再次运行kubectl命令查看节点信息,发现已正常 [root@master01 ~]# kubectl get nodes NAME STATUS ROLES AGE VERSION master01 Ready control-plane 3d15h v1.28.5 master02 Ready control-plane 3d15h v1.28.5 master03 Ready control-plane 2d1h v1.28.6 worker01 Ready worker 3d15h v1.28.5 worker02 Ready worker 3d15h v1.28.5