系统巡检脚本
#!/bin/bash
# system-check-scripts
#主机信息每日巡检
## 备注:使用脚本前请先根据不同环境修改IP变量中的网卡名为管理网网卡!!!
## 在执行系统脚本时,会调用k8s集群巡检脚本!或者在执行带k8s容器环境时(eg:UC)也可以单独执行k8s-cluster_check_dandu.sh脚本!
## 在巡检UC以外的系统时,请注释掉k8s-cluster_check函数!
#IP=$(ifconfig eth0|grep -w 'inet'|awk -F '[ :]' '{print $13}')
IP=$(ifconfig eth0|grep -w 'inet' | awk {'print $2'})
#环境变量PATH没设好,在cron里执行时有很多命令会找不到
export PATH=/usr//sbin:/usr//bin:/sbin:/bin:/usr/sbin:/usr/bin:/root/bin
source /etc/profile
[ $(id -u) -gt 0 ] && echo "请用root用户执行此脚本!" && exit 1
centosVersion=$(awk '{print $(NF-1)}' /etc/redhat-release)
VERSION="V1.0.1"
#定义日志相关
PROGPATH=`echo $0 | sed -e 's,[\\/][^\\/][^\\/]*$,,'`
[ -f $PROGPATH ] && PROGPATH="."
LOGPATH="$PROGPATH/report"
[ -e $LOGPATH ] || mkdir $LOGPATH
RESULTFILE="$LOGPATH/System-Check-$IP-`date +%Y%m%d`.txt"
#迎宾仪式
function Welcome(){
cat <<EOF
################################
| 欢迎使用日常巡检脚本! |
| ^^ ^^ |
| () 脚本版本为:$VERSION () |
| |
###############################
EOF
}
function OS_INFO(){
# 系统名
OS_NAME=`uname -n`
# 系统版本
OS_VERSION=`cat /etc/.kyinfo | grep dist_id | grep -oE "Kylin.*" 2>/dev/null || echo 获取信息失败`
# 系统类型
OS_TYPE=`uname`
# 主机序列号
OS_NUM=`dmidecode -t system | grep 'Serial Number' | awk '{print $3}'`
# 系统内核版本
OS_KERNEL=`uname -r`
# 系统机器码
OS_CODE=""
# 系统语言环境
OS_LANG=`echo $LANG`
# 系统时间
OS_DATE=`date +"%Y-%m-%d %H:%M:%S"`
# 系统运行时间
OS_UPTIME=`uptime | awk -F',' '{sub(/.*up /,"",$1);print $1'} || echo 获取信息失败`
# 系统时区情况
OS_CLOCK=`clockdiff -o1 $IP`
# 系统上次重启时间
OS_LAST_REBOOT=`last reboot | head -1 | awk '{print $5,$6,$7,$8,$10}'`
# 系统上次关机时间
OS_LAST_SHUTDOWN=`last -x | grep shutdown | head -1 | awk '{print $5,$6,$7,$8,$10}'`
echo "################################# [ 系统信息巡检区 ] ######################################"
echo "主机名:$OS_NAME"
echo "主机类型:$OS_TYPE"
echo "主机序列号:${OS_NUM:-获取信息失败}"
echo "系统版本:$OS_VERSION"
echo "系统内核版本:$OS_KERNEL"
echo "系统机器码:${OS_CODE:-获取信息失败}"
echo "系统语言环境:${OS_LANG}"
echo "系统时间;$OS_DATE"
echo "系统时区情况:$OS_CLOCK"
echo "系统已运行时间:$OS_UPTIME"
echo "系统上次重启时间:${OS_LAST_REBOOT:-获取信息失败}"
echo "系统上次关机时间:${OS_LAST_SHUTDOWN:-获取信息失败}"
}
function OS_HDWARE(){
# CPU架构
CPU_ARCH=`uname -m`
# CPU型号
CPU_TYPE=`cat /proc/cpuinfo | grep "model name" | uniq | awk -F':' '{sub(/ /,"",$2);print $2}'`
# CPU个数
CPU_NUM=`cat /proc/cpuinfo | grep "physical id" | sort | uniq | wc -l`
# CPU 核数
CPU_CORE=`cat /proc/cpuinfo | grep cores | uniq | awk -F':' '{sub(/ /,"",$2);print $2}'`
# CPU 频率
CPU_HZ=`cat /proc/cpuinfo | grep "cpu MHz" | uniq | awk -F':' '{sub(/ /,"",$2);printf "%s MHz\n",$2}'`
# 内存容量
ME_SIZE=$(echo "scale=2;`cat /proc/meminfo | grep 'MemTotal:' | awk '{print $2}'`/1048576"|bc)
# 空闲内存
ME_FREE=$(echo "scale=2;`cat /proc/meminfo | grep 'MemFree:' | awk '{print $2}'`/1048576"|bc)
# 可用内存
ME_FREEE=$(echo "scale=2;`cat /proc/meminfo | grep 'MemAvailable:' | awk '{print $2}'`/1048576" | bc)
# 内存使用率
ME_USE=$(awk 'BEGIN{printf "%.1f%\n",('$ME_SIZE'-'$ME_FREEE')/'$ME_SIZE'*100}')
# SWAP大小
ME_SWAP_SIZE=$(echo "scale=2;`cat /proc/meminfo | grep 'SwapTotal:' | awk '{print $2}'`/1048576"|bc)
# SWAP可用
ME_SWAP_FREE=$(echo "scale=2;`cat /proc/meminfo | grep 'SwapFree:' | awk '{print $2}'`/1048576"|bc)
# SWAP使用率
ME_SWAP_USE=$(awk 'BEGIN{printf "%.1f%\n",('$ME_SWAP_SIZE'-'$ME_SWAP_FREE')/'$ME_SWAP_SIZE'*100}')
# Buffer大小
ME_BUF=$(cat /proc/meminfo | grep 'Buffers:' | awk '{printf "%s KB",$2}')
# 内存Cache大小
ME_CACHE=$(cat /proc/meminfo | grep '^Cached:' | awk '{printf "%s KB",$2}')
# 当前系统所有网卡
NET_DEVICE=(`cat /proc/net/dev | awk 'NR>2 && $1 !~/lo/ {sub(/:/,"");print $1}'`)
echo "################################# [ 系统硬件巡检区 ] ######################################"
echo "CPU型号:$CPU_TYPE"
echo "CPU架构:$CPU_ARCH"
echo "CPU个数:$CPU_NUM"
echo "CPU核数: $CPU_CORE"
echo "CPU频率:$CPU_HZ"
echo "内存容量:${ME_SIZE} GB"
echo "内存空闲:${ME_FREE} GB"
echo "内存可用:${ME_FREEE} GB"
echo "内存使用率:${ME_USE}"
echo "SWAP容量:$ME_SWAP_SIZE GB"
echo "SWAP可用容量:$ME_SWAP_FREE GB"
echo "SWAP使用率:$ME_SWAP_USE"
echo "内存Buffer大小:${ME_BUF}"
echo "内存Cache大小:${ME_CACHE}"
for i in ${NET_DEVICE[@]}
do
echo "网卡:$i 状态: $(ip link show eth0 | awk 'NR==1{print $9}') RX: $(ethtool -g eth0 | grep "RX:" | tail -1 | awk '{print $2}') TX: $(ethtool -g eth0 | grep "TX:" | tail -1 | awk '{print $2}')"
done
}
function OS_NETWORK(){
# 系统IP
# IP=$(hostname -I)
# 网关地址
GATEWAY=$(ip route | grep default &>/dev/null && ip route | grep default | awk '{print $3}' || echo '未设置默认网关')
# DNS地址
DNS=(`cat /etc/resolv.conf | grep nameserver | uniq | awk '{print $2}'`)
echo "################################# [ 系统网络巡检区 ] ######################################"
echo "IP地址:$IP"
echo "网关地址:$GATEWAY"
echo "DNS地址:${DNS[@]}"
echo "网关[$GATEWAY]连接情况: $(ping -t 1 -i 1 -c 5 -W 1 $GATEWAY &>/dev/null && echo '正常通信' || echo '无法通信')"
}
function OS_RESOURCE(){
# 系统磁盘列表
DISK_LIST=(`lsblk | egrep "^[a-z].*" | grep -v "^sr" | awk '{print $1}'`)
# 系统磁盘使用率情况
DISK_PER=(`df -h | awk 'NR>1 && $1 !~/sr/ {gsub(/%/,"",$5);print $5}'`)
# CPU空闲率
CPU_FREE=$(top -d 1 -n 1 -b | awk 'NR==3{print $8}')
# CPU使用率
CPU_USE=$(awk 'BEGIN{printf "%.1f%\n",100-'$CPU_FREE'}')
# CPU_TOP_TEN
CPU_TOP_TEN=$(top -d 1 -n 1 -b | column -t | awk 'NR>=7 && NR<=15')
# 当前进程数
CPU_PROCESSORS=$(top -d 1 -n 1 -b | awk 'NR==2{print $2}')
# 当前正在运行进程数
CPU_RUN_PROCESSORS=$(top -d 1 -n 1 -b | awk 'NR==2{print $4}')
# 当前正在休眠进程数
CPU_SL_PROCESSORS=$(top -d 1 -n 1 -b | awk 'NR==2{print $6}')
# 当前停止运行进程数
CPU_STOP_PROCESSORS=$(top -d 1 -n 1 -b | awk 'NR==2{print 8}')
# 当前僵尸进程数
CPU_ZOM_PROCESSORS=$(top -d 1 -n 1 -b | awk 'NR==2{print $10}')
echo "################################# [ 系统资源巡检区 ] ######################################"
echo "CPU使用率:$CPU_USE"
echo "CPU使用率前十进程信息:"
echo "$(ps -eo user,pid,pcpu,pmem,args --sort=-pcpu | head -n 10)"
echo "内存使用率前十进程信息:"
echo "$(ps -eo user,pid,pcpu,pmem,args --sort=-pmem | head -n 10)"
echo "磁盘IO信息:$(iotop -bon 1 &>/dev/null || echo 'io top 未安装信息获取失败')"
echo "$(iotop -bon 1 &>/dev/null && iotop -bon 1 | head -n 13)"
echo "磁盘分区使用率是否正常:正常"
for i in ${DISK_LIST[@]}
do
if [[ -z "$(lsblk --nodeps -no serial /dev/$i)" ]]; then
echo "磁盘:$i 磁盘序列号:获取信息失败"
else
echo "磁盘:$i 磁盘序列号:$(lsblk --nodeps -no serial /dev/$i)"
fi
done
for i in ${DISK_PER[@]}
do
if [ $i -gt 80 ]; then
echo "某分区磁盘使用率为:$i% > 80% 请及时扩容"
fi
done
echo "\n系统磁盘分区inode使用情况:"
echo "$(df -Thi)"
echo "\n系统当前进程数:$CPU_PROCESSORS"
echo "系统当前进程运行数:$CPU_RUN_PROCESSORS"
echo "系统当前休眠进程数:$CPU_SL_PROCESSORS"
echo "系统当前停止进程数:$CPU_STOP_PROCESSORS"
echo "系统当前僵尸进程数:$CPU_ZOM_PROCESSORS"
echo "系统当前允许最大fd数量:$(cat /proc/sys/fs/file-nr | awk '{print $3}')"
echo "系统当前已打开fd数量:$(cat /proc/sys/fs/file-nr | awk '{print $1}')"
echo "系统单个进程运行打开fd数量:$(ulimit -n)"
echo "系统当前socket连接数:$(netstat -anp &>/dev/null && netstat -anp | wc -l || echo 'net-tools 未安装,获取信息失败')"
echo "系统 established socket数量: $(netstat -anp &>/dev/null && netstat -anp | grep "ESTABLISHED" | wc -l || echo 'net-tools 未安装,获取信息失败')"
echo "系统 sync socket数量:$(netstat -anp &>/dev/null && netstat -anp | grep "SYN" | wc -l || echo 'net-tools 未安装,获取信息失败')"
echo "系统当前已建立socket如下:"
echo "$(netstat -anp &>/dev/null && netstat -anp | grep ESTABLISHED | awk '{printf " 本地:%-20s <=> 外部:%-22s\n",$4,$5}' || echo '')"
}
function OS_SECURITY(){
# 系统所有能登录的用户
OS_USER=(`cat /etc/passwd | awk -F':' '$NF !~/nologin|sync|shutdown|halt/ {print $1}'`)
# Selinux
OS_SELINUX=`getenforce`
# 防火墙状态
OS_FIREWALLD=`service firewalld status &>/dev/null | grep "running" && echo on || echo off`
echo "################################# [ 系统安全巡检区 ] ######################################"
echo "防火墙状态: $OS_FIREWALLD"
echo "Selinux状态:${OS_SELINUX}\n"
echo "系统可登录用户数:$(cat /etc/passwd | awk -F':' '$NF !~/nologin|sync|shutdown|halt/ {print $1}' | wc -l)"
echo "系统可登录用户:${OS_USER[@]}"
for i in ${OS_USER[@]}
do
echo "用户 $i 最后1次登录信息: $(lastlog -u $i | awk 'NR==2')"
done
echo "系统当前登录用户:"
echo "$(who | sed 's#[()]##g' | awk '{printf " 用户: %10s 终端: %7s 登录时间: %7s %7s 登录IP: %7s\n",$1,$2,$3,$4,$5}')"
}
function OS_SERVICE(){
echo "################################# [ 系统服务巡检区 ] ######################################"
echo "自行添加"
}
function k8s-cluster_check(){
k8s_check_scripts_file="/root/monitor/check/k8s-cluster_check.sh"
#判断k8s集群巡检脚本是否存在
if [ ! -f ${k8s_check_scripts_file} ];then
echo 'k8s集群巡检脚本不存在,巡检未执行!'
exit 1
else
# 调用k8s集群巡检脚本执行
bash /root/monitor/check/k8s-cluster_check.sh
fi
}
if [ $(id -u -n) != "root" ]; then
ERROR "请以ROOT用户运行这个脚本"
fi
function check(){
Welcome
OS_INFO
OS_HDWARE
OS_NETWORK
OS_RESOURCE
OS_SECURITY
#pod_check
#node_check
k8s-cluster_check
}
#执行检查并保存检查结果
check > $RESULTFILE
K8s巡检脚本
#!/bin/bash
#Author: xwutx
#Version: v1
#k8s集群日常巡检
#迎宾仪式
VERSION=v1
function Welcome(){
cat <<EOF
################################
| 欢迎使用k8s集群日常巡检脚本! |
| 脚本版本为:$VERSION |
| |
###############################
EOF
}
#查看controller-manager、scheduler、etcd状态
function soft_check(){
echo -e "----------Controller-manager、Scheduler、Etcd-0检测中----------"
unhealthy_soft_count=$(kubectl get cs | awk 'NR == 1 {next}{if($2 != "Healthy") print $1}' | wc -l) #组件状态异常数
if (( ${unhealthy_soft_count} >= 1 ));then
echo -e "\033[31m$(kubectl get cs | awk 'NR == 1 {next}{if($2 != "Healthy") print $1}') Unhealthy\033[0m"
else
echo -e "\033[32mcontroller-manager、scheduler、etcd-0无异常\033[0m"
fi
}
#查看kubelet状态
function kubeletCheck(){
kubeletError=$(systemctl status kubelet.service | grep ^"$(date | awk '{print$2,$3}')" | grep -i error | wc -l) #kubelet日志报错数
echo -e "\n \n---------- Kubelet状态检测中 ----------"
if (( ${kubeletError} >= 1 ));then
echo -e "kubelet错误日志:" ; systemctl status kubelet.service | grep ^"$(date | awk '{print$2,$3}')" |awk '{for (i=10;i<=NF;i++)printf("%s ", $i);print ""}' | grep -i error | sort -n | uniq
else
echo -e "kubelet无日志报错"
fi
}
#查看Pods状态
function podsCheck(){
errorPod=$(kubectl get pods --all-namespaces | grep -v NAMESPACE| awk '{if($4 != "Running") print}' | wc -l ) #非runing状态pod数
echo -e "\n \n---------- Pods运行状态检测中 ----------"
if (( ${errorPod} >= 1 ));then
echo -e "ErrorPod:" && kubectl get pods --all-namespaces | grep -v NAMESPACE | awk '{if($4 != "Running") print}'
else
echo -e "Pods无异常"
fi
}
#查看Node资源使用率
function nodeCheck(){
echo -e "\n \n---------- Nodes资源使用状态检测中 ----------"
memWarn=0
for i in $(kubectl get nodes | awk 'NR == 1 {next}{print $1}');do
memRq=$(kubectl describe node $i | grep memory | grep % | awk '{print $3}' | sed "s/[^0-9]//g") #memory_request
memLim=$(kubectl describe node $i | grep memory | grep % | awk '{print $5}' | sed "s/[^0-9]//g") #memory_limit
cpuUsed=$(kubectl top nodes $i | awk 'NR == 1 {next}{print $3}'| sed "s/[^0-9]//g") #cpu使用率
memUsed=$(kubectl top nodes $i | awk 'NR == 1 {next}{print $5}'| sed "s/[^0-9]//g") #内存使用率
if (( $cpuUsed > 60 || $memUsed > 80 ));then
let memWarn+=1
echo -e "$i\tCPU使用率:$cpuUsed%\t内存使用率:$memUsed%"
fi
if (( $memRq > 95 ));then
let memWarn+=1
echo -e "$i\tMem_Requests:$memRq%\tMem_Limits:$memLim%"
fi
done
if (( $memWarn ==0 ));then
echo -e "无节点CPU、内存使用异常"
fi
}
#获取pods重启次数
function podRestart(){
echo -e "\n \n---------- Pods自动重启检测中 ----------"
kubectl get pods --all-namespaces |awk '{if($5 > 0) print}' | awk '{print $2,$5}' >/opt/podsnew.txt
rebootNum=$(diff /opt/podsold.txt /opt/podsnew.txt | wc -l)
if (( $rebootNum > 1 ));then
echo -e "有以下pod重启:"
diff /opt/podsold.txt /opt/podsnew.txt
else
echo -e "无自动重启pod"
fi
rm -f /opt/podsold.txt && mv /opt/podsnew.txt /opt/podsold.txt
}
main(){
Welcome
masterCheck
kubeletCheck
podsCheck
nodeCheck
podRestart
}
main
评论区