File: //bigscoots/lxd/node_monitor.sh
#!/bin/bash
# Always source common as requested
source /bigscoots/includes/common.sh
# Ensure counter directory exists
mkdir -p /root/.bigscoots/counters
# Check if this is an LXD node
if [ -f /usr/bin/lxc ] || [ -d /var/snap/lxd ]; then
# --- LXCFS Zombie/Crash Check ---
# Reports if more than 1 process exists; throttled to once per 24h
if [ ! -f /root/.bigscoots/counters/lxcfschk ]; then
lxcfs_count=$(ps aux|grep /var/snap/lxd/common/lxcfs.pid |grep -v grep | wc -l)
if [ "$lxcfs_count" -gt 1 ]; then
touch /root/.bigscoots/counters/lxcfschk
# Get the current active PID from the snap pidfile
current_lxcfs_pid=$(cat /var/snap/lxd/common/lxcfs.pid 2>/dev/null || echo "Unknown")
bash /bigscoots/general/slack.sh "#node-alerts" ":warning: $(hostname) - Detected ${lxcfs_count} LXCFS processes! Current PID: ${current_lxcfs_pid}. Ghost processes detected; cleanup required."
# Lock the alert for 24 hours
screen -dmS lxcfschk sh -c 'sleep 86400 ; rm -f /root/.bigscoots/counters/lxcfschk'
fi
fi
# --- ZFS Pool & Health Check ---
if [ ! -f /root/.bigscoots/counters/zfs_chk ]; then
read -r POOL_NAME ZFS_USAGE_RAW ZFS_HEALTH < <(zpool list -H -o name,cap,health | head -n1)
ZFS_USAGE=$(echo "$ZFS_USAGE_RAW" | cut -d'%' -f1)
if [ "$ZFS_USAGE" -ge 90 ] || [ "$ZFS_HEALTH" != "ONLINE" ]; then
touch /root/.bigscoots/counters/zfs_chk
SLACK_WEBHOOK=$(grep -oP '(?<=APP_SLACK_WEBHOOK=")[^"]*' /root/.slackrc)
EMOJI=":warning:"
[ "$ZFS_HEALTH" != "ONLINE" ] && EMOJI=":rotating_light:"
slack_message="{
\"channel\": \"#node-alerts\",
\"blocks\": [
{
\"type\": \"section\",
\"text\": { \"type\": \"mrkdwn\", \"text\": \"$EMOJI *$(hostname)* - ${serverip} - ZFS Alert $EMOJI\" }
},
{
\"type\": \"section\",
\"fields\": [
{ \"type\": \"mrkdwn\", \"text\": \"*Pool:*\n\`$POOL_NAME\`\" },
{ \"type\": \"mrkdwn\", \"text\": \"*Status:*\n\`$ZFS_HEALTH\`\" },
{ \"type\": \"mrkdwn\", \"text\": \"*Usage:*\n\`$ZFS_USAGE%\`\" }
]
}
]
}"
curl -X POST -H 'Content-type: application/json' --data "$slack_message" "$SLACK_WEBHOOK"
screen -dmS zfschk sh -c 'sleep 300 ; rm -f /root/.bigscoots/counters/zfs_chk'
fi
fi
# # --- Memory Check ---
# if [ ! -f /root/.bigscoots/counters/memchk ]; then
# availmem=$(free -g | awk '/^Mem:/ {print $7}')
# if [ "$availmem" -lt 4 ]; then
# touch /root/.bigscoots/counters/memchk
# bash /bigscoots/general/slack.sh "#node-alerts" ":warning: $(hostname) - ${serverip} - Low Memory: ${availmem}GB Available"
# screen -dmS memchk sh -c 'sleep 300 ; rm -f /root/.bigscoots/counters/memchk'
# fi
# fi
# --- IPMI / Power Supply Check ---
if [ ! -f /root/.bigscoots/counters/pduchk ]; then
if ! command -v ipmitool >/dev/null 2>&1; then
apt-get update && apt-get -y install ipmitool
fi
ps_status=$(ipmitool sdr type "Power Supply" 2>/dev/null)
if [ -n "$ps_status" ]; then
while read -r line; do
if [[ "$line" != *"Presence detected"* && "$line" != *"ok"* ]]; then
touch /root/.bigscoots/counters/pduchk
bash /bigscoots/general/slack.sh "#node-alerts" ":red_circle: $(hostname) - Power Supply Issue: \`$line\`"
screen -dmS pduchk sh -c 'sleep 300 ; rm -f /root/.bigscoots/counters/pduchk'
break
fi
done <<< "$ps_status"
fi
fi
# --- SMART Drive Check ---
if [ ! -f /root/.bigscoots/counters/deaddrivechk ] && ! lspci | grep -q "MegaRAID"; then
if ! command -v smartctl >/dev/null 2>&1; then
apt-get update && apt-get -y install smartmontools
fi
for DISK in /dev/sd[a-z] /dev/nvme[0-9]n[0-9]; do
[ -e "$DISK" ] || continue
# Skip removable devices (USB drives, etc.)
DEV=$(basename "$DISK")
[[ "$(cat /sys/block/${DEV}/removable 2>/dev/null)" == "1" ]] && continue
if ! smartctl -H "$DISK" | grep -q "PASSED"; then
touch /root/.bigscoots/counters/deaddrivechk
log_file="/root/.bigscoots/hardware/disk/$(basename "$DISK").$(date +%s).log"
mkdir -p "$(dirname "$log_file")"
smartctl -a "$DISK" > "$log_file"
bash /bigscoots/general/slack.sh "#node-alerts" "Detected failing drive: $DISK on $(hostname). Log: \`$log_file\`"
screen -dmS drivechk sh -c 'sleep 300 ; rm -f /root/.bigscoots/counters/deaddrivechk'
fi
done
fi
fi
# Cleanup old counters (resets alerts every 24 hours)
find /root/.bigscoots/counters/ -type f -mtime +0 -exec rm {} \;