HEX
Server: nginx/1.29.3
System: Linux 11979.bigscoots-wpo.com 6.8.0-88-generic #89-Ubuntu SMP PREEMPT_DYNAMIC Sat Oct 11 01:02:46 UTC 2025 x86_64
User: nginx (1068)
PHP: 7.4.33
Disabled: exec,system,passthru,shell_exec,proc_open,proc_close,popen,show_source,cmd# Do not modify this line # 1684243876
Upload Files
File: //proc/1284356/root/bigscoots/lxd/node_monitor.sh
#!/bin/bash

# Always source common as requested
source /bigscoots/includes/common.sh

# Ensure counter directory exists
mkdir -p /root/.bigscoots/counters

# Check if this is an LXD node
if [ -f /usr/bin/lxc ] || [ -d /var/snap/lxd ]; then

    # --- LXCFS Zombie/Crash Check ---
    # Reports if more than 1 process exists; throttled to once per 24h
    if [ ! -f /root/.bigscoots/counters/lxcfschk ]; then
        lxcfs_count=$(ps aux|grep /var/snap/lxd/common/lxcfs.pid |grep -v grep | wc -l)
        
        if [ "$lxcfs_count" -gt 1 ]; then
            touch /root/.bigscoots/counters/lxcfschk
            
            # Get the current active PID from the snap pidfile
            current_lxcfs_pid=$(cat /var/snap/lxd/common/lxcfs.pid 2>/dev/null || echo "Unknown")
            
            bash /bigscoots/general/slack.sh "#node-alerts" ":warning: $(hostname) - Detected ${lxcfs_count} LXCFS processes! Current PID: ${current_lxcfs_pid}. Ghost processes detected; cleanup required."
            
            # Lock the alert for 24 hours
            screen -dmS lxcfschk sh -c 'sleep 86400 ; rm -f /root/.bigscoots/counters/lxcfschk'
        fi
    fi

    # --- ZFS Pool & Health Check ---
    if [ ! -f /root/.bigscoots/counters/zfs_chk ]; then
        read -r POOL_NAME ZFS_USAGE_RAW ZFS_HEALTH < <(zpool list -H -o name,cap,health | head -n1)
        ZFS_USAGE=$(echo "$ZFS_USAGE_RAW" | cut -d'%' -f1)

        if [ "$ZFS_USAGE" -ge 90 ] || [ "$ZFS_HEALTH" != "ONLINE" ]; then
            touch /root/.bigscoots/counters/zfs_chk
            SLACK_WEBHOOK=$(grep -oP '(?<=APP_SLACK_WEBHOOK=")[^"]*' /root/.slackrc)
            EMOJI=":warning:"
            [ "$ZFS_HEALTH" != "ONLINE" ] && EMOJI=":rotating_light:"

            slack_message="{
                \"channel\": \"#node-alerts\",
                \"blocks\": [
                    {
                        \"type\": \"section\",
                        \"text\": { \"type\": \"mrkdwn\", \"text\": \"$EMOJI *$(hostname)* - ${serverip} - ZFS Alert $EMOJI\" }
                    },
                    {
                        \"type\": \"section\",
                        \"fields\": [
                            { \"type\": \"mrkdwn\", \"text\": \"*Pool:*\n\`$POOL_NAME\`\" },
                            { \"type\": \"mrkdwn\", \"text\": \"*Status:*\n\`$ZFS_HEALTH\`\" },
                            { \"type\": \"mrkdwn\", \"text\": \"*Usage:*\n\`$ZFS_USAGE%\`\" }
                        ]
                    }
                ]
            }"
            curl -X POST -H 'Content-type: application/json' --data "$slack_message" "$SLACK_WEBHOOK"
            screen -dmS zfschk sh -c 'sleep 300 ; rm -f /root/.bigscoots/counters/zfs_chk'
        fi
    fi

#    # --- Memory Check ---
#    if [ ! -f /root/.bigscoots/counters/memchk ]; then
#        availmem=$(free -g | awk '/^Mem:/ {print $7}')
#        if [ "$availmem" -lt 4 ]; then
#            touch /root/.bigscoots/counters/memchk
#            bash /bigscoots/general/slack.sh "#node-alerts" ":warning: $(hostname) - ${serverip} - Low Memory: ${availmem}GB Available"
#            screen -dmS memchk sh -c 'sleep 300 ; rm -f /root/.bigscoots/counters/memchk'
#        fi
#    fi

    # --- IPMI / Power Supply Check ---
    if [ ! -f /root/.bigscoots/counters/pduchk ]; then
        if ! command -v ipmitool >/dev/null 2>&1; then
            apt-get update && apt-get -y install ipmitool
        fi

        ps_status=$(ipmitool sdr type "Power Supply" 2>/dev/null)
        if [ -n "$ps_status" ]; then
            while read -r line; do
                if [[ "$line" != *"Presence detected"* && "$line" != *"ok"* ]]; then
                    touch /root/.bigscoots/counters/pduchk
                    bash /bigscoots/general/slack.sh "#node-alerts" ":red_circle: $(hostname) - Power Supply Issue: \`$line\`"
                    screen -dmS pduchk sh -c 'sleep 300 ; rm -f /root/.bigscoots/counters/pduchk'
                    break
                fi
            done <<< "$ps_status"
        fi
    fi

    # --- SMART Drive Check ---
    if [ ! -f /root/.bigscoots/counters/deaddrivechk ] && ! lspci | grep -q "MegaRAID"; then
        if ! command -v smartctl >/dev/null 2>&1; then
            apt-get update && apt-get -y install smartmontools
        fi

        for DISK in /dev/sd[a-z] /dev/nvme[0-9]n[0-9]; do
            [ -e "$DISK" ] || continue
            # Skip removable devices (USB drives, etc.)
            DEV=$(basename "$DISK")
            [[ "$(cat /sys/block/${DEV}/removable 2>/dev/null)" == "1" ]] && continue
            if ! smartctl -H "$DISK" | grep -q "PASSED"; then
                touch /root/.bigscoots/counters/deaddrivechk
                log_file="/root/.bigscoots/hardware/disk/$(basename "$DISK").$(date +%s).log"
                mkdir -p "$(dirname "$log_file")"
                smartctl -a "$DISK" > "$log_file"
                bash /bigscoots/general/slack.sh "#node-alerts" "Detected failing drive: $DISK on $(hostname). Log: \`$log_file\`"
                screen -dmS drivechk sh -c 'sleep 300 ; rm -f /root/.bigscoots/counters/deaddrivechk'
            fi
        done
    fi
fi

# Cleanup old counters (resets alerts every 24 hours)
find /root/.bigscoots/counters/ -type f -mtime +0 -exec rm {} \;