HEX
Server: nginx/1.29.3
System: Linux 11979.bigscoots-wpo.com 6.8.0-88-generic #89-Ubuntu SMP PREEMPT_DYNAMIC Sat Oct 11 01:02:46 UTC 2025 x86_64
User: nginx (1068)
PHP: 7.4.33
Disabled: exec,system,passthru,shell_exec,proc_open,proc_close,popen,show_source,cmd# Do not modify this line # 1684243876
Upload Files
File: //bigscoots/wpo/db/zfs_race_check.sh
#!/bin/bash

# --- CONFIGURATION ---
WEBHOOK_URL="https://n8n.bigscoots.dev/webhook/slack/add-message"
AUTH_TOKEN="e82e6baf-7942-43b7-b5c3-46e2dacb15a0"
CHANNEL="wpo-lxd-db"
THRESHOLD_COUNT=4
THRESHOLD_TIME=100
STATE_FILE="/tmp/mariadb_stall_parent_id"
SCRIPT_PATH="/bigscoots/wpo/db/zfs_race_check.sh"
HOSTNAME=$(hostname)
IP_ADDR=$(hostname -I | awk '{print $1}')

# Ensure systemd can find all necessary binaries
export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin

# --- SYSTEMD SELF-DEPLOYMENT ---
deploy_systemd() {
    # Check if the timer is already installed
    if [ ! -f "/etc/systemd/system/mariadb-monitor.timer" ]; then
        echo "Systemd timer not found. Deploying..."
        
        # Create Service
        cat <<EOF > /etc/systemd/system/mariadb-monitor.service
[Unit]
Description=MariaDB Stall Monitor Alert

[Service]
Type=oneshot
ExecStart=/bin/bash $SCRIPT_PATH
EOF

        # Create Timer
        cat <<EOF > /etc/systemd/system/mariadb-monitor.timer
[Unit]
Description=Run MariaDB Stall Monitor Every Minute

[Timer]
OnCalendar=*:0/1
Unit=mariadb-monitor.service

[Install]
WantedBy=timers.target
EOF

        systemctl daemon-reload
        systemctl enable --now mariadb-monitor.timer
        echo "Systemd timer deployed and started."
    fi
}

# Run deployment check
deploy_systemd

# --- DATA SOURCE ---
# Priority 1: Check for manual test file
if [ -f "/root/proc.list" ]; then
    PROC_LIST=$(cat /root/proc.list)
else
    # Priority 2: Query MariaDB (using root's .my.cnf for auth)
    PROC_LIST=$(mysql --defaults-extra-file=/root/.my.cnf -N -e "SELECT id, user, time, state, info FROM information_schema.processlist WHERE command != 'Sleep' AND info NOT LIKE '%information_schema.processlist%' AND time >= $THRESHOLD_TIME" 2>/dev/null)
fi

# --- IDENTIFY PHYSICAL NODE ---
NODE_NAME=$(curl --silent --unix-socket /dev/lxd/sock http://lxd/1.0 | jq -r '.location')

if [ -z "$NODE_NAME" ] || [ "$NODE_NAME" == "null" ]; then
    NODE_NAME="Unknown-Node"
fi

# Count active stuck queries
STUCK_COUNT=$(echo "$PROC_LIST" | grep -v '^$' | wc -l)

if [ "$STUCK_COUNT" -gt "$THRESHOLD_COUNT" ]; then
    # Check if we are already in the middle of a reported stall
    PARENT_MSG_ID=$(cat "$STATE_FILE" 2>/dev/null)

    # --- FORMATTING LOGIC ---
    CLEAN_PROC=$(echo "$PROC_LIST" | head -n 15 | awk -F'\t' '{
        query = $5;
        if (length(query) > 80) query = substr(query, 1, 77) "...";
        printf "%-7s | %-12s | %-4s | %-15s | %s\n", $1, $2, $3, $4, query
    }' | sed 's/\\/\\\\/g; s/"/\\"/g' | awk '{printf "%s\\n", $0}')

    # --- SEND PARENT MESSAGE ---
    if [ -z "$PARENT_MSG_ID" ]; then
        MSG_CONTENT=":rotating_light: *MariaDB Stall Alert* :rotating_light:\\n*Node:* $NODE_NAME\\n*Container:* $HOSTNAME ($IP_ADDR)\\n*Stall Count:* $STUCK_COUNT queries exceeding ${THRESHOLD_TIME}s."

        RESPONSE=$(curl --silent --location "$WEBHOOK_URL" \
            --header "Authorization: Bearer $AUTH_TOKEN" \
            --header "Content-Type: application/json" \
            --data "{\"channel\": \"$CHANNEL\", \"message\": \"$MSG_CONTENT\"}")

        # Extract message_id to allow threading
        PARENT_MSG_ID=$(echo "$RESPONSE" | grep -oP '"message_id":\s*"?\K[0-9.]+')

        if [ ! -z "$PARENT_MSG_ID" ]; then
            echo "$PARENT_MSG_ID" > "$STATE_FILE"
        fi
    fi

    # --- SEND THREADED DETAIL ---
    if [ ! -z "$PARENT_MSG_ID" ]; then
        THREAD_CONTENT="*Current Stuck Queue (Truncated):*\\n\`\`\`\\n$CLEAN_PROC\`\`\`"

        curl --silent --location "$WEBHOOK_URL" \
            --header "Authorization: Bearer $AUTH_TOKEN" \
            --header "Content-Type: application/json" \
            --data "{\"channel\": \"$CHANNEL\", \"parent_msg_id\": \"$PARENT_MSG_ID\", \"message\": \"$THREAD_CONTENT\"}" > /dev/null
    fi
else
    # Healthy: Remove state file so the next stall creates a fresh Slack thread
    [ -f "$STATE_FILE" ] && rm -f "$STATE_FILE"
fi

exit 0