File: //bigscoots/wpo/db/zfs_race_check.sh
#!/bin/bash
# --- CONFIGURATION ---
WEBHOOK_URL="https://n8n.bigscoots.dev/webhook/slack/add-message"
AUTH_TOKEN="e82e6baf-7942-43b7-b5c3-46e2dacb15a0"
CHANNEL="wpo-lxd-db"
THRESHOLD_COUNT=4
THRESHOLD_TIME=100
STATE_FILE="/tmp/mariadb_stall_parent_id"
SCRIPT_PATH="/bigscoots/wpo/db/zfs_race_check.sh"
HOSTNAME=$(hostname)
IP_ADDR=$(hostname -I | awk '{print $1}')
# Ensure systemd can find all necessary binaries
export PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
# --- SYSTEMD SELF-DEPLOYMENT ---
deploy_systemd() {
# Check if the timer is already installed
if [ ! -f "/etc/systemd/system/mariadb-monitor.timer" ]; then
echo "Systemd timer not found. Deploying..."
# Create Service
cat <<EOF > /etc/systemd/system/mariadb-monitor.service
[Unit]
Description=MariaDB Stall Monitor Alert
[Service]
Type=oneshot
ExecStart=/bin/bash $SCRIPT_PATH
EOF
# Create Timer
cat <<EOF > /etc/systemd/system/mariadb-monitor.timer
[Unit]
Description=Run MariaDB Stall Monitor Every Minute
[Timer]
OnCalendar=*:0/1
Unit=mariadb-monitor.service
[Install]
WantedBy=timers.target
EOF
systemctl daemon-reload
systemctl enable --now mariadb-monitor.timer
echo "Systemd timer deployed and started."
fi
}
# Run deployment check
deploy_systemd
# --- DATA SOURCE ---
# Priority 1: Check for manual test file
if [ -f "/root/proc.list" ]; then
PROC_LIST=$(cat /root/proc.list)
else
# Priority 2: Query MariaDB (using root's .my.cnf for auth)
PROC_LIST=$(mysql --defaults-extra-file=/root/.my.cnf -N -e "SELECT id, user, time, state, info FROM information_schema.processlist WHERE command != 'Sleep' AND info NOT LIKE '%information_schema.processlist%' AND time >= $THRESHOLD_TIME" 2>/dev/null)
fi
# --- IDENTIFY PHYSICAL NODE ---
NODE_NAME=$(curl --silent --unix-socket /dev/lxd/sock http://lxd/1.0 | jq -r '.location')
if [ -z "$NODE_NAME" ] || [ "$NODE_NAME" == "null" ]; then
NODE_NAME="Unknown-Node"
fi
# Count active stuck queries
STUCK_COUNT=$(echo "$PROC_LIST" | grep -v '^$' | wc -l)
if [ "$STUCK_COUNT" -gt "$THRESHOLD_COUNT" ]; then
# Check if we are already in the middle of a reported stall
PARENT_MSG_ID=$(cat "$STATE_FILE" 2>/dev/null)
# --- FORMATTING LOGIC ---
CLEAN_PROC=$(echo "$PROC_LIST" | head -n 15 | awk -F'\t' '{
query = $5;
if (length(query) > 80) query = substr(query, 1, 77) "...";
printf "%-7s | %-12s | %-4s | %-15s | %s\n", $1, $2, $3, $4, query
}' | sed 's/\\/\\\\/g; s/"/\\"/g' | awk '{printf "%s\\n", $0}')
# --- SEND PARENT MESSAGE ---
if [ -z "$PARENT_MSG_ID" ]; then
MSG_CONTENT=":rotating_light: *MariaDB Stall Alert* :rotating_light:\\n*Node:* $NODE_NAME\\n*Container:* $HOSTNAME ($IP_ADDR)\\n*Stall Count:* $STUCK_COUNT queries exceeding ${THRESHOLD_TIME}s."
RESPONSE=$(curl --silent --location "$WEBHOOK_URL" \
--header "Authorization: Bearer $AUTH_TOKEN" \
--header "Content-Type: application/json" \
--data "{\"channel\": \"$CHANNEL\", \"message\": \"$MSG_CONTENT\"}")
# Extract message_id to allow threading
PARENT_MSG_ID=$(echo "$RESPONSE" | grep -oP '"message_id":\s*"?\K[0-9.]+')
if [ ! -z "$PARENT_MSG_ID" ]; then
echo "$PARENT_MSG_ID" > "$STATE_FILE"
fi
fi
# --- SEND THREADED DETAIL ---
if [ ! -z "$PARENT_MSG_ID" ]; then
THREAD_CONTENT="*Current Stuck Queue (Truncated):*\\n\`\`\`\\n$CLEAN_PROC\`\`\`"
curl --silent --location "$WEBHOOK_URL" \
--header "Authorization: Bearer $AUTH_TOKEN" \
--header "Content-Type: application/json" \
--data "{\"channel\": \"$CHANNEL\", \"parent_msg_id\": \"$PARENT_MSG_ID\", \"message\": \"$THREAD_CONTENT\"}" > /dev/null
fi
else
# Healthy: Remove state file so the next stall creates a fresh Slack thread
[ -f "$STATE_FILE" ] && rm -f "$STATE_FILE"
fi
exit 0