File: //bigscoots/wpo/db/monitor_manager.sh
#!/usr/bin/env bash
set -euo pipefail
### --- CONFIG --- ###
COMMON_SH="/bigscoots/includes/common.sh" # <- your send_slack_alert lives here
LOG_DIR="/root/.bigscoots/logs/cpu_monitor"
MYSQL_LOG_DIR="${LOG_DIR}/mysql"
LOCK_FILE="/var/lock/cpu_monitor.lock"
# CPU thresholds (% of total across all cores, averaged via /proc/stat diff)
ALERT_CPU_PCT=50
KILL_CPU_PCT=90
# Sampling & pacing
SAMPLE_SEC=2 # time between /proc/stat samples to compute CPU%
POLL_SEC=5 # how often to re-check in the main loop
GRACE_SEC=5 # grace after issuing KILLs
ALERT_COOLDOWN_SEC=300 # don't spam Slack: min seconds between alerts of same severity
# Slack settings
SLACK_CHANNEL="#engineering"
SLACK_TAG="cpu-monitor"
SLACK_DOMAIN="$(hostname -f 2>/dev/null || hostname)"
# MySQL client (expects credentials in /root/.my.cnf or env)
MYSQL_CLI="mysql --batch --skip-column-names"
# Safety: exclude system threads from kills
MYSQL_EXCLUDE_USERS="'system user','event_scheduler','mysql.sys','rdsadmin'"
### ------------- ###
mkdir -p "${LOG_DIR}" "${MYSQL_LOG_DIR}"
timestamp() { date +"%Y-%m-%d %H:%M:%S %z"; }
today() { date +"%Y-%m-%d"; }
logfile() { echo "${LOG_DIR}/$(today).log"; }
log() {
local msg="$1"
echo "$(timestamp) | ${msg}" | tee -a "$(logfile)" >/dev/null
}
# Simple memory for alert cooldowns
_last_warn_ts=0
_last_crit_ts=0
should_alert() {
local level="$1" now tsvar
now=$(date +%s)
if [[ "${level}" == "WARN" ]]; then
tsvar="${_last_warn_ts}"
if (( now - tsvar >= ALERT_COOLDOWN_SEC )); then
_last_warn_ts=${now}
return 0
fi
else
tsvar="${_last_crit_ts}"
if (( now - tsvar >= ALERT_COOLDOWN_SEC )); then
_last_crit_ts=${now}
return 0
fi
fi
return 1
}
send_alert() {
local level="$1" # INFO/WARN/CRIT
local emoji="$2"
local msg="$3"
log "[ALERT:${level}] ${msg}"
# call send_slack_alert from common.sh
# args: channel emoji tag domain message
send_slack_alert "${SLACK_CHANNEL}" "${emoji}" "${SLACK_TAG}" "${SLACK_DOMAIN}" "${msg}"
}
# CPU usage via /proc/stat delta method
read_cpu_line() {
# prints fields: user nice system idle iowait irq softirq steal guest guest_nice
awk '/^cpu / {print $2,$3,$4,$5,$6,$7,$8,$9,$10,$11}' /proc/stat
}
cpu_usage_pct() {
# Usage: cpu_usage_pct <sample_seconds>
local wait="${1:-${SAMPLE_SEC}}"
local a b
a=($(read_cpu_line))
sleep "${wait}"
b=($(read_cpu_line))
# fields
local user nice system idle iowait irq softirq steal guest guest_nice
local user2 nice2 system2 idle2 iowait2 irq2 softirq2 steal2 guest2 guest_nice2
user=${a[0]}; nice=${a[1]}; system=${a[2]}; idle=${a[3]}; iowait=${a[4]}; irq=${a[5]}; softirq=${a[6]}; steal=${a[7]}; guest=${a[8]}; guest_nice=${a[9]}
user2=${b[0]}; nice2=${b[1]}; system2=${b[2]}; idle2=${b[3]}; iowait2=${b[4]}; irq2=${b[5]}; softirq2=${b[6]}; steal2=${b[7]}; guest2=${b[8]}; guest_nice2=${b[9]}
# guest times are already included in user/nice on modern kernels; ignore for totals
local idleAll idleAll2 nonIdle nonIdle2 total total2 totald idled usage
idleAll=$(( idle + iowait ))
idleAll2=$(( idle2 + iowait2 ))
nonIdle=$(( user + nice + system + irq + softirq + steal ))
nonIdle2=$(( user2 + nice2 + system2 + irq2 + softirq2 + steal2 ))
total=$(( idleAll + nonIdle ))
total2=$(( idleAll2 + nonIdle2 ))
totald=$(( total2 - total ))
idled=$(( idleAll2 - idleAll ))
if (( totald <= 0 )); then
echo "0.0"
return
fi
# usage = (totald - idled) / totald * 100
awk -v used="$(( totald - idled ))" -v tot="${totald}" 'BEGIN { printf "%.1f", (used*100.0)/tot }'
}
# MySQL helpers
mysql_proc_list_to_file() {
local out="${MYSQL_LOG_DIR}/mysql-proc-$(date +%Y%m%dT%H%M%S).txt"
{
echo "### $(timestamp) | SHOW FULL PROCESSLIST;"
${MYSQL_CLI} -e "SHOW FULL PROCESSLIST\G" || echo "[WARN] Could not run SHOW FULL PROCESSLIST"
echo
echo "### $(timestamp) | information_schema.processlist (trimmed)"
${MYSQL_CLI} -e "SELECT ID,USER,HOST,DB,COMMAND,TIME,STATE,LEFT(IFNULL(INFO,''),200) AS INFO FROM information_schema.processlist ORDER BY TIME DESC LIMIT 200;"
} > "${out}" || true
echo "${out}"
}
kill_all_selects() {
log "Preparing to kill all SELECT queries in MySQL…"
# Build kill commands, excluding system threads and our own connection
local tmp_cmds
tmp_cmds="$(mktemp)"
${MYSQL_CLI} -e "
SELECT CONCAT('KILL ',id,';')
FROM information_schema.processlist
WHERE COMMAND='Query'
AND INFO LIKE 'SELECT %'
AND USER NOT IN (${MYSQL_EXCLUDE_USERS})
AND ID <> CONNECTION_ID();
" > "${tmp_cmds}" || true
local to_kill
to_kill=$(wc -l < "${tmp_cmds}" | tr -d ' ')
if (( to_kill == 0 )); then
log "No SELECT queries matched kill criteria."
rm -f "${tmp_cmds}"
return 0
fi
log "Issuing ${to_kill} KILL statements for SELECT queries…"
# Execute kill commands
${MYSQL_CLI} < "${tmp_cmds}" || true
# Log what we attempted to kill
local killed_log="${MYSQL_LOG_DIR}/killed-selects-$(date +%Y%m%dT%H%M%S).sql"
mv "${tmp_cmds}" "${killed_log}" || true
log "Recorded KILL statements to ${killed_log}"
return 0
}
main_loop() {
# Make sure common.sh is loaded for send_slack_alert
# shellcheck disable=SC1090
source "${COMMON_SH}"
log "Starting CPU monitor on ${SLACK_DOMAIN} | thresholds: warn>=${ALERT_CPU_PCT}% crit>=${KILL_CPU_PCT}%"
while :; do
local pct
pct="$(cpu_usage_pct "${SAMPLE_SEC}")"
log "CPU usage ~${pct}%"
# WARN threshold
if awk -v c="${pct}" -v t="${ALERT_CPU_PCT}" 'BEGIN{ exit !(c>=t) }'; then
if should_alert "WARN"; then
send_alert "WARN" ":warning:" "CPU high on ${SLACK_DOMAIN}: ~${pct}% (≥${ALERT_CPU_PCT}%)."
fi
fi
# CRIT threshold: capture proc list + kill selects
if awk -v c="${pct}" -v t="${KILL_CPU_PCT}" 'BEGIN{ exit !(c>=t) }'; then
if should_alert "CRIT"; then
local procfile
procfile="$(mysql_proc_list_to_file || true)"
send_alert "CRIT" ":rotating_light:" \
"CPU CRITICAL on ${SLACK_DOMAIN}: ~${pct}% (≥${KILL_CPU_PCT}%). MySQL process list saved to ${procfile}. Killing all SELECT queries."
else
log "CRIT condition met (no Slack due to cooldown); still capturing proc list and killing SELECTs."
mysql_proc_list_to_file >/dev/null 2>&1 || true
fi
kill_all_selects || true
sleep "${GRACE_SEC}"
fi
sleep "${POLL_SEC}"
done
}
# --- entrypoint with lock to avoid dupes ---
main() {
mkdir -p "${LOG_DIR}" "${MYSQL_LOG_DIR}"
exec 9>"${LOCK_FILE}"
if ! flock -n 9; then
log "Another cpu_monitor instance is running; exiting."
exit 0
fi
main_loop
}
main "$@"