HEX
Server: nginx/1.29.3
System: Linux 11979.bigscoots-wpo.com 6.8.0-88-generic #89-Ubuntu SMP PREEMPT_DYNAMIC Sat Oct 11 01:02:46 UTC 2025 x86_64
User: nginx (1068)
PHP: 7.4.33
Disabled: exec,system,passthru,shell_exec,proc_open,proc_close,popen,show_source,cmd# Do not modify this line # 1684243876
Upload Files
File: //bigscoots/wpo/db/monitor_manager.sh
#!/usr/bin/env bash
set -euo pipefail

### --- CONFIG --- ###
COMMON_SH="/bigscoots/includes/common.sh"      # <- your send_slack_alert lives here
LOG_DIR="/root/.bigscoots/logs/cpu_monitor"
MYSQL_LOG_DIR="${LOG_DIR}/mysql"
LOCK_FILE="/var/lock/cpu_monitor.lock"

# CPU thresholds (% of total across all cores, averaged via /proc/stat diff)
ALERT_CPU_PCT=50
KILL_CPU_PCT=90

# Sampling & pacing
SAMPLE_SEC=2          # time between /proc/stat samples to compute CPU%
POLL_SEC=5            # how often to re-check in the main loop
GRACE_SEC=5           # grace after issuing KILLs
ALERT_COOLDOWN_SEC=300  # don't spam Slack: min seconds between alerts of same severity

# Slack settings
SLACK_CHANNEL="#engineering"
SLACK_TAG="cpu-monitor"
SLACK_DOMAIN="$(hostname -f 2>/dev/null || hostname)"

# MySQL client (expects credentials in /root/.my.cnf or env)
MYSQL_CLI="mysql --batch --skip-column-names"
# Safety: exclude system threads from kills
MYSQL_EXCLUDE_USERS="'system user','event_scheduler','mysql.sys','rdsadmin'"
### ------------- ###

mkdir -p "${LOG_DIR}" "${MYSQL_LOG_DIR}"

timestamp() { date +"%Y-%m-%d %H:%M:%S %z"; }
today() { date +"%Y-%m-%d"; }
logfile() { echo "${LOG_DIR}/$(today).log"; }

log() {
  local msg="$1"
  echo "$(timestamp) | ${msg}" | tee -a "$(logfile)" >/dev/null
}

# Simple memory for alert cooldowns
_last_warn_ts=0
_last_crit_ts=0

should_alert() {
  local level="$1" now tsvar
  now=$(date +%s)
  if [[ "${level}" == "WARN" ]]; then
    tsvar="${_last_warn_ts}"
    if (( now - tsvar >= ALERT_COOLDOWN_SEC )); then
      _last_warn_ts=${now}
      return 0
    fi
  else
    tsvar="${_last_crit_ts}"
    if (( now - tsvar >= ALERT_COOLDOWN_SEC )); then
      _last_crit_ts=${now}
      return 0
    fi
  fi
  return 1
}

send_alert() {
  local level="$1"   # INFO/WARN/CRIT
  local emoji="$2"
  local msg="$3"
  log "[ALERT:${level}] ${msg}"

  # call send_slack_alert from common.sh
  # args: channel emoji tag domain message
  send_slack_alert "${SLACK_CHANNEL}" "${emoji}" "${SLACK_TAG}" "${SLACK_DOMAIN}" "${msg}"
}

# CPU usage via /proc/stat delta method
read_cpu_line() {
  # prints fields: user nice system idle iowait irq softirq steal guest guest_nice
  awk '/^cpu / {print $2,$3,$4,$5,$6,$7,$8,$9,$10,$11}' /proc/stat
}

cpu_usage_pct() {
  # Usage: cpu_usage_pct <sample_seconds>
  local wait="${1:-${SAMPLE_SEC}}"
  local a b
  a=($(read_cpu_line))
  sleep "${wait}"
  b=($(read_cpu_line))

  # fields
  local user nice system idle iowait irq softirq steal guest guest_nice
  local user2 nice2 system2 idle2 iowait2 irq2 softirq2 steal2 guest2 guest_nice2
  user=${a[0]}; nice=${a[1]}; system=${a[2]}; idle=${a[3]}; iowait=${a[4]}; irq=${a[5]}; softirq=${a[6]}; steal=${a[7]}; guest=${a[8]}; guest_nice=${a[9]}
  user2=${b[0]}; nice2=${b[1]}; system2=${b[2]}; idle2=${b[3]}; iowait2=${b[4]}; irq2=${b[5]}; softirq2=${b[6]}; steal2=${b[7]}; guest2=${b[8]}; guest_nice2=${b[9]}

  # guest times are already included in user/nice on modern kernels; ignore for totals
  local idleAll idleAll2 nonIdle nonIdle2 total total2 totald idled usage
  idleAll=$(( idle + iowait ))
  idleAll2=$(( idle2 + iowait2 ))

  nonIdle=$(( user + nice + system + irq + softirq + steal ))
  nonIdle2=$(( user2 + nice2 + system2 + irq2 + softirq2 + steal2 ))

  total=$(( idleAll + nonIdle ))
  total2=$(( idleAll2 + nonIdle2 ))

  totald=$(( total2 - total ))
  idled=$(( idleAll2 - idleAll ))

  if (( totald <= 0 )); then
    echo "0.0"
    return
  fi

  # usage = (totald - idled) / totald * 100
  awk -v used="$(( totald - idled ))" -v tot="${totald}" 'BEGIN { printf "%.1f", (used*100.0)/tot }'
}

# MySQL helpers
mysql_proc_list_to_file() {
  local out="${MYSQL_LOG_DIR}/mysql-proc-$(date +%Y%m%dT%H%M%S).txt"
  {
    echo "### $(timestamp) | SHOW FULL PROCESSLIST;"
    ${MYSQL_CLI} -e "SHOW FULL PROCESSLIST\G" || echo "[WARN] Could not run SHOW FULL PROCESSLIST"
    echo
    echo "### $(timestamp) | information_schema.processlist (trimmed)"
    ${MYSQL_CLI} -e "SELECT ID,USER,HOST,DB,COMMAND,TIME,STATE,LEFT(IFNULL(INFO,''),200) AS INFO FROM information_schema.processlist ORDER BY TIME DESC LIMIT 200;"
  } > "${out}" || true
  echo "${out}"
}

kill_all_selects() {
  log "Preparing to kill all SELECT queries in MySQL…"
  # Build kill commands, excluding system threads and our own connection
  local tmp_cmds
  tmp_cmds="$(mktemp)"
  ${MYSQL_CLI} -e "
    SELECT CONCAT('KILL ',id,';')
    FROM information_schema.processlist
    WHERE COMMAND='Query'
      AND INFO LIKE 'SELECT %'
      AND USER NOT IN (${MYSQL_EXCLUDE_USERS})
      AND ID <> CONNECTION_ID();
  " > "${tmp_cmds}" || true

  local to_kill
  to_kill=$(wc -l < "${tmp_cmds}" | tr -d ' ')
  if (( to_kill == 0 )); then
    log "No SELECT queries matched kill criteria."
    rm -f "${tmp_cmds}"
    return 0
  fi

  log "Issuing ${to_kill} KILL statements for SELECT queries…"
  # Execute kill commands
  ${MYSQL_CLI} < "${tmp_cmds}" || true

  # Log what we attempted to kill
  local killed_log="${MYSQL_LOG_DIR}/killed-selects-$(date +%Y%m%dT%H%M%S).sql"
  mv "${tmp_cmds}" "${killed_log}" || true
  log "Recorded KILL statements to ${killed_log}"

  return 0
}

main_loop() {
  # Make sure common.sh is loaded for send_slack_alert
  # shellcheck disable=SC1090
  source "${COMMON_SH}"

  log "Starting CPU monitor on ${SLACK_DOMAIN} | thresholds: warn>=${ALERT_CPU_PCT}% crit>=${KILL_CPU_PCT}%"

  while :; do
    local pct
    pct="$(cpu_usage_pct "${SAMPLE_SEC}")"
    log "CPU usage ~${pct}%"

    # WARN threshold
    if awk -v c="${pct}" -v t="${ALERT_CPU_PCT}" 'BEGIN{ exit !(c>=t) }'; then
      if should_alert "WARN"; then
        send_alert "WARN" ":warning:" "CPU high on ${SLACK_DOMAIN}: ~${pct}% (≥${ALERT_CPU_PCT}%)."
      fi
    fi

    # CRIT threshold: capture proc list + kill selects
    if awk -v c="${pct}" -v t="${KILL_CPU_PCT}" 'BEGIN{ exit !(c>=t) }'; then
      if should_alert "CRIT"; then
        local procfile
        procfile="$(mysql_proc_list_to_file || true)"
        send_alert "CRIT" ":rotating_light:" \
          "CPU CRITICAL on ${SLACK_DOMAIN}: ~${pct}% (≥${KILL_CPU_PCT}%). MySQL process list saved to ${procfile}. Killing all SELECT queries."
      else
        log "CRIT condition met (no Slack due to cooldown); still capturing proc list and killing SELECTs."
        mysql_proc_list_to_file >/dev/null 2>&1 || true
      fi

      kill_all_selects || true
      sleep "${GRACE_SEC}"
    fi

    sleep "${POLL_SEC}"
  done
}

# --- entrypoint with lock to avoid dupes ---
main() {
  mkdir -p "${LOG_DIR}" "${MYSQL_LOG_DIR}"
  exec 9>"${LOCK_FILE}"
  if ! flock -n 9; then
    log "Another cpu_monitor instance is running; exiting."
    exit 0
  fi
  main_loop
}

main "$@"