File: //bigscoots/wpo/extras/wp_cron_debugger.sh
#!/usr/bin/env bash
set -euo pipefail
### --- CONFIG --- ###
COMMON_SH="/bigscoots/includes/common.sh" # wpcli() + send_slack_alert()
LOG_BASE="/root/.bigscoots/logs/wpcron"
# Thresholds as % of total system RAM
ALERT_PCT=25
KILL_PCT=50
# Monitor sampling / grace time (seconds)
SAMPLE_SEC=1
GRACE_SEC=10
# Slack settings
SLACK_CHANNEL="#engineering"
SLACK_TAG="wpcron-monitor"
### ------------- ###
# ---- Args ----
if [[ $# -lt 1 ]]; then
echo "Usage: $0 <domain.com> [single_hook_name]"
exit 1
fi
DOMAIN="$1"
SINGLE_HOOK="${2:-${HOOK:-}}"
WP_PATH="/home/nginx/domains/${DOMAIN}/public"
LOG_DIR="${LOG_BASE}/${DOMAIN}"
HOOK_LOG_DIR="${LOG_DIR}/hooks"
LOCK_FILE="/var/lock/wpcron_runner_${DOMAIN}.lock"
mkdir -p "${HOOK_LOG_DIR}"
timestamp() { date +"%Y-%m-%d %H:%M:%S %z"; }
today() { date +"%Y-%m-%d"; }
logfile() { echo "${LOG_DIR}/$(today).log"; }
log() {
local msg="$1"
echo "$(timestamp) | ${DOMAIN} | ${msg}" | tee -a "$(logfile)" >/dev/null
}
send_alert() {
local level="$1" # INFO/WARN/CRIT
local emoji="$2"
local hook="$3"
local run_id="$4"
local msg="$5"
log "[ALERT:${level}] ${msg}"
# send_slack_alert(channel, emoji, tag, domain, message)
send_slack_alert "${SLACK_CHANNEL}" "${emoji}" "${SLACK_TAG}" "${DOMAIN}" "(${hook} / ${run_id}) ${msg}"
}
mem_total_kb() { awk '/MemTotal:/ {print $2}' /proc/meminfo; }
pg_rss_kb() { local pgid="$1"; ps -o rss= -g "${pgid}" 2>/dev/null | awk '{s+=$1} END {print (s==""?0:s)}'; }
pct_of_total() { awk -v p="$1" -v t="$2" 'BEGIN { if (t==0) {print 0} else {printf "%.1f", (p*100.0)/t} }'; }
run_hook() {
local hook="$1"
local start_ts=$(date +%s)
local run_id="${hook}-$(date +%Y%m%dT%H%M%S)"
local hook_log="${HOOK_LOG_DIR}/${hook}.log"
log "Starting hook='${hook}' run_id='${run_id}'"
echo "$(timestamp) | START ${run_id}" >> "${hook_log}"
set +e
# Source common.sh in the child shell so wpcli() + send_slack_alert() exist
setsid /bin/bash -lc "source '${COMMON_SH}'; wp cron event run '${hook}' --allow-root --path='${WP_PATH}'" \
> >(awk -v r="${run_id}" '{print strftime("%Y-%m-%d %H:%M:%S %z"), "|", r, "| STDOUT |", $0}' | tee -a "${hook_log}") \
2> >(awk -v r="${run_id}" '{print strftime("%Y-%m-%d %H:%M:%S %z"), "|", r, "| STDERR |", $0}' | tee -a "${hook_log}" >&2) &
child_pid=$!
set -e
sleep 0.05
pgid=$(ps -o pgid= -p "${child_pid}" 2>/dev/null | tr -d ' ' || echo "")
[[ -z "${pgid}" ]] && pgid="${child_pid}"
total_kb=$(mem_total_kb)
peak_kb=0
exceeded_alert=0
while kill -0 "${child_pid}" 2>/dev/null; do
current_kb=$(pg_rss_kb "${pgid}")
(( current_kb > peak_kb )) && peak_kb="${current_kb}"
current_pct=$(pct_of_total "${current_kb}" "${total_kb}")
if (( exceeded_alert == 0 )) && awk -v c="${current_pct}" -v t="${ALERT_PCT}" 'BEGIN{exit !(c>=t)}'; then
send_alert "WARN" ":warning:" "${hook}" "${run_id}" "exceeded ${ALERT_PCT}% RAM (current ~${current_pct}%)."
exceeded_alert=1
fi
if awk -v c="${current_pct}" -v t="${KILL_PCT}" 'BEGIN{exit !(c>=t)}'; then
send_alert "CRIT" ":rotating_light:" "${hook}" "${run_id}" "KILLING for exceeding ${KILL_PCT}% RAM (current ~${current_pct}%)."
pkill -TERM -g "${pgid}" || true
sleep "${GRACE_SEC}"
kill -0 "${child_pid}" 2>/dev/null && pkill -KILL -g "${pgid}" || true
break
fi
sleep "${SAMPLE_SEC}"
done
wait "${child_pid}" 2>/dev/null || true
exit_code=$?
local end_ts=$(date +%s)
local duration=$(( end_ts - start_ts ))
local peak_pct=$(pct_of_total "${peak_kb}" "${total_kb}")
log "Finished hook='${hook}' run_id='${run_id}' exit=${exit_code} duration=${duration}s peak_mem=${peak_kb}KB (~${peak_pct}%)"
echo "$(timestamp) | END ${run_id} | exit=${exit_code} | duration=${duration}s | peak=${peak_kb}KB (~${peak_pct}%)" >> "${hook_log}"
[[ "${exit_code}" -ne 0 ]] && send_alert "WARN" ":x:" "${hook}" "${run_id}" "finished with non-zero exit=${exit_code} (duration ${duration}s, peak ~${peak_pct}%)."
}
main() {
mkdir -p "${LOG_DIR}"
# Per-domain lock to prevent overlap for the same site
exec 9>"${LOCK_FILE}"
if ! flock -n 9; then
log "Another instance is running for ${DOMAIN}; exiting."
exit 0
fi
# Ensure the path exists (basic sanity)
if [[ ! -d "${WP_PATH}" ]]; then
log "WP_PATH not found: ${WP_PATH}"
exit 1
fi
# Make sure common.sh is loaded in this shell too
# shellcheck disable=SC1090
source "${COMMON_SH}"
if ! type -t wp >/dev/null 2>&1; then
log "wp command not found in PATH; aborting."
exit 1
fi
local hooks=()
if [[ -n "${SINGLE_HOOK}" ]]; then
# Only run this hook if it's actually due now
if /bin/bash -lc "source '${COMMON_SH}'; wpcli cron event list --next_run_relative=now --field=hook --path='${WP_PATH}'" \
| grep -Fxq "${SINGLE_HOOK}"; then
hooks=("${SINGLE_HOOK}")
else
log "Single-hook '${SINGLE_HOOK}' is not due now; skipping."
exit 0
fi
else
# Only hooks whose next run is 'now'
mapfile -t hooks < <(/bin/bash -lc "source '${COMMON_SH}'; wpcli cron event list --next_run_relative=now --field=hook --path='${WP_PATH}'" 2>/dev/null || true)
fi
if [[ ${#hooks[@]} -eq 0 ]]; then
log "No hooks due now."
exit 0
fi
log "Hooks due now: ${hooks[*]}"
for hook in "${hooks[@]}"; do
run_hook "${hook}"
done
}
main "$@"