File: //bigscoots/ovz/node/monitor.sh
#!/bin/sh
source /bigscoots/includes/common.sh
sessionleak_check() {
SESSION_LEAK_CHK_FILE="/root/.bigscoots/counters/sessionleakchk"
SESSION_LEAK_CHK_INTERVAL=600 # 10 minutes
THRESHOLD_SEC="${THRESHOLD_SEC:-86400}" # 24h default (override for testing)
DRY_RUN="${DRY_RUN:-0}" # 1 = list only, 0 = actually reap
# throttle
if [ -f "$SESSION_LEAK_CHK_FILE" ]; then
last_run=$(stat -c %Y "$SESSION_LEAK_CHK_FILE" 2>/dev/null || echo 0)
now=$(date +%s)
elapsed=$((now - last_run))
if [ "$elapsed" -lt "$SESSION_LEAK_CHK_INTERVAL" ]; then
return 0
fi
fi
mkdir -p "$(dirname "$SESSION_LEAK_CHK_FILE")" 2>/dev/null
: > "$SESSION_LEAK_CHK_FILE"
for CTID in $(vzlist -H -o veid); do
(
# how many sessions are in closing/abandoned (any user)
count=$(vzctl exec "$CTID" \
"loginctl list-sessions --no-legend 2>/dev/null | awk '\$5 ~ /^(closing|abandoned)\$/ {print \$1}' | wc -l" </dev/null)
# only act if it's getting noisy; tune the 50 as you like
if [ "$count" -gt 50 ]; then
if [ "$DRY_RUN" = "1" ]; then
echo "=== CTID $CTID (dry-run: threshold ${THRESHOLD_SEC}s; $count candidates) ==="
fi
# run the age-gated reaper/listing inside the container
vzctl exec "$CTID" THRESHOLD_SEC="$THRESHOLD_SEC" DRY_RUN="$DRY_RUN" bash -s <<'IN'
now=$(date +%s)
hms() { s=$1; d=$((s/86400)); s=$((s%86400)); h=$((s/3600)); s=$((s%3600)); m=$((s/60)); s=$((s%60));
printf "%dd %02dh %02dm %02ds" "$d" "$h" "$m" "$s"; }
# iterate by session id (robust across column layouts)
# ★ OPTIMIZATION: Filter for closing/abandoned here, not inside the loop
while read -r sid; do
[ -z "$sid" ] && continue
# ★ THE FIX: Only act on sessions started by the 'crond' service
service=$(loginctl show-session "$sid" -p Service --value 2>/dev/null)
[ "$service" != "crond" ] && continue
# session state is already known to be closing/abandoned, so we skip that check.
# pick session file for timestamp
f="/run/systemd/sessions/$sid"
[ -e "$f" ] || f="/run/systemd/sessions/$sid.ref"
[ -e "$f" ] || continue
mtime=$(stat -c %Y "$f" 2>/dev/null) || continue
age=$(( now - mtime ))
[ "$age" -lt "$THRESHOLD_SEC" ] && continue
# Get info for logging
name=$(loginctl show-session "$sid" -p Name --value 2>/dev/null || echo "?")
state=$(loginctl show-session "$sid" -p State --value 2>/dev/null || echo "unknown")
if [ "$DRY_RUN" = "1" ]; then
printf "Would reap CRON session=%s user=%s state=%s since=%s age=%s\n" \
"$sid" "$name" "$state" "$(date -d "@$mtime" '+%F %T')" "$(hms "$age")"
else
# terminate the session scope safely
systemctl stop "session-$sid.scope" &>/dev/null || true
loginctl terminate-session "$sid" &>/dev/null || true
logger -t session-reaper "CTID=$VEID reaped CRON sid=$sid user=$name state=$state mtime=$(date -d "@$mtime" '+%F %T') age=${age}s"
fi
# ★ OPTIMIZATION: This awk filter is much more efficient
done < <(loginctl list-sessions --no-legend 2>/dev/null | awk '$5 ~ /^(closing|abandoned)$/ {print $1}')
IN
fi
) &
done
wait
}
if [ ! -d /root/.bigscoots/counters ]
then
mkdir -p /root/.bigscoots/counters
fi
if [ -f /etc/openvz-release ] || [[ $(uname -r) == *"stab"* ]]
then
if [ ! -f /root/.bigscoots/counters/diskchk ]
then
usageroot=$(df -Ph / | grep -v Filesystem | awk '{ print $5}')
userootp=$(echo "$usageroot" | cut -d'%' -f1)
usagevz=$(df -Ph /vz | grep -v Filesystem | awk '{ print $5}')
usevzp=$(echo "$usagevz" | cut -d'%' -f1)
# Total size of the /vz filesystem in GB
total_size=$(df -BG /vz | awk 'NR==2 {print $2}' | sed 's/G//')
# Calculate 70% of the total size
target_used=$(echo "scale=2; $total_size * 0.70" | bc)
# Current used space in GB
used_space=$(df -BG /vz | awk 'NR==2 {print $3}' | sed 's/G//')
# Space that needs to be freed to reach 70% usage
space_to_free=$(echo "scale=2; $used_space - $target_used" | bc)
if [ "$userootp" -ge 95 ] || [ "$usevzp" -ge 95 ]
then
touch /root/.bigscoots/counters/diskchk
# Read Slack webhook URL from file
SLACK_WEBHOOK=$(grep -oP '(?<=APP_SLACK_WEBHOOK=")[^"]*' /root/.slackrc)
# Construct Slack message with better formatting
slack_message="{
\"channel\": \"#node-alerts\",
\"blocks\": [
{
\"type\": \"section\",
\"text\": {
\"type\": \"mrkdwn\",
\"text\": \":warning: *$(hostname)* - ${serverip} - Disk Usage Alert :warning:\"
}
},
{
\"type\": \"divider\"
},
{
\"type\": \"section\",
\"fields\": [
{
\"type\": \"mrkdwn\",
\"text\": \"*Root Partition (/):*\n\`\`\`Usage: ${usageroot}\`\`\`\"
},
{
\"type\": \"mrkdwn\",
\"text\": \"*VZ Partition (/vz):*\n\`\`\`Usage: ${usagevz}\`\`\`\"
}
]
},
{
\"type\": \"divider\"
},
{
\"type\": \"section\",
\"text\": {
\"type\": \"mrkdwn\",
\"text\": \"*Additional Information:*\n\`\`\`Total Size: ${total_size}G\nCurrent Used: ${used_space}G\nTarget 70% Used: ${target_used}G\nSpace to Free: ${space_to_free}G\`\`\`\"
}
}
]
}"
# Send Slack notification
curl -X POST -H 'Content-type: application/json' --data "$slack_message" $SLACK_WEBHOOK
screen -dmS diskchk sh -c 'sleep 300 ; rm -f /root/.bigscoots/counters/diskchk'
fi
fi
#if [ ! -f /root/.bigscoots/counters/swapchk ]
#then
#
# usedswap=$(free | awk '/^Swap/ { printf("%.2f", $3/$2 * 100.0) }' | cut -d . -f 1)
#
# if [ "$usedswap" -gt 95 ]
# then
# touch /root/.bigscoots/counters/swapchk
# bash /bigscoots/general/slack.sh "#node-alerts" ":warning: $(hostname) - ${serverip} - High Swap Usage: %${usedswap}"
# screen -dmS swapchk sh -c 'sleep 86400 ; rm -f /root/.bigscoots/counters/swapchk'
# fi
#fi
if [ ! -f /root/.bigscoots/counters/memchk ]
then
availmem=$(free -g |grep Mem | awk '{print $7}')
totalmem=$(free -g |grep Mem | awk '{print $2}')
percmemfree=$(($availmem / $totalmem))
if [ "$availmem" -lt 4 ]
then
touch /root/.bigscoots/counters/memchk
bash /bigscoots/general/slack.sh "#node-alerts" ":warning: $(hostname) - ${serverip} - High Memory Usage: ${availmem}GB Free"
screen -dmS memchk sh -c 'sleep 300 ; rm -f /root/.bigscoots/counters/memchk'
fi
fi
if [ ! -f /root/.bigscoots/counters/vzpbackup ]
then
if ! crontab -l | grep -q vzbackup
then
touch /root/.bigscoots/counters/vzpbackup
bash /bigscoots/general/slack.sh "#node-alerts" ":warning: $(hostname) - ${serverip} - The backup script is missing from cronjobs"
screen -dmS vzpbackup sh -c 'sleep 86400 ; rm -f /root/.bigscoots/counters/vzpbackup'
fi
fi
if [ ! -f /root/.bigscoots/counters/pduchk ]
then
if ! rpm -qa | grep -q ipmitool
then
yum -y install ipmitool
fi
if [[ "$(ipmitool sdr type "Power Supply" | awk '{$2=$3=$4=$5=$6=$7=$8=$9=""; print $0}' | wc -l)" -gt 1 ]]
then
ipmitool sdr type "Power Supply" | awk '{$2=$3=$4=$5=$6=$7=$8=$9=""; print $0}' | while read -r ps status
do
if [[ "$status" != 'Presence detected' ]]
then
touch /root/.bigscoots/counters/pduchk
bash /bigscoots/general/slack.sh "#node-alerts" "Power Supply Status: \n :red_circle: $ps $status"
screen -dmS diskchk sh -c 'sleep 300 ; rm -f /root/.bigscoots/counters/pduchk'
fi
done
fi
fi
if [ ! -f /root/.bigscoots/counters/deaddrivechk ] && ! lspci | grep -q MegaRAID && [ ! -f /usr/StorMan/arcconf ]
then
if ! rpm -qa smartmontools | grep -q smartmontools
then
yum -y install smartmontools
fi
if ls -1 /dev/sd[a-z] >/dev/null 2>&1
then
disks=$(ls -1 /dev/sd[a-z])
for DISK in $disks
do
if ! smartctl_check=$(smartctl -a -d ata "$DISK")
then
echo "$smartctl_check" > "/root/.bigscoots/hardware/disk/$(basename "$DISK").$(date +%s).log"
touch /root/.bigscoots/counters/deaddrivechk
bash /bigscoots/general/slack.sh "#node-alerts" "Detected bad drive: $DISK in $(hostname). \nOutput available in: \`/root/.bigscoots/hardware/disk/$(basename "$DISK").$(date +%s).log\`"
screen -dmS diskchk sh -c 'sleep 300 ; rm -f /root/.bigscoots/counters/deaddrivechk'
fi
done
fi
fi
fi
sessionleak_check
find /root/.bigscoots/counters/ -type f -mtime +0 -exec rm {} \;