*** Monitoring cluster node uptime. (node)**** *** /usr/lib/nagios/plugins/cluster-ssh.sh #!/bin/sh #CSHL IT Matt N 3/26 SSH BCS palive list based nagios cluster plugin. ALIVE="/etc/beowulf/bcs/alive" NODELIST=`cat $ALIVE` NODEDOWNLIST='' NODEDOWN=0 for NAME in $NODELIST do #echo pinging $NAME #ping -c1 -w1 $NAME 1>/dev/null 2>&1 /usr/lib/nagios/plugins/check_tcp -H $NAME -p 22 1>/dev/null 2>&1 if [ $? -gt 0 ] then NODEDOWN=1 NODEDOWNLIST="$NODEDOWNLIST $NAME" fi done if [ $NODEDOWN == 1 ] then echo "SSH CRITICAL: node(s) $NODEDOWNLIST not responding." exit 2 fi echo "SSH OK: all nodes are answering" exit 0 *** /etc/beowulf/bcs/alive - this text file needed to be generated for other commands anyway. clusternode1 clusternode2 clusternode3 clusternode4 clusternode5 clusternode6 clusternode7 clusternode8 clusternode9 clusternode10 clusternode11 clusternode12 clusternode13 clusternode14 clusternode15 clusternode16 clusternode17 clusternode18 clusternode19 clusternode20 clusternode21 clusternode22 clusternode23 clusternode24 clusternode25 clusternode26 clusternode27 clusternode28 clusternode29 clusternode30 clusternode31 clusternode33 clusternode34 clusternode35 clusternode36 clusternode37 clusternode38 clusternode39 clusternode40 clusternode41 *** Monitoring Rsynced volumes with custom scripts. (node) **** *** CSHL-safer-rsync.sh - run from cron every 15 minutes. #!/bin/sh if [ $1 == "--help" ] then echo "CSHL-safer-rsync.sh rsync-share-file" exit 0 fi REPSTATE=`ps aux | grep "rsync -v -b" | grep -v grep | wc -l` DATE=`date +%D_%T` if [ $REPSTATE -lt 1 ] then while read SHARENAME ;do rsync -v -b -a -e ssh $SHARENAME --timeout 900 --delete-during --exclude-from=/etc/rsync-excludes-CSHL root@bigfileserver:/data >> /var/log/CSHL-rsync.log done < $1 echo $DATE > /var/log/CSHL-rsync.last elif [ $REPSTATE -gt 1 ] then echo Rsync was already running at $DATE >> /var/log/CSHL-rsync.log fi *** CSHL-linux_pair_size_check.sh - run from NRPE daemon #!/bin/sh #newhall@cshl.edu 7/12/2005 if [ $1 == "--help" ] then echo "CSHL-linux_pair_size_check.sh secondary-host-name shares-to-check-file" exit fi SECONDARYHOST=$1 SHARETOCHECKFILE=$2 SHARELIST=`cat $SHARETOCHECKFILE` for MYSHARE in $SHARELIST do LOCALUSED=`df -B G $MYSHARE | grep -v "Filesystem" | awk '{ used_gbytes += $3 } END { printf "%d", used_gbytes }'` REMOTEUSED=`ssh $SECONDARYHOST df -B G $MYSHARE | grep -v "Filesystem" | awk '{ used_gbytes += $3 } END { printf "%d", used_gbytes }'` if [ $LOCALUSED -gt $REMOTEUSED ] then echo RSYNC Critical: $MYSHARE is $LOCALUSED GB, which is larger than $REMOTEUSED GB on replica exit 2 fi done echo RSYNC OK: local shares listed in $SHARETOCHECKFILE are copied on replica exit 0 fi *** CSHL-linux_pair_run_check.sh - run from nrpe. #!/bin/sh #newhall@cshl.edu 7/12/2005 if [ $1 == "--help" ] then echo "CSHL-linux_pair_run_check.sh hours-to-wait-this-date" exit fi LASTDATE=`tail -n 1 /var/log/CSHL-rsync.last | cut -b 4-5` CURRENTDATE=`date +%d` CURRENTHOUR=`date +%k` HOURSTOWAIT=$1 if [ $CURRENTHOUR -gt $HOURSTOWAIT ] then if [ $LASTDATE -ne $CURRENTDATE ] then echo RSYNC Critical: rsync has not completed one run this date. exit 2 fi echo RSYNC OK: Rsync has run successfully at least once today. exit 0 else echo RSYNC OK: Rsync has some time to complete the first run today. exit 0 fi *** /etc/rsync-shares-CSHL1 - share listing /data/share2 /data/share3 /data/share1 /data/share5 /data/share4 /data/share6 /data/share7 /data/share8 *** Watching clusters of redundant cisco links through snmp polling. (server)**** *** check-snmp-cisco-port-pool.sh - snmp redundant interface check *** Called like check-snmp-cisco-port-pool.sh big-cisco-box /etc/nagios/woodbury-link-interfaces 1 4 #!/bin/sh #CSHL IT Matt N 3/26 SSH BCS palive list based nagios cluster plugin. if test $# -lt 4 then echo "USAGE: check-snmp-cisco-port-pool.sh switch-address MIB-ID-file-path warning-error-count critical-error-count" exit 3 fi MIBIDFILE=$2 MIBIDLIST=`cat $2` INTDOWNLIST='' INTUPLIST='' INTDOWN=0 SWITCHADDR=$1 MAXWARN=$3 MAXERROR=$4 for NAME in $MIBIDLIST do #Check to see current MIB INT ID's Adminitrative state /usr/lib/nagios/plugins/check_snmp -t 500 -P 2c -p 161 -H $SWITCHADDR -C ITOVWREAD -o 1.3.6.1.2.1.2.2.1.7.$NAME -c 1 1> /dev/null 2>&1 if [ $? -gt 0 ] then INTDOWN=`expr $INTDOWN + 1` INTDOWNLIST="$INTDOWNLIST$NAME," else ##If we pass the Adminitrative state test link state /usr/lib/nagios/plugins/check_snmp -t 500 -P 2c -p 161 -H $SWITCHADDR -C ITOVWREAD -o 1.3.6.1.2.1.2.2.1.8.$NAME -c 1 1>/dev/null 2>&1 if [ $? -gt 0 ] then INTDOWN=`expr $INTDOWN + 1` INTDOWNLIST="$INTDOWNLIST$NAME," else INTUPLIST="$INTUPLIST$NAME," fi fi done if [ $INTDOWN -ge $MAXERROR ] then echo "SWITCH INTERFACE CRITICAL: interfaces(s)$INTDOWNLIST failed." exit 2 fi if [ $INTDOWN -ge $MAXWARN ] then echo "SWITCH INTERFACE WARNING: interfaces(s)$INTDOWNLIST failed." exit 1 echo "NET LINK(S) OK: $INTUPLIST" exit 0 *** /etc/nagios/greenlawn-link-interfaces - check-snmp-cisco-port-pool.sh is called with this for a host check 215 216 *** /etc/nagios/woodbury-link-interfaces - check-snmp-cisco-port-pool.sh is called with this for a host check 3 4 213 214