*** Monitoring cluster node uptime. (node)****

*** /usr/lib/nagios/plugins/cluster-ssh.sh

#!/bin/sh
#CSHL IT Matt N 3/26 SSH BCS palive list based nagios cluster plugin.

ALIVE="/etc/beowulf/bcs/alive"
NODELIST=`cat $ALIVE`
NODEDOWNLIST=''
NODEDOWN=0
   for NAME in $NODELIST
   do
        #echo pinging  $NAME
        #ping -c1 -w1 $NAME 1>/dev/null 2>&1
        /usr/lib/nagios/plugins/check_tcp -H $NAME -p 22 1>/dev/null 2>&1
        if [  $? -gt  0 ]
        then
                NODEDOWN=1
                NODEDOWNLIST="$NODEDOWNLIST $NAME"

        fi
   done

        if [ $NODEDOWN == 1 ]
        then
        echo "SSH CRITICAL: node(s) $NODEDOWNLIST not responding."
        exit 2
        fi

echo "SSH OK: all nodes are answering"
exit 0


*** /etc/beowulf/bcs/alive - this text file needed to be generated for other commands anyway.

clusternode1
clusternode2
clusternode3
clusternode4
clusternode5
clusternode6
clusternode7
clusternode8
clusternode9
clusternode10
clusternode11
clusternode12
clusternode13
clusternode14
clusternode15
clusternode16
clusternode17
clusternode18
clusternode19
clusternode20
clusternode21
clusternode22
clusternode23
clusternode24
clusternode25
clusternode26
clusternode27
clusternode28
clusternode29
clusternode30
clusternode31
clusternode33
clusternode34
clusternode35
clusternode36
clusternode37
clusternode38
clusternode39
clusternode40
clusternode41


*** Monitoring Rsynced volumes with custom scripts. (node) ****


*** CSHL-safer-rsync.sh - run from cron every 15 minutes.

#!/bin/sh

if [ $1 == "--help" ]
then
echo "CSHL-safer-rsync.sh rsync-share-file"
exit 0
fi

REPSTATE=`ps aux | grep "rsync -v -b" | grep -v grep | wc -l`
DATE=`date +%D_%T`

if [ $REPSTATE -lt 1 ]
then
        while read SHARENAME ;do
        rsync -v -b -a -e ssh $SHARENAME --timeout 900 --delete-during --exclude-from=/etc/rsync-excludes-CSHL root@bigfileserver:/data >> /var/log/CSHL-rsync.log
        done < $1
echo $DATE > /var/log/CSHL-rsync.last
elif [ $REPSTATE -gt 1 ]
then
echo Rsync was already running at $DATE >> /var/log/CSHL-rsync.log
fi


*** CSHL-linux_pair_size_check.sh - run from NRPE daemon

#!/bin/sh

#newhall@cshl.edu 7/12/2005
if [ $1 == "--help" ]
then
echo "CSHL-linux_pair_size_check.sh secondary-host-name shares-to-check-file"
exit
fi

SECONDARYHOST=$1
SHARETOCHECKFILE=$2

SHARELIST=`cat $SHARETOCHECKFILE`

for MYSHARE in $SHARELIST
do
LOCALUSED=`df -B G $MYSHARE | grep -v "Filesystem" | awk '{ used_gbytes += $3 } END { printf "%d", used_gbytes }'`

REMOTEUSED=`ssh $SECONDARYHOST df -B G $MYSHARE | grep -v "Filesystem" | awk '{ used_gbytes += $3 } END { printf "%d", used_gbytes }'`

if [ $LOCALUSED -gt $REMOTEUSED ]
then
echo RSYNC Critical: $MYSHARE is $LOCALUSED GB, which is larger than $REMOTEUSED GB on replica
exit 2
fi

done

echo RSYNC OK: local shares listed in $SHARETOCHECKFILE are copied on replica
exit 0
fi


*** CSHL-linux_pair_run_check.sh - run from nrpe.

#!/bin/sh

#newhall@cshl.edu 7/12/2005
if [ $1 == "--help" ]
then
echo "CSHL-linux_pair_run_check.sh hours-to-wait-this-date"
exit
fi

LASTDATE=`tail -n 1 /var/log/CSHL-rsync.last | cut -b 4-5`
CURRENTDATE=`date +%d`
CURRENTHOUR=`date +%k`
HOURSTOWAIT=$1

if [ $CURRENTHOUR -gt $HOURSTOWAIT ]
then

        if [ $LASTDATE -ne $CURRENTDATE ]
        then
        echo RSYNC Critical: rsync has not completed one run this date.
        exit 2
        fi

echo RSYNC OK: Rsync has run successfully at least once today.
exit 0

else

echo RSYNC OK: Rsync has some time to complete the first run today.
exit 0

fi


*** /etc/rsync-shares-CSHL1 - share listing

/data/share2
/data/share3
/data/share1
/data/share5
/data/share4
/data/share6
/data/share7
/data/share8


*** Watching clusters of redundant cisco links through snmp polling. (server)**** 


*** check-snmp-cisco-port-pool.sh - snmp redundant interface check
*** Called like check-snmp-cisco-port-pool.sh big-cisco-box /etc/nagios/woodbury-link-interfaces 1 4


#!/bin/sh
#CSHL IT Matt N 3/26 SSH BCS palive list based nagios cluster plugin.

if test $# -lt 4
then
echo "USAGE: check-snmp-cisco-port-pool.sh switch-address MIB-ID-file-path warning-error-count critical-error-count"
exit 3
fi

MIBIDFILE=$2
MIBIDLIST=`cat $2`
INTDOWNLIST=''
INTUPLIST=''
INTDOWN=0
SWITCHADDR=$1
MAXWARN=$3
MAXERROR=$4

   for NAME in $MIBIDLIST
   do
#Check to see current MIB INT ID's Adminitrative state
        /usr/lib/nagios/plugins/check_snmp -t 500 -P 2c -p 161 -H $SWITCHADDR -C ITOVWREAD -o 1.3.6.1.2.1.2.2.1.7.$NAME -c 1 1> /dev/null 2>&1
        if [  $? -gt  0 ]
        then
                INTDOWN=`expr $INTDOWN + 1`
                INTDOWNLIST="$INTDOWNLIST$NAME,"
        else
                ##If we pass the Adminitrative state test link state
                /usr/lib/nagios/plugins/check_snmp -t 500 -P 2c -p 161 -H $SWITCHADDR -C ITOVWREAD -o 1.3.6.1.2.1.2.2.1.8.$NAME -c 1 1>/dev/null 2>&1
                if [  $? -gt  0 ]
                then
                        INTDOWN=`expr $INTDOWN + 1`
                        INTDOWNLIST="$INTDOWNLIST$NAME,"
                else
                INTUPLIST="$INTUPLIST$NAME,"
                fi
        fi


   done
        if [ $INTDOWN -ge $MAXERROR ]
        then
        echo "SWITCH INTERFACE CRITICAL: interfaces(s)$INTDOWNLIST failed."

        exit 2
        fi

        if [ $INTDOWN -ge $MAXWARN ]
        then
        echo "SWITCH INTERFACE WARNING: interfaces(s)$INTDOWNLIST failed."
        exit 1

echo "NET LINK(S) OK: $INTUPLIST"
exit 0


*** /etc/nagios/greenlawn-link-interfaces - check-snmp-cisco-port-pool.sh is called with this for a host check

215
216


*** /etc/nagios/woodbury-link-interfaces - check-snmp-cisco-port-pool.sh is called with this for a host check 

3
4
213
214