#!/bin/bash # # ############ Use it at your own risk ################################## # # Author : Vikram Khatri vikram.khatri@us.ibm.com # # Purpose: Fix failed offline problem mainly due to alerts that will not go away # # Change DB2INST to the instance name of your DB2 pureScale # This script assumes that there is password less ssh for root # for all nodes in pureScale cluster if [[ $EUID -ne 0 ]]; then echo "This script should be run as root" 1>&2 exit 1 fi DB2INST=db2psc NODES=`su -l $DB2INST -c "cat ~/sqllib/db2nodes.cfg" | awk '{print $1}' | sort -n -r` echo "Nodes = $NODES" MEMBERS=$(mmgetstate -a | grep active | awk '{print $2}') echo "Members = $MEMBERS" found=false for MEMBER in $MEMBERS do if [ "`hostname`" == "$MEMBER" ] ; then found=true fi done if [ "$found" == "false" ] ; then echo "Run this script from one of the member $MEMBERS" exit -1 fi echo "Stopping db2 instance" su -l $DB2INST -c ". ./.bashrc;db2stop force" sleep 10 echo "Find out the RSCT domain name" DOMAIN=`lsrpdomain -x | awk '{print $1}'` if [ "$DOMAIN" == "" ] ; then echo "Domain seems to be down already. Exiting ...." exit -1 fi STATUS=`lsrpdomain -x | awk '{print $2}'` if [ "$STATUS" == "Online" ] ; then echo "Stopping $DOMAIN forcefully and wait for 20 seconds" stoprpdomain -f $DOMAIN sleep 20 STATUS2=`lsrpdomain -x | awk '{print $2}'` if [ "$STATUS2" == "Offline" ] ; then echo "Doing the cleanup" DB2FS1=`db2cluster -cfs -list -filesystem | grep db2fs1 | awk '{print $2}'` if [ "$DB2FS1" != "" ] ; then echo "Deleting .pgrp from $DB2FS1/$DB2INST/sqllib_shared/ctrlha" rm -f $DB2FS1/$DB2INST/sqllib_shared/ctrlha/.*pgrp echo "Deleting .pgrp from $DB2FS1/$DB2INST/sqllib_shared/ctrlhamirror" rm -f $DB2FS1/$DB2INST/sqllib_shared/ctrlhamirror/.*pgrp fi echo "Deleting pgrp from all nodes" for MEMBER in $MEMBERS do echo "Deleting .pgrp from sqllib/ctrlha and sqllib/ctrlhamirror on $MEMBER" ssh $MEMBER "su -l $DB2INST -c \"rm -f ~/sqllib/ctrlha/.*pgrp\"" ssh $MEMBER "su -l $DB2INST -c \"rm -f ~/sqllib/ctrlhamirror/.*pgrp\"" done fi echo "Starting domain $DOMAIN" startrpdomain $DOMAIN while true do lsrpnode > /dev/null 2>&1 if [ $? -eq 0 ] ; then break else echo "Status from lsrpnode is not yet zero. Please be patient. Wait 10 seconds" sleep 10 fi done echo "Wait for 10 seconds" sleep 10 for NODE in $NODES do echo "Starting node $NODE" su -l $DB2INST -c ". ./.bashrc;db2start $NODE" echo "Wait for 10 seconds" done CNT=`lssam | grep "Failed offline" | grep "Control=MemberInProblemState" | wc -l` if [ "$CNT" != "0" ] ; then echo "Oops. This did not fix" fi lssam echo "Please wait for 1 minute and try lssam again" else echo "Domain $DOMAIN is offline. Exiting .... " exit -1 fi #############################################