#!/bin/bash
#
# ############   Use it at your own risk ################################## 
#
# Author : Vikram Khatri vikram.khatri@us.ibm.com
#
# Purpose: Fix failed offline problem mainly due to alerts that will not go away
#
#          Change DB2INST to the instance name of your DB2 pureScale
#          This script assumes that there is password less ssh for root 
#          for all nodes in pureScale cluster 

if [[ $EUID -ne 0 ]]; then
   echo "This script should be run as root" 1>&2
   exit 1
fi

DB2INST=db2psc
NODES=`su -l $DB2INST -c "cat ~/sqllib/db2nodes.cfg" | awk '{print $1}' | sort -n -r`
echo "Nodes = $NODES"
MEMBERS=$(mmgetstate -a | grep active | awk '{print $2}')
echo "Members = $MEMBERS"
found=false
for MEMBER in $MEMBERS
do
   if [ "`hostname`" == "$MEMBER" ] ; then
      found=true
   fi
done

if [ "$found" == "false" ] ; then
   echo "Run this script from one of the member $MEMBERS"
   exit -1
fi

echo "Stopping db2 instance"
su -l $DB2INST -c ". ./.bashrc;db2stop force"
sleep 10
echo "Find out the RSCT domain name"
DOMAIN=`lsrpdomain -x | awk '{print $1}'`
if [ "$DOMAIN" == "" ] ; then
   echo "Domain seems to be down already. Exiting ...."
   exit -1
fi
STATUS=`lsrpdomain -x | awk '{print $2}'`
if [ "$STATUS" == "Online" ] ; then
   echo "Stopping $DOMAIN forcefully and wait for 20 seconds"
   stoprpdomain -f $DOMAIN
   sleep 20
   STATUS2=`lsrpdomain -x | awk '{print $2}'`
   if [ "$STATUS2" == "Offline" ] ; then
       echo "Doing the cleanup"
       DB2FS1=`db2cluster -cfs -list -filesystem | grep db2fs1 | awk '{print $2}'`
       if [ "$DB2FS1" != "" ] ; then
          echo "Deleting .pgrp from $DB2FS1/$DB2INST/sqllib_shared/ctrlha"
          rm -f $DB2FS1/$DB2INST/sqllib_shared/ctrlha/.*pgrp
          echo "Deleting .pgrp from $DB2FS1/$DB2INST/sqllib_shared/ctrlhamirror"
          rm -f $DB2FS1/$DB2INST/sqllib_shared/ctrlhamirror/.*pgrp
       fi
       echo "Deleting pgrp from all nodes"
       for MEMBER in $MEMBERS
       do
          echo "Deleting .pgrp from sqllib/ctrlha and sqllib/ctrlhamirror on $MEMBER"
          ssh $MEMBER "su -l $DB2INST -c \"rm -f ~/sqllib/ctrlha/.*pgrp\""
          ssh $MEMBER "su -l $DB2INST -c \"rm -f ~/sqllib/ctrlhamirror/.*pgrp\""
       done
   fi
   echo "Starting domain $DOMAIN"
   startrpdomain $DOMAIN
   while true
   do
      lsrpnode > /dev/null 2>&1
      if [ $? -eq 0 ] ; then
         break
      else
         echo "Status from lsrpnode is not yet zero. Please be patient. Wait 10 seconds"
         sleep 10
      fi
   done
   echo "Wait for 10 seconds"
   sleep 10
   for NODE in $NODES
   do
     echo "Starting node $NODE"
     su -l $DB2INST -c ". ./.bashrc;db2start $NODE"
     echo "Wait for 10 seconds"
   done
   CNT=`lssam | grep "Failed offline" | grep "Control=MemberInProblemState" | wc -l`
   if [ "$CNT" != "0" ] ; then
      echo "Oops. This did not fix"
   fi
   lssam
   echo "Please wait for 1 minute and try lssam again"
else
   echo "Domain $DOMAIN is offline. Exiting .... "
   exit -1
fi  
#############################################
