#!/bin/sh
#
#	$Id: ResourceManager.in,v 1.12.2.6 2005/08/31 16:05:07 alan Exp $
#
#	New haresources format:
#
#	machine resource resource resource resource
#
#	Where a resource can be an IP address or a scriptname, or a scriptname
#	and single argument.
#
#	When it's a scriptname with an argument, the argument is connected to
#	the scriptname with "::".  Another way of expressing an IP address is
#	via IPaddr::ip-address, since the script name IPaddr is the one assumed
#	for resources which are spelled like an IP address.
#
#	As an illustration, the following two lines are identical in effect:
#
#	node1	123.45.67.89 httpd
#	node1	IPaddr::123.45.67.89 httpd
#
#       One can also pass multiple arguments to a script by separating each
#       argument with a double colon:
#
#       node1  10.0.0.170 Filesystem::/dev/sda1::/data1::ext2
#
#	Note:  The first resource on the line must be unique in the haresources
#	file.  Maybe I should add a resource type called Unique which can
#	put on the front of a line to meet this requirement.
#
#

# set -x
: 
: 
: ==================== Starting ResourceManager $* ==========================
 
unset LC_ALL; export LC_ALL # Make ifconfig work in France for David Jules :-)
unset LANGUAGE; export LANGUAGE # Make ifconfig work in France for Fabrice :-)
 
prefix=/usr
exec_prefix=/usr
HA_DIR=/etc/ha.d
HA_FUNCS=$HA_DIR/shellfuncs
export HA_DIR HA_FUNCS
INITDIR=/etc/init.d
USAGE="usage: $0 listkeys machine | takegroup key | givegroup key|status resource";
. $HA_FUNCS

: ${HA_STOPRETRYMAX=10}

isip() {
  case $1 in
    [0-9]*.[0-9]*.[0-9]*.[0-9]*)	true;;
    *)					false;;
  esac
}

#
# Reverseargs doesn't deal with arguments containing  white space correctly.
# To fix this I think you'd have to change it to put out the arguments
# one per line, and then have the caller use them a line at a time using
# "read" or something.  This could be done either using a tmp file or
# by using eval on the argument numbers starting from $# and going
# down.  But, no existing resource script deals with these either, so
# I'm not too worried yet...
#
reverseargs() {
  L=""
  for arg in "$@"
  do
    case $L in
      ?*)	L="$arg $L";;
      "")	L=$arg;;
    esac
  done
  echo $L
}

#
#	Remove comments and extra blanks from haresources
#	translate all white space into single blanks.  Each line ends with
#	a blank, making parsing it easier for dumb shell scripts.
#
ipres() {
  grepflags=""
  case $# in
    0)	pat='.'			;;
    1)  pat=$1			;;
    2)  grepflags=$1; pat=$2	;;
  esac
  cat $HA_DIR/haresources |
	#
	#	Explanation of Sed -e expressions below:
	#
	#	1) Strip out comments
	#	2) Repeatedly join lines together when they're ended by '\'
        #		(":a" is a label.  "ta" is a conditional
	#		branch to "a:")
	#	3) Append a blank to the end of the line
	#	4) Compress multiple blanks/tabs into a single blank
	#	5) Remove lines that only contain whitespace or are empty
	#	6) Strip off a leading space (if any)
	#
	sed						\
		-e  's%#.*%%'				\
		-e :a -e '/\\$/N; s/\\\n//; ta'		\
		-e  's%$% %'				\
		-e  's%[	 ][	 ]*% %g' 	\
		-e  '/^[	 ]*$/ d' 		\
		-e  's%^ %%' 				|
	awk '/.*/ {$1 = tolower($1); print $0, ""}'	|
  	egrep $grepflags "$pat"
}

ipresline() {
	ipres " $1 "
}

KeyResources() {
	ipres -i "^$1 "  | cut -d' ' -f2
}

OurGroupKeys() {
	ipres -i "^${HA_CURHOST} "  | cut -d' ' -f2-
}

OtherGroupKeys() {
	ipres -iv "^${HA_CURHOST} "  | cut -d' ' -f2-
}


canonname() {
  if
    isip $1
  then
    echo "IPaddr::$1"
  else
    echo $1
  fi
}

resource2script() {
  case `canonname $1` in
    *::*)	echo $1 | sed 's%::.*%%'
		;;
    *)		echo $1;;
  esac
}

# Return the list of arguments after the script name.
# multiple arguments are separated by :: delimiters
resource2arg() {
  case `canonname $1` in
    *::*)	echo $1 | sed 's%[^:]*::%%' | sed 's%::% %g'
		;;
  esac
}

scriptpath() {
  script=`canonname $1`
  script=`resource2script $script`
  for dir in $HA_RESOURCEDIR $INITDIR
  do
    if
      [ -f $dir/$script -a -x $dir/$script ]
    then
      echo $dir/$script;
      return 0;
    fi
  done

  ha_log "ERROR: Cannot locate resource script $script"
  false
}
#
#	Since we're patterned after the LSB's init scripts, here are
#	the exit codes we should be returning for status...
#
#	0	program is running
#	1	program is dead and /var/run pid file exists
#	2	program is dead and /var/lock lock file exists
#	3	program is stopped
#	4-100	reserved for future LSB use
#	100-149	reserved for distribution use
#	150-199	reserved for application use
#	200-254	reserved
#

we_own_resource() {
  arg=`resource2arg $1`
  spath=`scriptpath $1`;

  case `$spath $arg status` in
    *[Nn][Oo][Tt]\ *[Rr]unning*)	return 3;;
    *[Rr]unning*|*OK*)			return 0;;
    *)					return 3;;
  esac
}


doscript() {
  script=`resource2script $1`
  arg=`resource2arg $1`

  spath=`scriptpath $script`
  if
    [ -f "$spath" -a -x "$spath" ]
  then
    ha_log "info: Running $spath $arg $2"
    ha_debug "debug: Starting $spath $arg $2"
    $spath $arg "$2" 2>>$HA_DEBUGLOG
    RC=$?
    ha_debug "debug: $spath $arg $2 done. RC=$RC"
    case $RC in
      0);;
      *) ha_log "ERROR: Return code $RC from $spath";;
    esac
    return $RC;
  fi
  ha_log "ERROR: Cannot locate resource script $script"
  return 1;
}


# Arguments are: nodename resource1 resource2 ...
giveupresourcegroup() {

  ha_log "info: Releasing resource group: $*"
  shift
  rc=0
  for j in `reverseargs "$@"`
  do
    retries=0
    while
      doscript $j stop >>$HA_DEBUGLOG 2>&1
      rc1=$?
      [ $HA_STOPRETRYMAX -le 0 -o  $retries -lt $HA_STOPRETRYMAX ] &&
      [ $rc1 -ne 0 ]
    do
      sleep 1
      retries=`expr 1 + "$retries"`
      ha_log "info: Retrying failed stop operation [$j]"
    done
    if
      [ $rc1 -ne 0 ]
    then
      if
        we_own_resource $j
      then
        rc=$rc1
        RecoverFromFailedStop	# Bye Bye!
      else
        # Red Hat (and probably others) Kludge!
        ha_log "ERROR: Resource script for $j probably not LSB-compliant."
        ha_log "WARN: it ($j) MUST succeed on a stop when already stopped"
        ha_log "WARN: Machine reboot narrowly avoided!"
      fi
    fi
  done
  return $rc
}

# Arguments are: nodename resource1 resource2 ...
acquireresourcegroup() {
  ha_log "info: Acquiring resource group: $*"
  node="$1"
  shift
  rc=0;
  for j in "$@"
  do
    if
      we_own_resource "$j" || doscript "$j" start
    then
      : $j start succeeded
    else
      rc=$?
      ha_log "CRIT: Giving up resources due to failure of $j"
      giveupresourcegroup "$node" "$@"
      break
    fi
  done
  return $rc
}

#
#	We may be given a resource to give up that we don't own...
#
GiveUpGroup() {
  #	Get the list of resources we've been requested to give up...
  haresources=`ipresline $1`
  if
    [ ! -z "$haresources" ]
  then
    giveupresourcegroup $haresources
  fi
}

TakeGroup() {
  #	Get the list of resources we've been requested to take...
  haresources=`ipresline $1`
  if
    [ ! -z "$haresources" ]
  then
    acquireresourcegroup $haresources
  fi
}

#
# Determine the status of all the resources in a resource group
#
# Results are echoed to stdout:
#
# NONE:	None of the resources in the resource group are held
#      (or there no such resource group)
# ALL:	All of the resources in the resource group are held
# SOME:	Some of the resources in the resource group are held
#
StatGroup() {
  result="FirstTime"

  set `ipresline $1`
  shift
  for resource
  do
    if
      we_own_resource $resource
    then
      case $result in
        FirstTime)	result=ALL;;
        NONE)		echo SOME; return 0;;
        SOME|ALL)		;;
      esac
    else
      case $result in
        FirstTime)	result=NONE;;
        ALL)		echo SOME; return 0;;
        SOME|NONE)	;;
      esac
    fi
  done

  case $result in
    FirstTime)	echo NONE;;
    *)		echo $result;;
  esac
}

#
#	Verify that all resources in the resource group are idle
#

VerifyAllIdle() {
  rc=0
  for rsc in `KeyResources ".*"`
  do
    if
      we_own_resource $rsc
    then
      ha_log "CRITICAL: Resource $rsc is active, and should not be!"
      ha_log "CRITICAL: Non-idle resources can affect data integrity!" >&2
      ha_log "info: If you don't know what this means, then get help!"
      ha_log "info: Read the docs and/or source to $0 for more details."
      cat <<-! >&2
	CRITICAL: Resource $rsc is active, and should not be!
	CRITICAL: Non-idle resources can affect data integrity!
	info: If you don't know what this means, then get help!
	info: Read the docs and/or the source to $0 for more details.
	!
      #
      # What this means is that if you have a shared disk and it's already mounted
      # before you start heartbeat, then you could have it mounted simultaneously
      # on both sides.  If this happens then your disk data is toast!
      # So, this is sometimes VERY BAD INDEED!
      #
      # The most *common* cause for this message is that you told your OS to manage one
      # of the IP addresses that you asked heartbeat to manage.  You can't put
      # both your OS startup scripts and heartbeat in charge of HA resources.
      # This particular case is discussed in detail in the docs.
      #
      rc=`expr $rc + 1`
    fi
  done
  if
    [ $rc -ne 0 ]
  then
    ha_log "CRITICAL: Non-idle resources will affect resource takeback!"
    ha_log "CRITICAL: Non-idle resources may affect data integrity!"
  fi
  exit $rc
}

#
#	If we are unable to stop a resource, then everything is in a
#	hosed state.  The only way out is through a reboot...
#
RecoverFromFailedStop() {
  ha_log "CRIT: Resource STOP failure. Reboot required!"
  ha_log "CRIT: Killing heartbeat ungracefully!"
  for name in heartbeat ipfail ccm
  do
    pkill -9 $name
  done
  if
    [ -x /sbin/reboot ] && /sbin/reboot -f
  then
    : OK - reboot succeeded
  elif
    [ -x /sbin/init ] && /sbin/init 6
  then
    : OK - init 6 succeeded
  else
    ha_log "CRIT: Unable to force reboot."
  fi
}

case $1 in

  listkeys)	KeyResources "$2";;

  allkeys)	KeyResources '.*';;

  ourkeys)	OurGroupKeys;;

  otherkeys)	OtherGroupKeys;;

  status)	we_own_resource $2;;

  givegroup)	GiveUpGroup $2 >>$HA_DEBUGLOG 2>&1 ;;

  takegroup)	TakeGroup $2 >>$HA_DEBUGLOG 2>&1 ;;

  statgroup)	StatGroup $2 >>$HA_DEBUGLOG 2>&1 ;;

  verifyallidle)VerifyAllIdle ;;

  *)		echo "$USAGE" >&2
  		echo "" >&2
		exit 1;;
esac

#
#	$Log: ResourceManager.in,v $
#	Revision 1.12.2.6  2005/08/31 16:05:07  alan
#	Fixed a bug which make host names in haresources files to have
#	hostnames in all lower case letters.
#	
#	Revision 1.12.2.5  2005/02/14 07:37:05  horms
#	Omit blank lines from output of allkeys
#	
#	Revision 1.12.2.4  2005/01/11 14:55:06  alan
#	Activated a patch provided by lge a long time ago - with a little
#	change brought to our attention by Umbrae <umbrae@gmail.com>
#	
#	Revision 1.26  2004/11/10 18:37:07  alan
#	Put in code to stop starting resources when we get a failure.
#	
#	Revision 1.23  2004/10/01 13:10:34  lge
#	micro fixes
#	 initialize logfacility = -1 in hb_cluster_new()
#	 fix off by one error in media_idx range check in read_child_dispatch()
#	 add commented out "not running" branch to we_own_resource() in ResourceManager.in
#	Revision 1.12.2.3  2004/12/17 16:13:22  alan
#	Fixed a bug introduced by last fix.
#	This new bug was pointed out by Kenneth Geisshirt <kenneth@geisshirt.dk>
#	who also supplied a patch for it.  But, if this doesn't work right yet,
#	then it's my fault, since I didn't take his patch exactly.
#	
#	Revision 1.12.2.2  2004/12/14 22:06:37  alan
#	Put in a patch to make us give up partially-acquired resources groups after a failure.
#	
#	Revision 1.12.2.1  2004/04/20 05:05:29  alan
#	Backported everything from 1.3.0 to 1.2.1
#	
#	Revision 1.22  2004/04/16 19:01:00  gshi
#	added a missing 'done' statement
#	
#	Revision 1.21  2004/04/16 17:06:10  alan
#	Fixed a bug in ResourceManager where it warns about bad resource scripts
#	when they're OK.  Problem noted by Jens Schmalzing <j.s@lmu.de>.  Thanks Jens!
#	
#	Revision 1.20  2004/04/16 04:48:05  alan
#	Put in a fix which gets rid of LMB's non-portable sed expression.
#	
#	Revision 1.19  2004/04/15 18:15:50  alan
#	Fixed a bug in the last fix to ResourceManager - I didn't check return codes correctly.
#	
#	Revision 1.18  2004/04/15 13:18:38  alan
#	Put in a kludge to permit the new code to avoid an unnecessary reboot for
#	Red Hat and other non-LSB-compliant init scripts.
#	
#	Revision 1.17  2004/04/12 21:43:43  alan
#	Changed the function that runs scripts to return the proper return code.
#	
#	Revision 1.16  2004/03/31 22:17:10  alan
#	Minor changes to the reboot-on-stop-failure patch.
#	These changes suggested by Jan-Frode Myklebust <janfrode@parallab.uib.no>
#	They make it more solaris-compatible, and also give an additional reboot attempt
#	option.
#	
#	Revision 1.15  2004/03/31 17:52:25  alan
#	Put in code to cause the machine to (eventually) reboot if a resource
#	stop fails.
#	
#	Revision 1.14  2004/03/26 05:00:47  alan
#	Added the Local Resource Manager from Zhen Huang and Jiang Dong Sun to CVS.
#	
#	Revision 1.13  2004/03/25 10:17:28  lars
#	Part I: Lower-case hostnames whereever they are coming in. STONITH
#	module audit to follow.
#	
#	Revision 1.12  2003/09/19 19:12:35  alan
#	Changed resourcemanager to not give an ERROR instead of a WARNING if
#	one of the resource scripts it starts gives a bad return code.
#	
#	Revision 1.11  2003/07/22 09:50:23  alan
#	Added code to pay attention more to return codes in ResourceManager,
#	and also to avoid starting any resource twice (barring race conditions).
#	
#	Revision 1.10  2003/07/01 19:01:20  alan
#	removed the checks to see if we already own a resource group
#	before giving it up.
#	
#	Revision 1.9  2003/05/17 05:28:46  alan
#	Implemented a modified version of nice_failback allowing it to
#	move subsets of resources around - not just all of them...
#	
#	Revision 1.8  2002/04/07 21:08:56  alan
#	Put in more stern warnings about resources not being idle when we start up...
#	
#	Revision 1.7  2002/02/15 06:53:16  horms
#	Small changes to allow heartbeat to work on Solaris 8
#	* use unset LC_ALL and unset LANGUAGE instead of LC_ALL=en and LANGUAGE=en
#	* provide LOG_PERROR, as Solaris doesn't
#	-- Horms
#	
#	Revision 1.6  2002/01/24 16:00:43  alan
#	Put in a patch sent by Matt Soffen.
#	
#	Revision 1.5  2001/10/24 20:46:29  alan
#	A large number of patches.  They are in these categories:
#		Fixes from Matt Soffen
#		Fixes to test environment things - including changing some ERRORs to
#			WARNings and vice versa.
#		etc.
#	
#	Revision 1.4  2001/08/11 22:59:25  alan
#	Finished the patch from Lorn Kay <lorn_kay@hotmail.com>
#	Also reverted a patch from Horms which turned off strict warnings on Linux.
#	
#	Revision 1.3  2001/08/11 22:12:55  alan
#	Made ResourceManager status return 0 if we own the resource and 3
#	if we don't (to be consistent with the LSB).
#	This patch was suggested by Lorn Kay.
#	
#	Revision 1.2  2001/06/28 20:35:00  alan
#	Patch from Juri to install our scripts with paths patched appropriately.
#	
#	Revision 1.1  2001/06/28 12:16:44  alan
#	Committed the *rest* of Juri Haberland's script patch that I thought I
#	had already applied :-(.
#	
#	Revision 1.10  2001/02/07 07:10:21  alan
#	Added code to verify that all resources are idle when starting heartbeat.
#	
#	Revision 1.9  2001/02/07 07:01:02  alan
#	Added a verifyallidle action to the ResourceManger for use in
#	startup scripts.
#	
#	Revision 1.8  2001/02/01 11:52:05  alan
#	Change things to that things occur in the right order.
#	We need to not start timing message reception until we're completely started.
#	We need to Stonith the other guy before we take over their resources.
#	
#	Revision 1.7  2000/12/06 09:11:31  jacob
#	Resources in ha.d/haresources can be split over more lines by concatenating
#	the lines with \ .
#	
#	Revision 1.6  2000/10/25 19:49:33  eric
#	Added the ability to specify multiple arguments to a resource script.
#	
#	Revision 1.5  2000/07/26 04:14:10  alan
#	Fixed a bug in a sed expression in ResourceManager
#	There was a missing 'g'.
#	
#	Revision 1.4  2000/06/12 06:11:09  alan
#	Changed resource takeover order to left-to-right
#	Added new version of nice_failback.  Hopefully it works wonderfully!
#	Regularized some error messages
#	Print the version of heartbeat when starting
#	Hosts now have three statuses {down, up, active}
#	SuSE compatability due to Friedrich Lobenstock and alanr
#	Other minor tweaks, too numerous to mention.
#	
#	Revision 1.3  2000/05/03 20:23:59  alan
#	Fixed a bug where the code was RedHat dependent.  It now allows the standard
#	SuSE OK response for status requests from scripts.
#	
#	Revision 1.2  1999/11/10 20:33:04  alan
#	Deleted /proc/ha directory from build list
#	Added #!/bin/sh to lots (all?) of the scripts...
#	
#	Revision 1.1.1.1  1999/09/23 15:31:24  alanr
#	High-Availability Linux
#	
#	Revision 1.7  1999/09/01 05:32:12  alanr
#	Put in a fix from Gregor Howey <ghowey@bremer-nachrichten.de>
#	where Gregor found that I had stripped off the ::resourceid
#	part of a string resulting in some bad calls later on.
#
#	It looks like he either always used IPaddr:xxx.yy.zzz or he actually
#	read the code and understood it.  Amazing!
#
#	Revision 1.6  1999/08/17 13:25:08  alanr
#	more comment changes.
#
#	Revision 1.5  1999/08/17 13:22:47  alanr
#	Fixed up comments on the format of the ipresources file.
#
#	Revision 1.4  1999/08/17 13:17:05  alanr
#	fixed up white space.
#
#	Revision 1.3  1999/08/17 13:16:07  alanr
#	Tidied up logging when running resource scripts.
#	Also added log messages.
#
#
