#!/bin/sh
#
#
#       OCF Resource Agent compliant drbd resource script.
#
# Copyright (c) 2004 - 2007 SUSE LINUX Products GmbH, Lars Marowsky-Bree
#                    All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like.  Any license provided herein, whether implied or
# otherwise, applies only to this software file.  Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#
#

# OCF instance parameters
#	OCF_RESKEY_drbd_resource
#	OCF_RESKEY_drbdconf
#	OCF_RESKEY_CRM_meta_clone_max
#	OCF_RESKEY_CRM_meta_clone_node_max
#	OCF_RESKEY_master_max
#	OCF_RESKEY_master_node_max


#######################################################################
# Initialization:

if [ -n "$OCF_DEBUG_LIBRARY" ]; then
	. $OCF_DEBUG_LIBRARY
else
	. ${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs
fi

#######################################################################

meta_data() {
	cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="drbd">
<version>1.1</version>

<longdesc lang="en">
Master/Slave OCF Resource Agent for DRBD
</longdesc>

<shortdesc lang="en">This resource agent manages a Distributed
Replicated Block Device (DRBD) object as a master/slave
resource. DRBD is a mechanism for replicating storage; please see the
documentation for setup details.</shortdesc>

<parameters>
<parameter name="drbd_resource" unique="1" required="1">
<longdesc lang="en">
The name of the drbd resource from the drbd.conf file.
</longdesc>
<shortdesc lang="en">drbd resource name</shortdesc>
<content type="string" default="drbd0" />
</parameter>

<parameter name="drbdconf">
<longdesc lang="en">
Full path to the drbd.conf file.
</longdesc>
<shortdesc lang="en">Path to drbd.conf</shortdesc>
<content type="string" default="/etc/drbd.conf"/>
</parameter>

<parameter name="clone_overrides_hostname">
<longdesc lang="en">
Whether or not to override the hostname with the clone number. This can
be used to create floating peer configurations; drbd will be told to
use node_&#60;cloneno&#62; as the hostname instead of the real uname,
which can then be used in drbd.conf.
</longdesc>
<shortdesc lang="en">Override drbd hostname</shortdesc>
<content type="boolean" default="no"/>
</parameter>


<parameter name="clone_max" required="1">
<longdesc lang="en">
Number of clones of this drbd resource. Do not fiddle with the default.
</longdesc>
<shortdesc lang="en">Number of clones</shortdesc>
<content type="integer" default="2"/>
</parameter>

<parameter name="clone_node_max" required="1">
<longdesc lang="en">
Clones per node. Do not fiddle with the default.
</longdesc>
<shortdesc lang="en">Number of nodes</shortdesc>
<content type="integer" default="1"/>
</parameter>

<parameter name="master_max" required="1">
<longdesc lang="en">
Maximum number of active primaries. Do not fiddle with the default.
</longdesc>
<shortdesc lang="en">Number of primaries</shortdesc>
<content type="integer" default="1"/>
</parameter>

<parameter name="master_node_max" required="1">
<longdesc lang="en">
Maximum number of primaries per node. Do not fiddle with the default.
</longdesc>
<shortdesc lang="en">Number of primaries per node</shortdesc>
<content type="integer" default="1"/>
</parameter>
</parameters>

<actions>
<action name="start"   timeout="240" />
<action name="promote"   timeout="90" />
<action name="demote"   timeout="90" />
<action name="notify"   timeout="90" />
<action name="stop"    timeout="100" />
<action name="monitor" depth="0"  timeout="20" interval="20" start-delay="1m" role="Slave" />
<action name="monitor" depth="0"  timeout="20" interval="10" start-delay="1m" role="Master" />
<action name="meta-data"  timeout="5" />
<action name="validate-all"  timeout="30" />
</actions>
</resource-agent>
END

	exit $OCF_SUCCESS
}

do_cmd() {
	local cmd="$*"
	ocf_log debug "$RESOURCE: Calling $cmd"
	local cmd_out=$($cmd 2>&1)
	ret=$?
	
	if [ $ret -ne 0 ]; then
		ocf_log err "$RESOURCE: Called $cmd"
		ocf_log err "$RESOURCE: Exit code $ret"
		ocf_log err "$RESOURCE: Command output: $cmd_out"
	else
		ocf_log debug "$RESOURCE: Exit code $ret"
		ocf_log debug "$RESOURCE: Command output: $cmd_out"
	fi
	
	echo $cmd_out
	
	return $ret
}

do_drbdadm() {
	local cmd="$DRBDADM -c $DRBDCONF $*"
	ocf_log debug "$RESOURCE: Calling $cmd"
	local cmd_out=$($cmd 2>&1)
	ret=$?
	# Trim the garbage drbdadm likes to print when using the node
	# override feature:
	local cmd_ret=$(echo $cmd_out | sed -e 's/found __DRBD_NODE__.*<<//;')
	
	if [ $ret -ne 0 ]; then
		ocf_log err "$RESOURCE: Called $cmd"
		ocf_log err "$RESOURCE: Exit code $ret"
		ocf_log err "$RESOURCE: Command output: $cmd_ret"
	else
		ocf_log debug "$RESOURCE: Exit code $ret"
		ocf_log debug "$RESOURCE: Command output: $cmd_ret"
	fi
	
	echo $cmd_ret
	
	return $ret
}

drbd_init() {
	check_binary $DRBDADM
	CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot "
	
	RESOURCE="$OCF_RESKEY_drbd_resource"
	CLONE_NO="$OCF_RESKEY_CRM_meta_clone"
	DRBDCONF="${OCF_RESKEY_drbdconf:=/etc/drbd.conf}"

	if [ ! -f "$DRBDCONF" ]; then
		ocf_log err "drbd.conf not installed."
		if [ '$ACTION' = 'monitor' ]; then
			exit $OCF_NOT_RUNNING
		else
			exit $OCF_ERR_INSTALLED
		fi
	fi

	case "$OCF_RESKEY_clone_overrides_hostname" in
	[Yy][Ee][Ss]|[Tt][Rr][Uu][Ee]|[Oo][Nn]|1)
		__DRBD_NODE__="node_${CLONE_NO}"
		export __DRBD_NODE__
		ocf_log info "$RESOURCE: Using hostname $__DRBD_NODE__"
		;;
	esac

}


#######################################################################

drbd_usage() {
	cat <<END
usage: $0 {start|stop|monitor|validate-all|promote|demote|notify|meta-data}

Expects to have a fully populated OCF RA-compliant environment set.
END
}

is_drbd_enabled () {
	if [ -f /proc/drbd ]; then
		return 0
	fi
	return 1
}


drbd_get_status() {
	DRBD_STATE=$(do_drbdadm role $RESOURCE)
	DRBD_STATE_LOCAL=$(echo $DRBD_STATE | sed -e 's#/.*##')
	DRBD_STATE_REMOTE=$(echo $DRBD_STATE | sed -e 's#.*/##')
	DRBD_CSTATE=$(do_drbdadm cstate $RESOURCE)

	# Sanitize the various states, drbdadm is quite annoying; so if it 
	# outputs something which doesn't make sense, translate it into
	# a harmless state:

	case "$DRBD_STATE_LOCAL" in
		"Not configured"|"Primary"|"Secondary") ;;
		*)	DRBD_STATE_LOCAL="Not configured" ;;
	esac

	case "$DRBD_STATE_REMOTE" in
		"Primary"|"Secondary"|"Unknown") ;;
		*)	DRBD_STATE_REMOTE="Not configured" ;;
	esac

	case "$DRBD_CSTATE" in
		Unconfigured|StandAlone|Unconnected|Timeout|BrokenPipe) ;;
		NetworkFailure|WFConnection|WFReportParams|Connected|SkippedSyncS) ;;
		SkippedSyncT|WFBitMapS|WFBitMapT|SyncSource|SyncTarget) ;; 
		PausedSyncS|PausedSyncT) ;;
		*) DRBD_CSTATE="Unconfigured" ;;
	esac

	ocf_log debug "$RESOURCE status: $DRBD_STATE $DRBD_STATE_LOCAL $DRBD_STATE_REMOTE $DRBD_CSTATE"
}

drbd_start() {
	if is_drbd_enabled; then
	    : OK
	else
	    do_cmd modprobe -s drbd `$DRBDADM sh-mod-parms` || { 
		    ocf_log err "Cannot load the drbd module."$'\n'; 
		    return $OCF_ERR_GENERIC 
	    }
	    ocf_log debug "$RESOURCE start: Module loaded."
	fi

	drbd_get_status

	if [ "$DRBD_STATE_LOCAL" != "Not configured" ]; then
		ocf_log debug "$RESOURCE start: already configured."
		return $OCF_SUCCESS
	fi

	if do_drbdadm up $RESOURCE ; then
		drbd_get_status
		if [ "$DRBD_STATE_LOCAL" != "Secondary" ]; then
			ocf_log err "$RESOURCE start: not in Secondary mode after start."
			return $OCF_ERR_GENERIC
		fi

		ocf_log debug "$RESOURCE start: succeeded."
		return $OCF_SUCCESS
	else
		ocf_log err "$RESOURCE: Failed to start up."
		return $OCF_ERR_GENERIC
	fi
}

drbd_update_prefs() {
	drbd_get_status

	# TODO: This is probably way too complex. 
	case $DRBD_CSTATE in
	Connected) 
		do_cmd $CRM_MASTER -v 75
		;;
	SyncSource|PausedSyncS|WFBitMapS|SkippedSyncS)
		do_cmd $CRM_MASTER -v 100
		;;
        # TODO:
        # (Inconsistent || Diskless && WFConnection) should be -infinity
	# This one implies we'll try to promote even on disconnected
	# nodes, but that might not work.
	WFConnection) 
		do_cmd $CRM_MASTER -v 10
		;;
	*)
		do_cmd $CRM_MASTER -v 5
		;;
	esac

	return $OCF_SUCCESS
}

drbd_stop() {
	# Do not bother if drbd is not enabled
	if is_drbd_enabled; then
		drbd_get_status
		
		# Clear preference for becoming master
		do_cmd $CRM_MASTER -D
	
		if [ "$DRBD_STATE_LOCAL" = "Not configured" ]; then
			ocf_log debug "$RESOURCE stop: already unconfigured."
			return $OCF_SUCCESS
		fi
		
		# TODO: this is a _force_ operation. we may need to kill higher
		# levels to be able to down drbd. figure out how...
		if do_drbdadm down $RESOURCE ; then
			ocf_log debug "$RESOURCE stop: drbdadm down succeeded."
			
			# TODO: If drbdadm propagated error codes, this
			# wouldn't be needed.
			drbd_get_status

			if [ "$DRBD_STATE_LOCAL" = "Not configured" ]; then
				return $OCF_SUCCESS
			else
				ocf_log err "$RESOURCE stop: Not stopped."
			fi
		else
			ocf_log err "$RESOURCE stop: Failed with exit code: $?"
		fi
		return $OCF_ERR_GENERIC
	else
		ocf_log debug "$RESOURCE stop: drbd not loaded."
	fi

	return $OCF_SUCCESS
}

drbd_monitor() {
	# TODO: Think about how to monitor drbd and what constitutes
	# failure cases...
	# diskless etc?
	# A secondary node which is supposed to be primary?

	# TODO: we ought to update the preferences here occasionally,
	# but that causes transitions right now ...

	if is_drbd_enabled; then
	    : OK
	else
	    ocf_log warn "$RESOURCE monitor: drbd module not loaded"
	    return $OCF_NOT_RUNNING
	fi

	drbd_get_status

	if [ "$DRBD_STATE_LOCAL" = "Not configured" ]; then
	    ocf_log debug "$RESOURCE monitor: resource not configured"
	    return $OCF_NOT_RUNNING
	elif [ "$DRBD_STATE_LOCAL" = "Primary" ]; then
#	    drbd_update_prefs
	    return $OCF_RUNNING_MASTER
	elif [ "$DRBD_STATE_LOCAL" = "Secondary" ]; then
#	    drbd_update_prefs
	    return $OCF_SUCCESS
	else
	    ocf_log err "$RESOURCE monitor: unexpected local state: $DRBD_STATE_LOCAL"
	fi
	
	return $OCF_ERR_GENERIC
}

drbd_promote() {
	if is_drbd_enabled; then
	    : OK
	else
	    ocf_log err "drbd is not enabled"
	    return $OCF_ERR_GENERIC
	fi

	drbd_get_status

	if [ "$DRBD_STATE_LOCAL" = "Primary" ]; then
		ocf_log info "$RESOURCE promote: already primary"
		return $OCF_SUCCESS
	fi
	
	if [ "$DRBD_STATE_LOCAL" != "Secondary" ]; then
		ocf_log warn "$RESOURCE promote: Not secondary to start with."
		return $OCF_ERR_GENERIC
	fi
	if do_drbdadm primary $RESOURCE ; then
		# TODO: WORK AROUND because drbdadm has a bug and
		# reports success even if it failed :-(
		drbd_get_status

		if [ "$DRBD_STATE_LOCAL" = "Primary" ]; then
			ocf_log info "$RESOURCE promote: primary succeeded"
			return $OCF_SUCCESS
		else 
			ocf_log err "$RESOURCE promote: Not primary despite drbdadm call."
		fi
		
	else
		ocf_log err "$RESOURCE promote: Failed with exit code $?."
	fi
	return $OCF_ERR_GENERIC
}

drbd_demote() {
	if is_drbd_enabled; then
	    : OK
	else
	    # A stopped resource also is demoted.
	    ocf_log debug "$RESOURCE demote: module not loaded"
	    return $OCF_SUCCESS
	fi
	
	if [ "$DRBD_STATE_LOCAL" = "Secondary" ]; then
		ocf_log debug "$RESOURCE demote: already secondary"
		return $OCF_SUCCESS
	fi
	if [ "$DRBD_STATE_LOCAL" = "Not configured" ]; then
		ocf_log debug "$RESOURCE demote: already stopped"
		return $OCF_NOT_RUNNING
	fi
	
	# TODO: this is a _force_ operation. we may need to kill higher
	# levels (or switch them to r/o) to be able to demote drbd.
	# figure out how...
	if do_drbdadm secondary $RESOURCE ; then
		if [ "$DRBD_STATE_LOCAL" = "Primary" ]; then
			ocf_log err "$RESOURCE demote: still primary!"
			return $OCF_ERR_GENERIC
		fi
	
	        ocf_log debug "$RESOURCE demote: succeeded"
		return $OCF_SUCCESS
	else
	        ocf_log err "$RESOURCE demote: Failed with exit code $?."
		return $OCF_ERR_GENERIC
	fi
}

drbd_notify() {
	local n_type="$OCF_RESKEY_CRM_meta_notify_type"
	local n_op="$OCF_RESKEY_CRM_meta_notify_operation"
	set -- $OCF_RESKEY_CRM_meta_notify_active_resource
	local n_active="$#"
	set -- $OCF_RESKEY_CRM_meta_notify_stop_resource
	local n_stop="$#"
	set -- $OCF_RESKEY_CRM_meta_notify_start_resource
	local n_start="$#"

	ocf_log debug "$RESOURCE notify: $n_type for $n_op - counts: active $n_active - starting $n_start - stopping $n_stop"
	
	case $n_type in
	pre)
		case $n_op in
		promote) # TODO:
			 # Resist promotion of the other side in case we
			 # are already primary - though the CRM should
			 # not even attempt that.
			;;
		esac
		;;
	post)	# TODO: Entire case statement which follows redundant?
		case $n_op in
		start)
			if [ "$n_active" -eq 2 ]; then
				# The other side is running, so we ought
				# to connect and wait for that.
				do_drbdadm connect $RESOURCE
				do_drbdadm wait_connect $RESOURCE
				# TODO: If this can cause a hang if the
				# other side isn't connected or goes
				# away during that, maybe just sleep
				# here for 5-10s or take out the entire
				# case statement
			fi
			;;
		stop)
			# TODO BUG: disconnect seems to force
			# non-primary mode?!?
			#### do_drbdadm disconnect $RESOURCE

			# TODO: If we are secondary, do we need to do
			# anything about a stopped primary in case we
			# had an outdated flag...?
			;;
		esac

		# After something has been done is a good time to
		# recheck our status:
		drbd_update_prefs
		;;
	esac
	
	return $OCF_SUCCESS
}

drbd_validate_all () {
	# First check the configuration file
	if [ -n "$DRBDCONF" ] && [ ! -f "$DRBDCONF" ]; then
	    ocf_log err "Configuration file does not exist: $DRBDCONF"
	    return $OCF_ERR_CONFIGURED
	fi

	# Check the resource name, it should appear in DRBDCONF
	if [ -z "$RESOURCE" ]; then
	    ocf_log err "No resource name specified!"
	    return $OCF_ERR_ARGS
	fi
	
	if do_drbdadm dump $RESOURCE 2>/dev/null ; then
	    :
	else
	    ocf_log err "Invalid configuration file $DRBDCONF"
	    return $OCF_ERR_CONFIGURED
	fi
	
	if [ "$OCF_RESKEY_CRM_meta_clone_max" -ne 2 ] \
	 || [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] \
	 || [ "$OCF_RESKEY_master_node_max" -ne 1 ] \
	 || [ "$OCF_RESKEY_master_max" -ne 1 ] ; then
		ocf_log err "Clone options misconfigured."
		exit $OCF_ERR_CONFIGURED
	fi
	
	return $OCF_SUCCESS
}


if [ $# -ne 1 ]; then
	echo "Incorrect parameter count."
	drbd_usage
	exit $OCF_ERR_ARGS
fi

ACTION=$1
case $ACTION in
meta-data)	meta_data
		;;
validate-all)	drbd_init
		drbd_validate_all
		;;
start|stop|monitor|promote|demote|notify)
		if ocf_is_root ; then : ; else
			ocf_log err "You must be root to perform this operation."
			exit $OCF_ERR_PERM
		fi
	
		drbd_init
		drbd_$ACTION
		exit $?
		;;
usage|help)	drbd_usage
		exit $OCF_SUCCESS
		;;
*)		drbd_usage
		exit $OCF_ERR_ARGS
		;;
esac

