#!/usr/bin/bash
#
# ocf:pacemaker:HealthSMART resource agent
#
# Copyright 2009-2023 the Pacemaker project contributors
#
# The version control history for this file may have further details.
#
# This source code is licensed under the GNU General Public License version 2
# (GPLv2) WITHOUT ANY WARRANTY.
#

#
# Checks the S.M.A.R.T. status of all given drives and writes the #health-smart
# status into the CIB
#
#######################################################################

#######################################################################
# Initialization:

: ${OCF_FUNCTIONS:="${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs"}
. "${OCF_FUNCTIONS}"
: ${__OCF_ACTION:="$1"}

# Explicitly list all environment variables used, to make static analysis happy
: ${OCF_RESKEY_CRM_meta_interval:=0}
: ${OCF_RESKEY_CRM_meta_globally_unique:="true"}
: ${OCF_RESKEY_temp_warning:=""}
: ${OCF_RESKEY_temp_lower_limit:=""}
: ${OCF_RESKEY_temp_upper_limit:=""}
: ${OCF_RESKEY_drives:="/dev/sda"}
: ${OCF_RESKEY_devices:=""}
: ${OCF_RESKEY_state:=""}
: ${OCF_RESKEY_smartctl:="/usr/sbin/smartctl"}
: ${OCF_RESKEY_dampen:="5s"}

# Turn these into arrays so we can iterate them later.
DRIVES=(${OCF_RESKEY_drives})
DEVICES=(${OCF_RESKEY_devices})

#######################################################################

meta_data() {
    cat <<END
<?xml version="1.0"?>
<resource-agent name="HealthSMART" version="2.1.9-1.el9">
<version>1.1</version>

<longdesc lang="en">
System health agent that checks the S.M.A.R.T. status of the given drives and
updates the #health-smart attribute.
</longdesc>
<shortdesc lang="en">SMART health status</shortdesc>

<parameters>
<parameter name="state" unique-group="state">
<longdesc lang="en">
Location to store the resource state in.
</longdesc>
<shortdesc lang="en">State file</shortdesc>
<content type="string" default="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state" />
</parameter>

<parameter name="drives" reloadable="1">
<longdesc lang="en">
The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda".
</longdesc>
<shortdesc lang="en">Drives to check</shortdesc>
<content type="string" default="/dev/sda" />
</parameter>

<parameter name="devices" reloadable="1">
<longdesc lang="en">
The device type(s) to assume for the drive(s) being tested as a SPACE separated list.
</longdesc>
<shortdesc lang="en">Device types</shortdesc>
<content type="string" />
</parameter>

<parameter name="temp_lower_limit" reloadable="1">
<longdesc lang="en">
Lower limit of the temperature in deg C of the drive(s). Below this limit the status of #health-smart will be red.
</longdesc>
<shortdesc lang="en">Lower limit for the red smart attribute</shortdesc>
<content type="string" default="0"/>
</parameter>

<parameter name="temp_upper_limit" reloadable="1">
<longdesc lang="en">
Upper limit of the temperature if deg C of the drives(s). If the drive reports
a temperature higher than this value the status of #health-smart will be red.
</longdesc>
<shortdesc lang="en">Upper limit for red smart attribute</shortdesc>
<content type="string" default="60"/>
</parameter>

<parameter name="temp_warning" reloadable="1">
<longdesc lang="en">
Number of deg C below/above the upper/lower temp limits at which point the status of #health-smart will change to yellow.
</longdesc>
<shortdesc lang="en">Deg C below/above the upper limits for yellow smart attribute</shortdesc>
<content type="string" default="5"/>
</parameter>

<parameter name="smartctl" reloadable="1">
<longdesc lang="en">
The path to the smartctl program, used for querying device health.
</longdesc>
<shortdesc lang="en">The path to the smartctl program</shortdesc>
<content type="string" default="/usr/sbin/smartctl"/>
</parameter>

<parameter name="dampen" reloadable="1">
<longdesc lang="en">
The time to wait (dampening) for further changes to occur
</longdesc>
<shortdesc lang="en">Dampening interval</shortdesc>
<content type="string" default="5s"/>
</parameter>

</parameters>

<actions>
<action name="start"        timeout="10s" />
<action name="stop"         timeout="10s" />
<action name="monitor"      timeout="10s" interval="10s" start-delay="0s" />
<action name="meta-data"    timeout="5s" />
<action name="validate-all"   timeout="10s" depth="0" />
<action name="reload-agent"   timeout="20s" />
</actions>
</resource-agent>
END
}

#######################################################################

check_temperature() {

    if [ -n "$1" ]; then
        if [ $1 -lt ${lower_red_limit} ] ; then
            ocf_log info "Drive ${DRIVE} ${DEVICE} too cold: ${1} C"
            attrd_updater -n "#health-smart" -B "red" -d "${OCF_RESKEY_dampen}"
            return 1
        fi

        if [ $1 -gt ${upper_red_limit} ] ; then
            ocf_log info "Drive ${DRIVE} ${DEVICE} too hot: ${1} C"
            attrd_updater -n "#health-smart" -B "red" -d "${OCF_RESKEY_dampen}"
            return 1
        fi

        if [ $1 -lt ${lower_yellow_limit} ] ; then
            ocf_log info "Drive ${DRIVE} ${DEVICE} quite cold: ${1} C"
            attrd_updater -n "#health-smart" -B "yellow" -d "${OCF_RESKEY_dampen}"
            return 1
        fi

        if [ $1 -gt ${upper_yellow_limit} ] ; then
            ocf_log info "Drive ${DRIVE} ${DEVICE} quite hot: ${1} C"
            attrd_updater -n "#health-smart" -B "yellow" -d "${OCF_RESKEY_dampen}"
            return 1
        fi
    fi
}

common_checks() {
    # Each item in $OCF_RESKEY_drives must have a corresponding item in
    # $OCF_RESKEY_devices with the device type.  Alternately,
    # $OCF_RESKEY_devices can be empty.
    drives_len=${#DRIVES[@]}
    devices_len=${#DEVICES[@]}

    if [ "${drives_len}" -ne "${devices_len}" ] && [ "${devices_len}" -gt 0 ]; then
        ocf_log err "OCF_RESKEY_devices must be empty or the same length as OCF_RESKEY_drives."
        exit $OCF_ERR_ARGS
    fi

    # Each item in $OCF_RESKEY_drives must look like a device node.
    for d in "${DRIVES[@]}"; do
        if [[ "$d" != /dev/* ]]; then
            ocf_log err "Device in OCF_RESKEY_devices does not look like a device node: $d"
            exit $OCF_ERR_ARGS
        fi
    done
}


init_smart() {
    #Set temperature defaults
    if [ -z "${OCF_RESKEY_temp_warning}" ]; then
        yellow_threshold=5
    else
        yellow_threshold=${OCF_RESKEY_temp_warning}
    fi

    if [ -z "${OCF_RESKEY_temp_lower_limit}" ] ; then
        lower_red_limit=0
    else
        lower_red_limit=${OCF_RESKEY_temp_lower_limit}
    fi
    lower_yellow_limit=$((${lower_red_limit}+${yellow_threshold}))

    if [ -z "${OCF_RESKEY_temp_upper_limit}" ] ; then
        upper_red_limit=60
    else
        upper_red_limit=${OCF_RESKEY_temp_upper_limit}
    fi
    upper_yellow_limit=$((${upper_red_limit}-${yellow_threshold}))

    for ndx in ${!DRIVES[*]}; do
        DRIVE=${DRIVES[$ndx]}

        if [ -n "${OCF_RESKEY_devices}" ]; then
            DEVICE=${DEVICES[$ndx]}

            "${OCF_RESKEY_smartctl}" -d "${DEVICE}" -i "${DRIVE}" | grep -q "SMART support is: Enabled"
            if [ $? -ne 0 ] ; then
                ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE}
                exit $OCF_ERR_INSTALLED
            fi
        else
            "${OCF_RESKEY_smartctl}" -i "${DRIVE}" | grep -q "SMART support is: Enabled"
            if [ $? -ne 0 ] ; then
                ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE}
                exit $OCF_ERR_INSTALLED
            fi
        fi
    done
}

HealthSMART_usage() {
    cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data|reload-agent}

Expects to have a fully populated OCF RA-compliant environment set.
END
}

HealthSMART_start() {
    HealthSMART_monitor
    if [ $? -eq $OCF_SUCCESS ]; then
        return $OCF_SUCCESS
    fi
    touch "${OCF_RESKEY_state}"
}

HealthSMART_stop() {
    attrd_updater -D -n "#health-smart"

    rm "${OCF_RESKEY_state}"

    if [ $? -eq 0 ]; then
        return $OCF_SUCCESS
    else
        return $OCF_ERR_GENERIC
    fi
}

HealthSMART_monitor() {
    common_checks

    # Test for presence of smartctl
    check_binary smartctl

    init_smart

    # Monitor _MUST!_ differentiate correctly between running
    # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
    # That is THREE states, not just yes/no.

    if [ -f "${OCF_RESKEY_state}" ]; then

        for ndx in ${!DRIVES[*]}; do
            DRIVE=${DRIVES[$ndx]}

            if [ -n "${OCF_RESKEY_devices}" ]; then
                DEVICE=${DEVICES[$ndx]}

                # Check overall S.M.A.R.T. status
                "${OCF_RESKEY_smartctl}" -d "${DEVICE}" -H ${DRIVE} | grep -q "SMART overall-health self-assessment test result: PASSED"
                if [ $? -ne 0 ]; then
                    attrd_updater -n "#health-smart" -B "red" -d "${OCF_RESKEY_dampen}"
                    return $OCF_SUCCESS
                fi

                # Check drive temperature(s)
                check_temperature "$("${OCF_RESKEY_smartctl}" -d "${DEVICE}" -A "${DRIVE}" | awk '/^194/ { print $10 }')"
                if [ $? -ne 0 ]; then
                    return $OCF_SUCCESS
                fi
            else
                "${OCF_RESKEY_smartctl}" -H "${DRIVE}" | grep -q "SMART overall-health self-assessment test result: PASSED"
                if [ $? -ne 0 ]; then
                    attrd_updater -n "#health-smart" -B "red" -d "${OCF_RESKEY_dampen}"
                    return $OCF_SUCCESS
                fi

                check_temperature "$("${OCF_RESKEY_smartctl}" -A "${DRIVE}" | awk '/^194/ { print $10 }')"
                if [ $? -ne 0 ]; then
                    return $OCF_SUCCESS
                fi
            fi
        done

        attrd_updater -n "#health-smart" -B "green" -d "${OCF_RESKEY_dampen}"
        return $OCF_SUCCESS
    fi

    return $OCF_NOT_RUNNING

}

HealthSMART_validate() {
    common_checks

    # Host-specific checks
    if [ "$OCF_CHECK_LEVEL" = "10" ]; then
        # Test for presence of smartctl
        check_binary smartctl

        init_smart

        # Is the state directory writable?
        state_dir=$(dirname "$OCF_RESKEY_state")
        touch "$state_dir/$$"
        if [ $? -ne 0 ]; then
            return $OCF_ERR_ARGS
        fi
        rm "$state_dir/$$"
    fi

    return $OCF_SUCCESS
}

HealthSMART_reload_agent() {
    return $OCF_SUCCESS
}


if [ -z "$OCF_RESKEY_state" ]; then
    if [ "${OCF_RESKEY_CRM_meta_globally_unique}" = "false" ]; then
        state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state"

        # Strip off the trailing clone marker
        OCF_RESKEY_state=$(echo $state | sed s/:[0-9][0-9]*\.state/.state/)
    else
        OCF_RESKEY_state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state"
    fi
fi

case "$__OCF_ACTION" in
    start)        HealthSMART_start;;
    stop)         HealthSMART_stop;;
    monitor)      HealthSMART_validate && HealthSMART_monitor;;
    validate-all) HealthSMART_validate;;
    reload-agent) HealthSMART_reload_agent;;
    meta-data)
        meta_data
        exit $OCF_SUCCESS
        ;;
    usage|help)
        HealthSMART_usage
        exit $OCF_SUCCESS
        ;;
    *)  HealthSMART_usage
        exit $OCF_ERR_UNIMPLEMENTED
        ;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
exit $rc

# vim: set filetype=sh expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80:
