From fdd9c743674d9c06a68cef0ffbe7f4b0d5b599ab Mon Sep 17 00:00:00 2001 From: Imanol-Mikel Barba Sabariego Date: Sun, 18 Feb 2024 20:39:10 +0100 Subject: [PATCH] Adding wear leveling changes to check_smart. Adding check_systemd and check_zfs --- check_smart/auxiliar.cpp | 2 +- check_smart/auxiliar.h | 0 check_smart/check_smart.cpp | 51 ++++++++++++++++++++++++++++++++++++++++++--------- check_smart/check_smart.h | 26 +++++++++++++++++++++----- check_systemd/check_systemd.sh | 14 ++++++++++++++ check_zfs/check_zfs.sh | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 222 insertions(+), 15 deletions(-) mode change 100644 => 100755 check_smart/auxiliar.cpp mode change 100644 => 100755 check_smart/auxiliar.h mode change 100644 => 100755 check_smart/check_smart.h create mode 100755 check_systemd/check_systemd.sh create mode 100755 check_zfs/check_zfs.sh diff --git a/check_smart/auxiliar.cpp b/check_smart/auxiliar.cpp old mode 100644 new mode 100755 index abea7dc..9599ce0 --- a/check_smart/auxiliar.cpp +++ b/check_smart/auxiliar.cpp @@ -52,4 +52,4 @@ int exec(string cmd, string *output) } } return pclose(pipe)/256; -} +} \ No newline at end of file diff --git a/check_smart/auxiliar.h b/check_smart/auxiliar.h old mode 100644 new mode 100755 index b68c3de..b68c3de --- a/check_smart/auxiliar.h +++ b/check_smart/auxiliar.h diff --git a/check_smart/check_smart.cpp b/check_smart/check_smart.cpp index d757f66..f511a50 100644 --- a/check_smart/check_smart.cpp +++ b/check_smart/check_smart.cpp @@ -1,9 +1,32 @@ #include "check_smart.h" +#include const char *servicename = (const char*)"SMART"; -int getSmartAttrValue(string line) { - return stoi(line.substr(line.find_last_of(" ")+1)); +int getSmartAttrValue(string line, unsigned int col) { + line = std::regex_replace(line, std::regex("\\s+"), " "); + line = std::regex_replace(line, std::regex("^ "), ""); + + // Find nth col + size_t pos = 0; + int cnt = 0; + + while(cnt != col) { + pos = line.find(" ", pos+1); + if (pos == std::string::npos) { + throw std::runtime_error("Column out of range"); + } + cnt++; + } + if(pos != 0) { + ++pos; + } + + size_t end_pos = line.find(" ", pos); + if(end_pos != std::string::npos) { + return stoi(line.substr(pos, line.find(" ", pos))); + } + return stoi(line.substr(pos)); } int getSmartAttrID(string line) { @@ -88,7 +111,7 @@ int evalStatus(const char* disk, int driveType, string *status) { continue; } if(getSmartAttrID(line) == id) { - attrMap[id].value = getSmartAttrValue(line); + attrMap[id].value = getSmartAttrValue(line, attrMap[id].col); } } } @@ -107,11 +130,21 @@ int evalStatus(const char* disk, int driveType, string *status) { } int veredict = 0; - if(attr.value > attr.threshold_warn) { - veredict = WARN; - } - if(attr.threshold_crit != -1 && attr.value > attr.threshold_crit) { - veredict = CRIT; + + if(attr.lower_than) { + if(attr.value < attr.threshold_warn) { + veredict = WARN; + } + if(attr.threshold_crit != -1 && attr.value < attr.threshold_crit) { + veredict = CRIT; + } + } else { + if(attr.value > attr.threshold_warn) { + veredict = WARN; + } + if(attr.threshold_crit != -1 && attr.value > attr.threshold_crit) { + veredict = CRIT; + } } switch(veredict) { @@ -183,7 +216,7 @@ int main(int argc, char **argv) { switch(c) { case 'h': printHelp(true); - return OK; + return OK; case 'V': printVersion(); return OK; diff --git a/check_smart/check_smart.h b/check_smart/check_smart.h old mode 100644 new mode 100755 index 020c2d6..78a4320 --- a/check_smart/check_smart.h +++ b/check_smart/check_smart.h @@ -48,6 +48,8 @@ struct SMARTAttr int threshold_warn; int threshold_crit; bool optional; + unsigned int col; + bool lower_than; }; typedef struct SMARTAttr SMARTAttr; @@ -59,6 +61,8 @@ SMARTAttr reallocated = { .threshold_warn = 0, .threshold_crit = -1, .optional = false, + .col = 9, + .lower_than = false, }; SMARTAttr pending = { @@ -66,8 +70,10 @@ SMARTAttr pending = { .name = "Current_Pending_Sector", .value = -1, .threshold_warn = 0, - .threshold_crit = -1, + .threshold_crit = -1, .optional = false, + .col = 9, + .lower_than = false, }; SMARTAttr off_uncorrect = { @@ -77,24 +83,30 @@ SMARTAttr off_uncorrect = { .threshold_warn = 0, .threshold_crit = 0, .optional = false, + .col = 9, + .lower_than = false, }; SMARTAttr wear = { .id = WEAR_COUNT_ID, .name = "Wear_Leveling_Count", .value = -1, - .threshold_warn = 80, - .threshold_crit = 90, + .threshold_warn = 20, + .threshold_crit = 10, .optional = true, + .col = 3, + .lower_than = true, }; SMARTAttr wearout = { .id = MEDIA_WEAROUT_ID, .name = "Media_Wearout_Indicator", .value = -1, - .threshold_warn = 80, - .threshold_crit = 90, + .threshold_warn = 20, + .threshold_crit = 10, .optional = true, + .col = 3, + .lower_than = true, }; SMARTAttr badblocks = { @@ -104,6 +116,8 @@ SMARTAttr badblocks = { .threshold_warn = 0, .threshold_crit = 0, .optional = false, + .col = 9, + .lower_than = false, }; SMARTAttr rep_uncorrect = { @@ -113,6 +127,8 @@ SMARTAttr rep_uncorrect = { .threshold_warn = 0, .threshold_crit = -1, .optional = false, + .col = 9, + .lower_than = false, }; map prepareAttrMap(int driveType); diff --git a/check_systemd/check_systemd.sh b/check_systemd/check_systemd.sh new file mode 100755 index 0000000..53f04e1 --- /dev/null +++ b/check_systemd/check_systemd.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +UNIT=$1 + +systemctl status $UNIT > /dev/null +RET=$? + +if [[ $RET != 0 ]]; then + echo "SERVICE $UNIT - CRITICAL: Not running" + exit 2 +fi + +echo "SERVICE $UNIT - OK" +exit 0 diff --git a/check_zfs/check_zfs.sh b/check_zfs/check_zfs.sh new file mode 100755 index 0000000..7525a57 --- /dev/null +++ b/check_zfs/check_zfs.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +function smart_disks() { + if [[ $# != 1 ]]; then + echo "Wrong number of arguments" + return 3 + fi + + POOL=$1 + DISKS=$(zpool status -P $POOL | sed -E 's/\t/ /g' | sed -E 's/[ ]+/ /g' | sed -E 's/^ //g' | grep "^/dev" | cut -d' ' -f 1) + ERR_OUTPUT="" + HIGHEST_RES=0 + for disk in $DISKS; do + OUTPUT=$(/usr/lib64/nagios/plugins/check_smart $disk) + RES=$? + if [[ $RES != 0 ]]; then + ERR_OUTPUT="$ERR_OUTPUT - $OUTPUT" + fi + if [[ $RES -gt $HIGHEST_RES ]]; then + HIGHEST_RES=$RES + fi + done + + echo -n "ZFS POOL SMART $POOL " + if [[ $HIGHEST_RES == 1 ]]; then + echo "WARNING${ERR_OUTPUT}" + return 1 + elif [[ $HIGHEST_RES == 2 ]]; then + echo "CRITICAL${ERR_OUTPUT}" + return 2 + elif [[ $HIGHEST_RES == 3 ]]; then + echo "UNKNOWN${ERR_OUTPUT}" + return 3 + fi + + echo "OK" + return 0 +} + +function pool_errors() { + if [[ $# != 1 ]]; then + echo "Wrong number of arguments" + return 3 + fi + + OUTPUT="$(zpool status $1)" + if [[ $? != 0 ]]; then + return 3 + fi + + STATUS=$(echo "$OUTPUT" | grep -P "^\s*state:" | sed 's/ //g' | grep -P "^\s*state" | cut -d ':' -f 2) + if [[ $? != 0 ]]; then + return 3 + fi + + ERRORS=$(echo "$OUTPUT" | grep -P "^errors:") + if [[ $? != 0 ]]; then + return 3 + fi + + if [[ $STATUS != "ONLINE" ]]; then + echo "ZFS POOL ERRORS $1 - CRITICAL: status: $STATUS" + return 2 + fi + + if [[ $ERRORS != "errors: No known data errors" ]]; then + echo "ZFS POOL ERRORS $1 - WARNING: $ERRORS" + return 1 + fi + + echo "ZFS POOL ERRORS $1 - OK" + return 0 +} + +function pool_free() { + if [[ $# != 7 ]]; then + echo "Wrong number of arguments" + return 3 + fi + POOL=$3 + + FREE_RAW=$(zfs list -po available $POOL | tail -n 1) + if [[ $? != 0 ]]; then + return 3 + fi + + USED_RAW=$(zfs list -po used $POOL | tail -n 1) + if [[ $? != 0 ]]; then + return 3 + fi + TOTAL_RAW=$((USED_RAW + FREE_RAW)) + TOTAL=$(echo "scale=2; $TOTAL_RAW / (1024^3)" | bc) + FREE=$(echo "scale=2; $FREE_RAW / (1024^3)" | bc) + PERC_FREE=$(echo "scale=2; 100*$FREE_RAW/$TOTAL_RAW" | bc) + + CRITICAL="0" + WARNING="0" + TEMP=$(getopt -o c:w: -- "$@") + eval set -- "$TEMP" + + for opt; do + case "$opt" in + -c) CRITICAL=$2; shift 2 ;; + -w) WARNING=$2; shift 2 ;; + esac + done + + if [[ $CRITICAL == "0" || $WARNING == "0" ]]; then + echo "Missing -c or -w arguments" + return 3 + fi + + if [[ $(echo "$PERC_FREE < $CRITICAL" | bc) == 1 ]]; then + echo "ZFS POOL FREE $POOL - CRITICAL: ${PERC_FREE}% ${FREE}/${TOTAL} GB" + return 2 + elif [[ $(echo "$PERC_FREE < $WARNING" | bc) == 1 ]]; then + echo "ZFS POOL FREE $POOL - WARNING: ${PERC_FREE}% ${FREE}/${TOTAL} GB" + return 2 + fi + + echo "ZFS POOL FREE $POOL - OK: ${PERC_FREE}% ${FREE}/${TOTAL} GB" + return 0 +} + +if [[ $# -lt 2 ]]; then + echo "Wrong number of arguments" + exit 3 +fi + +ACTION=$1 +POOL=$2 + +if [[ $ACTION == "smart" ]]; then + smart_disks $POOL + exit $? +elif [[ $ACTION == "errors" ]]; then + pool_errors $POOL + exit $? +elif [[ $ACTION == "free" ]]; then + pool_free $POOL $@ + exit $? +fi +echo "Unknown command: $ACTION" +exit 3 -- libgit2 0.22.2