diff --git a/README.md b/README.md index d00b147..d75789a 100644 --- a/README.md +++ b/README.md @@ -644,6 +644,136 @@ nagios::check::elasticsearch::args_jvm_usage: '-N 10.0.0.1 -C 90 -W 80' nagios::check::elasticsearch::args_nodes: '-E 5' # Expected nodes in cluester ``` + +## OpenSearch + +The `opensearch` checks monitor OpenSearch clusters, nodes, and indices using a single +`opensearch_check_nrpe.sh` script with multiple **modes** (or “actions”). + +Each mode performs a distinct health or capacity check and can be enabled or disabled individually, or in logical groups (for example, enabling only cluster-level checks). + +You must provide valid credentials for an account with permission to call the standard cluster APIs (`_cluster/health`, `_nodes/stats`, `_cat/*`, etc.). + +```yaml +nagios::check::opensearch::user: 'nagios' +nagios::check::opensearch::pass: 'mysupersecretpassword' +``` + +### Example user creation (OpenSearch Security plugin) + +Example user definition (`/usr/share/opensearch/plugins/opensearch-security/tools/securityadmin.sh`): + +```json +{ + "nagios_monitor": { + "hash": "$2y$12$...", + "roles": [ "monitoring_user" ], + "description": "Nagios monitoring account" + } +} +``` + +Minimal role definition for `monitoring_user`: + +```json +{ + "monitoring_user": { + "cluster": [ + "cluster:monitor/main", + "cluster:monitor/health", + "cluster:monitor/state", + "cluster:monitor/stats", + "cluster:monitor/nodes/stats", + "cluster:monitor/nodes/info", + "cluster:monitor/allocation/explain", + "cluster:monitor/settings/get" + ], + "indices": [ + { + "names": [ "*" ], + "privileges": [ "read", "view_index_metadata" ] + } + ] + } +} +``` + +### Enabling and disabling checks + +Completely disable OpenSearch monitoring on a host: + +```yaml +nagios::check::opensearch::ensure: 'absent' +``` + +Selectively disable specific modes: + +```yaml +nagios::check::opensearch::modes_disabled: + - 'thread_pool_queues' + - 'no_replica_indices' +``` + +Or enable only certain checks: + +```yaml +nagios::check::opensearch::modes_enabled: + - 'cluster_status' + - 'jvm_usage' + - 'disk_usage' +``` + +Each mode can also receive its own threshold arguments through `mode_args`: + +```yaml +nagios::check::opensearch::mode_args: + jvm_usage: '-w 75 -c 90' + disk_usage: '-w 80 -c 95' + shard_capacity: '-w 70 -c 90' +``` + +### Available modes + +|Mode|Description| +|---|---| +|**cluster_status**|Checks the overall cluster health (`green`, `yellow`, or `red`). Warns if `yellow`, critical if `red`.| +|**nodes**|Checks the number of active nodes in the cluster and compares against `-E `.| +|**unassigned_shards**|Reports unassigned primary or replica shards in the cluster.| +|**jvm_usage**|Checks JVM heap memory usage on the current node.| +|**disk_usage**|Checks disk utilization percentage for the node running the check.| +|**thread_pool_queues**|Monitors the `search` thread pool queue size for the node.| +|**no_replica_indices**|Reports user indices configured without replicas (excluding system indices).| +|**node_uptime**|Warns if the node has been up for less than 10 minutes (useful after restarts).| +|**check_disk_space_for_resharding**|Estimates whether remaining disk space is sufficient to reshuffle shards after a node loss.| +|**shard_capacity**|Checks total cluster shard utilization vs. the `cluster.max_shards_per_node` limit. Warns/criticals based on % usage.| + + +### Example configuration + +Enable OpenSearch monitoring with all default checks: + +```yaml +class { 'nagios::check::opensearch': + ensure => present, + host => 'opensearch.internal', + port => '9200', + user => 'nagios', + pass => 'mysupersecretpassword', + modes_enabled => [ + 'cluster_status', + 'nodes', + 'unassigned_shards', + 'jvm_usage', + 'disk_usage', + 'thread_pool_queues', + 'no_replica_indices', + 'node_uptime', + 'check_disk_space_for_resharding', + 'shard_capacity', + ], +} +``` + ## Fluent Bit The Fluent Bit monitoring uses the `/api/v1/health` endpoint to determine the health status of Fluent Bit. diff --git a/files/scripts/opensearch_check_nrpe.sh b/files/scripts/opensearch_check_nrpe.sh new file mode 100644 index 0000000..ad6038b --- /dev/null +++ b/files/scripts/opensearch_check_nrpe.sh @@ -0,0 +1,539 @@ +#!/bin/bash +# Default values +HOST="localhost" +PORT="9200" +USER="" +PASSWORD="" +SSL_VERIFY=true +WARN_THRESHOLD="" +CRIT_THRESHOLD="" +CURRENT_NODE=$(hostname) + +# Valid types array +VALID_TYPES=(cluster_status nodes unassigned_shards jvm_usage disk_usage thread_pool_queues no_replica_indices node_uptime check_disk_space_for_resharding shard_capacity) + +# Function to join array elements +join_by() { + local IFS="$1" + shift + echo "$*" +} + +# Function to print usage with descriptions for each check +print_usage() { + echo "Usage: $0 [options]" + echo "Options:" + echo " -H Specify the OpenSearch host. Default is 'localhost'." + echo " -P Specify the OpenSearch port. Default is '9200'." + echo " -u Specify the OpenSearch username." + echo " -p Specify the OpenSearch password." + echo " -t Specify the check type. Types are:" + echo " - cluster_status: Check the overall cluster status." + echo " - nodes: Check the number of nodes in the cluster." + echo " - unassigned_shards: Check for unassigned shards in the cluster." + echo " - jvm_usage: Check JVM memory usage." + echo " - disk_usage: Check disk space usage." + echo " - thread_pool_queues: Check thread pool queue sizes." + echo " - no_replica_indices: Check for indices without replicas." + echo " - node_uptime: Check if node uptime is less than 10 minutes." + echo " - check_disk_space_for_resharding: Check disk space for index resharding." + echo " - shard_capacity: Check total shard usage vs. the cluster.max_shards_per_node limit." + echo " -k Skip SSL verification. Use with caution." + echo " -w Set a warning threshold." + echo " -c Set a critical threshold." + echo " -E Specify the expected node count for the 'nodes' check." + echo " -N Specify the node name. Defaults to the hostname." + echo " -h Display this help message and exit." +} + +# Parse command-line options +while getopts ":H:P:u:p:N:E:t:w:W:c:C:kh" opt; do + case ${opt} in + H ) + HOST=$OPTARG + ;; + P ) + PORT=$OPTARG + ;; + u ) + USER=$OPTARG + ;; + p ) + PASSWORD=$OPTARG + ;; + N ) + CURRENT_NODE=$OPTARG + ;; + E) + EXPECTED_NODE_COUNT=$OPTARG + ;; + t ) + TYPE=$OPTARG + ;; + k ) + SSL_VERIFY=false + ;; + w | W) + WARN_THRESHOLD=$OPTARG + ;; + c | C) + CRIT_THRESHOLD=$OPTARG + ;; + h ) + print_usage + exit 0 + ;; + * ) + print_usage + exit 3 # Unknown + ;; + esac +done + +# Verify if type is set +if [[ -z "$TYPE" ]]; then + echo "Error: -t option is required." + print_usage + exit 3 # Unknown +fi + +# Verify if type is valid +if ! [[ " ${VALID_TYPES[*]} " =~ " $TYPE " ]]; then + echo "Invalid type specified. Valid types are: $(join_by ', ' "${VALID_TYPES[@]}")." + exit 3 # Unknown +fi + +# Construct URL and CURL options +OPENSEARCH_URL="https://${HOST}:${PORT}" +CREDENTIALS="$USER:$PASSWORD" +CURL_OPTS="-s" +if [[ $SSL_VERIFY == false ]]; then + CURL_OPTS="$CURL_OPTS -k" +fi + +# Enhanced CURL execution with error handling +execute_curl() { + local url=$1 + response=$(curl $CURL_OPTS -u $CREDENTIALS "$url" 2>&1) + curl_status=$? + + if [[ $curl_status -ne 0 ]]; then + echo "CURL error: $response" + exit 2 # CRITICAL + else + echo "$response" | jq . > /dev/null 2>&1 + jq_status=$? + if [[ $jq_status -ne 0 ]]; then + echo "Failed to parse JSON response: $response" + exit 2 # CRITICAL + fi + fi + + echo "$response" +} + +# Function to get cluster health +get_cluster_health() { + local url="$OPENSEARCH_URL/_cluster/health" + response=$(curl $CURL_OPTS -u $CREDENTIALS "$url" 2>&1) + curl_status=$? + + # Check if curl command succeeded + if [[ $curl_status -ne 0 ]]; then + echo "CRITICAL: Failed to retrieve cluster health from OpenSearch - CURL error." + exit 2 # CRITICAL + else + # Attempt to parse the response using jq + echo "$response" | jq . > /dev/null 2>&1 + jq_status=$? + + # Check if jq succeeded in parsing the response + if [[ $jq_status -ne 0 ]]; then + echo "CRITICAL: Failed to parse JSON response for cluster health. Response may not be in valid JSON format." + exit 2 # CRITICAL + fi + fi + + echo "$response" +} + +# Verify authentication credentials +verify_credentials() { + local url="$OPENSEARCH_URL/_cluster/health" + local http_code=$(curl -o /dev/null $CURL_OPTS -s -w "%{http_code}" -u $CREDENTIALS "$url") + + if [[ $http_code -eq 401 ]]; then + echo "CRITICAL: Invalid authentication credentials." + exit 2 # CRITICAL + elif [[ $http_code != 200 ]]; then + echo "UNKNOWN: Unable to verify authentication credentials." + exit 2 # CRITICAL + fi +} + +# Verify authentication credentials before proceeding +verify_credentials + +# Adjusted function to get nodes stats optionally for a specific node +get_nodes_stats() { + local node_name=$1 + local url="$OPENSEARCH_URL/_nodes/stats" + if [[ -n "$node_name" ]]; then + # If a node name is provided, adjust the URL or filter logic accordingly + url="$OPENSEARCH_URL/_nodes/$node_name/stats" + fi + echo $(execute_curl "$url") +} + +# Function to get disk space usage +check_disk_usage() { + # Set default warning and critical thresholds if not provided + if [[ -z "$WARN_THRESHOLD" ]]; then + WARN_THRESHOLD=70 # Default warning threshold at 70% + fi + + if [[ -z "$CRIT_THRESHOLD" ]]; then + CRIT_THRESHOLD=90 # Default critical threshold at 90% + fi + + local response=$(execute_curl "$OPENSEARCH_URL/_cat/allocation?format=json") + local node_info=$(echo "$response" | jq -r --arg node "$CURRENT_NODE" '.[] | select(.node == $node)') + local percent=$(echo "$node_info" | jq -r '.["disk.percent"] // "n/a"') + + if [[ "$node_info" == "" ]]; then + echo "UNKNOWN: Node $CURRENT_NODE not found in the cluster." + exit 2 # CRITICAL + elif [[ "$percent" == "n/a" ]]; then + echo "UNKNOWN: Disk information for node $CURRENT_NODE is unavailable." + exit 2 # CRITICAL + else + local used=$(echo "$node_info" | jq -r '.["disk.used"]') + local total=$(echo "$node_info" | jq -r '.["disk.total"]') + local avail=$(echo "$node_info" | jq -r '.["disk.avail"]') + local perf_data="'$CURRENT_NODE'_used=${used}; '$CURRENT_NODE'_total=${total}; '$CURRENT_NODE'_avail=${avail};" + + if [[ "$percent" -ge "$CRIT_THRESHOLD" ]]; then + echo "CRITICAL: Disk usage on $CURRENT_NODE is critical: ${percent}% used | $perf_data" + exit 2 # CRITICAL + elif [[ "$percent" -ge "$WARN_THRESHOLD" ]]; then + echo "WARNING: Disk usage on $CURRENT_NODE is high: ${percent}% used | $perf_data" + exit 1 # WARNING + else + echo "OK: Disk usage on $CURRENT_NODE is within thresholds: ${percent}% used | $perf_data" + exit 0 # OK + fi + fi +} + +# Function to get thread pool queue size +check_thread_pool_queues() { + # Adjust to filter by the current node's hostname + local response=$(execute_curl "$OPENSEARCH_URL/_cat/thread_pool/search?h=node_name,queue&v") + + if [[ -z "$response" ]]; then + echo "UNKNOWN: Unable to retrieve thread pool queue information." + exit 2 # CRITICAL + fi + + local queue_size=$(echo "$response" | awk -v node="$CURRENT_NODE" '$1 == node {print $2}') + + if [[ -z "$queue_size" ]]; then + echo "UNKNOWN: No data for node $CURRENT_NODE." + exit 2 # CRITICAL + fi + + # Compare the queue size to the warning threshold + if [[ "$queue_size" -gt "$WARN_THRESHOLD" ]]; then + echo "WARNING: High search thread pool queue on $CURRENT_NODE: $queue_size" + exit 1 # WARNING + else + echo "Thread pool queue OK on $CURRENT_NODE | '$CURRENT_NODE'_queue=$queue_size;" + exit 0 # OK + fi +} + + +# Function to check for indices with no replicas +check_no_replica_indices() { + local response=$(execute_curl "$OPENSEARCH_URL/_cat/indices?h=index,rep&s=index") + local indices_with_no_replicas=$(echo "$response" | awk '$1 !~ /^\./ && $2 == "0" {print $1}') + + if [[ -n "$indices_with_no_replicas" ]]; then + echo "CRITICAL: The following user indices have no replicas: $indices_with_no_replicas" + exit 2 # CRITICAL + else + echo "OK: All user indices have replicas." + exit 0 # OK + fi +} + +# Function to check if uptime is less than 10 minutes +check_node_uptime() { + local node_name=$1 + # Fetch node stats. + local node_stats=$(execute_curl "$OPENSEARCH_URL/_nodes/$node_name/stats") + + # Extract uptime in milliseconds + local uptime_ms=$(echo "$node_stats" | jq -r ".nodes[] | select(.name == \"$node_name\") | .jvm.uptime_in_millis") + + # Validate uptime_ms is numeric + if ! [[ "$uptime_ms" =~ ^[0-9]+$ ]]; then + echo "UNKNOWN: Unable to retrieve or validate uptime for node $node_name." + exit 2 # CRITICAL + fi + + # Convert uptime from milliseconds to minutes for performance data + local uptime_minutes=$((uptime_ms / 60000)) + + # Convert uptime from milliseconds to human-readable format for message display + local days=$((uptime_ms / 86400000)) + local hours=$(( (uptime_ms % 86400000) / 3600000 )) + local minutes_display=$(( (uptime_ms % 3600000) / 60000 )) + + # Prepare uptime string in a human-readable format + local uptime_string="${days}d ${hours}h ${minutes_display}m" + + # Prepare performance data including uptime in minutes + local perf_data="'uptime_minutes'=$uptime_minutes" + + # Check if uptime is less than 10 minutes + if [[ "$uptime_minutes" -lt 10 ]]; then + echo "WARNING: OpenSearch node $node_name uptime is less than 10 minutes ($uptime_string). | $perf_data" + exit 1 # WARNING + else + echo "OK: OpenSearch node $node_name uptime is $uptime_string. | $perf_data" + exit 0 # OK + fi +} + +check_shard_capacity() { + # Defaults if not provided + local warn="${WARN_THRESHOLD:-80}" # % used + local crit="${CRIT_THRESHOLD:-95}" # % used + + # Pull cluster stats (has both node counts and shard totals) + local stats + stats=$(execute_curl "$OPENSEARCH_URL/_cluster/stats") + + # Total shards (primaries + replicas) currently open/active + local total_shards + total_shards=$(echo "$stats" | jq -r '.indices.shards.total // 0') + + # Data node count + local data_nodes + data_nodes=$(echo "$stats" | jq -r '.nodes.count.data // 0') + + if [[ -z "$data_nodes" || "$data_nodes" -le 0 ]]; then + echo "CRITICAL: No data nodes reported by cluster - OpenSearch cluster likely unhealthy or unreachable." + exit 2 + fi + + # Get max_shards_per_node from settings (persistent -> transient -> defaults) + local settings + settings=$(execute_curl "$OPENSEARCH_URL/_cluster/settings?include_defaults=true") + + # Try persistent, then transient, then defaults + local mspn + mspn=$(echo "$settings" | jq -r ' + .persistent.cluster.max_shards_per_node // .transient.cluster.max_shards_per_node // .defaults.cluster.max_shards_per_node // empty + ') + # Fallback to 1000 if empty + if [[ -z "$mspn" || "$mspn" == "null" ]]; then + mspn=1000 + fi + + # Compute cluster-wide ceiling + local max_total=$(( mspn * data_nodes )) + + if [[ "$max_total" -le 0 ]]; then + echo "CRITICAL: Invalid shard ceiling (max_shards_per_node=$mspn, data_nodes=$data_nodes)." + exit 2 + fi + + # Used percentage + local used_pct + used_pct=$(echo "scale=2; ($total_shards*100)/$max_total" | bc) + # Headroom (absolute shards) + local headroom=$(( max_total - total_shards )) + + # Perfdata + local perf="total_shards=$total_shards;;;$max_total; max_shards=$max_total;;;0; used_pct=${used_pct}%;$warn;$crit;0;100 headroom=$headroom;;;0;" + + # Evaluate thresholds + # Compare as decimals via bc + if (( $(echo "$used_pct >= $crit" | bc -l) )); then + echo "CRITICAL: Shard capacity used ${used_pct}% ($total_shards/$max_total). per_node=$mspn data_nodes=$data_nodes headroom=$headroom | $perf" + exit 2 + elif (( $(echo "$used_pct >= $warn" | bc -l) )); then + echo "WARNING: Shard capacity used ${used_pct}% ($total_shards/$max_total). per_node=$mspn data_nodes=$data_nodes headroom=$headroom | $perf" + exit 1 + else + echo "OK: Shard capacity used ${used_pct}% ($total_shards/$max_total). per_node=$mspn data_nodes=$data_nodes headroom=$headroom | $perf" + exit 0 + fi +} + +# Perform checks based on type +case "$TYPE" in + cluster_status) + cluster_health=$(get_cluster_health) + if [[ $? -ne 0 ]]; then + # If get_cluster_health exited with a non-zero status, it has already handled the error. + return + fi + cluster_status=$(echo "$cluster_health" | jq -r '.status') + number_of_nodes=$(echo "$cluster_health" | jq -r '.number_of_nodes') + perf_data="nodes=$number_of_nodes" + case "$cluster_status" in + green) + echo "OK: Cluster status is GREEN. All systems functional. | $perf_data" + exit 0 + ;; + yellow) + echo "WARNING: Cluster status is YELLOW. Data is available but some replicas are not allocated. This could affect redundancy and failover capabilities. | $perf_data" + exit 1 + ;; + red) + echo "CRITICAL: Cluster status is RED. Data is not fully available due to unallocated shards. Immediate action required. | $perf_data" + exit 2 + ;; + *) + echo "UNKNOWN: Cluster status is UNKNOWN - $cluster_status. Action may be required. | $perf_data" + exit 2 + ;; + esac + ;; + nodes) + cluster_health=$(get_cluster_health) + current_nodes=$(echo "$cluster_health" | jq -r '.number_of_nodes') + perf_data="'current_nodes'=$current_nodes" + if [[ -z "$EXPECTED_NODE_COUNT" ]]; then + echo "INFO: Nodes = $current_nodes (No number of expected nodes provided) | $perf_data" + exit 0 # OK (Informational) + else + if [[ "$current_nodes" -lt "$EXPECTED_NODE_COUNT" ]]; then + echo "WARNING: Number of nodes ($current_nodes) is below the expected count ($EXPECTED_NODE_COUNT). | $perf_data" + exit 1 # WARNING + else + echo "OK: Nodes = $current_nodes (Expected count met or exceeded) | $perf_data" + exit 0 # OK + fi + fi + ;; + unassigned_shards) + cluster_health=$(get_cluster_health) + if [[ $? -ne 0 ]]; then + # If get_cluster_health exited with a non-zero status, it has already handled the error. + exit 2 # Exit with the same status to indicate failure. + fi + unassigned_shards=$(echo "$cluster_health" | jq -r '.unassigned_shards') + + # Ensure that WARN_THRESHOLD and CRIT_THRESHOLD have default values if not set + if [[ -z "$WARN_THRESHOLD" ]]; then + WARN_THRESHOLD=5 # Default warning threshold + fi + + if [[ -z "$CRIT_THRESHOLD" ]]; then + CRIT_THRESHOLD=10 # Default critical threshold + fi + + # Perf data string for graphing + perf_data="'unassigned_shards'=$unassigned_shards;$WARN_THRESHOLD;$CRIT_THRESHOLD;0;" + + # Check if unassigned_shards is a valid number + if ! [[ "$unassigned_shards" =~ ^[0-9]+$ ]]; then + echo "CRITICAL: Unable to retrieve the number of unassigned shards. | $perf_data" + exit 2 # CRITICAL + fi + + # Compare the number of unassigned shards against the thresholds and include perf data in the output + if (( unassigned_shards < WARN_THRESHOLD )); then + echo "OK: Number of unassigned shards is within threshold: $unassigned_shards | $perf_data" + exit 0 # OK + elif (( unassigned_shards >= WARN_THRESHOLD && unassigned_shards < CRIT_THRESHOLD )); then + echo "WARNING: High number of unassigned shards: $unassigned_shards | $perf_data" + exit 1 # WARNING + else + echo "CRITICAL: Very high number of unassigned shards: $unassigned_shards | $perf_data" + exit 2 # CRITICAL + fi + ;; + jvm_usage) + # Ensure that WARN_THRESHOLD and CRIT_THRESHOLD have default values if not set + if [[ -z "$WARN_THRESHOLD" ]]; then + WARN_THRESHOLD=70 # Default warning threshold + fi + + if [[ -z "$CRIT_THRESHOLD" ]]; then + CRIT_THRESHOLD=90 # Default critical threshold + fi + + # Get JVM stats for the current node only + nodes_stats=$(get_nodes_stats $CURRENT_NODE) + jvm_heap_used_percent=$(echo "$nodes_stats" | jq -r ".nodes[] | select(.name == \"$CURRENT_NODE\") | .jvm.mem.heap_used_percent") + + if [[ -z "$jvm_heap_used_percent" || "$jvm_heap_used_percent" == "null" ]]; then + echo "UNKNOWN: No JVM stats available for node $CURRENT_NODE." + exit 2 # CRITICAL + fi + + # Perf data string for graphing + perf_data="'jvm_heap_used_percent'=$jvm_heap_used_percent%;$WARN_THRESHOLD;$CRIT_THRESHOLD;0;100" + + # Compare the JVM heap usage against the thresholds and include perf data in the output + if (( $(echo "$jvm_heap_used_percent < $WARN_THRESHOLD" | bc -l) )); then + echo "OK: JVM Heap Used on $CURRENT_NODE is within threshold: ${jvm_heap_used_percent}% | $perf_data" + exit 0 # OK + elif (( $(echo "$jvm_heap_used_percent >= $WARN_THRESHOLD && $jvm_heap_used_percent < $CRIT_THRESHOLD" | bc -l) )); then + echo "WARNING: JVM Heap Used on $CURRENT_NODE is high: ${jvm_heap_used_percent}% | $perf_data" + exit 1 # WARNING + else + echo "CRITICAL: JVM Heap Used on $CURRENT_NODE is very high: ${jvm_heap_used_percent}% | $perf_data" + exit 2 # CRITICAL + fi + ;; + disk_usage) + check_disk_usage + ;; + thread_pool_queues) + check_thread_pool_queues + ;; + no_replica_indices) + check_no_replica_indices + ;; + node_uptime) + check_node_uptime $CURRENT_NODE + ;; + check_disk_space_for_resharding) + # Fetch cluster stats for disk space and number of data nodes + cluster_stats=$(execute_curl "$OPENSEARCH_URL/_cluster/stats") + total_disk_space=$(echo "$cluster_stats" | jq '.nodes.fs.total_in_bytes') + available_disk_space=$(echo "$cluster_stats" | jq '.nodes.fs.available_in_bytes') + number_of_data_nodes=$(echo "$cluster_stats" | jq '.nodes.count.data') + + # Calculate the average disk space that would be required per node after resharding (excluding one node) + if ((number_of_data_nodes > 1)); then + space_required_per_node_after_resharding=$(( (total_disk_space - available_disk_space) / (number_of_data_nodes - 1) )) + + # Check if there's enough available disk space for resharding after hypothetically losing one data node + if (( space_required_per_node_after_resharding > available_disk_space )); then + echo "WARNING: There might not be enough disk space for index resharding if a data node fails." + exit 1 # WARNING + else + echo "OK: Sufficient disk space for index resharding after a data node failure." + exit 0 # OK + fi + else + echo "UNKNOWN: Insufficient data nodes to calculate disk space for resharding." + exit 2 # CRITICAL + fi + ;; + shard_capacity) + check_shard_capacity + ;; + *) + # This should theoretically never be reached due to the prior validation + echo "This check type is not implemented in the script. Please contact the administrator if you believe this is an error." + exit 2 # Unknown + ;; +esac \ No newline at end of file diff --git a/lib/facter/nagios_opensearch.rb b/lib/facter/nagios_opensearch.rb new file mode 100644 index 0000000..65a4e60 --- /dev/null +++ b/lib/facter/nagios_opensearch.rb @@ -0,0 +1,13 @@ +# Create custom nagios_opensearch if opensearch binary is found + +binaries = [ + '/usr/share/opensearch/bin/opensearch', + '/opt/opensearch/bin/opensearch', + '/opt/opensearch/current/bin/opensearch' + ] + +binaries.each do |filename| + if File.exists?(filename) + Facter.add('nagios_opensearch') { setcode { true } } + end +end \ No newline at end of file diff --git a/manifests/check/opensearch.pp b/manifests/check/opensearch.pp new file mode 100644 index 0000000..e500ae1 --- /dev/null +++ b/manifests/check/opensearch.pp @@ -0,0 +1,122 @@ +class nagios::check::opensearch ( + # Absent by default for testing + Enum['present','absent'] $ensure = 'present', + String $args = '', + Optional[String] $host = '127.0.0.1', + Optional[String] $port = undef, + Optional[String] $node = undef, + Optional[Integer] $expected_nodes = undef, + Optional[String] $user = 'admin', + Optional[String] $pass = 'admin', + Array[String] $modes_enabled = [], + Array[String] $modes_disabled = [], + Optional[Hash[String, String]] $mode_args = {}, + Optional[String] $check_title = $::nagios::client::host_name, + Optional[String] $check_period = $::nagios::client::service_check_period, + Optional[String] $contact_groups = $::nagios::client::service_contact_groups, + Optional[String] $first_notification_delay = $::nagios::client::service_first_notification_delay, + Optional[String] $max_check_attempts = $::nagios::client::service_max_check_attempts, + Optional[String] $notification_period = $::nagios::client::service_notification_period, + Optional[String] $use = $::nagios::client::service_use, + Optional[String] $servicegroups = $::nagios::client::service_servicegroups, +) { + + # Set options from parameters unless already set inside args + if $args !~ /-H/ and $host != undef { + $arg_h = "-H ${host} " + } else { + $arg_h = '' + } + if $args !~ /-P/ and $port != undef { + $arg_p = "-P ${port} " + } else { + $arg_p = '' + } + if $args !~ /-N/ and $node != undef { + $arg_n = "-N ${node} " + } else { + $arg_n = '' + } + if $args !~ /-u/ and $user != undef { + $arg_u = "-u ${user} " + } else { + $arg_u = '' + } + if $args !~ /-p/ and $pass != undef { + $arg_pass = "-p ${pass} " + } else { + $arg_pass = '' + } + if $args !~ /-E/ and $expected_nodes != undef { + $arg_enodes = "-E ${expected_nodes} " + } else { + $arg_enodes = '' + } + + $globalargs = strip("${arg_h}${arg_p}${arg_n}${arg_u}${arg_pass}${arg_enodes}${args}") + + # We need jq and bc installed + $packages = [ 'bc' , 'jq' ] + $packages.each |$package_name| { + package { $package_name: ensure => installed } + } + + # Custom check script + file { '/usr/lib64/nagios/plugins/opensearch_check_nrpe.sh': + ensure => file, + owner => 'root', + group => 'root', + mode => '0755', + source => "puppet:///modules/${module_name}/scripts/opensearch_check_nrpe.sh", + } + + # Define Nagios checks for each mode + $check_commands = { + 'cluster_status' => 'cluster_status', + 'nodes' => 'nodes', + 'unassigned_shards' => 'unassigned_shards', + 'jvm_usage' => 'jvm_usage', + 'disk_usage' => 'disk_usage', + 'thread_pool_queues' => 'thread_pool_queues', + 'no_replica_indices' => 'no_replica_indices', + 'node_uptime' => 'node_uptime', + 'check_disk_space_for_resharding' => 'check_disk_space_for_resharding', + 'shard_capacity' => 'shard_capacity', + } + + # Define Nagios checks for each mode + # Need to solve the user and password for monitoring + $check_commands.each |$mode, $command| { + if !($mode in $modes_disabled) and (empty($modes_enabled) or $mode in $modes_enabled) { + # Determine if mode_args is defined and has a key for the current mode + $args_mode = $mode_args ? { + undef => '', + default => $mode_args[$mode] ? { + undef => '', + default => $mode_args[$mode] + } + } + $fullargs = strip("${globalargs} ${args_mode}") + + nagios::client::nrpe_file { "check_opensearch_${mode}": + ensure => $ensure, + plugin => 'opensearch_check_nrpe.sh', + args => "$fullargs -t ${command}", + require => File['/usr/lib64/nagios/plugins/opensearch_check_nrpe.sh'], + } + + nagios::service { "check_opensearch_${mode}_${check_title}": + ensure => $ensure, + check_command => "check_nrpe_opensearch_${mode}", + service_description => "opensearch_${mode}", + servicegroups => $servicegroups, + check_period => $check_period, + contact_groups => $contact_groups, + first_notification_delay => $first_notification_delay, + notification_period => $notification_period, + max_check_attempts => $max_check_attempts, + use => $use, + } + } + } +} \ No newline at end of file diff --git a/manifests/check/opensearch/mode.pp b/manifests/check/opensearch/mode.pp new file mode 100644 index 0000000..903c07b --- /dev/null +++ b/manifests/check/opensearch/mode.pp @@ -0,0 +1,55 @@ +define nagios::check::opensearch::mode ( + $ensure, + $globalargs, + $modes_enabled, + $modes_disabled, + $servicegroups, + $check_title, + $check_period, + $contact_groups, + $first_notification_delay, + $max_check_attempts, + $notification_period, + $use, +) { + $mode = $title + if $ensure == 'absent' or + ( $modes_disabled != [] and $mode in $modes_disabled ) or + ( $modes_enabled != [] and ! ( $mode in $modes_enabled ) ) { + + $ensure_mode = 'absent' + $fullargs = undef + + } else { + $ensure_mode = $ensure + # Get the args passed to the main class for our mode + $args_mode = getvar("::nagios::check::opensearch::args_${mode}") + if $mode == 'split_brain' { + # split_brain mode needs the node address through'-N', '-H' option is not need it + # since the query output already answer with the full cluster status + $fullargs = regsubst(strip("${globalargs} ${args_mode}"),'-H','-N') + } else { + $fullargs = strip("${globalargs} ${args_mode}") + } + } + + nagios::client::nrpe_file { "check_opensearch_${mode}": + ensure => $ensure_mode, + plugin => "check_es_${mode}", + args => $fullargs, + } + + nagios::service { "check_opensearch_${mode}_${check_title}": + ensure => $ensure_mode, + check_command => "check_nrpe_opensearch_${mode}", + service_description => "opensearch_${mode}", + servicegroups => $servicegroups, + check_period => $check_period, + contact_groups => $contact_groups, + first_notification_delay => $first_notification_delay, + notification_period => $notification_period, + max_check_attempts => $max_check_attempts, + use => $use, + } + +} \ No newline at end of file diff --git a/manifests/server.pp b/manifests/server.pp index 91512ff..35f806c 100644 --- a/manifests/server.pp +++ b/manifests/server.pp @@ -1024,6 +1024,36 @@ nagios_command { 'check_nrpe_elasticsearch_unassigned_shards': command_line => "${nrpe} -c check_elasticsearch_unassigned_shards", } + nagios_command { 'check_nrpe_opensearch_cluster_status': + command_line => "${nrpe} -c check_opensearch_cluster_status", + } + nagios_command { 'check_nrpe_opensearch_disk_usage': + command_line => "${nrpe} -c check_opensearch_disk_usage", + } + nagios_command { 'check_nrpe_opensearch_nodes': + command_line => "${nrpe} -c check_opensearch_nodes", + } + nagios_command { 'check_nrpe_opensearch_unassigned_shards': + command_line => "${nrpe} -c check_opensearch_unassigned_shards", + } + nagios_command { 'check_nrpe_opensearch_jvm_usage': + command_line => "${nrpe} -c check_opensearch_jvm_usage", + } + nagios_command { 'check_nrpe_opensearch_thread_pool_queues': + command_line => "${nrpe} -c check_opensearch_thread_pool_queues", + } + nagios_command { 'check_nrpe_opensearch_no_replica_indices': + command_line => "${nrpe} -c check_opensearch_no_replica_indices", + } + nagios_command { 'check_nrpe_opensearch_node_uptime': + command_line => "${nrpe} -c check_opensearch_node_uptime", + } + nagios_command { 'check_nrpe_opensearch_check_disk_space_for_resharding': + command_line => "${nrpe} -c check_opensearch_check_disk_space_for_resharding", + } + nagios_command { 'check_nrpe_opensearch_shard_capacity': + command_line => "${nrpe} -c check_opensearch_shard_capacity", + } nagios_command { 'check_nrpe_fluentbit_health': command_line => "${nrpe} -c check_fluentbit_health", }