New macros:

gjanders · gjanders · commit 4177a45c0c10 · 2022-09-03T14:55:10.000+10:00
- `splunkadmins_shutdown_time_by_period` New alerts: - `MonitoringConsole - Check OS ulimits via REST` - `SearchHeadLevel - Detect bundle pushes no longer occurring` New reports: - `DeploymentServer - Count by application` - contributed by @trex (radler) - `IndexerLevel - DataModel Acceleration - Indexes in use` - `SearchHeadLevel - Knowledge bundle status on indexers` - `SearchHeadLevel - Knowledge bundle replication times metrics.log` Updated alerts: - `AllSplunkEnterpriseLevel - Splunkd Log Messages Admins Only` Updated dashboards: - `splunk_introspection_io_stats` - updated names/description of fields used - `indexer_max_data_queue_sizes_by_name` - minor tweak to replication queue queries - `indexer_max_data_queue_sizes_by_name_v8` - minor tweak to replication queue queries - `splunk_forwarder_output_tuning` - comment update only Updated macros: - `splunkadmins_shutdown_time_by_period(4)` to work as expected Added link to Admins Little Helper for Splunk and TrackMe README.md improvements
diff --git a/README.md b/README.md
diff --git a/app.manifest b/app.manifest
@@ -5,7 +5,7 @@
     "id": {
       "group": null,
       "name": "SplunkAdmins",
-      "version": "3.0.0"
+      "version": "3.0.1"
     },
     "author": [
       {
diff --git a/default/app.conf b/default/app.conf
@@ -12,7 +12,7 @@ label = SplunkAdmins
 [launcher]
 author = Gareth Anderson
 description = Alerts and dashboards as described in the Splunk 2017 conf presentation How did you get so big?
-version = 3.0.0
+version = 3.0.1
 
 [package]
 id = SplunkAdmins
diff --git a/default/data/ui/nav/default.xml b/default/data/ui/nav/default.xml
@@ -9,6 +9,7 @@
 				<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkEnterpriseLevel%2520-%2520Core%2520Dumps%2520Disabled">Core Dumps Disabled</a> 
 				<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkEnterpriseLevel%20-%20Transparent%20Huge%20Pages%20is%20enabled%20and%20should%20not%20be">Transparent Huge Pages is enabled and should not be</a>
 				<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkEnterpriseLevel%20-%20ulimit%20on%20Splunk%20enterprise%20servers%20is%20below%208192">ulimit on Splunk enterprise servers is below 8192</a>
+				<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FMonitoringConsole%20-%20Check%20OS%20ulimits%20via%20REST">MonitoringConsole - Check OS ulimits via REST</a>
 			</collection>   
 			<collection label="Failures">
 				<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkEnterpriseLevel%2520-%2520KVStore%2520Process%2520Terminated">KVStore Process Terminated</a>	
@@ -84,6 +85,7 @@
                         <a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkEnterpriseLevel%20-%20Splunkd%20Log%20Messages%20Admins%20Only">Splunkd Log Messages Admins Only</a>
 	                <a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FDeploymentServer%20-%20Error%20Found%20On%20Deployment%20Server">Error Found On Deployment Server</a>
                 </collection>
+		<saved name="DeploymentServer - Count by application" />		
 	</collection>
 	<collection label="ForwarderLevel">  
 		<collection label="OS Level Issues">  
@@ -93,6 +95,7 @@
 			<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FForwarderLevel%20-%20Splunk%20Universal%20Forwarders%20that%20are%20time%20shifting">Splunk Universal Forwarders that are time shifting</a>
 			<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FForwarderLevel%20-%20Splunk%20universal%20forwarders%20with%20ulimit%20issues">Splunk universal forwarders with ulimit issues</a>			
 			<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FForwarderLevel%20-%20Splunk%20Universal%20Forwarders%20Exceeding%20the%20File%20Descriptor%20Cache">Splunk Universal Forwarders Exceeding the File Descriptor Cache</a>			
+			<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FMonitoringConsole%20-%20Check%20OS%20ulimits%20via%20REST">MonitoringConsole - Check OS ulimits via REST (useful for HF's only)</a>
 		</collection>		
 		<collection label="File Monitoring issues">
 			<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FForwarderLevel%20-%20crcSalt%20or%20initCrcLength%20change%20may%20be%20required">crcSalt or initCrcLength change may be required</a>
@@ -101,6 +104,7 @@
 		</collection>			
 		<collection label="Deployment Server">
 			<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkLevel%20-%20Splunk%20forwarders%20that%20are%20not%20talking%20to%20the%20deployment%20server">Splunk forwarders that are not talking to the deployment server</a>
+			<saved name="DeploymentServer - Count by application" />
 		</collection>
 		<collection label="Splunk Level Issues">
 			<collection label="Performance">
@@ -208,6 +212,7 @@
                                 <view name="hec_performance" />
 				<view name="splunk_introspection_io_stats" />
 				<saved name="IndexerLevel - Knowledge bundle upload stats" />
+				<saved name="SearchHeadLevel - Knowledge bundle replication times metrics.log" />							
 	                        <saved name="SearchHeadLevel - Search Messages field extractor slow" />
 				<saved name="IndexerLevel - IndexWriter pause duration" />				
 			</collection>
@@ -283,13 +288,14 @@
                         <saved name="SearchHeadLevel - Search Queries summary exact match" />
                         <saved name="SearchHeadLevel - Search Queries summary exact match by user" />
                         <saved name="SearchHeadLevel - Search Queries summary exact match by index" />
-                        <saved name="SearchHeadLevel - IndexesPerUser Report" />
+                        <saved name="SearchHeadLevel - IndexesPerUser Report" />			
                         <saved name="IndexerLevel - RemoteSearches Indexes Stats" />
-                        <saved name="IndexerLevel - RemoteSearches Indexes Stats Wilcard" />
+                        <saved name="IndexerLevel - RemoteSearches Indexes Stats Wilcard" />			
 		</collection>      
 		<collection label="Data Models">				
 			<saved name="SearchHeadLevel - Data Model Acceleration Completion Status" />
 			<saved name="SearchHeadLevel - DataModel Fields" />
+			<saved name="IndexerLevel - DataModel Acceleration - Indexes in use" />
                         <a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20datamodel%20errors%20in%20splunkd">datamodel errors in splunkd</a>			
 			<view name="data_model_rebuild_monitor" />
 			<view name="data_model_status" />			
@@ -310,6 +316,7 @@
 			<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20datamodel%20errors%20in%20splunkd">datamodel errors in splunkd</a>
                         <a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FIndexerLevel%20-%20SmartStore%20-%20Bucket%20cache%20errors%20audit%20logs">IndexerLevel - SmartStore - Bucket cache errors audit logs</a>
                         <a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkLevel%20-%20No%20recent%20metrics.log%20data">AllSplunkLevel - No recent metrics.log data</a>
+			<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20Detect%20bundle%20pushes%20no%20longer%20occurring">SearchHeadLevel - Detect bundle pushes no longer occurring</a>
 	                <collection label="Generic">
 	                        <a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkEnterpriseLevel%20-%20Splunkd%20Log%20Messages%20Admins%20Only">Splunkd Log Messages Admins Only</a>
 	                        <a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20Search%20Messages%20user%20level">Search Messages user level</a>
@@ -343,6 +350,7 @@
 				<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20Splunk%20Scheduler%20logs%20have%20not%20appeared%20in%20the%20last">Splunk Scheduler logs have not appeared in the last</a>
 	  		</collection>									
 			<collection label="Other">
+				<saved name="SearchHeadLevel - Knowledge bundle replication times metrics.log" />							
 	                        <saved name="SearchHeadLevel - audit logs showing all time searches" />
 	                        <saved name="IndexerLevel - RemoteSearches find all time searches" />
 				<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20Excessive%20REST%20API%20usage">SearchHeadLevel - Excessive REST API usage</a>			
@@ -361,6 +369,7 @@
 			<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FIndexerLevel%20-%20Slow%20peer%20from%20remote%20searches">Slow peer from remote searches</a>
                         <saved name="SearchHeadLevel - Search Messages field extractor slow" />
 			<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20Excessive%20REST%20API%20usage">SearchHeadLevel - Excessive REST API usage</a>			
+			<saved name="SearchHeadLevel - Knowledge bundle replication times metrics.log" />			
 		</collection>		
 		<collection label="Proactive">			
 			<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20LDAP%20users%20have%20been%20disabled%20or%20left%20the%20company%20cleanup%20required">LDAP users have been disabled or left the company cleanup required</a>		
@@ -412,6 +421,8 @@
 			<view name="knowledge_objects_by_app" />
 			<view name="lookups_in_use_finder" />
 			<view name="lookup_audit" />
+			<saved name="SearchHeadLevel - Knowledge bundle status on indexers" />
+			<saved name="SearchHeadLevel - Knowledge bundle replication times metrics.log" />			
 		</collection>	
 		<collection label="Summary_Reports">
         		<saved name="SearchHeadLevel - platform_stats.audit metrics searches" />
@@ -454,6 +465,8 @@
 			<a href="https://github.com/dpaper-splunk/public/tree/master/dashboards" target="_blank">Extended Search Reporting (and others)</a>
 			<a href="https://github.com/nicovdw/splunk_concurrency_helper" target="_blank">Search Scheduler Tuning searches</a>
 			<a href="https://splunkbase.splunk.com/app/6449/" target="_blank">Sideview UI (User Activity details)</a>
+			<a href="https://splunkbase.splunk.com/app/6368/" target="_blank">Admins Little Helper for Splunk (btool, bundle utils and similar)</a>
+			<a href="https://splunkbase.splunk.com/app/4621/" target="_blank">TrackMe (Data Ingestion)</a>
 		</collection>
 	</collection>	
         <collection label="Summary_Reports">
diff --git a/default/data/ui/views/indexer_max_data_queue_sizes_by_name.xml b/default/data/ui/views/indexer_max_data_queue_sizes_by_name.xml
@@ -127,7 +127,13 @@
       <chart>
         <title>The replication queue appears to directly relate to the indexing queue, any blockage of the indexing queue will then block the replication queue and temporarily slow data ingestion. The replication queue appears to be extremely sensitive to the other indexers indexing queue so it can be a useful measure of an issue...</title>
         <search>
-          <query>index=_internal `indexerhosts` "replication queue for " "full" OR "has room now" sourcetype=splunkd | rename peer AS guid | join guid [| rest /services/search/distributed/peers | table guid peerName] | transaction bid peer endswith="has room now" keeporphans=true | timechart span=1m count, max(duration) AS duration by peerName</query>
+          <query>index=_internal `indexerhosts` "replication queue for " "full" OR "has room now" sourcetype=splunkd
+| rename peer AS guid
+| join guid
+    [| rest /services/search/distributed/peers
+    | table guid peerName]
+| transaction bid guid endswith="has room now" keeporphans=true
+| timechart span=1m count, max(duration) AS duration by peerName</query>	  
           <earliest>-60m@m</earliest>
           <latest>now</latest>
           <sampleRatio>1</sampleRatio>
diff --git a/default/data/ui/views/indexer_max_data_queue_sizes_by_name_v8.xml b/default/data/ui/views/indexer_max_data_queue_sizes_by_name_v8.xml
@@ -127,7 +127,13 @@
       <chart>
         <title>The replication queue appears to directly relate to the indexing queue, any blockage of the indexing queue will then block the replication queue and temporarily slow data ingestion. The replication queue appears to be extremely sensitive to the other indexers indexing queue so it can be a useful measure of an issue...</title>
         <search>
-          <query>index=_internal `indexerhosts` "replication queue for " "full" OR "has room now" sourcetype=splunkd | rename peer AS guid | join guid [| rest /services/search/distributed/peers | table guid peerName] | transaction bid peer endswith="has room now" keeporphans=true | timechart span=1m count, max(duration) AS duration by peerName</query>
+          <query>index=_internal `indexerhosts` "replication queue for " "full" OR "has room now" sourcetype=splunkd
+| rename peer AS guid
+| join guid
+    [| rest /services/search/distributed/peers
+    | table guid peerName]
+| transaction bid guid endswith="has room now" keeporphans=true
+| timechart span=1m count, max(duration) AS duration by peerName</query>
           <earliest>-60m@m</earliest>
           <latest>now</latest>
           <sampleRatio>1</sampleRatio>
diff --git a/default/data/ui/views/splunk_forwarder_output_tuning.xml b/default/data/ui/views/splunk_forwarder_output_tuning.xml
@@ -105,7 +105,7 @@
           <p>Purpose of destination count table? metrics.log only records the tcpout data *if* the connection is open at the time the metrics.log writes, so the count is to sanity-check that the numbers of connections matches the number of forwarders on the backend (this will happen with the below outputs.conf settings combined with regular data flow)</p>
           <br/>
 <p><a href="https://www.linkedin.com/pulse/splunk-asynchronous-forwarding-lightning-fast-data-ingestor-rawat"> Splunk Asynchronous Forwarding (Lightning-fast data ingestor)</a></p>
-    <p>Purpose of the data output per-second timechart? The current goal is to get close to switching indexers every second for an output group (per-pipeline), note that this will result in more open connections to indexers so only really works if this is deployed to a moderate number of intermediate forwarders (HF's or similar). Note that you want to do this with autoLBVolume, if you lower autoLBFrequency to a very short time period you may result in un-even data balance due to switching frequently when forwarding smaller volumes of data</p>
+    <p>Purpose of the data output per-second timechart? The current goal is to get close to switching indexers every second for an output group (per-pipeline), note that this will result in more open connections to indexers so only really works if this is deployed to a moderate number of intermediate forwarders (HF's or similar). Note that you want to do this with autoLBVolume, if you lower autoLBFrequency to a very short time period you may result in un-even data balance due to switching frequently when forwarding smaller volumes of data. In my testing so far it would appear that aiming above the average kb/s for the autoLBVolume appears to work well, going too low doesn't work well in my testing so far</p>
     <p>Please read the linked article for information on these settings, note that when using async forwarding the open file descriptor usage is higher than without async forwarding as the connections are held open by forwarders. So this works great on an intermediate forwarding tier, this may not work so well with a very large number of forwarders</p>
     <p>Also note that the maxQueueSize should not be below 10MB (10MB minimium size)</p>
     <p>Finally while this also works on UF's, there are some reasons why you may want to consider HF's if you are running an intermediate tier, answers post <a href="https://community.splunk.com/t5/Getting-Data-In/Wrongly-merged-Events-permanently-blocked-tcpout-queue-with/m-p/508743">Wrongly merged Events/permanently blocked tcpout queue with Intermediate Universal Forwarder</a></p>
diff --git a/default/data/ui/views/splunk_introspection_io_stats.xml b/default/data/ui/views/splunk_introspection_io_stats.xml
@@ -21,13 +21,13 @@
     <panel>
       <title>data.avg_total_ms (average wait time)</title>
       <chart>
-        <title>perc95 average wait time per host (sum of all disks)</title>
+        <title>perc95 total io service time per host (sum of all disks avg_total_ms)</title>
         <search>
           <query>index=_introspection sourcetype=splunk_resource_usage component=IOStats $hosts$ data.device=nvme* 
-| eval avg_wait_ms = 'data.avg_total_ms' 
+| eval avg_total_ms = 'data.avg_total_ms', comment="You may wish to change sum(avg_total_ms) for perc95 or similar depending on your setup..." 
 | bin _time span=$span$
-| stats sum(avg_wait_ms) AS avg_wait_ms by host, _time
-| timechart span=$span$ partial=f limit=99 perc95(avg_wait_ms) AS avg_wait_ms by host</query>
+| stats sum(avg_total_ms) AS avg_total_ms by host, _time
+| timechart span=$span$ partial=f limit=99 perc95(avg_total_ms) AS avg_total_ms by host</query>
           <earliest>$time.earliest$</earliest>
           <latest>$time.latest$</latest>
           <sampleRatio>1</sampleRatio>
diff --git a/default/macros.conf b/default/macros.conf
@@ -151,6 +151,22 @@ index=_internal (`$macroName$`) sourcetype=splunkd `splunkadmins_splunkd_source`
 | rex mode=sed field=search "s/\"//g"
 iseval = 0
 
+# variation of the above to utilise smaller blocks of time during the search
+[splunkadmins_shutdown_time_by_period(4)]
+args = macroName, minTimeContingency, maxTimeContingency, period
+definition = search `comment("Send an exclusion list in terms of a search result for the time when any indexer was shutdown")`\
+index=_internal (`$macroName$`) sourcetype=splunkd `splunkadmins_splunkd_source` (CASE("Shutting down")) OR "Shutdown complete in" OR "Received shutdown signal." OR "master has instructed peer to restart" OR "Performing early shutdown tasks"\
+| eval message=coalesce(message,event_message)\
+| bin _time span=$period$\
+| stats min(_time) AS logTime by message, host, _time\
+| stats min(logTime) AS minTime, max(logTime) AS maxTime by _time\
+| eval minTime=minTime - $minTimeContingency$, maxTime=maxTime + $maxTimeContingency$\
+| eval search=" _time>" . minTime . " _time<" .maxTime\
+| fields search\
+| format\
+| rex mode=sed field=search "s/\"//g"
+iseval = 0
+
 
 ##############
 #
@@ -840,3 +856,19 @@ iseval = 0
 [splunkadmins_indexerqueue_count]
 definition = 1
 iseval = 0
+
+[splunkadmins_deploymentserver_splunkserver]
+definition = splunk_server=localhost
+iseval = 0
+
+[splunkadmins_sh_knowledgebundle_metrics_filter]
+definition = where replication_time_msec>200000
+iseval = 0
+
+[splunkadmins_sh_knowledgebundle_metrics_timespan]
+definition = 60m
+iseval = 0
+
+[splunkadmins_bundlepush_span]
+definition = 10m
+iseval = 0
diff --git a/default/savedsearches.conf b/default/savedsearches.conf

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	`"id": {`
`6`	`6`	`"group": null,`
`7`	`7`	`"name": "SplunkAdmins",`
`8`		`- "version": "3.0.0"`
	`8`	`+ "version": "3.0.1"`
`9`	`9`	`},`
`10`	`10`	`"author": [`
`11`	`11`	`{`