Skip to content

Commit 4177a45

Browse files
committed
New macros:
- `splunkadmins_shutdown_time_by_period` New alerts: - `MonitoringConsole - Check OS ulimits via REST` - `SearchHeadLevel - Detect bundle pushes no longer occurring` New reports: - `DeploymentServer - Count by application` - contributed by @trex (radler) - `IndexerLevel - DataModel Acceleration - Indexes in use` - `SearchHeadLevel - Knowledge bundle status on indexers` - `SearchHeadLevel - Knowledge bundle replication times metrics.log` Updated alerts: - `AllSplunkEnterpriseLevel - Splunkd Log Messages Admins Only` Updated dashboards: - `splunk_introspection_io_stats` - updated names/description of fields used - `indexer_max_data_queue_sizes_by_name` - minor tweak to replication queue queries - `indexer_max_data_queue_sizes_by_name_v8` - minor tweak to replication queue queries - `splunk_forwarder_output_tuning` - comment update only Updated macros: - `splunkadmins_shutdown_time_by_period(4)` to work as expected Added link to Admins Little Helper for Splunk and TrackMe README.md improvements
1 parent 67192d0 commit 4177a45

10 files changed

+419
-124
lines changed

README.md

Lines changed: 173 additions & 105 deletions
Large diffs are not rendered by default.

app.manifest

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"id": {
66
"group": null,
77
"name": "SplunkAdmins",
8-
"version": "3.0.0"
8+
"version": "3.0.1"
99
},
1010
"author": [
1111
{

default/app.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ label = SplunkAdmins
1212
[launcher]
1313
author = Gareth Anderson
1414
description = Alerts and dashboards as described in the Splunk 2017 conf presentation How did you get so big?
15-
version = 3.0.0
15+
version = 3.0.1
1616

1717
[package]
1818
id = SplunkAdmins

default/data/ui/nav/default.xml

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkEnterpriseLevel%2520-%2520Core%2520Dumps%2520Disabled">Core Dumps Disabled</a>
1010
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkEnterpriseLevel%20-%20Transparent%20Huge%20Pages%20is%20enabled%20and%20should%20not%20be">Transparent Huge Pages is enabled and should not be</a>
1111
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkEnterpriseLevel%20-%20ulimit%20on%20Splunk%20enterprise%20servers%20is%20below%208192">ulimit on Splunk enterprise servers is below 8192</a>
12+
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FMonitoringConsole%20-%20Check%20OS%20ulimits%20via%20REST">MonitoringConsole - Check OS ulimits via REST</a>
1213
</collection>
1314
<collection label="Failures">
1415
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkEnterpriseLevel%2520-%2520KVStore%2520Process%2520Terminated">KVStore Process Terminated</a>
@@ -84,6 +85,7 @@
8485
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkEnterpriseLevel%20-%20Splunkd%20Log%20Messages%20Admins%20Only">Splunkd Log Messages Admins Only</a>
8586
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FDeploymentServer%20-%20Error%20Found%20On%20Deployment%20Server">Error Found On Deployment Server</a>
8687
</collection>
88+
<saved name="DeploymentServer - Count by application" />
8789
</collection>
8890
<collection label="ForwarderLevel">
8991
<collection label="OS Level Issues">
@@ -93,6 +95,7 @@
9395
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FForwarderLevel%20-%20Splunk%20Universal%20Forwarders%20that%20are%20time%20shifting">Splunk Universal Forwarders that are time shifting</a>
9496
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FForwarderLevel%20-%20Splunk%20universal%20forwarders%20with%20ulimit%20issues">Splunk universal forwarders with ulimit issues</a>
9597
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FForwarderLevel%20-%20Splunk%20Universal%20Forwarders%20Exceeding%20the%20File%20Descriptor%20Cache">Splunk Universal Forwarders Exceeding the File Descriptor Cache</a>
98+
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FMonitoringConsole%20-%20Check%20OS%20ulimits%20via%20REST">MonitoringConsole - Check OS ulimits via REST (useful for HF's only)</a>
9699
</collection>
97100
<collection label="File Monitoring issues">
98101
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FForwarderLevel%20-%20crcSalt%20or%20initCrcLength%20change%20may%20be%20required">crcSalt or initCrcLength change may be required</a>
@@ -101,6 +104,7 @@
101104
</collection>
102105
<collection label="Deployment Server">
103106
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkLevel%20-%20Splunk%20forwarders%20that%20are%20not%20talking%20to%20the%20deployment%20server">Splunk forwarders that are not talking to the deployment server</a>
107+
<saved name="DeploymentServer - Count by application" />
104108
</collection>
105109
<collection label="Splunk Level Issues">
106110
<collection label="Performance">
@@ -208,6 +212,7 @@
208212
<view name="hec_performance" />
209213
<view name="splunk_introspection_io_stats" />
210214
<saved name="IndexerLevel - Knowledge bundle upload stats" />
215+
<saved name="SearchHeadLevel - Knowledge bundle replication times metrics.log" />
211216
<saved name="SearchHeadLevel - Search Messages field extractor slow" />
212217
<saved name="IndexerLevel - IndexWriter pause duration" />
213218
</collection>
@@ -283,13 +288,14 @@
283288
<saved name="SearchHeadLevel - Search Queries summary exact match" />
284289
<saved name="SearchHeadLevel - Search Queries summary exact match by user" />
285290
<saved name="SearchHeadLevel - Search Queries summary exact match by index" />
286-
<saved name="SearchHeadLevel - IndexesPerUser Report" />
291+
<saved name="SearchHeadLevel - IndexesPerUser Report" />
287292
<saved name="IndexerLevel - RemoteSearches Indexes Stats" />
288-
<saved name="IndexerLevel - RemoteSearches Indexes Stats Wilcard" />
293+
<saved name="IndexerLevel - RemoteSearches Indexes Stats Wilcard" />
289294
</collection>
290295
<collection label="Data Models">
291296
<saved name="SearchHeadLevel - Data Model Acceleration Completion Status" />
292297
<saved name="SearchHeadLevel - DataModel Fields" />
298+
<saved name="IndexerLevel - DataModel Acceleration - Indexes in use" />
293299
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20datamodel%20errors%20in%20splunkd">datamodel errors in splunkd</a>
294300
<view name="data_model_rebuild_monitor" />
295301
<view name="data_model_status" />
@@ -310,6 +316,7 @@
310316
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20datamodel%20errors%20in%20splunkd">datamodel errors in splunkd</a>
311317
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FIndexerLevel%20-%20SmartStore%20-%20Bucket%20cache%20errors%20audit%20logs">IndexerLevel - SmartStore - Bucket cache errors audit logs</a>
312318
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkLevel%20-%20No%20recent%20metrics.log%20data">AllSplunkLevel - No recent metrics.log data</a>
319+
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20Detect%20bundle%20pushes%20no%20longer%20occurring">SearchHeadLevel - Detect bundle pushes no longer occurring</a>
313320
<collection label="Generic">
314321
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FAllSplunkEnterpriseLevel%20-%20Splunkd%20Log%20Messages%20Admins%20Only">Splunkd Log Messages Admins Only</a>
315322
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20Search%20Messages%20user%20level">Search Messages user level</a>
@@ -343,6 +350,7 @@
343350
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20Splunk%20Scheduler%20logs%20have%20not%20appeared%20in%20the%20last">Splunk Scheduler logs have not appeared in the last</a>
344351
</collection>
345352
<collection label="Other">
353+
<saved name="SearchHeadLevel - Knowledge bundle replication times metrics.log" />
346354
<saved name="SearchHeadLevel - audit logs showing all time searches" />
347355
<saved name="IndexerLevel - RemoteSearches find all time searches" />
348356
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20Excessive%20REST%20API%20usage">SearchHeadLevel - Excessive REST API usage</a>
@@ -361,6 +369,7 @@
361369
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FIndexerLevel%20-%20Slow%20peer%20from%20remote%20searches">Slow peer from remote searches</a>
362370
<saved name="SearchHeadLevel - Search Messages field extractor slow" />
363371
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20Excessive%20REST%20API%20usage">SearchHeadLevel - Excessive REST API usage</a>
372+
<saved name="SearchHeadLevel - Knowledge bundle replication times metrics.log" />
364373
</collection>
365374
<collection label="Proactive">
366375
<a href="/app/SplunkAdmins/alert?s=%2FservicesNS%2Fnobody%2FSplunkAdmins%2Fsaved%2Fsearches%2FSearchHeadLevel%20-%20LDAP%20users%20have%20been%20disabled%20or%20left%20the%20company%20cleanup%20required">LDAP users have been disabled or left the company cleanup required</a>
@@ -412,6 +421,8 @@
412421
<view name="knowledge_objects_by_app" />
413422
<view name="lookups_in_use_finder" />
414423
<view name="lookup_audit" />
424+
<saved name="SearchHeadLevel - Knowledge bundle status on indexers" />
425+
<saved name="SearchHeadLevel - Knowledge bundle replication times metrics.log" />
415426
</collection>
416427
<collection label="Summary_Reports">
417428
<saved name="SearchHeadLevel - platform_stats.audit metrics searches" />
@@ -454,6 +465,8 @@
454465
<a href="https://github.com/dpaper-splunk/public/tree/master/dashboards" target="_blank">Extended Search Reporting (and others)</a>
455466
<a href="https://github.com/nicovdw/splunk_concurrency_helper" target="_blank">Search Scheduler Tuning searches</a>
456467
<a href="https://splunkbase.splunk.com/app/6449/" target="_blank">Sideview UI (User Activity details)</a>
468+
<a href="https://splunkbase.splunk.com/app/6368/" target="_blank">Admins Little Helper for Splunk (btool, bundle utils and similar)</a>
469+
<a href="https://splunkbase.splunk.com/app/4621/" target="_blank">TrackMe (Data Ingestion)</a>
457470
</collection>
458471
</collection>
459472
<collection label="Summary_Reports">

default/data/ui/views/indexer_max_data_queue_sizes_by_name.xml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,13 @@
127127
<chart>
128128
<title>The replication queue appears to directly relate to the indexing queue, any blockage of the indexing queue will then block the replication queue and temporarily slow data ingestion. The replication queue appears to be extremely sensitive to the other indexers indexing queue so it can be a useful measure of an issue...</title>
129129
<search>
130-
<query>index=_internal `indexerhosts` "replication queue for " "full" OR "has room now" sourcetype=splunkd | rename peer AS guid | join guid [| rest /services/search/distributed/peers | table guid peerName] | transaction bid peer endswith="has room now" keeporphans=true | timechart span=1m count, max(duration) AS duration by peerName</query>
130+
<query>index=_internal `indexerhosts` "replication queue for " "full" OR "has room now" sourcetype=splunkd
131+
| rename peer AS guid
132+
| join guid
133+
[| rest /services/search/distributed/peers
134+
| table guid peerName]
135+
| transaction bid guid endswith="has room now" keeporphans=true
136+
| timechart span=1m count, max(duration) AS duration by peerName</query>
131137
<earliest>-60m@m</earliest>
132138
<latest>now</latest>
133139
<sampleRatio>1</sampleRatio>

default/data/ui/views/indexer_max_data_queue_sizes_by_name_v8.xml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,13 @@
127127
<chart>
128128
<title>The replication queue appears to directly relate to the indexing queue, any blockage of the indexing queue will then block the replication queue and temporarily slow data ingestion. The replication queue appears to be extremely sensitive to the other indexers indexing queue so it can be a useful measure of an issue...</title>
129129
<search>
130-
<query>index=_internal `indexerhosts` "replication queue for " "full" OR "has room now" sourcetype=splunkd | rename peer AS guid | join guid [| rest /services/search/distributed/peers | table guid peerName] | transaction bid peer endswith="has room now" keeporphans=true | timechart span=1m count, max(duration) AS duration by peerName</query>
130+
<query>index=_internal `indexerhosts` "replication queue for " "full" OR "has room now" sourcetype=splunkd
131+
| rename peer AS guid
132+
| join guid
133+
[| rest /services/search/distributed/peers
134+
| table guid peerName]
135+
| transaction bid guid endswith="has room now" keeporphans=true
136+
| timechart span=1m count, max(duration) AS duration by peerName</query>
131137
<earliest>-60m@m</earliest>
132138
<latest>now</latest>
133139
<sampleRatio>1</sampleRatio>

default/data/ui/views/splunk_forwarder_output_tuning.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@
105105
<p>Purpose of destination count table? metrics.log only records the tcpout data *if* the connection is open at the time the metrics.log writes, so the count is to sanity-check that the numbers of connections matches the number of forwarders on the backend (this will happen with the below outputs.conf settings combined with regular data flow)</p>
106106
<br/>
107107
<p><a href="https://www.linkedin.com/pulse/splunk-asynchronous-forwarding-lightning-fast-data-ingestor-rawat"> Splunk Asynchronous Forwarding (Lightning-fast data ingestor)</a></p>
108-
<p>Purpose of the data output per-second timechart? The current goal is to get close to switching indexers every second for an output group (per-pipeline), note that this will result in more open connections to indexers so only really works if this is deployed to a moderate number of intermediate forwarders (HF's or similar). Note that you want to do this with autoLBVolume, if you lower autoLBFrequency to a very short time period you may result in un-even data balance due to switching frequently when forwarding smaller volumes of data</p>
108+
<p>Purpose of the data output per-second timechart? The current goal is to get close to switching indexers every second for an output group (per-pipeline), note that this will result in more open connections to indexers so only really works if this is deployed to a moderate number of intermediate forwarders (HF's or similar). Note that you want to do this with autoLBVolume, if you lower autoLBFrequency to a very short time period you may result in un-even data balance due to switching frequently when forwarding smaller volumes of data. In my testing so far it would appear that aiming above the average kb/s for the autoLBVolume appears to work well, going too low doesn't work well in my testing so far</p>
109109
<p>Please read the linked article for information on these settings, note that when using async forwarding the open file descriptor usage is higher than without async forwarding as the connections are held open by forwarders. So this works great on an intermediate forwarding tier, this may not work so well with a very large number of forwarders</p>
110110
<p>Also note that the maxQueueSize should not be below 10MB (10MB minimium size)</p>
111111
<p>Finally while this also works on UF's, there are some reasons why you may want to consider HF's if you are running an intermediate tier, answers post <a href="https://community.splunk.com/t5/Getting-Data-In/Wrongly-merged-Events-permanently-blocked-tcpout-queue-with/m-p/508743">Wrongly merged Events/permanently blocked tcpout queue with Intermediate Universal Forwarder</a></p>

default/data/ui/views/splunk_introspection_io_stats.xml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@
2121
<panel>
2222
<title>data.avg_total_ms (average wait time)</title>
2323
<chart>
24-
<title>perc95 average wait time per host (sum of all disks)</title>
24+
<title>perc95 total io service time per host (sum of all disks avg_total_ms)</title>
2525
<search>
2626
<query>index=_introspection sourcetype=splunk_resource_usage component=IOStats $hosts$ data.device=nvme*
27-
| eval avg_wait_ms = 'data.avg_total_ms'
27+
| eval avg_total_ms = 'data.avg_total_ms', comment="You may wish to change sum(avg_total_ms) for perc95 or similar depending on your setup..."
2828
| bin _time span=$span$
29-
| stats sum(avg_wait_ms) AS avg_wait_ms by host, _time
30-
| timechart span=$span$ partial=f limit=99 perc95(avg_wait_ms) AS avg_wait_ms by host</query>
29+
| stats sum(avg_total_ms) AS avg_total_ms by host, _time
30+
| timechart span=$span$ partial=f limit=99 perc95(avg_total_ms) AS avg_total_ms by host</query>
3131
<earliest>$time.earliest$</earliest>
3232
<latest>$time.latest$</latest>
3333
<sampleRatio>1</sampleRatio>

default/macros.conf

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,22 @@ index=_internal (`$macroName$`) sourcetype=splunkd `splunkadmins_splunkd_source`
151151
| rex mode=sed field=search "s/\"//g"
152152
iseval = 0
153153

154+
# variation of the above to utilise smaller blocks of time during the search
155+
[splunkadmins_shutdown_time_by_period(4)]
156+
args = macroName, minTimeContingency, maxTimeContingency, period
157+
definition = search `comment("Send an exclusion list in terms of a search result for the time when any indexer was shutdown")`\
158+
index=_internal (`$macroName$`) sourcetype=splunkd `splunkadmins_splunkd_source` (CASE("Shutting down")) OR "Shutdown complete in" OR "Received shutdown signal." OR "master has instructed peer to restart" OR "Performing early shutdown tasks"\
159+
| eval message=coalesce(message,event_message)\
160+
| bin _time span=$period$\
161+
| stats min(_time) AS logTime by message, host, _time\
162+
| stats min(logTime) AS minTime, max(logTime) AS maxTime by _time\
163+
| eval minTime=minTime - $minTimeContingency$, maxTime=maxTime + $maxTimeContingency$\
164+
| eval search=" _time>" . minTime . " _time<" .maxTime\
165+
| fields search\
166+
| format\
167+
| rex mode=sed field=search "s/\"//g"
168+
iseval = 0
169+
154170

155171
##############
156172
#
@@ -840,3 +856,19 @@ iseval = 0
840856
[splunkadmins_indexerqueue_count]
841857
definition = 1
842858
iseval = 0
859+
860+
[splunkadmins_deploymentserver_splunkserver]
861+
definition = splunk_server=localhost
862+
iseval = 0
863+
864+
[splunkadmins_sh_knowledgebundle_metrics_filter]
865+
definition = where replication_time_msec>200000
866+
iseval = 0
867+
868+
[splunkadmins_sh_knowledgebundle_metrics_timespan]
869+
definition = 60m
870+
iseval = 0
871+
872+
[splunkadmins_bundlepush_span]
873+
definition = 10m
874+
iseval = 0

0 commit comments

Comments
 (0)