Skip to content

Commit deaa777

Browse files
authored
Merge branch 'prometheus-community:master' into allamiro-dev
2 parents bdffbf2 + 39b36d0 commit deaa777

File tree

10 files changed

+267
-63
lines changed

10 files changed

+267
-63
lines changed

.github/workflows/lint.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ jobs:
99
flake8:
1010
runs-on: ubuntu-latest
1111
steps:
12-
- uses: actions/checkout@v3
13-
- uses: actions/setup-python@v4
12+
- uses: actions/checkout@v4
13+
- uses: actions/setup-python@v5
1414
with:
15-
python-version: '3.11'
15+
python-version: '3.12'
1616
- uses: py-actions/flake8@v2
1717

1818
shellcheck:
1919
runs-on: ubuntu-latest
2020
steps:
21-
- uses: actions/checkout@v3
21+
- uses: actions/checkout@v4
2222
- uses: ludeeus/action-shellcheck@master

apt_info.py

Lines changed: 34 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,26 @@
11
#!/usr/bin/env python3
2-
#
3-
# Description: Expose metrics from apt. This is inspired by and
4-
# intended to be a replacement for the original apt.sh.
5-
#
6-
# This script deliberately does *not* update the apt cache. You need
7-
# something else to run `apt update` regularly for the metrics to be
8-
# up to date. This can be done in numerous ways, but the canonical way
9-
# is to use the normal `APT::Periodic::Update-Package-Lists`
10-
# setting.
11-
#
12-
# This, for example, will enable a nightly job that runs `apt update`:
13-
#
14-
# echo 'APT::Periodic::Update-Package-Lists "1";' > /etc/apt/apt.conf.d/99_auto_apt_update.conf
15-
#
16-
# See /usr/lib/apt/apt.systemd.daily for details.
17-
#
18-
# Dependencies: python3-apt, python3-prometheus-client
19-
#
20-
# Authors: Kyle Fazzari <kyrofa@ubuntu.com>
21-
# Daniel Swarbrick <dswarbrick@debian.org>
2+
3+
"""
4+
Description: Expose metrics from apt. This is inspired by and
5+
intended to be a replacement for the original apt.sh.
6+
7+
This script deliberately does *not* update the apt cache. You need
8+
something else to run `apt update` regularly for the metrics to be
9+
up to date. This can be done in numerous ways, but the canonical way
10+
is to use the normal `APT::Periodic::Update-Package-Lists`
11+
setting.
12+
13+
This, for example, will enable a nightly job that runs `apt update`:
14+
15+
echo 'APT::Periodic::Update-Package-Lists "1";' > /etc/apt/apt.conf.d/99_auto_apt_update.conf
16+
17+
See /usr/lib/apt/apt.systemd.daily for details.
18+
19+
Dependencies: python3-apt, python3-prometheus-client
20+
21+
Authors: Kyle Fazzari <kyrofa@ubuntu.com>
22+
Daniel Swarbrick <dswarbrick@debian.org>
23+
"""
2224

2325
import apt
2426
import apt_pkg
@@ -52,12 +54,8 @@ def _convert_candidates_to_upgrade_infos(candidates):
5254

5355

5456
def _write_pending_upgrades(registry, cache):
55-
# Discount any changes that apply to packages that aren't installed (e.g.
56-
# count an upgrade to package A that adds a new dependency on package B as
57-
# only one upgrade, not two). See the following issue for more details:
58-
# https://github.com/prometheus-community/node-exporter-textfile-collector-scripts/issues/85
5957
candidates = {
60-
p.candidate for p in cache.get_changes() if p.is_installed and p.marked_upgrade
58+
p.candidate for p in cache if p.is_upgradable
6159
}
6260
upgrade_list = _convert_candidates_to_upgrade_infos(candidates)
6361

@@ -69,7 +67,10 @@ def _write_pending_upgrades(registry, cache):
6967

7068

7169
def _write_held_upgrades(registry, cache):
72-
held_candidates = {p.candidate for p in cache if p.is_upgradable and p.marked_keep}
70+
held_candidates = {
71+
p.candidate for p in cache
72+
if p.is_upgradable and p._pkg.selected_state == apt_pkg.SELSTATE_HOLD
73+
}
7374
upgrade_list = _convert_candidates_to_upgrade_infos(held_candidates)
7475

7576
if upgrade_list:
@@ -89,13 +90,16 @@ def _write_autoremove_pending(registry, cache):
8990
def _write_cache_timestamps(registry):
9091
g = Gauge('apt_package_cache_timestamp_seconds', "Apt update last run time.", registry=registry)
9192
apt_pkg.init_config()
92-
if apt_pkg.config.find_b("APT::Periodic::Update-Package-Lists"):
93+
if (
94+
apt_pkg.config.find_b("APT::Periodic::Update-Package-Lists") and
95+
os.path.isfile("/var/lib/apt/periodic/update-success-stamp")
96+
):
9397
# if we run updates automatically with APT::Periodic, we can
94-
# check this timestamp file
98+
# check this timestamp file if it exists
9599
stamp_file = "/var/lib/apt/periodic/update-success-stamp"
96100
else:
97-
# if not, let's just fallback on the lists directory
98-
stamp_file = '/var/lib/apt/lists'
101+
# if not, let's just fallback on the partial file of the lists directory
102+
stamp_file = '/var/lib/apt/lists/partial'
99103
try:
100104
g.set(os.stat(stamp_file).st_mtime)
101105
except OSError:

btrfs_stats.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
from prometheus_client import CollectorRegistry, Gauge, generate_latest
1313

1414

15+
DEVICE_PATTERN = re.compile(r"^\[([^\]]+)\]\.(\S+)\s+(\d+)$")
16+
17+
1518
def get_btrfs_mount_points():
1619
"""List all btrfs mount points.
1720
@@ -47,7 +50,7 @@ def get_btrfs_errors(mountpoint):
4750
continue
4851
# Sample line:
4952
# [/dev/vdb1].flush_io_errs 0
50-
m = re.search(r"^\[([^\]]+)\]\.(\S+)\s+(\d+)$", line.decode("utf-8"))
53+
m = DEVICE_PATTERN.match(line.decode("utf-8"))
5154
if not m:
5255
raise RuntimeError("unexpected output from btrfs: '%s'" % line)
5356
yield m.group(1), m.group(2), int(m.group(3))

directory-size.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@
1313
echo "# HELP node_directory_size_bytes Disk space used by some directories"
1414
echo "# TYPE node_directory_size_bytes gauge"
1515
du --block-size=1 --summarize "$@" \
16-
| sed -ne 's/\\/\\\\/;s/"/\\"/g;s/^\([0-9]\+\)\t\(.*\)$/node_directory_size_bytes{directory="\2"} \1/p'
16+
| awk '{ print "node_directory_size_bytes{directory=\"" $2 "\"} " $1 }'

needrestart_info.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
Description: Expose metrics from needrestart.
5+
6+
This script runs needrestart in batch mode. It will never ask for input
7+
and will never restart or upgrade anything.
8+
9+
Dependencies: python >= 3.5, python3-prometheus-client, needrestart
10+
11+
Authors: RomainMou
12+
"""
13+
14+
import sys
15+
import time
16+
import subprocess
17+
from collections import Counter
18+
from enum import Enum
19+
20+
from prometheus_client import (
21+
CollectorRegistry,
22+
Gauge,
23+
generate_latest,
24+
)
25+
26+
27+
class KernelStatus(Enum):
28+
UNKNOWN = 0
29+
CURRENT = 1
30+
ABI_UPGRADE = 2
31+
VERSION_UPGRADE = 3
32+
33+
34+
class MicroCodeStatus(Enum):
35+
UNKNOWN = 0
36+
CURRENT = 1
37+
OBSOLETE = 2
38+
39+
40+
class NeedRestartData:
41+
def __init__(self, needrestart_output):
42+
# Some default value
43+
self.timestamp = int(time.time())
44+
self.version = None
45+
self.kernel_status = None
46+
self.microcode_status = None
47+
self.kernel_current_version = ""
48+
self.kernel_expected_version = ""
49+
self.microcode_current_version = ""
50+
self.microcode_expected_version = ""
51+
needrestart_counter = Counter()
52+
53+
# Parse the cmd output
54+
for line in needrestart_output.splitlines():
55+
key, value = line.split(": ", maxsplit=1)
56+
if key == "NEEDRESTART-VER":
57+
self.version = value
58+
# Kernel informations
59+
elif key == "NEEDRESTART-KCUR":
60+
self.kernel_current_version = value
61+
elif key == "NEEDRESTART-KEXP":
62+
self.kernel_expected_version = value
63+
elif key == "NEEDRESTART-KSTA":
64+
self.kernel_status = KernelStatus(int(value))
65+
# Microcode informations
66+
elif key == "NEEDRESTART-UCCUR":
67+
self.microcode_current_version = value
68+
elif key == "NEEDRESTART-UCEXP":
69+
self.microcode_expected_version = value
70+
elif key == "NEEDRESTART-UCSTA":
71+
self.microcode_status = MicroCodeStatus(int(value))
72+
# Count the others
73+
else:
74+
needrestart_counter.update({key})
75+
76+
self.services_count = needrestart_counter["NEEDRESTART-SVC"]
77+
self.containers_count = needrestart_counter["NEEDRESTART-CONT"]
78+
self.sessions_count = needrestart_counter["NEEDRESTART-SESS"]
79+
80+
81+
def write_timestamp(registry, needrestart_data):
82+
g = Gauge(
83+
"needrestart_timestamp_seconds",
84+
"information about the version and when it was last run",
85+
labelnames=["version"],
86+
registry=registry,
87+
)
88+
g.labels(needrestart_data.version).set(needrestart_data.timestamp)
89+
90+
91+
def write_kernel(registry, needrestart_data):
92+
if needrestart_data.kernel_status:
93+
e = Gauge(
94+
"needrestart_kernel_status_info",
95+
"information about the kernel status",
96+
labelnames=["current", "expected"],
97+
registry=registry,
98+
)
99+
e.labels(
100+
needrestart_data.kernel_current_version,
101+
needrestart_data.kernel_expected_version,
102+
).set(needrestart_data.kernel_status.value)
103+
104+
105+
def write_microcode(registry, needrestart_data):
106+
if needrestart_data.microcode_status:
107+
e = Gauge(
108+
"needrestart_microcode_status_info",
109+
"information about the microcode status",
110+
labelnames=["current", "expected"],
111+
registry=registry,
112+
)
113+
e.labels(
114+
needrestart_data.microcode_current_version,
115+
needrestart_data.microcode_expected_version,
116+
).set(needrestart_data.microcode_status.value)
117+
118+
119+
def write_services(registry, needrestart_data):
120+
g = Gauge(
121+
"needrestart_services_total",
122+
"number of services requiring a restart",
123+
registry=registry,
124+
)
125+
g.set(needrestart_data.services_count)
126+
127+
128+
def write_containers(registry, needrestart_data):
129+
g = Gauge(
130+
"needrestart_containers_total",
131+
"number of containers requiring a restart",
132+
registry=registry,
133+
)
134+
g.set(needrestart_data.containers_count)
135+
136+
137+
def write_sessions(registry, needrestart_data):
138+
g = Gauge(
139+
"needrestart_sessions_total",
140+
"number of sessions requiring a restart",
141+
registry=registry,
142+
)
143+
g.set(needrestart_data.sessions_count)
144+
145+
146+
def main():
147+
registry = CollectorRegistry()
148+
149+
try:
150+
needrestart_output = subprocess.run(
151+
["needrestart", "-b"], capture_output=True, text=True, check=True
152+
).stdout
153+
needrestart_data = NeedRestartData(needrestart_output)
154+
except subprocess.CalledProcessError as e:
155+
print(f"Error executing needrestart:\n{e}", file=sys.stderr)
156+
sys.exit(1)
157+
except Exception as e:
158+
print(f"An unexpected error occurred:\n{e}", file=sys.stderr)
159+
sys.exit(1)
160+
161+
write_timestamp(registry, needrestart_data)
162+
write_kernel(registry, needrestart_data)
163+
write_microcode(registry, needrestart_data)
164+
write_services(registry, needrestart_data)
165+
write_containers(registry, needrestart_data)
166+
write_sessions(registry, needrestart_data)
167+
168+
print(generate_latest(registry).decode(), end="")
169+
170+
171+
if __name__ == "__main__":
172+
main()

ntpd_metrics.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,13 @@
99
from prometheus_client import CollectorRegistry, Gauge, generate_latest
1010

1111
# NTP peers status, with no DNS lookups.
12-
ntpq_cmd = ['ntpq', '-np']
12+
ntpq_cmd = ['ntpq', '-np', '-W', '255']
1313
ntpq_rv_cmd = ['ntpq', '-c', 'rv 0 offset,sys_jitter,rootdisp,rootdelay']
1414

1515
# Regex to match all of the fields in the output of ntpq -np
1616
metrics_fields = [
17-
r'^(?P<status>.)(?P<remote>[\w\.]+)',
18-
r'(?P<refid>[\w\.]+)',
17+
r'^(?P<status>.)(?P<remote>[\w\.:]+)',
18+
r'(?P<refid>[\w\.:]+)',
1919
r'(?P<stratum>\d+)',
2020
r'(?P<type>\w)',
2121
r'(?P<when>\d+)',

smartmon.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,13 @@
4141
'host_writes_mib',
4242
'host_writes_32mib',
4343
'load_cycle_count',
44+
'lifetime_writes_gib',
4445
'media_wearout_indicator',
46+
'percent_lifetime_remain',
4547
'wear_leveling_count',
4648
'nand_writes_1gib',
4749
'offline_uncorrectable',
50+
'percent_lifetime_remain',
4851
'power_cycle_count',
4952
'power_on_hours',
5053
'program_fail_count',
@@ -60,10 +63,14 @@
6063
'temperature_case',
6164
'temperature_celsius',
6265
'temperature_internal',
66+
'total_bad_block',
6367
'total_lbas_read',
6468
'total_lbas_written',
69+
'total_writes_gib',
70+
'total_reads_gib',
6571
'udma_crc_error_count',
6672
'unsafe_shutdown_count',
73+
'unexpect_power_loss_ct',
6774
'workld_host_reads_perc',
6875
'workld_media_wear_indic',
6976
'workload_minutes',

smartmon.sh

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ load_cycle_count
4343
media_wearout_indicator
4444
nand_writes_1gib
4545
offline_uncorrectable
46+
percent_lifetime_remain
4647
power_cycle_count
4748
power_on_hours
4849
program_fail_cnt_total
@@ -166,12 +167,13 @@ format_output() {
166167
awk -F'{' "${output_format_awk}"
167168
}
168169

169-
smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')"
170+
smartctl_version="$(/usr/sbin/smartctl -V | awk 'NR==1 && $1 == "smartctl" {print $2}')"
170171

171172
echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output
172173

173-
if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then
174-
exit
174+
# Exit if "smartctl" version is lower 6
175+
if [[ ${smartctl_version%.*} -lt 6 ]]; then
176+
exit 0
175177
fi
176178

177179
device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')"

0 commit comments

Comments
 (0)