Skip to content

Commit ee3df94

Browse files
authored
Add recover_status parser lustrefs_exporter (#118)
* Add recovery_status to lustrefs_exporter and enhance recovery status parser to include additional metrics: - RecoveryDuration - RecoveryTimeRemaining - RecoveryTotalClients * Address review comments * Use test_case macro instead of regular test
1 parent 07b820a commit ee3df94

26 files changed

+785
-80
lines changed

Cargo.lock

Lines changed: 34 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ serde_json = "1"
3535
serde_yaml = "0.9"
3636
serial_test = "3.2"
3737
sysinfo = "0.29"
38+
test-case = "3.3"
3839
thiserror = "2"
3940
tokio = "1"
4041
tower = "0.5"

lustre-collector/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ tracing-subscriber.workspace = true
1919
[dev-dependencies]
2020
include_dir.workspace = true
2121
insta.workspace = true
22+
test-case.workspace = true
2223
tokio = { workspace = true, features = ["full"] }
2324
criterion = { workspace = true, features = ["html_reports", "async_tokio"] }
2425
sysinfo.workspace = true
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
obdfilter.200NVX2-OST0000.recovery_status=
2+
status: COMPLETE
3+
recovery_start: 1761567697
4+
recovery_duration: 1
5+
completed_clients: 8/8
6+
replayed_requests: 0
7+
last_transno: 17184814233
8+
VBR: DISABLED
9+
IR: ENABLED
10+
obdfilter.200NVX2-OST0003.recovery_status=
11+
status: COMPLETE
12+
recovery_start: 1759494115
13+
recovery_duration: 15
14+
completed_clients: 8/8
15+
replayed_requests: 0
16+
last_transno: 12934942643
17+
VBR: DISABLED
18+
IR: DISABLED
19+
obdfilter.200NVX2-OST0004.recovery_status=
20+
status: COMPLETE
21+
recovery_start: 1759494115
22+
recovery_duration: 14
23+
completed_clients: 8/8
24+
replayed_requests: 0
25+
last_transno: 12934956643
26+
VBR: DISABLED
27+
IR: DISABLED
28+
obdfilter.200NVX2-OST0007.recovery_status=
29+
status: COMPLETE
30+
recovery_start: 1759494115
31+
recovery_duration: 14
32+
completed_clients: 8/8
33+
replayed_requests: 0
34+
last_transno: 12934943652
35+
VBR: DISABLED
36+
IR: DISABLED

lustre-collector/src/parser.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use crate::{
66
ldlm, llite, mdd_parser,
77
mds::{self, client_count_parser},
88
mgs::mgs_parser,
9-
nodemap, osd_parser, oss, quota, top_level_parser,
9+
nodemap, osd_parser, oss, quota, recovery_status_parser, top_level_parser,
1010
types::Record,
1111
};
1212
use combine::{Parser, Stream, choice, error::ParseError, many};
@@ -24,6 +24,7 @@ pub fn params() -> Vec<String> {
2424
.chain(mdd_parser::params())
2525
.chain(quota::params())
2626
.chain(nodemap::params())
27+
.chain(recovery_status_parser::params())
2728
.collect()
2829
}
2930

@@ -44,6 +45,7 @@ where
4445
mdd_parser::parse().map(|x| vec![x]),
4546
quota::parse().map(|x| vec![x]),
4647
nodemap::parse().map(|x| vec![x]),
48+
recovery_status_parser::parse(),
4749
)))
4850
.map(|xs: Vec<_>| xs.into_iter().flatten().collect())
4951
}
@@ -86,6 +88,7 @@ mod tests {
8688
test_fixtures!(test_lustre_2_14_0_ddn145_fixtures, "*ddn145*");
8789

8890
test_fixtures!(test_lustre_2_14_0_ddn133_fixtures, "*ddn133*");
91+
test_fixtures!(test_lustre_2_14_0_ddn212_fixtures, "*ddn212*");
8992

9093
#[test]
9194
fn test_params() {

lustre-collector/src/recovery_status_parser.rs

Lines changed: 106 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
// Use of this source code is governed by a MIT-style
33
// license that can be found in the LICENSE file.
44

5+
use std::iter;
6+
57
use crate::{
68
base_parsers::{digits, param, period, target, till_newline},
79
types::{Param, Record, RecoveryStatus, Target, TargetStat, TargetStats, TargetVariant},
@@ -71,7 +73,8 @@ where
7173
})
7274
}
7375

74-
fn clients_line<I>(x: &'static str) -> impl Parser<I, Output = u64>
76+
/// Parses a client line containing the completed and a optional value for the total count (e.g., "completed: 5/10").
77+
fn clients_line<I>(x: &'static str) -> impl Parser<I, Output = (u64, Option<u64>)>
7578
where
7679
I: Stream<Token = char>,
7780
I::Error: ParseError<I::Token, I::Range, I::Position>,
@@ -83,15 +86,18 @@ where
8386
optional((token('/'), digits())),
8487
optional(newline().map(drop).or(eof())),
8588
)
86-
.map(|(_, _, x, _, _): (_, _, u64, _, _)| x)
89+
.map(|(_, _, x, y, _): (_, _, u64, Option<(_, u64)>, _)| (x, y.map(|(_, v)| v)))
8790
}
8891

8992
#[derive(Debug)]
9093
enum RecoveryStat {
91-
Status(RecoveryStatus),
9294
Completed(u64),
9395
Connected(u64),
9496
Evicted(u64),
97+
RecoveryDuration(u64),
98+
Status(RecoveryStatus),
99+
TimeRemaining(u64),
100+
Total(Option<u64>),
95101
}
96102

97103
pub struct StatName(pub String);
@@ -105,6 +111,20 @@ where
105111
many1(alpha_num().or(one_of("_-".chars()))).map(StatName)
106112
}
107113

114+
/// Parses all the simple recovery stats for a target, which are just plain u64 values
115+
fn simple_client_stat<I>(
116+
name: &'static str,
117+
constructor: fn(u64) -> RecoveryStat,
118+
) -> impl Parser<I, Output = Vec<RecoveryStat>>
119+
where
120+
I: Stream<Token = char>,
121+
I::Error: ParseError<I::Token, I::Range, I::Position>,
122+
{
123+
clients_line(name)
124+
.skip(optional(newline()))
125+
.map(move |(x, _)| vec![constructor(x)])
126+
}
127+
108128
fn target_recovery_stats<I>() -> impl Parser<I, Output = Vec<RecoveryStat>>
109129
where
110130
I: Stream<Token = char>,
@@ -113,22 +133,23 @@ where
113133
many(choice((
114134
status_line()
115135
.skip(optional(newline()))
116-
.map(RecoveryStat::Status)
117-
.map(Some),
118-
clients_line("completed_clients")
119-
.skip(optional(newline()))
120-
.map(RecoveryStat::Completed)
121-
.map(Some),
136+
.map(|x| vec![RecoveryStat::Status(x)]),
137+
simple_client_stat("recovery_duration", RecoveryStat::RecoveryDuration),
138+
simple_client_stat("completed_clients", RecoveryStat::Completed),
139+
simple_client_stat("time_remaining", RecoveryStat::TimeRemaining),
140+
simple_client_stat("evicted_clients", RecoveryStat::Evicted),
122141
clients_line("connected_clients")
123142
.skip(optional(newline()))
124-
.map(RecoveryStat::Connected)
125-
.map(Some),
126-
clients_line("evicted_clients")
127-
.skip(optional(newline()))
128-
.map(RecoveryStat::Evicted)
129-
.map(Some),
143+
.map(|(x, y)| {
144+
iter::once(RecoveryStat::Connected(x))
145+
.chain(
146+
y.map(|total| vec![RecoveryStat::Total(Some(total))])
147+
.unwrap_or_default(),
148+
)
149+
.collect()
150+
}),
130151
// This will ignore line/field we don't care
131-
attempt((stat_name(), token(':'), till_newline().skip(newline()))).map(|_| None),
152+
attempt((stat_name(), token(':'), till_newline().skip(newline()))).map(|_| vec![]),
132153
)))
133154
.map(|xs: Vec<_>| xs.into_iter().flatten().collect())
134155
}
@@ -176,6 +197,28 @@ where
176197
value: *value,
177198
})
178199
}
200+
RecoveryStat::RecoveryDuration(value) => {
201+
TargetStats::RecoveryDuration(TargetStat {
202+
kind,
203+
param: param.clone(),
204+
target: target.clone(),
205+
value: *value,
206+
})
207+
}
208+
RecoveryStat::TimeRemaining(value) => {
209+
TargetStats::RecoveryTimeRemaining(TargetStat {
210+
kind,
211+
param: param.clone(),
212+
target: target.clone(),
213+
value: *value,
214+
})
215+
}
216+
RecoveryStat::Total(value) => TargetStats::RecoveryTotalClients(TargetStat {
217+
kind,
218+
param: param.clone(),
219+
target: target.clone(),
220+
value: value.unwrap_or(0),
221+
}),
179222
})
180223
.collect()
181224
})
@@ -186,20 +229,19 @@ where
186229
I: Stream<Token = char>,
187230
I::Error: ParseError<I::Token, I::Range, I::Position>,
188231
{
189-
many(
190-
(
191-
target_status(),
192-
skip_until(attempt(ost_or_mdt().map(drop)).or(eof())),
193-
)
194-
.map(|(x, _)| x.into_iter().map(Record::Target).collect()),
232+
(
233+
target_status(),
234+
skip_until(attempt(ost_or_mdt().map(drop)).or(eof())),
195235
)
196-
.map(|x: Vec<Vec<Record>>| x.into_iter().flatten().collect())
236+
.map(|(x, _)| x.into_iter().map(Record::Target).collect())
197237
}
198238

199239
#[cfg(test)]
200240
mod tests {
201-
use crate::recovery_status_parser::{clients_line, parse, target_recovery_stats};
241+
use crate::parser::parse;
242+
use crate::recovery_status_parser::{clients_line, target_recovery_stats};
202243
use combine::{Parser, parser::EasyParser, stream::position};
244+
use test_case::test_case;
203245

204246
#[test]
205247
fn test_multiple() {
@@ -235,14 +277,12 @@ mod tests {
235277
insta::assert_debug_snapshot!(records);
236278
}
237279

238-
#[test]
239-
fn test_clients_line() {
240-
let result = clients_line("completed_clients").parse("completed_clients: 3/7\n");
241-
assert_eq!(result, Ok((3, "")));
242-
let result = clients_line("connected_clients").parse("connected_clients: 3/7\n");
243-
assert_eq!(result, Ok((3, "")));
244-
let result = clients_line("completed_clients").parse("completed_clients: 3\n");
245-
assert_eq!(result, Ok((3, "")));
280+
#[test_case("completed_clients", "completed_clients: 3/7\n", (3, Some(7)); "completed clients with total")]
281+
#[test_case("connected_clients", "connected_clients: 3/7\n", (3, Some(7)); "connected clients with total")]
282+
#[test_case("completed_clients", "completed_clients: 3\n", (3, None); "completed clients without total")]
283+
fn test_clients_line(field_name: &'static str, input: &str, expected: (u64, Option<u64>)) {
284+
let result = clients_line(field_name).parse(input);
285+
assert_eq!(result, Ok((expected, "")));
246286
}
247287

248288
#[test]
@@ -259,7 +299,19 @@ IR: ENABLED
259299

260300
let (records, _): (Vec<_>, _) = target_recovery_stats().parse(x).unwrap();
261301

262-
insta::assert_debug_snapshot!(records);
302+
insta::assert_debug_snapshot!(records, @r"
303+
[
304+
Status(
305+
Complete,
306+
),
307+
RecoveryDuration(
308+
150,
309+
),
310+
Completed(
311+
4,
312+
),
313+
]
314+
");
263315
}
264316

265317
#[test]
@@ -275,6 +327,26 @@ completed_clients: 3
275327

276328
let (records, _): (Vec<_>, _) = target_recovery_stats().parse(x).unwrap();
277329

278-
insta::assert_debug_snapshot!(records);
330+
insta::assert_debug_snapshot!(records, @r"
331+
[
332+
Status(
333+
Recovering,
334+
),
335+
TimeRemaining(
336+
119,
337+
),
338+
Connected(
339+
3,
340+
),
341+
Total(
342+
Some(
343+
7,
344+
),
345+
),
346+
Completed(
347+
3,
348+
),
349+
]
350+
");
279351
}
280352
}

lustre-collector/src/snapshots/lustre_collector__parser__tests__params.snap

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,4 +67,6 @@ expression: params()
6767
"qmt.*.{dt,md}-*.glb-grp",
6868
"nodemap.*.dt_stats",
6969
"nodemap.*.md_stats",
70+
"obdfilter.*OST*.recovery_status",
71+
"mdt.*MDT*.recovery_status",
7072
]

0 commit comments

Comments
 (0)