Skip to content

Commit b4b32d3

Browse files
committed
shell: output: make kvs output limit configurable
Problem: The job shell truncates output at 10M to workaround current KVS limitations with very large eventlogs in possibly long-running system instances. However, this is unnecessarily limiting in single-user subinstances where it is most convenient to have all job output folded into the output of a batch job. Convert the KVS output limit to a configurable value set via the job shell option `output.limit`. Keep the default limit of 10M if the userid of the enclosing instance broker does not match the current userid, and disable the limit completely if the broker and shell userids match. Update shell output truncation tests that would now fail with the new behavior. Ensure output limit is not enforced in the single user test instance. Note that this allows a user to disable the limit in a multiuser instance as well via `-o output.limit=0`, but this was possible anyway since the limit is not really enforced by the instance (a user could build a custom job shell for instance).
1 parent 248600b commit b4b32d3

File tree

2 files changed

+77
-16
lines changed

2 files changed

+77
-16
lines changed

src/shell/output.c

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
#include "src/common/libeventlog/eventlog.h"
5454
#include "src/common/libeventlog/eventlogger.h"
5555
#include "src/common/libioencode/ioencode.h"
56+
#include "src/common/libutil/parse_size.h"
5657
#include "ccan/str/str.h"
5758

5859
#include "task.h"
@@ -61,8 +62,7 @@
6162
#include "builtins.h"
6263
#include "log.h"
6364

64-
#define OUTPUT_LIMIT_BYTES 1024*1024*10
65-
#define OUTPUT_LIMIT_STRING "10MB"
65+
#define MULTIUSER_OUTPUT_LIMIT "10M"
6666

6767
enum {
6868
FLUX_OUTPUT_TYPE_TERM = 1,
@@ -82,6 +82,8 @@ struct shell_output_type_file {
8282

8383
struct shell_output {
8484
flux_shell_t *shell;
85+
const char *kvs_limit_string;
86+
size_t kvs_limit_bytes;
8587
struct eventlogger *ev;
8688
double batch_timeout;
8789
int refcount;
@@ -341,6 +343,9 @@ static bool check_kvs_output_limit (struct shell_output *out,
341343
size_t *bytesp;
342344
size_t prev;
343345

346+
if (out->kvs_limit_bytes == 0)
347+
return false;
348+
344349
if (is_stdout) {
345350
stream = "stdout";
346351
bytesp = &out->stdout_bytes;
@@ -353,13 +358,13 @@ static bool check_kvs_output_limit (struct shell_output *out,
353358
prev = *bytesp;
354359
*bytesp += len;
355360

356-
if (*bytesp > OUTPUT_LIMIT_BYTES) {
361+
if (*bytesp > out->kvs_limit_bytes) {
357362
/* Only log an error when the threshold is reached.
358363
*/
359-
if (prev <= OUTPUT_LIMIT_BYTES)
364+
if (prev <= out->kvs_limit_bytes)
360365
shell_warn ("%s will be truncated, %s limit exceeded",
361366
stream,
362-
OUTPUT_LIMIT_STRING);
367+
out->kvs_limit_string);
363368
return true;
364369
}
365370
return false;
@@ -385,7 +390,7 @@ static int shell_output_kvs (struct shell_output *out)
385390
out->stdout_bytes : out->stderr_bytes;
386391
shell_warn ("%s: %zu of %zu bytes truncated",
387392
is_stdout ? "stdout" : "stderr",
388-
total - OUTPUT_LIMIT_BYTES,
393+
total - out->kvs_limit_bytes,
389394
total);
390395
}
391396
}
@@ -1135,6 +1140,53 @@ static int shell_lost (flux_plugin_t *p,
11351140
return 0;
11361141
}
11371142

1143+
static int get_output_limit (struct shell_output *out)
1144+
{
1145+
json_t *val = NULL;
1146+
uint64_t size;
1147+
1148+
/* Set default to unlimited (0) for single-user instances,
1149+
* O/w use the default multiuser output limit:
1150+
*/
1151+
if (out->shell->broker_owner == getuid())
1152+
out->kvs_limit_string = "0";
1153+
else
1154+
out->kvs_limit_string = MULTIUSER_OUTPUT_LIMIT;
1155+
1156+
if (flux_shell_getopt_unpack (out->shell,
1157+
"output",
1158+
"{s?o}",
1159+
"limit", &val) < 0) {
1160+
shell_log_error ("Unable to unpack shell output.limit");
1161+
return -1;
1162+
}
1163+
if (val != NULL) {
1164+
if (json_is_integer (val)) {
1165+
out->kvs_limit_bytes = (size_t) json_integer_value (val);
1166+
if (out->kvs_limit_bytes > 0) {
1167+
/* Need a string representation of limit for errors
1168+
*/
1169+
char *s = strdup (encode_size (out->kvs_limit_bytes));
1170+
if (s && flux_shell_aux_set (out->shell, NULL, s, free) < 0)
1171+
free (s);
1172+
else
1173+
out->kvs_limit_string = s;
1174+
}
1175+
return 0;
1176+
}
1177+
if (!(out->kvs_limit_string = json_string_value (val))) {
1178+
shell_log_error ("Unable to convert output.limit to string");
1179+
return -1;
1180+
}
1181+
}
1182+
if (parse_size (out->kvs_limit_string, &size) < 0) {
1183+
shell_log_errno ("Invalid KVS output.limit=%s", out->kvs_limit_string);
1184+
return -1;
1185+
}
1186+
out->kvs_limit_bytes = (size_t) size;
1187+
return 0;
1188+
}
1189+
11381190
struct shell_output *shell_output_create (flux_shell_t *shell)
11391191
{
11401192
struct shell_output *out;
@@ -1147,6 +1199,8 @@ struct shell_output *shell_output_create (flux_shell_t *shell)
11471199
out->stdout_buffer_type = "line";
11481200
out->stderr_buffer_type = "none";
11491201

1202+
if (get_output_limit (out) < 0)
1203+
goto error;
11501204
if (shell_output_check_alternate_output (out) < 0)
11511205
goto error;
11521206
if (shell_output_check_alternate_buffer_type (out) < 0)

t/t2606-job-shell-output-redirection.t

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -348,15 +348,22 @@ test_expect_success 'job-shell: shell errors are captured in error file' '
348348
test_expect_code 127 flux run --error=test.err nosuchcommand &&
349349
grep "nosuchcommand: No such file or directory" test.err
350350
'
351-
test_expect_success LONGTEST 'job-shell: output to kvs is truncated at 10MB' '
352-
dd if=/dev/urandom bs=10240 count=800 | base64 --wrap 79 >expected &&
353-
flux run cat expected >output 2>truncate.error &&
354-
test_debug "cat truncate.error" &&
355-
grep "stdout.*truncated" truncate.error
356-
'
357-
test_expect_success LONGTEST 'job-shell: stderr to kvs is truncated at 10MB' '
358-
dd if=/dev/urandom bs=10240 count=800 | base64 --wrap 79 >expected &&
359-
flux run sh -c "cat expected >&2" >truncate2.error 2>&1 &&
360-
grep "stderr.*truncated" truncate2.error
351+
test_expect_success 'job-shell: kvs output truncatation works' '
352+
flux run -o output.limit=5 echo 0123456789 2>trunc.err &&
353+
test_debug "cat trunc.err" &&
354+
grep "stdout.*truncated" trunc.err
355+
'
356+
test_expect_success 'job-shell: stderr truncation works' '
357+
flux run -o output.limit=5 \
358+
sh -c "echo 0123456789 >&2" >trunc2.error 2>&1 &&
359+
grep "stderr.*truncated" trunc2.error
360+
'
361+
test_expect_success LONGTEST 'job-shell: no truncation at 10MB for single-user job' '
362+
dd if=/dev/urandom bs=10240 count=800 | base64 --wrap 79 >10M+ &&
363+
flux run cat 10M+ >10M+.output &&
364+
test_cmp 10M+ 10M+.output
365+
'
366+
test_expect_success 'job-shell: invalid output.limit string is rejected' '
367+
test_must_fail flux run -o output.limit=foo hostname
361368
'
362369
test_done

0 commit comments

Comments
 (0)