Skip to content

Commit e71f17b

Browse files
authored
Merge pull request #5732 from grondo/shell-output-limit-relax
shell: make kvs output limit configurable and default single-user jobs to unlimited
2 parents cc54bb9 + 74b4faa commit e71f17b

File tree

10 files changed

+124
-18
lines changed

10 files changed

+124
-18
lines changed

doc/man1/common/job-shell-options.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,8 @@
3939

4040
* - :option:`hwloc.xmlfile`
4141
- Write hwloc XML gathered by job to a file and set ``HWLOC_XMLFILE``
42+
43+
* - :option:`output.limit`
44+
- Set KVS output limit to SIZE bytes, where SIZE may be a floating point
45+
value including optional SI units: k, K, M, G. This value is ignored
46+
if output is directed to a file with :option:`--output`.

doc/man1/flux-shell.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,14 @@ plugins include:
318318
this option directly, as it will be set automatically by options
319319
of higher level commands such as :man1:`flux-submit`.
320320

321+
.. option:: output.limit=SIZE
322+
323+
Truncate KVS output after SIZE bytes have been written. SIZE may
324+
be a floating point value with optional SI units k, K, M, G. A value of
325+
0 is considered unlimited. The default KVS output limit is 10M for jobs
326+
in a multi-user instance or unlimited for single-user instance jobs.
327+
This value is ignored if output is directed to a file.
328+
321329
.. option:: output.{stdout,stderr}.path=PATH
322330

323331
Set job stderr/out file output to PATH.

doc/man3/flux_shell_get_info.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ with the following layout:
3939
::
4040

4141
"jobid":I,
42+
"instance_owner":i,
4243
"rank":i,
4344
"size":i,
4445
"ntasks";i,

src/shell/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
struct flux_shell {
2727
flux_jobid_t jobid;
2828
int broker_rank;
29+
uid_t broker_owner;
2930
char hostname [MAXHOSTNAMELEN + 1];
3031
int protocol_fd[2];
3132

src/shell/output.c

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
#include "src/common/libeventlog/eventlog.h"
5454
#include "src/common/libeventlog/eventlogger.h"
5555
#include "src/common/libioencode/ioencode.h"
56+
#include "src/common/libutil/parse_size.h"
5657
#include "ccan/str/str.h"
5758

5859
#include "task.h"
@@ -61,8 +62,7 @@
6162
#include "builtins.h"
6263
#include "log.h"
6364

64-
#define OUTPUT_LIMIT_BYTES 1024*1024*10
65-
#define OUTPUT_LIMIT_STRING "10MB"
65+
#define MULTIUSER_OUTPUT_LIMIT "10M"
6666

6767
enum {
6868
FLUX_OUTPUT_TYPE_TERM = 1,
@@ -82,6 +82,8 @@ struct shell_output_type_file {
8282

8383
struct shell_output {
8484
flux_shell_t *shell;
85+
const char *kvs_limit_string;
86+
size_t kvs_limit_bytes;
8587
struct eventlogger *ev;
8688
double batch_timeout;
8789
int refcount;
@@ -341,6 +343,9 @@ static bool check_kvs_output_limit (struct shell_output *out,
341343
size_t *bytesp;
342344
size_t prev;
343345

346+
if (out->kvs_limit_bytes == 0)
347+
return false;
348+
344349
if (is_stdout) {
345350
stream = "stdout";
346351
bytesp = &out->stdout_bytes;
@@ -353,13 +358,13 @@ static bool check_kvs_output_limit (struct shell_output *out,
353358
prev = *bytesp;
354359
*bytesp += len;
355360

356-
if (*bytesp > OUTPUT_LIMIT_BYTES) {
361+
if (*bytesp > out->kvs_limit_bytes) {
357362
/* Only log an error when the threshold is reached.
358363
*/
359-
if (prev <= OUTPUT_LIMIT_BYTES)
364+
if (prev <= out->kvs_limit_bytes)
360365
shell_warn ("%s will be truncated, %s limit exceeded",
361366
stream,
362-
OUTPUT_LIMIT_STRING);
367+
out->kvs_limit_string);
363368
return true;
364369
}
365370
return false;
@@ -385,7 +390,7 @@ static int shell_output_kvs (struct shell_output *out)
385390
out->stdout_bytes : out->stderr_bytes;
386391
shell_warn ("%s: %zu of %zu bytes truncated",
387392
is_stdout ? "stdout" : "stderr",
388-
total - OUTPUT_LIMIT_BYTES,
393+
total - out->kvs_limit_bytes,
389394
total);
390395
}
391396
}
@@ -1135,6 +1140,53 @@ static int shell_lost (flux_plugin_t *p,
11351140
return 0;
11361141
}
11371142

1143+
static int get_output_limit (struct shell_output *out)
1144+
{
1145+
json_t *val = NULL;
1146+
uint64_t size;
1147+
1148+
/* Set default to unlimited (0) for single-user instances,
1149+
* O/w use the default multiuser output limit:
1150+
*/
1151+
if (out->shell->broker_owner == getuid())
1152+
out->kvs_limit_string = "0";
1153+
else
1154+
out->kvs_limit_string = MULTIUSER_OUTPUT_LIMIT;
1155+
1156+
if (flux_shell_getopt_unpack (out->shell,
1157+
"output",
1158+
"{s?o}",
1159+
"limit", &val) < 0) {
1160+
shell_log_error ("Unable to unpack shell output.limit");
1161+
return -1;
1162+
}
1163+
if (val != NULL) {
1164+
if (json_is_integer (val)) {
1165+
out->kvs_limit_bytes = (size_t) json_integer_value (val);
1166+
if (out->kvs_limit_bytes > 0) {
1167+
/* Need a string representation of limit for errors
1168+
*/
1169+
char *s = strdup (encode_size (out->kvs_limit_bytes));
1170+
if (s && flux_shell_aux_set (out->shell, NULL, s, free) < 0)
1171+
free (s);
1172+
else
1173+
out->kvs_limit_string = s;
1174+
}
1175+
return 0;
1176+
}
1177+
if (!(out->kvs_limit_string = json_string_value (val))) {
1178+
shell_log_error ("Unable to convert output.limit to string");
1179+
return -1;
1180+
}
1181+
}
1182+
if (parse_size (out->kvs_limit_string, &size) < 0) {
1183+
shell_log_errno ("Invalid KVS output.limit=%s", out->kvs_limit_string);
1184+
return -1;
1185+
}
1186+
out->kvs_limit_bytes = (size_t) size;
1187+
return 0;
1188+
}
1189+
11381190
struct shell_output *shell_output_create (flux_shell_t *shell)
11391191
{
11401192
struct shell_output *out;
@@ -1147,6 +1199,8 @@ struct shell_output *shell_output_create (flux_shell_t *shell)
11471199
out->stdout_buffer_type = "line";
11481200
out->stderr_buffer_type = "none";
11491201

1202+
if (get_output_limit (out) < 0)
1203+
goto error;
11501204
if (shell_output_check_alternate_output (out) < 0)
11511205
goto error;
11521206
if (shell_output_check_alternate_buffer_type (out) < 0)

src/shell/shell.c

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,25 @@ static int reconnect (flux_t *h, void *arg)
253253
return 0;
254254
}
255255

256+
static uid_t get_instance_owner (flux_t *h)
257+
{
258+
const char *s;
259+
char *endptr;
260+
int id;
261+
262+
if (!(s = flux_attr_get (h, "security.owner"))) {
263+
shell_log_errno ("error fetching security.owner attribute");
264+
return (uid_t) 0;
265+
}
266+
errno = 0;
267+
id = strtoul (s, &endptr, 10);
268+
if (errno != 0 || *endptr != '\0') {
269+
shell_log_error ("error parsing security.owner=%s", s);
270+
return (uid_t) 0;
271+
}
272+
return (uid_t) id;
273+
}
274+
256275
static void shell_connect_flux (flux_shell_t *shell)
257276
{
258277
uint32_t rank;
@@ -274,6 +293,8 @@ static void shell_connect_flux (flux_shell_t *shell)
274293
shell_log_errno ("error fetching broker rank");
275294
shell->broker_rank = rank;
276295

296+
shell->broker_owner = get_instance_owner (shell->h);
297+
277298
if (plugstack_call (shell->plugstack, "shell.connect", NULL) < 0)
278299
shell_log_errno ("shell.connect");
279300
}
@@ -427,9 +448,10 @@ static json_t *flux_shell_get_info_object (flux_shell_t *shell)
427448
return o;
428449

429450
if (!(o = json_pack_ex (&err, 0,
430-
"{ s:I s:i s:i s:i s:s s:O s:O s:{ s:i }}",
451+
"{ s:I s:i s:i s:i s:i s:s s:O s:O s:{ s:i }}",
431452
"jobid", shell->info->jobid,
432453
"rank", shell->info->shell_rank,
454+
"instance_owner", (int) shell->broker_owner,
433455
"size", shell->info->shell_size,
434456
"ntasks", shell->info->total_ntasks,
435457
"service", shell_svc_name (shell->svc),

src/shell/shell.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ const struct taskmap *flux_shell_get_taskmap (flux_shell_t *shell);
105105
/* Return shell info as a JSON string.
106106
* {
107107
* "jobid":I,
108+
* "instance_owner":i,
108109
* "rank":i,
109110
* "size":i,
110111
* "ntasks";i,

t/system/0001-basic.t

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,10 @@ test_expect_success 'flux jobs lists job with correct userid' '
1515
test_expect_success 'flux proxy can submit jobs to system instance' '
1616
flux proxy $(flux getattr local-uri) flux submit true
1717
'
18+
test_expect_success 'flux-shell limits kvs output to 10M for guest jobs' '
19+
dd if=/dev/urandom bs=10240 count=800 | base64 --wrap 79 >large.in &&
20+
flux run -vvv cat large.in >large.out 2>trunc.err &&
21+
ls -lh large* &&
22+
test_debug "cat trunc.err" &&
23+
grep "stdout.*truncated" trunc.err
24+
'

t/t2606-job-shell-output-redirection.t

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -348,15 +348,22 @@ test_expect_success 'job-shell: shell errors are captured in error file' '
348348
test_expect_code 127 flux run --error=test.err nosuchcommand &&
349349
grep "nosuchcommand: No such file or directory" test.err
350350
'
351-
test_expect_success LONGTEST 'job-shell: output to kvs is truncated at 10MB' '
352-
dd if=/dev/urandom bs=10240 count=800 | base64 --wrap 79 >expected &&
353-
flux run cat expected >output 2>truncate.error &&
354-
test_debug "cat truncate.error" &&
355-
grep "stdout.*truncated" truncate.error
356-
'
357-
test_expect_success LONGTEST 'job-shell: stderr to kvs is truncated at 10MB' '
358-
dd if=/dev/urandom bs=10240 count=800 | base64 --wrap 79 >expected &&
359-
flux run sh -c "cat expected >&2" >truncate2.error 2>&1 &&
360-
grep "stderr.*truncated" truncate2.error
351+
test_expect_success 'job-shell: kvs output truncatation works' '
352+
flux run -o output.limit=5 echo 0123456789 2>trunc.err &&
353+
test_debug "cat trunc.err" &&
354+
grep "stdout.*truncated" trunc.err
355+
'
356+
test_expect_success 'job-shell: stderr truncation works' '
357+
flux run -o output.limit=5 \
358+
sh -c "echo 0123456789 >&2" >trunc2.error 2>&1 &&
359+
grep "stderr.*truncated" trunc2.error
360+
'
361+
test_expect_success LONGTEST 'job-shell: no truncation at 10MB for single-user job' '
362+
dd if=/dev/urandom bs=10240 count=800 | base64 --wrap 79 >10M+ &&
363+
flux run cat 10M+ >10M+.output &&
364+
test_cmp 10M+ 10M+.output
365+
'
366+
test_expect_success 'job-shell: invalid output.limit string is rejected' '
367+
test_must_fail flux run -o output.limit=foo hostname
361368
'
362369
test_done

t/t9000-system.t

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ alias test_expect_success='expect_success_wrap'
6565
#
6666
for testscript in ${FLUX_SOURCE_DIR}/t/system/${T9000_SYSTEM_GLOB}; do
6767
TEST_LABEL="$(basename $testscript)"
68-
source $testscript
68+
. $testscript
6969
done
7070

7171
test_done

0 commit comments

Comments
 (0)