1
- //########################################################################
2
- // This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.
3
- // License: MS-RSL – see LICENSE.md for details
4
- //########################################################################
1
+ /*
2
+ * This file is part of CoCalc: Copyright © 2020–2024 Sagemath, Inc.
3
+ * License: MS-RSL – see LICENSE.md for details
4
+ */
5
5
6
6
// Execute code in a subprocess.
7
7
@@ -21,6 +21,7 @@ import getLogger from "@cocalc/backend/logger";
21
21
import { envToInt } from "@cocalc/backend/misc/env-to-number" ;
22
22
import { aggregate } from "@cocalc/util/aggregate" ;
23
23
import { callback_opts } from "@cocalc/util/async-utils" ;
24
+ import { PROJECT_EXEC_DEFAULT_TIMEOUT_S } from "@cocalc/util/consts/project" ;
24
25
import { to_json , trunc , uuid , walltime } from "@cocalc/util/misc" ;
25
26
import {
26
27
ExecuteCodeOutputAsync ,
@@ -38,10 +39,15 @@ import { ProcessStats } from "./process-stats";
38
39
39
40
const log = getLogger ( "execute-code" ) ;
40
41
41
- const ASYNC_CACHE_MAX = envToInt ( "COCALC_PROJECT_ASYNC_EXEC_CACHE_MAX" , 100 ) ;
42
- const ASYNC_CACHE_TTL_S = envToInt ( "COCALC_PROJECT_ASYNC_EXEC_TTL_S" , 60 * 60 ) ;
42
+ const PREFIX = "COCALC_PROJECT_ASYNC_EXEC" ;
43
+ const ASYNC_CACHE_MAX = envToInt ( `${ PREFIX } _CACHE_MAX` , 100 ) ;
44
+ const ASYNC_CACHE_TTL_S = envToInt ( `${ PREFIX } _TTL_S` , 60 * 60 ) ;
43
45
// for async execution, every that many secs check up on the child-tree
44
- const MONITOR_INTERVAL_S = envToInt ( "COCALC_PROJECT_MONITOR_INTERVAL_S" , 60 ) ;
46
+ const MONITOR_INTERVAL_S = envToInt ( `${ PREFIX } _MONITOR_INTERVAL_S` , 60 ) ;
47
+ const MONITOR_STATS_LENGTH_MAX = envToInt (
48
+ `${ PREFIX } _MONITOR_STATS_LENGTH_MAX` ,
49
+ 100 ,
50
+ ) ;
45
51
46
52
const asyncCache = new LRU < string , ExecuteCodeOutputAsync > ( {
47
53
max : ASYNC_CACHE_MAX ,
@@ -56,6 +62,8 @@ function asyncCacheUpdate(job_id: string, upd) {
56
62
const obj = asyncCache . get ( job_id ) ;
57
63
if ( Array . isArray ( obj ?. stats ) && Array . isArray ( upd . stats ) ) {
58
64
obj . stats . push ( ...upd . stats ) ;
65
+ // truncate to $MONITOR_STATS_LENGTH_MAX, by discarding the inital entries
66
+ obj . stats = obj . stats . slice ( obj . stats . length - MONITOR_STATS_LENGTH_MAX ) ;
59
67
}
60
68
asyncCache . set ( job_id , { ...obj , ...upd } ) ;
61
69
}
@@ -73,7 +81,14 @@ export const execute_code: ExecuteCodeFunctionWithCallback = aggregate(
73
81
( opts : ExecuteCodeOptionsWithCallback ) : void => {
74
82
( async ( ) => {
75
83
try {
76
- opts . cb ?.( undefined , await executeCodeNoAggregate ( opts ) ) ;
84
+ let data = await executeCodeNoAggregate ( opts ) ;
85
+ if ( isExecuteCodeOptionsAsyncGet ( opts ) && data . type === "async" ) {
86
+ // stats could contain a lot of data. we only return it if requested.
87
+ if ( opts . async_stats !== true ) {
88
+ data = { ...data , stats : undefined } ;
89
+ }
90
+ }
91
+ opts . cb ?.( undefined , data ) ;
77
92
} catch ( err ) {
78
93
opts . cb ?.( err ) ;
79
94
}
@@ -101,7 +116,7 @@ async function executeCodeNoAggregate(
101
116
}
102
117
103
118
opts . args ??= [ ] ;
104
- opts . timeout ??= 10 ;
119
+ opts . timeout ??= PROJECT_EXEC_DEFAULT_TIMEOUT_S ;
105
120
opts . ulimit_timeout ??= true ;
106
121
opts . err_on_exit ??= true ;
107
122
opts . verbose ??= true ;
@@ -166,15 +181,15 @@ async function executeCodeNoAggregate(
166
181
if ( opts . async_call ) {
167
182
// we return an ID, the caller can then use it to query the status
168
183
opts . max_output ??= 1024 * 1024 ; // we limit how much we keep in memory, to avoid problems;
169
- opts . timeout ??= 10 * 60 ;
184
+ opts . timeout ??= PROJECT_EXEC_DEFAULT_TIMEOUT_S ;
170
185
const job_id = uuid ( ) ;
171
- const start = new Date ( ) ;
186
+ const start = Date . now ( ) ;
172
187
const job_config : ExecuteCodeOutputAsync = {
173
188
type : "async" ,
174
- stdout : `Process started running at ${ start . toISOString ( ) } ` ,
189
+ stdout : "" ,
175
190
stderr : "" ,
176
191
exit_code : 0 ,
177
- start : start . getTime ( ) ,
192
+ start,
178
193
job_id,
179
194
status : "running" ,
180
195
} ;
@@ -184,15 +199,14 @@ async function executeCodeNoAggregate(
184
199
{ ...opts , origCommand, job_id, job_config } ,
185
200
async ( err , result ) => {
186
201
try {
187
- const started = asyncCache . get ( job_id ) ?. start ?? 0 ;
188
202
const info : Omit <
189
203
ExecuteCodeOutputAsync ,
190
204
"stdout" | "stderr" | "exit_code"
191
205
> = {
192
206
job_id,
193
207
type : "async" ,
194
- elapsed_s : ( Date . now ( ) - started ) / 1000 ,
195
- start : start . getTime ( ) ,
208
+ elapsed_s : ( Date . now ( ) - start ) / 1000 ,
209
+ start,
196
210
status : "error" ,
197
211
} ;
198
212
if ( err ) {
@@ -311,21 +325,20 @@ function doSpawn(
311
325
let stdout_is_done = false ;
312
326
let killed = false ;
313
327
let callback_done = false ;
314
- let monitorRef : NodeJS . Timer | null = null ;
315
328
let timer : NodeJS . Timeout | undefined = undefined ;
316
329
317
330
// periodically check up on the child process tree and record stats
318
331
// this also keeps the entry in the cache alive, when the ttl is less than the duration of the execution
319
- async function setupMonitor ( ) {
332
+ async function startMonitor ( ) {
320
333
const pid = child . pid ;
321
- const job_id = opts . job_id ;
322
- if ( job_id == null || pid == null ) return ;
334
+ const { job_id, job_config } = opts ;
335
+ if ( job_id == null || pid == null || job_config == null ) return ;
323
336
const monitor = new ProcessStats ( ) ;
324
337
await monitor . init ( ) ;
325
338
await new Promise ( ( done ) => setTimeout ( done , 1000 ) ) ;
326
339
if ( callback_done ) return ;
327
340
328
- monitorRef = setInterval ( async ( ) => {
341
+ while ( true ) {
329
342
const { procs } = await monitor . processes ( Date . now ( ) ) ;
330
343
// reconstruct process tree
331
344
const children : { [ pid : number ] : number [ ] } = { } ;
@@ -337,26 +350,26 @@ function doSpawn(
337
350
// we only consider those, which are the pid itself or one of its children
338
351
const { rss, pct_cpu, cpu_secs } = sumChildren ( procs , children , pid ) ;
339
352
340
- let obj = asyncCache . get ( job_id ) ;
341
- obj ??= opts . job_config ; // in case the cache "forgot" about it
342
- if ( obj != null ) {
343
- obj . pid = pid ;
344
- obj . stats ??= [ ] ;
345
- obj . stats . push ( {
346
- timestamp : Date . now ( ) ,
347
- mem_rss : rss ,
348
- cpu_pct : pct_cpu ,
349
- cpu_secs,
350
- } ) ;
351
- asyncCache . set ( job_id , obj ) ;
352
- }
353
- } , 1000 * MONITOR_INTERVAL_S ) ;
354
- }
353
+ // ?? fallback, in case the cache "forgot" about it
354
+ const obj = asyncCache . get ( job_id ) ?? job_config ;
355
+ obj . pid = pid ;
356
+ obj . stats ??= [ ] ;
357
+ obj . stats . push ( {
358
+ timestamp : Date . now ( ) ,
359
+ mem_rss : rss ,
360
+ cpu_pct : pct_cpu ,
361
+ cpu_secs,
362
+ } ) ;
363
+ asyncCache . set ( job_id , obj ) ;
364
+
365
+ // initially, we record more frequently, but then we space it out up until the interval (probably 1 minute)
366
+ const elapsed_s = ( Date . now ( ) - job_config . start ) / 1000 ;
367
+ // i.e. after 6 minutes, we check every minute
368
+ const next_s = Math . max ( 1 , Math . floor ( elapsed_s / 6 ) ) ;
369
+ const wait_s = Math . min ( next_s , MONITOR_INTERVAL_S ) ;
370
+ await new Promise ( ( done ) => setTimeout ( done , wait_s * 1000 ) ) ;
355
371
356
- function clearMonitor ( ) {
357
- if ( monitorRef != null ) {
358
- clearInterval ( monitorRef ) ;
359
- monitorRef = null ;
372
+ if ( callback_done ) return ;
360
373
}
361
374
}
362
375
@@ -436,7 +449,8 @@ function doSpawn(
436
449
} ) ;
437
450
438
451
if ( opts . job_id && child . pid ) {
439
- setupMonitor ( ) ;
452
+ // we don't await it, it runs until $callback_done is true
453
+ startMonitor ( ) ;
440
454
}
441
455
442
456
const finish = ( err ?) => {
@@ -449,9 +463,8 @@ function doSpawn(
449
463
// we already finished up.
450
464
return ;
451
465
}
452
- // finally finish up.
466
+ // finally finish up – this will also terminate the monitor
453
467
callback_done = true ;
454
- clearMonitor ( ) ;
455
468
456
469
if ( timer != null ) {
457
470
clearTimeout ( timer ) ;
0 commit comments