Skip to content

Commit 352a2ed

Browse files
joerundenjhill
authored andcommitted
Log failures to /dev/termination-log
This contains simple try/catch logic to write fatal failures to /dev/termination-log
1 parent bc371de commit 352a2ed

File tree

4 files changed

+101
-28
lines changed

4 files changed

+101
-28
lines changed

launcher/src/main.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@ use std::thread;
1313
use std::thread::sleep;
1414
use std::time::{Duration, Instant};
1515
use std::{fs, io};
16+
use std::fs::File;
1617
use std::env::VarError;
1718
use std::ffi::OsString;
1819
use std::os::unix::process::CommandExt;
20+
use String;
1921
use tracing::{info, warn};
2022

2123
// In most cases this gives the best performance for inferencing
@@ -81,6 +83,20 @@ struct Args {
8183
}
8284

8385
fn main() -> ExitCode {
86+
// Register a panic handler up-front to write to /dev/termination-log
87+
let default_hook = std::panic::take_hook();
88+
std::panic::set_hook(Box::new(move |panic_info| {
89+
if let Some(&s) = panic_info.payload().downcast_ref::<&str>() {
90+
_ = write_termination_log(s);
91+
} else if let Some(s) = panic_info.payload().downcast_ref::<String>() {
92+
_ = write_termination_log(s);
93+
}
94+
// No else case: If we cannot get good panic info, we won't write anything to the
95+
// termination log. The system logs should contain better information.
96+
default_hook(panic_info);
97+
}));
98+
99+
84100
// Pattern match configuration
85101
let args = Args::parse();
86102

@@ -647,3 +663,11 @@ fn resolve_tokenizer_path(model_name: &str, revision: Option<&str>) -> Result<St
647663
}
648664
}
649665
}
666+
667+
fn write_termination_log(msg: &str) -> Result<(), io::Error> {
668+
// Writes a message to the termination log.
669+
// Creates the logfile if it doesn't exist.
670+
let mut f = File::options().write(true).create(true).open("/dev/termination-log")?;
671+
writeln!(f, "{}", msg)?;
672+
Ok(())
673+
}

router/src/main.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
/// Text Generation Inference external gRPC server entrypoint
22
use clap::Parser;
3+
use std::fs::File;
4+
use std::io;
5+
use std::io::Write;
36
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
47
use text_generation_client::ShardedClient;
58
use text_generation_router::server;
@@ -48,6 +51,19 @@ struct Args {
4851
}
4952

5053
fn main() -> Result<(), std::io::Error> {
54+
// Register a panic handler up-front to write to /dev/termination-log
55+
let default_hook = std::panic::take_hook();
56+
std::panic::set_hook(Box::new(move |panic_info| {
57+
if let Some(&s) = panic_info.payload().downcast_ref::<&str>() {
58+
_ = write_termination_log(s);
59+
} else if let Some(s) = panic_info.payload().downcast_ref::<String>() {
60+
_ = write_termination_log(s);
61+
}
62+
// No else case: If we cannot get good panic info, we won't write anything to the
63+
// termination log. The system logs should contain better information.
64+
default_hook(panic_info);
65+
}));
66+
5167
// Get args
5268
let args = Args::parse();
5369

@@ -141,3 +157,11 @@ fn main() -> Result<(), std::io::Error> {
141157
Ok(())
142158
})
143159
}
160+
161+
fn write_termination_log(msg: &str) -> Result<(), io::Error> {
162+
// Writes a message to the termination log.
163+
// Creates the logfile if it doesn't exist.
164+
let mut f = File::options().write(true).create(true).open("/dev/termination-log")?;
165+
writeln!(f, "{}", msg)?;
166+
Ok(())
167+
}

server/text_generation_server/cli.py

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -32,36 +32,47 @@ def serve(
3232
uds_path: Path = "/tmp/text-generation",
3333
):
3434
from text_generation_server import server
35+
from text_generation_server.utils.termination import write_termination_log
3536

3637
if sharded:
37-
assert (
38-
os.getenv("RANK", None) is not None
39-
), "RANK must be set when sharded is True"
40-
assert (
41-
os.getenv("WORLD_SIZE", None) is not None
42-
), "WORLD_SIZE must be set when sharded is True"
43-
assert (
44-
os.getenv("MASTER_ADDR", None) is not None
45-
), "MASTER_ADDR must be set when sharded is True"
46-
assert (
47-
os.getenv("MASTER_PORT", None) is not None
48-
), "MASTER_PORT must be set when sharded is True"
49-
50-
server.serve(
51-
model_name,
52-
revision,
53-
deployment_framework,
54-
dtype,
55-
# Downgrade enum into str for easier management later on
56-
None if quantize is None else quantize.value,
57-
max_sequence_length,
58-
max_new_tokens,
59-
max_batch_size,
60-
batch_safety_margin,
61-
sharded,
62-
cuda_process_memory_fraction,
63-
uds_path
64-
)
38+
try:
39+
assert (
40+
os.getenv("RANK", None) is not None
41+
), "RANK must be set when sharded is True"
42+
assert (
43+
os.getenv("WORLD_SIZE", None) is not None
44+
), "WORLD_SIZE must be set when sharded is True"
45+
assert (
46+
os.getenv("MASTER_ADDR", None) is not None
47+
), "MASTER_ADDR must be set when sharded is True"
48+
assert (
49+
os.getenv("MASTER_PORT", None) is not None
50+
), "MASTER_PORT must be set when sharded is True"
51+
except AssertionError as e:
52+
write_termination_log(str(e))
53+
raise e
54+
55+
try:
56+
server.serve(
57+
model_name,
58+
revision,
59+
deployment_framework,
60+
dtype,
61+
# Downgrade enum into str for easier management later on
62+
None if quantize is None else quantize.value,
63+
max_sequence_length,
64+
max_new_tokens,
65+
max_batch_size,
66+
batch_safety_margin,
67+
sharded,
68+
cuda_process_memory_fraction,
69+
uds_path
70+
)
71+
except Exception as e:
72+
# Any exceptions in the blocking server thread here should mean that
73+
# the server terminated due to an error
74+
write_termination_log(str(e))
75+
raise e
6576

6677

6778
@app.command()
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"""Utils for properly logging process termination"""
2+
3+
4+
def write_termination_log(msg: str, file: str="/dev/termination-log") -> None:
5+
"""Writes to the termination logfile."""
6+
7+
try:
8+
with open(file, "w") as termination_file:
9+
termination_file.write(f"{msg}\n")
10+
except Exception:
11+
# Ignore any errors writing to the termination logfile.
12+
# Users can fall back to the stdout logs, and we don't want to pollute
13+
# those with an error here.
14+
pass

0 commit comments

Comments
 (0)