Skip to content

Commit b1467fd

Browse files
jayasifacebook-github-bot
authored andcommitted
Client trace id for connecting events on backend and frontend (#897)
Summary: Pull Request resolved: #897 Introducing a client context, which currently only contains a trace id, which can be used to connect events on the client and worker side. The trace id will be logged to scuba, and is saved as an env var on the process. get_trace_id_or_create_new is first called when we initialize the rust bindings and the trace_id itself is first assigned to be the same as the execution id. Then, as part of the alloc we also pass this trace id along. For RemoteProcessAllocator, the Allocate message will take the client context, and on receiving it, set it in the AllocSpec before allocating the new processes. The ProcessAlloc will set the env var after reading the trace_id from the match_labels of the AllocSpec. It is only set when the client has set it and passed it with the RemoteAlloc. It will default to "". Reviewed By: eliothedeman, vidhyav Differential Revision: D79537187 fbshipit-source-id: fbea0889f65616f2faecd959cae7000d0dbcbfe8
1 parent 3b11820 commit b1467fd

File tree

10 files changed

+293
-2
lines changed

10 files changed

+293
-2
lines changed

hyperactor_mesh/src/alloc/process.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ use hyperactor::sync::monitor;
3333
use ndslice::view::Extent;
3434
use nix::sys::signal;
3535
use nix::unistd::Pid;
36+
use serde::Deserialize;
37+
use serde::Serialize;
3638
use tokio::io;
3739
use tokio::process::Command;
3840
use tokio::sync::Mutex;
@@ -56,6 +58,8 @@ use crate::shortuuid::ShortUuid;
5658
/// The maximum number of log lines to tail keep for managed processes.
5759
const MAX_TAIL_LOG_LINES: usize = 100;
5860

61+
pub const CLIENT_TRACE_ID_LABEL: &str = "CLIENT_TRACE_ID";
62+
5963
/// An allocator that allocates procs by executing managed (local)
6064
/// processes. ProcessAllocator is configured with a [`Command`] (template)
6165
/// to spawn external processes. These processes must invoke [`hyperactor_mesh::bootstrap`] or
@@ -102,10 +106,26 @@ impl Allocator for ProcessAllocator {
102106
children: JoinSet::new(),
103107
running: true,
104108
failed: false,
109+
client_context: ClientContext {
110+
trace_id: spec
111+
.constraints
112+
.match_labels
113+
.get(CLIENT_TRACE_ID_LABEL)
114+
.cloned()
115+
.unwrap_or_else(|| "".to_string()),
116+
},
105117
})
106118
}
107119
}
108120

121+
// Client Context is saved in ProcessAlloc, and is also passed in
122+
// the RemoteProcessAllocator's Allocate method
123+
#[derive(Debug, Clone, Serialize, Deserialize)]
124+
pub struct ClientContext {
125+
/// Trace ID for correlating logs across client and worker processes
126+
pub trace_id: String,
127+
}
128+
109129
/// An allocation produced by [`ProcessAllocator`].
110130
pub struct ProcessAlloc {
111131
name: ShortUuid,
@@ -121,6 +141,7 @@ pub struct ProcessAlloc {
121141
children: JoinSet<(usize, ProcStopReason)>,
122142
running: bool,
123143
failed: bool,
144+
client_context: ClientContext,
124145
}
125146

126147
#[derive(EnumAsInner)]
@@ -389,6 +410,10 @@ impl ProcessAlloc {
389410
bootstrap::BOOTSTRAP_ADDR_ENV,
390411
self.bootstrap_addr.to_string(),
391412
);
413+
cmd.env(
414+
bootstrap::CLIENT_TRACE_ID_ENV,
415+
self.client_context.trace_id.as_str(),
416+
);
392417
cmd.env(bootstrap::BOOTSTRAP_INDEX_ENV, index.to_string());
393418
cmd.env(bootstrap::BOOTSTRAP_LOG_CHANNEL, log_channel.to_string());
394419
cmd.stdout(Stdio::piped());

hyperactor_mesh/src/alloc/remoteprocess.rs

Lines changed: 173 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,15 @@ use tokio_stream::wrappers::WatchStream;
5555
use tokio_util::sync::CancellationToken;
5656

5757
use crate::alloc::Alloc;
58+
use crate::alloc::AllocConstraints;
5859
use crate::alloc::AllocSpec;
5960
use crate::alloc::Allocator;
6061
use crate::alloc::AllocatorError;
6162
use crate::alloc::ProcState;
6263
use crate::alloc::ProcStopReason;
6364
use crate::alloc::ProcessAllocator;
65+
use crate::alloc::process::CLIENT_TRACE_ID_LABEL;
66+
use crate::alloc::process::ClientContext;
6467

6568
/// Control messages sent from remote process allocator to local allocator.
6669
#[derive(Debug, Clone, Serialize, Deserialize, Named)]
@@ -74,6 +77,10 @@ pub enum RemoteProcessAllocatorMessage {
7477
/// Ordered list of hosts in this allocation. Can be used to
7578
/// pre-populate the any local configurations such as torch.dist.
7679
hosts: Vec<String>,
80+
/// Client context which is passed to the ProcessAlloc
81+
/// Todo: Once RemoteProcessAllocator moves to mailbox,
82+
/// the client_context will go to the message header instead
83+
client_context: Option<ClientContext>,
7784
},
7885
/// Stop allocation.
7986
Stop,
@@ -196,15 +203,25 @@ impl RemoteProcessAllocator {
196203
view,
197204
bootstrap_addr,
198205
hosts,
206+
client_context,
199207
}) => {
200208
tracing::info!("received allocation request for view: {}", view);
201209
ensure_previous_alloc_stopped(&mut active_allocation).await;
202210
tracing::info!("allocating...");
203211

204212
// Create the corresponding local allocation spec.
213+
let mut constraints: AllocConstraints = Default::default();
214+
if let Some(context) = &client_context {
215+
constraints = AllocConstraints {
216+
match_labels: HashMap::from([(
217+
CLIENT_TRACE_ID_LABEL.to_string(),
218+
context.trace_id.to_string(),
219+
)]
220+
)};
221+
}
205222
let spec = AllocSpec {
206223
extent: view.extent(),
207-
constraints: Default::default(),
224+
constraints,
208225
};
209226

210227
match process_allocator.allocate(spec.clone()).await {
@@ -749,10 +766,15 @@ impl RemoteProcessAlloc {
749766
"failed to dial remote {} for host {}",
750767
remote_addr, host.id
751768
))?;
769+
770+
let trace_id = hyperactor_telemetry::trace::get_or_create_trace_id();
771+
let client_context = Some(ClientContext { trace_id });
772+
752773
tx.post(RemoteProcessAllocatorMessage::Allocate {
753774
view,
754775
bootstrap_addr: self.bootstrap_addr.clone(),
755776
hosts: hostnames.clone(),
777+
client_context,
756778
});
757779

758780
self.hosts_by_offset.insert(offset, host.id.clone());
@@ -1280,6 +1302,7 @@ mod test {
12801302
view: extent.clone().into(),
12811303
bootstrap_addr,
12821304
hosts: vec![],
1305+
client_context: None,
12831306
})
12841307
.await
12851308
.unwrap();
@@ -1419,6 +1442,7 @@ mod test {
14191442
view: extent.clone().into(),
14201443
bootstrap_addr,
14211444
hosts: vec![],
1445+
client_context: None,
14221446
})
14231447
.await
14241448
.unwrap();
@@ -1519,6 +1543,7 @@ mod test {
15191543
view: extent.clone().into(),
15201544
bootstrap_addr: bootstrap_addr.clone(),
15211545
hosts: vec![],
1546+
client_context: None,
15221547
})
15231548
.await
15241549
.unwrap();
@@ -1539,6 +1564,7 @@ mod test {
15391564
view: extent.clone().into(),
15401565
bootstrap_addr,
15411566
hosts: vec![],
1567+
client_context: None,
15421568
})
15431569
.await
15441570
.unwrap();
@@ -1632,6 +1658,7 @@ mod test {
16321658
view: extent.clone().into(),
16331659
bootstrap_addr,
16341660
hosts: vec![],
1661+
client_context: None,
16351662
})
16361663
.await
16371664
.unwrap();
@@ -1721,6 +1748,7 @@ mod test {
17211748
view: extent.clone().into(),
17221749
bootstrap_addr,
17231750
hosts: vec![],
1751+
client_context: None,
17241752
})
17251753
.await
17261754
.unwrap();
@@ -1747,6 +1775,150 @@ mod test {
17471775
remote_allocator.terminate();
17481776
handle.await.unwrap().unwrap();
17491777
}
1778+
1779+
#[timed_test::async_timed_test(timeout_secs = 15)]
1780+
async fn test_trace_id_propagation() {
1781+
let config = hyperactor::config::global::lock();
1782+
let _guard = config.override_key(
1783+
hyperactor::config::REMOTE_ALLOCATOR_HEARTBEAT_INTERVAL,
1784+
Duration::from_secs(60),
1785+
);
1786+
hyperactor_telemetry::initialize_logging(ClockKind::default());
1787+
let serve_addr = ChannelAddr::any(ChannelTransport::Unix);
1788+
let bootstrap_addr = ChannelAddr::any(ChannelTransport::Unix);
1789+
let (_, mut rx) = channel::serve(bootstrap_addr.clone()).await.unwrap();
1790+
1791+
let extent = extent!(host = 1, gpu = 1);
1792+
let tx = channel::dial(serve_addr.clone()).unwrap();
1793+
let test_world_id: WorldId = id!(test_world_id);
1794+
let test_trace_id = "test_trace_id_12345";
1795+
1796+
// Create a mock alloc that we can verify receives the correct trace id
1797+
let mut alloc = MockAlloc::new();
1798+
alloc.expect_world_id().return_const(test_world_id.clone());
1799+
alloc.expect_extent().return_const(extent.clone());
1800+
alloc.expect_next().return_const(None);
1801+
1802+
// Create a mock allocator that captures the AllocSpec passed to it
1803+
let mut allocator = MockAllocator::new();
1804+
allocator
1805+
.expect_allocate()
1806+
.times(1)
1807+
.withf(move |spec: &AllocSpec| {
1808+
// Verify that the trace id is correctly set in the constraints
1809+
spec.constraints
1810+
.match_labels
1811+
.get(CLIENT_TRACE_ID_LABEL)
1812+
.is_some_and(|trace_id| trace_id == test_trace_id)
1813+
})
1814+
.return_once(|_| Ok(MockAllocWrapper::new(alloc)));
1815+
1816+
let remote_allocator = RemoteProcessAllocator::new();
1817+
let handle = tokio::spawn({
1818+
let remote_allocator = remote_allocator.clone();
1819+
async move {
1820+
remote_allocator
1821+
.start_with_allocator(serve_addr, allocator, None)
1822+
.await
1823+
}
1824+
});
1825+
1826+
// Send allocate message with client context containing trace id
1827+
tx.send(RemoteProcessAllocatorMessage::Allocate {
1828+
view: extent.clone().into(),
1829+
bootstrap_addr,
1830+
hosts: vec![],
1831+
client_context: Some(ClientContext {
1832+
trace_id: test_trace_id.to_string(),
1833+
}),
1834+
})
1835+
.await
1836+
.unwrap();
1837+
1838+
// Verify we get the allocated message
1839+
let m = rx.recv().await.unwrap();
1840+
assert_matches!(
1841+
m,
1842+
RemoteProcessProcStateMessage::Allocated { world_id, view }
1843+
if world_id == test_world_id && view.extent() == extent
1844+
);
1845+
1846+
// Verify we get the done message since the mock alloc returns None immediately
1847+
let m = rx.recv().await.unwrap();
1848+
assert_matches!(m, RemoteProcessProcStateMessage::Done(world_id) if world_id == test_world_id);
1849+
1850+
remote_allocator.terminate();
1851+
handle.await.unwrap().unwrap();
1852+
}
1853+
1854+
#[timed_test::async_timed_test(timeout_secs = 15)]
1855+
async fn test_trace_id_propagation_no_client_context() {
1856+
let config = hyperactor::config::global::lock();
1857+
let _guard = config.override_key(
1858+
hyperactor::config::REMOTE_ALLOCATOR_HEARTBEAT_INTERVAL,
1859+
Duration::from_secs(60),
1860+
);
1861+
hyperactor_telemetry::initialize_logging(ClockKind::default());
1862+
let serve_addr = ChannelAddr::any(ChannelTransport::Unix);
1863+
let bootstrap_addr = ChannelAddr::any(ChannelTransport::Unix);
1864+
let (_, mut rx) = channel::serve(bootstrap_addr.clone()).await.unwrap();
1865+
1866+
let extent = extent!(host = 1, gpu = 1);
1867+
let tx = channel::dial(serve_addr.clone()).unwrap();
1868+
let test_world_id: WorldId = id!(test_world_id);
1869+
1870+
// Create a mock alloc
1871+
let mut alloc = MockAlloc::new();
1872+
alloc.expect_world_id().return_const(test_world_id.clone());
1873+
alloc.expect_extent().return_const(extent.clone());
1874+
alloc.expect_next().return_const(None);
1875+
1876+
// Create a mock allocator that verifies no trace id is set when client_context is None
1877+
let mut allocator = MockAllocator::new();
1878+
allocator
1879+
.expect_allocate()
1880+
.times(1)
1881+
.withf(move |spec: &AllocSpec| {
1882+
// Verify that no trace id is set in the constraints when client_context is None
1883+
spec.constraints.match_labels.is_empty()
1884+
})
1885+
.return_once(|_| Ok(MockAllocWrapper::new(alloc)));
1886+
1887+
let remote_allocator = RemoteProcessAllocator::new();
1888+
let handle = tokio::spawn({
1889+
let remote_allocator = remote_allocator.clone();
1890+
async move {
1891+
remote_allocator
1892+
.start_with_allocator(serve_addr, allocator, None)
1893+
.await
1894+
}
1895+
});
1896+
1897+
// Send allocate message without client context
1898+
tx.send(RemoteProcessAllocatorMessage::Allocate {
1899+
view: extent.clone().into(),
1900+
bootstrap_addr,
1901+
hosts: vec![],
1902+
client_context: None,
1903+
})
1904+
.await
1905+
.unwrap();
1906+
1907+
// Verify we get the allocated message
1908+
let m = rx.recv().await.unwrap();
1909+
assert_matches!(
1910+
m,
1911+
RemoteProcessProcStateMessage::Allocated { world_id, view }
1912+
if world_id == test_world_id && view.extent() == extent
1913+
);
1914+
1915+
// Verify we get the done message since the mock alloc returns None immediately
1916+
let m = rx.recv().await.unwrap();
1917+
assert_matches!(m, RemoteProcessProcStateMessage::Done(world_id) if world_id == test_world_id);
1918+
1919+
remote_allocator.terminate();
1920+
handle.await.unwrap().unwrap();
1921+
}
17501922
}
17511923

17521924
#[cfg(test)]

hyperactor_mesh/src/bootstrap.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ use crate::proc_mesh::mesh_agent::MeshAgent;
2929

3030
pub const BOOTSTRAP_ADDR_ENV: &str = "HYPERACTOR_MESH_BOOTSTRAP_ADDR";
3131
pub const BOOTSTRAP_INDEX_ENV: &str = "HYPERACTOR_MESH_INDEX";
32+
pub const CLIENT_TRACE_ID_ENV: &str = "MONARCH_CLIENT_TRACE_ID";
3233
/// A channel used by each process to receive its own stdout and stderr
3334
/// Because stdout and stderr can only be obtained by the parent process,
3435
/// they need to be streamed back to the process.

hyperactor_telemetry/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ mod pool;
3434
pub mod recorder;
3535
mod spool;
3636
pub mod sqlite;
37+
pub mod trace;
3738
use std::io::IsTerminal;
3839
use std::io::Write;
3940
use std::str::FromStr;

hyperactor_telemetry/src/trace.rs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
use std::env;
10+
11+
use crate::env::execution_id;
12+
13+
const MONARCH_CLIENT_TRACE_ID: &str = "MONARCH_CLIENT_TRACE_ID";
14+
15+
/// Returns the current trace ID if it exists, or None if it doesn't.
16+
/// This function does not create a new trace ID if one doesn't exist.
17+
/// Todo: Eventually use Message Headers to relay this traceid instead of
18+
/// env vars
19+
pub fn get_trace_id() -> Option<String> {
20+
if let Ok(env_id) = env::var(MONARCH_CLIENT_TRACE_ID) {
21+
if !env_id.is_empty() {
22+
return Some(env_id);
23+
}
24+
}
25+
26+
// No trace ID found
27+
None
28+
}
29+
30+
/// Returns the current trace ID, if none exists, set the current execution id as the trace id.
31+
/// This ensures that the client trace id and execution id is the same.
32+
/// The trace ID remains the same as long as it is in the same process.
33+
/// Use this method only on the client side.
34+
pub fn get_or_create_trace_id() -> String {
35+
if let Ok(existing_trace_id) = std::env::var(MONARCH_CLIENT_TRACE_ID) {
36+
if !existing_trace_id.is_empty() {
37+
return existing_trace_id;
38+
}
39+
}
40+
41+
let trace_id = execution_id().clone();
42+
// Safety: Can be unsound if there are multiple threads
43+
// reading and writing the environment.
44+
unsafe {
45+
std::env::set_var(MONARCH_CLIENT_TRACE_ID, trace_id.clone());
46+
}
47+
48+
trace_id
49+
}

0 commit comments

Comments
 (0)