Skip to content

Commit 171bb4c

Browse files
mariusaefacebook-github-bot
authored andcommitted
parse direct procs / actors (meta-pytorch#800)
Summary: Pull Request resolved: meta-pytorch#800 This change introduces a new kind of identifier used to parse direct procs. As we plan to phase out ranked procs, this will become the only kind of identifier in the system. ``` # The proc "proc-name" at tcp:[::1]:1234 tcp:[::1]:1234,proc-name # The root actor "actor-name" at the above proc tcp:[::1]:1234,proc-name,actor-name # ... equivalent to: tcp:[::1]:1234,proc-name,actor-name[0] # A child actor of the above actor: tcp:[::1]:1234,proc-name,actor-name[123] ``` ## Alternatives considered The central challenge with this syntax is satisfying the following simultaneously: 1) Unambiguous parsing: all while mixing channel addresses and procs, actor names, and PIDs 2) A shell-friendly syntax: we do not want to have to impose quoting or escaping in typical cases 3) Make it simple, drawing on only a few lexemes For example, the first requirement precludes a straightforward URL syntax, consider: ``` tcp://[::1]:1234/proc-name/actor-name/pid ``` but what happens when you introduce unix sockets? ``` unix:///path/to/socket/proc-name/actor-name/pid ``` How do you delimit the socket address from the proc name? One alternative is to introduce another delimiter, e.g.: ``` unix:///path/to/socket//proc-name/actor-name/pid ``` but now this looks rather like line noise! And it's hard to parse! One alternative I liked is to quote the channel address: this sets it apart, and signifies a separate syntax. ``` "tcp:[::1]:1234".proc-name.actor-name[pid] ``` but the shell parses the quotes! Similar issues exist for other alternatives, which can be parsed unambiguously, but would make the grammar more complicated: ``` # Brackets compete with IPv6 addresses, requiring recursive descent parsing unix[/foo/bar].proc-name.actor-name[pid] # "::" clash with IPv6 addresses, requiring additional context to parse correctly unix:/foo/bar::proc-name::actor-name[pid] ``` And so on. All of this being said: if you find a clearly superior alternative, I am all ears. ghstack-source-id: 301740077 exported-using-ghexport Reviewed By: shayne-fletcher Differential Revision: D79856404 fbshipit-source-id: 60b479091af52c3f15f2c6bf446639fab05f60ee
1 parent 7313f41 commit 171bb4c

File tree

2 files changed

+88
-47
lines changed

2 files changed

+88
-47
lines changed

hyperactor/src/parse.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ pub enum Token<'a> {
3838
Colon,
3939
/// "."
4040
Dot,
41+
/// ","
42+
Comma,
4143

4244
// Special token to denote an invalid element. It is used to poison
4345
// the parser.
@@ -56,7 +58,7 @@ impl<'a> Lexer<'a> {
5658
Self {
5759
// TODO: compose iterators directly; would be simpler with
5860
// existential type support.
59-
tokens: chop(input, &["[", "]", ".", "@"]).collect(),
61+
tokens: chop(input, &["[", "]", ".", "@", ","]).collect(),
6062
}
6163
}
6264
}
@@ -72,6 +74,7 @@ impl<'a> Iterator for Lexer<'a> {
7274
Some("@") => Some(Token::At),
7375
Some(":") => Some(Token::Colon),
7476
Some(".") => Some(Token::Dot),
77+
Some(",") => Some(Token::Comma),
7578
Some(elem) => Some({
7679
if let Ok(uint) = elem.parse::<usize>() {
7780
Token::Uint(uint)
@@ -180,7 +183,7 @@ mod tests {
180183

181184
#[test]
182185
fn test_lexer() {
183-
let tokens = Lexer::new("foo.bar[123]");
186+
let tokens = Lexer::new("foo.bar[123],baz");
184187
assert_eq!(
185188
tokens.collect::<Vec<Token>>(),
186189
vec![
@@ -189,7 +192,9 @@ mod tests {
189192
Token::Elem("bar"),
190193
Token::LeftBracket,
191194
Token::Uint(123),
192-
Token::RightBracket
195+
Token::RightBracket,
196+
Token::Comma,
197+
Token::Elem("baz"),
193198
]
194199
)
195200
}

hyperactor/src/reference.rs

Lines changed: 80 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ macro_rules! id {
302302
pub use id;
303303

304304
/// The type of error encountered while parsing references.
305-
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
305+
#[derive(thiserror::Error, Debug)]
306306
pub enum ReferenceParsingError {
307307
/// The parser expected a token, but it reached the end of the token stream.
308308
#[error("expected token")]
@@ -323,45 +323,82 @@ pub enum ReferenceParsingError {
323323
/// The parser encountered the wrong reference type.
324324
#[error("wrong reference type: expected {0}")]
325325
WrongType(String),
326+
327+
/// An invalid channel address was encountered while parsing the reference.
328+
#[error("invalid channel address {0}: {1}")]
329+
InvalidChannelAddress(String, anyhow::Error),
326330
}
327331

328332
impl FromStr for Reference {
329333
type Err = ReferenceParsingError;
330334

331335
fn from_str(addr: &str) -> Result<Self, Self::Err> {
332-
let reference = parse! {
333-
Lexer::new(addr);
334-
335-
// world
336-
Token::Elem(world) => Self::World(WorldId(world.into())),
337-
338-
// world[rank]
339-
Token::Elem(world) Token::LeftBracket Token::Uint(rank) Token::RightBracket =>
340-
Self::Proc(ProcId::Ranked(WorldId(world.into()), rank)),
341-
342-
// world[rank].actor (implied pid=0)
343-
Token::Elem(world) Token::LeftBracket Token::Uint(rank) Token::RightBracket
344-
Token::Dot Token::Elem(actor) =>
345-
Self::Actor(ActorId(ProcId::Ranked(WorldId(world.into()), rank), actor.into(), 0)),
346-
347-
// world[rank].actor[pid]
348-
Token::Elem(world) Token::LeftBracket Token::Uint(rank) Token::RightBracket
349-
Token::Dot Token::Elem(actor)
350-
Token::LeftBracket Token::Uint(pid) Token::RightBracket =>
351-
Self::Actor(ActorId(ProcId::Ranked(WorldId(world.into()), rank), actor.into(), pid)),
352-
353-
// world[rank].actor[pid][port]
354-
Token::Elem(world) Token::LeftBracket Token::Uint(rank) Token::RightBracket
355-
Token::Dot Token::Elem(actor)
356-
Token::LeftBracket Token::Uint(pid) Token::RightBracket
357-
Token::LeftBracket Token::Uint(index) Token::RightBracket =>
358-
Self::Port(PortId(ActorId(ProcId::Ranked(WorldId(world.into()), rank), actor.into(), pid), index as u64)),
359-
360-
// world.actor
361-
Token::Elem(world) Token::Dot Token::Elem(actor) =>
362-
Self::Gang(GangId(WorldId(world.into()), actor.into())),
363-
};
364-
Ok(reference?)
336+
// First, try to parse a "new style" reference:
337+
// 1) If the reference contains a comma (anywhere), it is a new style reference;
338+
// commas were not a valid lexeme in the previous reference format.
339+
// 2) This is a bit ugly, but we bypass the tokenizer prior to this comma,
340+
// try to parse a channel address, and then parse the remainder.
341+
342+
match addr.split_once(",") {
343+
Some((channel_addr, rest)) => {
344+
let channel_addr = channel_addr.parse().map_err(|err| {
345+
ReferenceParsingError::InvalidChannelAddress(channel_addr.to_string(), err)
346+
})?;
347+
348+
Ok(parse! {
349+
Lexer::new(rest);
350+
351+
// channeladdr,proc_name
352+
Token::Elem(proc_name) =>
353+
Self::Proc(ProcId::Direct(channel_addr, proc_name.to_string())),
354+
355+
// channeladdr,proc_name,actor_name
356+
Token::Elem(proc_name) Token::Comma Token::Elem(actor_name) =>
357+
Self::Actor(ActorId(ProcId::Direct(channel_addr, proc_name.to_string()), actor_name.to_string(), 0)),
358+
359+
// channeladdr,proc_name,actor_name[rank]
360+
Token::Elem(proc_name) Token::Comma Token::Elem(actor_name)
361+
Token::LeftBracket Token::Uint(rank) Token::RightBracket =>
362+
Self::Actor(ActorId(ProcId::Direct(channel_addr, proc_name.to_string()), actor_name.to_string(), rank)),
363+
}?)
364+
}
365+
366+
// "old style" / "ranked" reference
367+
None => {
368+
Ok(parse! {
369+
Lexer::new(addr);
370+
371+
// world
372+
Token::Elem(world) => Self::World(WorldId(world.into())),
373+
374+
// world[rank]
375+
Token::Elem(world) Token::LeftBracket Token::Uint(rank) Token::RightBracket =>
376+
Self::Proc(ProcId::Ranked(WorldId(world.into()), rank)),
377+
378+
// world[rank].actor (implied pid=0)
379+
Token::Elem(world) Token::LeftBracket Token::Uint(rank) Token::RightBracket
380+
Token::Dot Token::Elem(actor) =>
381+
Self::Actor(ActorId(ProcId::Ranked(WorldId(world.into()), rank), actor.into(), 0)),
382+
383+
// world[rank].actor[pid]
384+
Token::Elem(world) Token::LeftBracket Token::Uint(rank) Token::RightBracket
385+
Token::Dot Token::Elem(actor)
386+
Token::LeftBracket Token::Uint(pid) Token::RightBracket =>
387+
Self::Actor(ActorId(ProcId::Ranked(WorldId(world.into()), rank), actor.into(), pid)),
388+
389+
// world[rank].actor[pid][port]
390+
Token::Elem(world) Token::LeftBracket Token::Uint(rank) Token::RightBracket
391+
Token::Dot Token::Elem(actor)
392+
Token::LeftBracket Token::Uint(pid) Token::RightBracket
393+
Token::LeftBracket Token::Uint(index) Token::RightBracket =>
394+
Self::Port(PortId(ActorId(ProcId::Ranked(WorldId(world.into()), rank), actor.into(), pid), index as u64)),
395+
396+
// world.actor
397+
Token::Elem(world) Token::Dot Token::Elem(actor) =>
398+
Self::Gang(GangId(WorldId(world.into()), actor.into())),
399+
}?)
400+
}
401+
}
365402
}
366403
}
367404

@@ -519,16 +556,6 @@ impl FromStr for ProcId {
519556
type Err = ReferenceParsingError;
520557

521558
fn from_str(addr: &str) -> Result<Self, Self::Err> {
522-
// We first try to parse the proc id as a channel address; otherwise
523-
// as a ranked reference. These grammars are currently non-overlapping,
524-
// but we need to be careful when we add new channel address types.
525-
//
526-
// Over time, we will deprecate ranked references and provide a robustly
527-
// unambiguous syntax.
528-
if let Ok(channel_addr) = addr.parse::<ChannelAddr>() {
529-
// TODO: parse names
530-
return Ok(ProcId::Direct(channel_addr, "".to_string()));
531-
}
532559
match addr.parse()? {
533560
Reference::Proc(proc_id) => Ok(proc_id),
534561
_ => Err(ReferenceParsingError::WrongType("proc".into())),
@@ -1258,6 +1285,15 @@ mod tests {
12581285
"test.testactor",
12591286
GangId(WorldId("test".into()), "testactor".into()).into(),
12601287
),
1288+
(
1289+
"tcp:[::1]:1234,test,testactor[123]",
1290+
ActorId(
1291+
ProcId::Direct("tcp:[::1]:1234".parse().unwrap(), "test".to_string()),
1292+
"testactor".to_string(),
1293+
123,
1294+
)
1295+
.into(),
1296+
),
12611297
];
12621298

12631299
for (s, expected) in cases {

0 commit comments

Comments
 (0)