Skip to content

Commit 7b3fc82

Browse files
authored
Merge pull request #159 from firstbatchxyz/erhant/better-publish-timestamp-logic
Better publish timestamps & better batch logic
2 parents daa9fba + 0ef85b5 commit 7b3fc82

File tree

7 files changed

+86
-63
lines changed

7 files changed

+86
-63
lines changed

Cargo.lock

Lines changed: 7 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ default-members = ["compute"]
99

1010
[workspace.package]
1111
edition = "2021"
12-
version = "0.2.29"
12+
version = "0.2.30"
1313
license = "Apache-2.0"
1414
readme = "README.md"
1515

@@ -18,14 +18,9 @@ readme = "README.md"
1818
inherits = "release"
1919
debug = true
2020

21-
22-
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
23-
2421
[workspace.dependencies]
2522
# async stuff
26-
tokio-util = { version = "0.7.10", features = [
27-
"rt",
28-
] } # tokio-util provides CancellationToken
23+
tokio-util = { version = "0.7.10", features = ["rt"] }
2924
tokio = { version = "1", features = ["macros", "rt-multi-thread", "signal"] }
3025
async-trait = "0.1.81"
3126

compute/src/main.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,9 @@ async fn main() -> Result<()> {
118118
let node_token = cancellation.clone();
119119
task_tracker.spawn(async move {
120120
if let Err(err) = node.run(node_token).await {
121-
log::error!("Node launch error: {}", err);
122-
panic!("Node failed.")
121+
log::error!("Error within main node loop: {}", err);
122+
log::error!("Shutting down node.");
123+
node.shutdown().await.expect("could not shutdown node");
123124
};
124125
log::info!("Closing node.")
125126
});

compute/src/node.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -331,10 +331,9 @@ impl DriaComputeNode {
331331

332332
loop {
333333
tokio::select! {
334-
// check peer count every now and then
335-
_ = peer_refresh_interval.tick() => self.handle_diagnostic_refresh().await,
336-
// available nodes are refreshed every now and then
337-
_ = available_node_refresh_interval.tick() => self.handle_available_nodes_refresh().await,
334+
// prioritize the branches in the order below
335+
biased;
336+
338337
// a Workflow message to be published is received from the channel
339338
// this is expected to be sent by the workflow worker
340339
publish_msg_opt = self.publish_rx.recv() => {
@@ -358,6 +357,11 @@ impl DriaComputeNode {
358357
break;
359358
};
360359
},
360+
361+
// check peer count every now and then
362+
_ = peer_refresh_interval.tick() => self.handle_diagnostic_refresh().await,
363+
// available nodes are refreshed every now and then
364+
_ = available_node_refresh_interval.tick() => self.handle_available_nodes_refresh().await,
361365
// a GossipSub message is received from the channel
362366
// this is expected to be sent by the p2p client
363367
gossipsub_msg_opt = self.message_rx.recv() => {

compute/src/payloads/stats.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@ pub struct TaskStats {
1212
/// Timestamp at which the task was published back to network.
1313
pub published_at: u128,
1414
/// Time taken to execute the task.
15+
/// FIXME: will be removed after
1516
pub execution_time: u128,
17+
/// Timestamp at which the task execution had started.
18+
pub execution_started_at: u128,
19+
/// Timestamp at which the task execution had finished.
20+
pub execution_ended_time: u128,
1621
}
1722

1823
impl TaskStats {
@@ -33,7 +38,20 @@ impl TaskStats {
3338
self
3439
}
3540

41+
/// Records the execution start time within `execution_started_at`.
42+
pub fn record_execution_started_at(mut self) -> Self {
43+
self.execution_started_at = get_current_time_nanos();
44+
self
45+
}
46+
47+
/// Records the execution end time within `execution_ended_time`.
48+
pub fn record_execution_ended_at(mut self) -> Self {
49+
self.execution_ended_time = get_current_time_nanos();
50+
self
51+
}
52+
3653
/// Records the execution time of the task.
54+
/// TODO: #[deprecated = "will be removed later"]
3755
pub fn record_execution_time(mut self, started_at: Instant) -> Self {
3856
self.execution_time = Instant::now().duration_since(started_at).as_nanos();
3957
self

compute/src/workers/workflow.rs

Lines changed: 44 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ impl WorkflowsWorker {
7878

7979
if let Some(task) = task {
8080
log::info!("Processing single workflow for task {}", task.task_id);
81-
WorkflowsWorker::execute((task, self.publish_tx.clone())).await
81+
WorkflowsWorker::execute((task, &self.publish_tx)).await
8282
} else {
8383
return self.shutdown();
8484
};
@@ -93,76 +93,85 @@ impl WorkflowsWorker {
9393
///
9494
/// Batch size must NOT be larger than `MAX_BATCH_SIZE`, otherwise will panic.
9595
pub async fn run_batch(&mut self, batch_size: usize) {
96-
// TODO: need some better batch_size error handling here
96+
assert!(
97+
batch_size <= Self::MAX_BATCH_SIZE,
98+
"Batch size must not be larger than {}",
99+
Self::MAX_BATCH_SIZE
100+
);
101+
97102
loop {
98-
// get tasks in batch from the channel
99-
let mut task_buffer = Vec::new();
100-
let num_tasks = self
101-
.workflow_rx
102-
.recv_many(&mut task_buffer, batch_size)
103-
.await;
104-
105-
if num_tasks == 0 {
106-
return self.shutdown();
103+
let mut tasks = Vec::new();
104+
105+
// get tasks in batch from the channel, we enter the loop if:
106+
// (1) there are no tasks, or,
107+
// (2) there are tasks less than the batch size and the channel is not empty
108+
while tasks.len() == 0 || (tasks.len() < batch_size && !self.workflow_rx.is_empty()) {
109+
let limit = batch_size - tasks.len();
110+
match self.workflow_rx.recv_many(&mut tasks, limit).await {
111+
// 0 tasks returned means that the channel is closed
112+
0 => return self.shutdown(),
113+
_ => {
114+
// wait a small amount of time to allow for more tasks to be sent into the channel
115+
tokio::time::sleep(std::time::Duration::from_millis(256)).await;
116+
}
117+
}
107118
}
108119

109120
// process the batch
121+
let num_tasks = tasks.len();
122+
debug_assert!(
123+
num_tasks <= batch_size,
124+
"number of tasks cant be larger than batch size"
125+
);
126+
debug_assert!(num_tasks != 0, "number of tasks cant be zero");
110127
log::info!("Processing {} workflows in batch", num_tasks);
111-
let mut batch = task_buffer
112-
.into_iter()
113-
.map(|b| (b, self.publish_tx.clone()));
128+
let mut batch = tasks.into_iter().map(|b| (b, &self.publish_tx));
114129
match num_tasks {
115130
1 => {
116-
let r0 = WorkflowsWorker::execute(batch.next().unwrap()).await;
117-
vec![r0]
131+
WorkflowsWorker::execute(batch.next().unwrap()).await;
118132
}
119133
2 => {
120-
let (r0, r1) = tokio::join!(
134+
tokio::join!(
121135
WorkflowsWorker::execute(batch.next().unwrap()),
122136
WorkflowsWorker::execute(batch.next().unwrap())
123137
);
124-
vec![r0, r1]
125138
}
126139
3 => {
127-
let (r0, r1, r2) = tokio::join!(
140+
tokio::join!(
128141
WorkflowsWorker::execute(batch.next().unwrap()),
129142
WorkflowsWorker::execute(batch.next().unwrap()),
130143
WorkflowsWorker::execute(batch.next().unwrap())
131144
);
132-
vec![r0, r1, r2]
133145
}
134146
4 => {
135-
let (r0, r1, r2, r3) = tokio::join!(
147+
tokio::join!(
136148
WorkflowsWorker::execute(batch.next().unwrap()),
137149
WorkflowsWorker::execute(batch.next().unwrap()),
138150
WorkflowsWorker::execute(batch.next().unwrap()),
139151
WorkflowsWorker::execute(batch.next().unwrap())
140152
);
141-
vec![r0, r1, r2, r3]
142153
}
143154
5 => {
144-
let (r0, r1, r2, r3, r4) = tokio::join!(
155+
tokio::join!(
145156
WorkflowsWorker::execute(batch.next().unwrap()),
146157
WorkflowsWorker::execute(batch.next().unwrap()),
147158
WorkflowsWorker::execute(batch.next().unwrap()),
148159
WorkflowsWorker::execute(batch.next().unwrap()),
149160
WorkflowsWorker::execute(batch.next().unwrap())
150161
);
151-
vec![r0, r1, r2, r3, r4]
152162
}
153163
6 => {
154-
let (r0, r1, r2, r3, r4, r5) = tokio::join!(
164+
tokio::join!(
155165
WorkflowsWorker::execute(batch.next().unwrap()),
156166
WorkflowsWorker::execute(batch.next().unwrap()),
157167
WorkflowsWorker::execute(batch.next().unwrap()),
158168
WorkflowsWorker::execute(batch.next().unwrap()),
159169
WorkflowsWorker::execute(batch.next().unwrap()),
160170
WorkflowsWorker::execute(batch.next().unwrap())
161171
);
162-
vec![r0, r1, r2, r3, r4, r5]
163172
}
164173
7 => {
165-
let (r0, r1, r2, r3, r4, r5, r6) = tokio::join!(
174+
tokio::join!(
166175
WorkflowsWorker::execute(batch.next().unwrap()),
167176
WorkflowsWorker::execute(batch.next().unwrap()),
168177
WorkflowsWorker::execute(batch.next().unwrap()),
@@ -171,10 +180,9 @@ impl WorkflowsWorker {
171180
WorkflowsWorker::execute(batch.next().unwrap()),
172181
WorkflowsWorker::execute(batch.next().unwrap())
173182
);
174-
vec![r0, r1, r2, r3, r4, r5, r6]
175183
}
176184
8 => {
177-
let (r0, r1, r2, r3, r4, r5, r6, r7) = tokio::join!(
185+
tokio::join!(
178186
WorkflowsWorker::execute(batch.next().unwrap()),
179187
WorkflowsWorker::execute(batch.next().unwrap()),
180188
WorkflowsWorker::execute(batch.next().unwrap()),
@@ -184,7 +192,6 @@ impl WorkflowsWorker {
184192
WorkflowsWorker::execute(batch.next().unwrap()),
185193
WorkflowsWorker::execute(batch.next().unwrap())
186194
);
187-
vec![r0, r1, r2, r3, r4, r5, r6, r7]
188195
}
189196
_ => {
190197
unreachable!(
@@ -199,23 +206,28 @@ impl WorkflowsWorker {
199206

200207
/// Executes a single task, and publishes the output.
201208
pub async fn execute(
202-
(input, publish_tx): (WorkflowsWorkerInput, mpsc::Sender<WorkflowsWorkerOutput>),
209+
(input, publish_tx): (WorkflowsWorkerInput, &mpsc::Sender<WorkflowsWorkerOutput>),
203210
) {
211+
let mut stats = input.stats;
212+
204213
let mut memory = ProgramMemory::new();
205214

215+
// TODO: will be removed later
206216
let started_at = std::time::Instant::now();
217+
stats = stats.record_execution_started_at();
207218
let result = input
208219
.executor
209220
.execute(input.entry.as_ref(), &input.workflow, &mut memory)
210221
.await;
222+
stats = stats.record_execution_ended_at();
211223

212224
let output = WorkflowsWorkerOutput {
213225
result,
214226
public_key: input.public_key,
215227
task_id: input.task_id,
216228
model_name: input.model_name,
217229
batchable: input.batchable,
218-
stats: input.stats.record_execution_time(started_at),
230+
stats: stats.record_execution_time(started_at),
219231
};
220232

221233
if let Err(e) = publish_tx.send(output).await {

p2p/src/client.rs

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -129,16 +129,6 @@ impl DriaP2PClient {
129129
swarm.dial(rpc_addr.clone())?;
130130
}
131131

132-
// add rpcs as explicit peers
133-
// TODO: may not be necessary
134-
// for rpc_peer_id in &nodes.rpc_peerids {
135-
// log::info!("Adding {} as explicit peer.", rpc_peer_id);
136-
// swarm
137-
// .behaviour_mut()
138-
// .gossipsub
139-
// .add_explicit_peer(rpc_peer_id);
140-
// }
141-
142132
// create commander
143133
let (cmd_tx, cmd_rx) = mpsc::channel(COMMAND_CHANNEL_BUFSIZE);
144134
let commander = DriaP2PCommander::new(cmd_tx, protocol.clone());
@@ -161,7 +151,9 @@ impl DriaP2PClient {
161151
pub async fn run(mut self) {
162152
loop {
163153
tokio::select! {
164-
event = self.swarm.select_next_some() => self.handle_event(event).await,
154+
// this is a special keyword that changes the polling order from random to linear,
155+
// which will effectively prioritize commands over events
156+
biased;
165157
command = self.cmd_rx.recv() => match command {
166158
Some(c) => self.handle_command(c).await,
167159
// channel closed, thus shutting down the network event loop
@@ -170,6 +162,7 @@ impl DriaP2PClient {
170162
return
171163
},
172164
},
165+
event = self.swarm.select_next_some() => self.handle_event(event).await,
173166
}
174167
}
175168
}

0 commit comments

Comments
 (0)