Skip to content

Commit 50ffca4

Browse files
authored
[core][stats-die/03bis] improve scheduler_placement_time_s metric (#58217)
Change the unit of `scheduler_placement_time` from seconds to mili-seconds. The current bucket is in the range of 0.1s to 2.5 hours which doesn't make sense. According to a sample of data, the range we are interested in would be from us to s. Thanks @ZacAttack for pointing this out. ``` Note: This is an internal (non–public-facing) metric, so we only need to update its usage within Ray (e.g., the dashboard). A simple code change should suffice. ``` <img width="1609" height="421" alt="505491038-c5d81017-b86c-406f-acf4-614560752062" src="https://github.com/user-attachments/assets/cc647b97-42ec-42eb-bf01-4d1867940207" /> Test: - CI Signed-off-by: Cuong Nguyen <[email protected]>
1 parent 7271b0c commit 50ffca4

25 files changed

+60
-69
lines changed

python/ray/tests/test_scheduling_2.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -787,17 +787,17 @@ def ready(self):
787787
placement_metric_condition = get_metric_check_condition(
788788
[
789789
MetricSamplePattern(
790-
name="ray_scheduler_placement_time_s_bucket",
790+
name="ray_scheduler_placement_time_ms_bucket",
791791
value=1.0,
792792
partial_label_match={"WorkloadType": "Actor"},
793793
),
794794
MetricSamplePattern(
795-
name="ray_scheduler_placement_time_s_bucket",
795+
name="ray_scheduler_placement_time_ms_bucket",
796796
value=1.0,
797797
partial_label_match={"WorkloadType": "Task"},
798798
),
799799
MetricSamplePattern(
800-
name="ray_scheduler_placement_time_s_bucket",
800+
name="ray_scheduler_placement_time_ms_bucket",
801801
value=1.0,
802802
partial_label_match={"WorkloadType": "PlacementGroup"},
803803
),

src/ray/common/metrics.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,15 +73,15 @@ inline ray::stats::Gauge GetObjectStoreMemoryGaugeMetric() {
7373
};
7474
}
7575

76-
inline ray::stats::Histogram GetSchedulerPlacementTimeSHistogramMetric() {
76+
inline ray::stats::Histogram GetSchedulerPlacementTimeMsHistogramMetric() {
7777
return ray::stats::Histogram{
78-
/*name=*/"scheduler_placement_time_s",
78+
/*name=*/"scheduler_placement_time_ms",
7979
/*description=*/
80-
"The time it takes for a worklod (task, actor, placement group) to "
80+
"The time it takes for a workload (task, actor, placement group) to "
8181
"be placed. This is the time from when the tasks dependencies are "
8282
"resolved to when it actually reserves resources on a node to run.",
83-
/*unit=*/"s",
84-
/*boundaries=*/{0.1, 1, 10, 100, 1000, 10000},
83+
/*unit=*/"ms",
84+
/*boundaries=*/{1, 10, 100, 1000, 10000},
8585
/*tag_keys=*/{"WorkloadType"},
8686
};
8787
}

src/ray/common/task/task_spec.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -612,15 +612,15 @@ bool TaskSpecification::IsRetriable() const {
612612
}
613613

614614
void TaskSpecification::EmitTaskMetrics(
615-
ray::observability::MetricInterface &scheduler_placement_time_s_histogram) const {
616-
double duration_s = (GetMessage().lease_grant_timestamp_ms() -
617-
GetMessage().dependency_resolution_timestamp_ms()) /
618-
1000;
615+
ray::observability::MetricInterface &scheduler_placement_time_ms_histogram) const {
616+
double duration_ms = GetMessage().lease_grant_timestamp_ms() -
617+
GetMessage().dependency_resolution_timestamp_ms();
619618

620619
if (IsActorCreationTask()) {
621-
scheduler_placement_time_s_histogram.Record(duration_s, {{"WorkloadType", "Actor"}});
620+
scheduler_placement_time_ms_histogram.Record(duration_ms,
621+
{{"WorkloadType", "Actor"}});
622622
} else {
623-
scheduler_placement_time_s_histogram.Record(duration_s, {{"WorkloadType", "Task"}});
623+
scheduler_placement_time_ms_histogram.Record(duration_ms, {{"WorkloadType", "Task"}});
624624
}
625625
}
626626

src/ray/common/task/task_spec.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ class TaskSpecification : public MessageWrapper<rpc::TaskSpec> {
365365
bool IsRetriable() const;
366366

367367
void EmitTaskMetrics(
368-
ray::observability::MetricInterface &scheduler_placement_time_s_histogram) const;
368+
ray::observability::MetricInterface &scheduler_placement_time_ms_histogram) const;
369369

370370
/// \return true if task events from this task should be reported.
371371
bool EnableTaskEvents() const;

src/ray/core_worker/core_worker_process.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -552,7 +552,7 @@ std::shared_ptr<CoreWorker> CoreWorkerProcessImpl::CreateCoreWorker(
552552
return rpc::TensorTransport::OBJECT_STORE;
553553
},
554554
boost::asio::steady_timer(io_service_),
555-
*scheduler_placement_time_s_histogram_);
555+
*scheduler_placement_time_ms_histogram_);
556556

557557
auto report_locality_data_callback = [this](
558558
const ObjectID &object_id,
@@ -797,8 +797,8 @@ CoreWorkerProcessImpl::CoreWorkerProcessImpl(const CoreWorkerOptions &options)
797797
new ray::stats::Gauge(GetActorByStateGaugeMetric()));
798798
total_lineage_bytes_gauge_ = std::unique_ptr<ray::stats::Gauge>(
799799
new ray::stats::Gauge(GetTotalLineageBytesGaugeMetric()));
800-
scheduler_placement_time_s_histogram_ = std::unique_ptr<ray::stats::Histogram>(
801-
new ray::stats::Histogram(GetSchedulerPlacementTimeSHistogramMetric()));
800+
scheduler_placement_time_ms_histogram_ = std::unique_ptr<ray::stats::Histogram>(
801+
new ray::stats::Histogram(GetSchedulerPlacementTimeMsHistogramMetric()));
802802

803803
// Initialize event framework before starting up worker.
804804
if (RayConfig::instance().event_log_reporter_enabled() && !options_.log_dir.empty()) {

src/ray/core_worker/core_worker_process.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ class CoreWorkerProcessImpl {
192192
std::unique_ptr<ray::stats::Gauge> task_by_state_gauge_;
193193
std::unique_ptr<ray::stats::Gauge> actor_by_state_gauge_;
194194
std::unique_ptr<ray::stats::Gauge> total_lineage_bytes_gauge_;
195-
std::unique_ptr<ray::stats::Histogram> scheduler_placement_time_s_histogram_;
195+
std::unique_ptr<ray::stats::Histogram> scheduler_placement_time_ms_histogram_;
196196
};
197197
} // namespace core
198198
} // namespace ray

src/ray/core_worker/task_submission/normal_task_submitter.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ void NormalTaskSubmitter::OnWorkerIdle(
177177
scheduling_key_entry.num_busy_workers++;
178178

179179
task_spec.GetMutableMessage().set_lease_grant_timestamp_ms(current_sys_time_ms());
180-
task_spec.EmitTaskMetrics(scheduler_placement_time_s_histogram_);
180+
task_spec.EmitTaskMetrics(scheduler_placement_time_ms_histogram_);
181181

182182
executing_tasks_.emplace(task_spec.TaskId(), addr);
183183
PushNormalTask(

src/ray/core_worker/task_submission/normal_task_submitter.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ class NormalTaskSubmitter {
9696
std::shared_ptr<LeaseRequestRateLimiter> lease_request_rate_limiter,
9797
const TensorTransportGetter &tensor_transport_getter,
9898
boost::asio::steady_timer cancel_timer,
99-
ray::observability::MetricInterface &scheduler_placement_time_s_histogram)
99+
ray::observability::MetricInterface &scheduler_placement_time_ms_histogram)
100100
: rpc_address_(std::move(rpc_address)),
101101
local_raylet_client_(std::move(local_raylet_client)),
102102
raylet_client_pool_(std::move(raylet_client_pool)),
@@ -111,7 +111,7 @@ class NormalTaskSubmitter {
111111
job_id_(job_id),
112112
lease_request_rate_limiter_(std::move(lease_request_rate_limiter)),
113113
cancel_retry_timer_(std::move(cancel_timer)),
114-
scheduler_placement_time_s_histogram_(scheduler_placement_time_s_histogram) {}
114+
scheduler_placement_time_ms_histogram_(scheduler_placement_time_ms_histogram) {}
115115

116116
/// Schedule a task for direct submission to a worker.
117117
void SubmitTask(TaskSpecification task_spec);
@@ -365,7 +365,7 @@ class NormalTaskSubmitter {
365365
// Retries cancelation requests if they were not successful.
366366
boost::asio::steady_timer cancel_retry_timer_ ABSL_GUARDED_BY(mu_);
367367

368-
ray::observability::MetricInterface &scheduler_placement_time_s_histogram_;
368+
ray::observability::MetricInterface &scheduler_placement_time_ms_histogram_;
369369
};
370370

371371
} // namespace core

src/ray/core_worker/task_submission/tests/normal_task_submitter_test.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -496,7 +496,7 @@ class NormalTaskSubmitterTest : public testing::Test {
496496
rate_limiter,
497497
[](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; },
498498
boost::asio::steady_timer(io_context),
499-
fake_scheduler_placement_time_s_histogram_);
499+
fake_scheduler_placement_time_ms_histogram_);
500500
}
501501

502502
NodeID local_node_id;
@@ -513,7 +513,7 @@ class NormalTaskSubmitterTest : public testing::Test {
513513
std::unique_ptr<MockLeasePolicy> lease_policy;
514514
MockLeasePolicy *lease_policy_ptr = nullptr;
515515
instrumented_io_context io_context;
516-
ray::observability::FakeHistogram fake_scheduler_placement_time_s_histogram_;
516+
ray::observability::FakeHistogram fake_scheduler_placement_time_ms_histogram_;
517517
};
518518

519519
TEST_F(NormalTaskSubmitterTest, TestLocalityAwareSubmitOneTask) {
@@ -1433,7 +1433,7 @@ void TestSchedulingKey(const std::shared_ptr<CoreWorkerMemoryStore> store,
14331433
const TaskSpecification &same2,
14341434
const TaskSpecification &different) {
14351435
rpc::Address address;
1436-
ray::observability::FakeHistogram fake_scheduler_placement_time_s_histogram_;
1436+
ray::observability::FakeHistogram fake_scheduler_placement_time_ms_histogram_;
14371437
auto local_node_id = NodeID::FromRandom();
14381438
auto raylet_client = std::make_shared<MockRayletClient>();
14391439
auto raylet_client_pool = std::make_shared<rpc::RayletClientPool>(
@@ -1462,7 +1462,7 @@ void TestSchedulingKey(const std::shared_ptr<CoreWorkerMemoryStore> store,
14621462
std::make_shared<StaticLeaseRequestRateLimiter>(1),
14631463
[](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; },
14641464
boost::asio::steady_timer(io_context),
1465-
fake_scheduler_placement_time_s_histogram_);
1465+
fake_scheduler_placement_time_ms_histogram_);
14661466

14671467
submitter.SubmitTask(same1);
14681468
submitter.SubmitTask(same2);

src/ray/core_worker/tests/core_worker_test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ class CoreWorkerTest : public ::testing::Test {
224224
lease_request_rate_limiter,
225225
[](const ObjectID &object_id) { return rpc::TensorTransport::OBJECT_STORE; },
226226
boost::asio::steady_timer(io_service_),
227-
fake_scheduler_placement_time_s_histogram_);
227+
fake_scheduler_placement_time_ms_histogram_);
228228

229229
auto actor_task_submitter = std::make_unique<ActorTaskSubmitter>(
230230
*core_worker_client_pool,
@@ -300,7 +300,7 @@ class CoreWorkerTest : public ::testing::Test {
300300
ray::observability::FakeGauge fake_task_by_state_gauge_;
301301
ray::observability::FakeGauge fake_actor_by_state_gauge_;
302302
ray::observability::FakeGauge fake_total_lineage_bytes_gauge_;
303-
ray::observability::FakeHistogram fake_scheduler_placement_time_s_histogram_;
303+
ray::observability::FakeHistogram fake_scheduler_placement_time_ms_histogram_;
304304
std::unique_ptr<FakePeriodicalRunner> fake_periodical_runner_;
305305

306306
// Controllable time for testing publisher timeouts

0 commit comments

Comments
 (0)