Skip to content

Commit c680137

Browse files
fix(ray): runtime_env can be None leading to a job error [backport 3.16] (#14903)
Backport f11c438 from #14837 to 3.16. When submitting a job, `runtime_env` could be none in `kwargs`, leading to an `AttributeError`, see this [link](https://dd.datad0g.com/llm/distributed-ai/jobs?query=%40component%3Aray%20parent_id%3A0%20job-113464b6-33d0-47&agg_m=count&agg_m_source=base&agg_t=count&colorLegendSort=time&fromUser=false&spanId=5240087393744960581&traceId=68e818d800000000e5341ed29f2823d9&start=1759437349664&end=1760042149664&paused=false). This PR fixes this issue. It also updates two snapshots that were missing a span. The tests were xfailing so this was not caught in CI. Co-authored-by: Louis Tricot <[email protected]>
1 parent 792d493 commit c680137

File tree

4 files changed

+112
-34
lines changed

4 files changed

+112
-34
lines changed

ddtrace/contrib/internal/ray/patch.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,11 @@ def traced_submit_job(wrapped, instance, args, kwargs):
232232
submit_span.set_tag_str(RAY_SUBMISSION_ID_TAG, submission_id)
233233

234234
# Inject the context of the job so that ray.job.run is its child
235-
env_vars = kwargs.setdefault("runtime_env", {}).setdefault("env_vars", {})
235+
runtime_env = kwargs.get("runtime_env") or {}
236+
kwargs["runtime_env"] = runtime_env
237+
env_vars = runtime_env.get("env_vars") or {}
238+
runtime_env["env_vars"] = env_vars
239+
236240
_TraceContext._inject(job_span.context, env_vars)
237241
env_vars[RAY_SUBMISSION_ID] = submission_id
238242
if job_name:
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
fixes:
3+
- |
4+
ray: This fix resolves an issue where submitting Ray jobs caused an ``AttributeError`` crash in certain configurations.

tests/snapshots/tests.contrib.ray.test_ray.test_simple_put.json

Lines changed: 52 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,17 @@
1010
"error": 0,
1111
"meta": {
1212
"_dd.base_service": "tests.contrib.ray",
13-
"_dd.hostname": "docker-desktop",
1413
"_dd.p.dm": "-0",
15-
"_dd.p.tid": "68de3cb400000000",
14+
"_dd.p.tid": "68e8d2ce00000000",
1615
"component": "ray",
1716
"language": "python",
1817
"ray.hostname": "docker-desktop",
1918
"ray.job_id": "01000000",
20-
"ray.node_id": "6e08a6b14aa1db44ba918cb17ed0223d61e75c5193e0df2b881dd2e2",
19+
"ray.node_id": "84adfe319dc863f16fa08d39416a29717ea886f5c78ec87297358957",
2120
"ray.put.value_size_bytes": "28",
2221
"ray.put.value_type": "int",
2322
"ray.worker_id": "01000000ffffffffffffffffffffffffffffffffffffffffffffffff",
24-
"runtime-id": "5e2c80f345d34a8082da382e68b097bc",
23+
"runtime-id": "5e417ffae56949ccbc063a6f4c923f97",
2524
"span.kind": "producer"
2625
},
2726
"metrics": {
@@ -34,8 +33,8 @@
3433
"_sampling_priority_v1": 2,
3534
"process_id": 543
3635
},
37-
"duration": 190167,
38-
"start": 1759394996599461798
36+
"duration": 214833,
37+
"start": 1760088782207075883
3938
}],
4039
[
4140
{
@@ -48,17 +47,16 @@
4847
"type": "ray",
4948
"error": 0,
5049
"meta": {
51-
"_dd.hostname": "docker-desktop",
5250
"_dd.p.dm": "-0",
53-
"_dd.p.tid": "68de3cb400000000",
51+
"_dd.p.tid": "68e8d2ce00000000",
5452
"component": "ray",
5553
"language": "python",
5654
"ray.hostname": "docker-desktop",
5755
"ray.job_id": "01000000",
58-
"ray.node_id": "6e08a6b14aa1db44ba918cb17ed0223d61e75c5193e0df2b881dd2e2",
56+
"ray.node_id": "84adfe319dc863f16fa08d39416a29717ea886f5c78ec87297358957",
5957
"ray.task.submit_status": "success",
6058
"ray.worker_id": "01000000ffffffffffffffffffffffffffffffffffffffffffffffff",
61-
"runtime-id": "5e2c80f345d34a8082da382e68b097bc",
59+
"runtime-id": "5e417ffae56949ccbc063a6f4c923f97",
6260
"span.kind": "producer"
6361
},
6462
"metrics": {
@@ -71,8 +69,8 @@
7169
"_sampling_priority_v1": 2,
7270
"process_id": 543
7371
},
74-
"duration": 2298667,
75-
"start": 1759394996599837756
72+
"duration": 2818542,
73+
"start": 1760088782207505258
7674
},
7775
{
7876
"name": "task.execute",
@@ -87,10 +85,10 @@
8785
"component": "ray",
8886
"ray.hostname": "docker-desktop",
8987
"ray.job_id": "01000000",
90-
"ray.node_id": "6e08a6b14aa1db44ba918cb17ed0223d61e75c5193e0df2b881dd2e2",
88+
"ray.node_id": "84adfe319dc863f16fa08d39416a29717ea886f5c78ec87297358957",
9189
"ray.task.status": "success",
9290
"ray.worker_id": "01000000ffffffffffffffffffffffffffffffffffffffffffffffff",
93-
"runtime-id": "5e2c80f345d34a8082da382e68b097bc",
91+
"runtime-id": "5e417ffae56949ccbc063a6f4c923f97",
9492
"span.kind": "consumer"
9593
},
9694
"metrics": {
@@ -102,6 +100,43 @@
102100
"_sampling_priority_v1": 2,
103101
"process_id": 543
104102
},
105-
"duration": 196875,
106-
"start": 1759394996601777715
107-
}]]
103+
"duration": 259500,
104+
"start": 1760088782209866550
105+
}],
106+
[
107+
{
108+
"name": "ray.get",
109+
"service": "unnamed.ray.job",
110+
"resource": "ray.get",
111+
"trace_id": 2,
112+
"span_id": 1,
113+
"parent_id": 0,
114+
"type": "ray",
115+
"error": 0,
116+
"meta": {
117+
"_dd.base_service": "tests.contrib.ray",
118+
"_dd.p.dm": "-0",
119+
"_dd.p.tid": "68e8d2ce00000000",
120+
"component": "ray",
121+
"language": "python",
122+
"ray.get.value_size_bytes": "64",
123+
"ray.hostname": "docker-desktop",
124+
"ray.job_id": "01000000",
125+
"ray.node_id": "84adfe319dc863f16fa08d39416a29717ea886f5c78ec87297358957",
126+
"ray.worker_id": "01000000ffffffffffffffffffffffffffffffffffffffffffffffff",
127+
"runtime-id": "5e417ffae56949ccbc063a6f4c923f97",
128+
"span.kind": "producer"
129+
},
130+
"metrics": {
131+
"_dd.ai_obs.enabled": 1,
132+
"_dd.djm.enabled": 1,
133+
"_dd.filter.kept": 1,
134+
"_dd.measured": 1,
135+
"_dd.top_level": 1,
136+
"_dd.tracer_kr": 1.0,
137+
"_sampling_priority_v1": 2,
138+
"process_id": 543
139+
},
140+
"duration": 347000,
141+
"start": 1760088782210520300
142+
}]]

tests/snapshots/tests.contrib.ray.test_ray.test_simple_wait.json

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,16 @@
99
"type": "ray",
1010
"error": 0,
1111
"meta": {
12-
"_dd.hostname": "docker-desktop",
1312
"_dd.p.dm": "-0",
14-
"_dd.p.tid": "68dd410c00000000",
13+
"_dd.p.tid": "68e8d2ce00000000",
1514
"component": "ray",
1615
"language": "python",
1716
"ray.hostname": "docker-desktop",
1817
"ray.job_id": "01000000",
19-
"ray.node_id": "53c13c58eb47ac2803bc79a5dc776959895f9f822320fdf1ab6a4f41",
18+
"ray.node_id": "84adfe319dc863f16fa08d39416a29717ea886f5c78ec87297358957",
2019
"ray.task.submit_status": "success",
2120
"ray.worker_id": "01000000ffffffffffffffffffffffffffffffffffffffffffffffff",
22-
"runtime-id": "7fb280a5458e4eecaeae186719e61896",
21+
"runtime-id": "5e417ffae56949ccbc063a6f4c923f97",
2322
"span.kind": "producer"
2423
},
2524
"metrics": {
@@ -32,8 +31,8 @@
3231
"_sampling_priority_v1": 2,
3332
"process_id": 543
3433
},
35-
"duration": 3002916,
36-
"start": 1759330572760058212
34+
"duration": 3052292,
35+
"start": 1760088782235622758
3736
},
3837
{
3938
"name": "task.execute",
@@ -48,10 +47,10 @@
4847
"component": "ray",
4948
"ray.hostname": "docker-desktop",
5049
"ray.job_id": "01000000",
51-
"ray.node_id": "53c13c58eb47ac2803bc79a5dc776959895f9f822320fdf1ab6a4f41",
50+
"ray.node_id": "84adfe319dc863f16fa08d39416a29717ea886f5c78ec87297358957",
5251
"ray.task.status": "success",
5352
"ray.worker_id": "01000000ffffffffffffffffffffffffffffffffffffffffffffffff",
54-
"runtime-id": "7fb280a5458e4eecaeae186719e61896",
53+
"runtime-id": "5e417ffae56949ccbc063a6f4c923f97",
5554
"span.kind": "consumer"
5655
},
5756
"metrics": {
@@ -63,8 +62,8 @@
6362
"_sampling_priority_v1": 2,
6463
"process_id": 543
6564
},
66-
"duration": 291083,
67-
"start": 1759330572762500212
65+
"duration": 248958,
66+
"start": 1760088782238245050
6867
}],
6968
[
7069
{
@@ -77,18 +76,17 @@
7776
"type": "ray",
7877
"error": 0,
7978
"meta": {
80-
"_dd.hostname": "docker-desktop",
8179
"_dd.p.dm": "-0",
82-
"_dd.p.tid": "68dd410c00000000",
80+
"_dd.p.tid": "68e8d2ce00000000",
8381
"component": "ray",
8482
"language": "python",
8583
"ray.hostname": "docker-desktop",
8684
"ray.job_id": "01000000",
87-
"ray.node_id": "53c13c58eb47ac2803bc79a5dc776959895f9f822320fdf1ab6a4f41",
85+
"ray.node_id": "84adfe319dc863f16fa08d39416a29717ea886f5c78ec87297358957",
8886
"ray.wait.num_returns": "1",
8987
"ray.wait.timeout_s": "60",
9088
"ray.worker_id": "01000000ffffffffffffffffffffffffffffffffffffffffffffffff",
91-
"runtime-id": "7fb280a5458e4eecaeae186719e61896",
89+
"runtime-id": "5e417ffae56949ccbc063a6f4c923f97",
9290
"span.kind": "producer"
9391
},
9492
"metrics": {
@@ -101,6 +99,43 @@
10199
"_sampling_priority_v1": 2,
102100
"process_id": 543
103101
},
104-
"duration": 414292,
105-
"start": 1759330572763224920
102+
"duration": 235042,
103+
"start": 1760088782238835341
104+
}],
105+
[
106+
{
107+
"name": "ray.get",
108+
"service": "unnamed.ray.job",
109+
"resource": "ray.get",
110+
"trace_id": 2,
111+
"span_id": 1,
112+
"parent_id": 0,
113+
"type": "ray",
114+
"error": 0,
115+
"meta": {
116+
"_dd.base_service": "tests.contrib.ray",
117+
"_dd.p.dm": "-0",
118+
"_dd.p.tid": "68e8d2ce00000000",
119+
"component": "ray",
120+
"language": "python",
121+
"ray.get.value_size_bytes": "88",
122+
"ray.hostname": "docker-desktop",
123+
"ray.job_id": "01000000",
124+
"ray.node_id": "84adfe319dc863f16fa08d39416a29717ea886f5c78ec87297358957",
125+
"ray.worker_id": "01000000ffffffffffffffffffffffffffffffffffffffffffffffff",
126+
"runtime-id": "5e417ffae56949ccbc063a6f4c923f97",
127+
"span.kind": "producer"
128+
},
129+
"metrics": {
130+
"_dd.ai_obs.enabled": 1,
131+
"_dd.djm.enabled": 1,
132+
"_dd.filter.kept": 1,
133+
"_dd.measured": 1,
134+
"_dd.top_level": 1,
135+
"_dd.tracer_kr": 1.0,
136+
"_sampling_priority_v1": 2,
137+
"process_id": 543
138+
},
139+
"duration": 327208,
140+
"start": 1760088782239202258
106141
}]]

0 commit comments

Comments
 (0)