Skip to content

Commit 598b077

Browse files
committed
Adding code
1 parent 2959f47 commit 598b077

10 files changed

+16902
-0
lines changed
1.69 MB
Binary file not shown.

AWS+training_+Ray+cluster+deployment.doc

Lines changed: 13831 additions & 0 deletions
Large diffs are not rendered by default.

cloudwatch-basic.yaml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
provider:
2+
type: aws
3+
region: us-west-2
4+
availability_zone: us-west-2a
5+
# Start by defining a `cloudwatch` section to enable CloudWatch integration with your Ray cluster.
6+
cloudwatch:
7+
agent:
8+
# Path to Unified CloudWatch Agent config file
9+
config: "cloudwatch/example-cloudwatch-agent-config.json"
10+
dashboard:
11+
# CloudWatch Dashboard name
12+
name: "example-dashboard-name"
13+
# Path to the CloudWatch Dashboard config file
14+
config: "cloudwatch/example-cloudwatch-dashboard-config.json"
15+
16+
auth:
17+
ssh_user: ubuntu
18+
19+
available_node_types:
20+
ray.head.default:
21+
node_config:
22+
InstanceType: c5a.large
23+
ImageId: ami-0d88d9cbe28fac870 # Unified CloudWatch agent pre-installed AMI, us-west-2
24+
resources: {}
25+
ray.worker.default:
26+
node_config:
27+
InstanceType: c5a.large
28+
ImageId: ami-0d88d9cbe28fac870 # Unified CloudWatch agent pre-installed AMI, us-west-2
29+
IamInstanceProfile:
30+
Name: ray-autoscaler-cloudwatch-v1
31+
resources: {}
32+
min_workers: 0
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
{
2+
"agent":{
3+
"metrics_collection_interval":60,
4+
"run_as_user":"root"
5+
},
6+
"logs":{
7+
"metrics_collected": {
8+
"prometheus": {
9+
"log_group_name": "{cluster_name}-ray-prometheus",
10+
"prometheus_config_path": "/opt/aws/amazon-cloudwatch-agent/etc/prometheus.yml",
11+
"emf_processor": {
12+
"metric_declaration_dedup": true,
13+
"metric_namespace": "{cluster_name}-ray-prometheus",
14+
"metric_unit":{
15+
"python_gc_collections_total": "Count",
16+
"python_gc_objects": "Count",
17+
"python_gc_objects_uncollectable_total": "Count",
18+
"python_gc_objects_collected_total": "Count",
19+
"ray_cluster_active_nodes": "Count",
20+
"ray_cluster_pending_nodes": "Count",
21+
"ray_node_cpu_count": "Count",
22+
"ray_node_cpu_utilization": "Percent",
23+
"ray_node_disk_free": "Bytes",
24+
"ray_node_disk_usage": "Bytes",
25+
"ray_node_disk_utilization_percentage": "Percent",
26+
"ray_node_mem_available": "Bytes",
27+
"ray_node_mem_total": "Bytes",
28+
"ray_node_mem_used": "Bytes",
29+
"ray_node_network_receive_speed": "Bytes",
30+
"ray_node_network_received": "Bytes",
31+
"ray_node_network_send_speed": "Bytes",
32+
"ray_node_network_sent": "Bytes",
33+
"ray_avg_num_executed_tasks": "Count",
34+
"ray_avg_num_scheduled_tasks": "Count",
35+
"ray_avg_num_spilled_back_tasks": "Count",
36+
"ray_object_manager_num_pull_requests": "Count",
37+
"ray_object_store_available_memory": "Bytes",
38+
"ray_object_store_used_memory": "Bytes",
39+
"ray_object_store_fallback_memory":"Bytes",
40+
"ray_object_store_num_local_objects": "Count",
41+
"ray_object_directory_subscriptions": "Count",
42+
"ray_object_directory_added_locations": "Count",
43+
"ray_object_directory_removed_locations": "Count",
44+
"ray_object_directory_lookups": "Count",
45+
"ray_object_directory_updates": "Count",
46+
"ray_pending_actors": "Count",
47+
"ray_pending_placement_groups": "Count",
48+
"ray_raylet_cpu": "Count",
49+
"ray_raylet_mem": "Bytes",
50+
"ray_raylet_mem_uss": "Bytes",
51+
"ray_workers_mem": "Bytes",
52+
"ray_workers_mem_uss": "Bytes",
53+
"ray_internal_num_spilled_tasks": "Count",
54+
"ray_internal_num_infeasible_tasks": "Count",
55+
"ray_internal_num_processes_started": "Count",
56+
"ray_internal_num_received_tasks": "Count",
57+
"ray_internal_num_dispatched_tasks": "Count",
58+
"process_max_fds": "Count",
59+
"process_open_fds": "Count",
60+
"process_resident_memory_bytes": "Bytes",
61+
"process_virtual_memory_bytes": "Bytes",
62+
"process_start_time_seconds": "Seconds",
63+
"process_cpu_seconds_total": "Seconds",
64+
"autoscaler_config_validation_exceptions": "Count",
65+
"autoscaler_node_launch_exceptions": "Count",
66+
"autoscaler_pending_nodes": "Count",
67+
"autoscaler_reset_exceptions": "Count",
68+
"autoscaler_running_workers": "Count",
69+
"autoscaler_started_nodes": "Count",
70+
"autoscaler_stopped_nodes": "Count",
71+
"autoscaler_update_loop_exceptions": "Count",
72+
"autoscaler_worker_create_node_time": "Seconds",
73+
"autoscaler_worker_update_time": "Seconds",
74+
"autoscaler_updating_nodes": "Count",
75+
"autoscaler_successful_updates": "Count",
76+
"autoscaler_failed_updates": "Count",
77+
"autoscaler_failed_create_nodes": "Count",
78+
"autoscaler_recovering_nodes": "Count",
79+
"autoscaler_successful_recoveries": "Count",
80+
"autoscaler_failed_recoveries": "Count"
81+
},
82+
"metric_declaration": [
83+
{
84+
"source_labels": [
85+
"job"
86+
],
87+
"label_matcher": "ray",
88+
"dimensions": [
89+
[
90+
"instance"
91+
]
92+
],
93+
"metric_selectors": [
94+
""
95+
]
96+
}
97+
]
98+
}
99+
}
100+
},
101+
"logs_collected":{
102+
"files":{
103+
"collect_list":[
104+
{
105+
"file_path":"/tmp/ray/session_*/logs/**.out",
106+
"log_group_name":"{cluster_name}-ray_logs_out",
107+
"log_stream_name":"{instance_id}"
108+
},
109+
{
110+
"file_path":"/tmp/ray/session_*/logs/**.err",
111+
"log_group_name":"{cluster_name}-ray_logs_err",
112+
"log_stream_name":"{instance_id}"
113+
}
114+
]
115+
}
116+
}
117+
},
118+
"metrics": {
119+
"namespace": "{cluster_name}-ray-CWAgent",
120+
"aggregation_dimensions": [
121+
[
122+
"InstanceId"
123+
]
124+
],
125+
"append_dimensions": {
126+
"AutoScalingGroupName": "${aws:AutoScalingGroupName}",
127+
"InstanceId": "${aws:InstanceId}"
128+
},
129+
"metrics_collected": {
130+
"collectd": {
131+
"metrics_aggregation_interval": 60
132+
},
133+
"cpu": {
134+
"measurement": [
135+
"usage_active",
136+
"usage_system",
137+
"usage_user",
138+
"usage_idle",
139+
"time_active",
140+
"time_system",
141+
"time_user",
142+
"time_idle"
143+
],
144+
"resources": [
145+
"*"
146+
]
147+
},
148+
"processes": {
149+
"measurement": [
150+
"processes_running",
151+
"processes_sleeping",
152+
"processes_zombies",
153+
"processes_dead",
154+
"processes_total"
155+
],
156+
"metrics_collection_interval": 60,
157+
"resources": [
158+
"*"
159+
]
160+
},
161+
"disk": {
162+
"measurement": [
163+
"disk_used_percent"
164+
],
165+
"metrics_collection_interval": 60,
166+
"resources": [
167+
"/"
168+
]
169+
},
170+
"mem": {
171+
"measurement": [
172+
"mem_used_percent"
173+
],
174+
"metrics_collection_interval": 60,
175+
"resources": [
176+
"*"
177+
]
178+
},
179+
"statsd": {
180+
"metrics_aggregation_interval": 60,
181+
"metrics_collection_interval": 10,
182+
"service_address": ":8125"
183+
}
184+
}
185+
}
186+
}

0 commit comments

Comments
 (0)