Skip to content

Commit 14340fb

Browse files
Add 25.9.0
1 parent 8dd82ad commit 14340fb

File tree

88 files changed

+12902
-4685
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

88 files changed

+12902
-4685
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Running RDMA (remote direct memory access) GPU workloads on OKE
2-
Oracle Cloud Infrastructure Kubernetes Engine (OKE)[https://docs.oracle.com/en-us/iaas/Content/ContEng/Concepts/contengoverview.htm] is a fully-managed, scalable, and highly available service that you can use to deploy your containerized applications to the cloud.
2+
Oracle Cloud Infrastructure Kubernetes Engine (OKE) is a fully-managed, scalable, and highly available service that you can use to deploy your containerized applications to the cloud.
33

44
### Supported Operating Systems
55
- Ubuntu 22.04

terraform/oci-bv.tf renamed to terraform/bv.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024 Oracle Corporation and/or its affiliates.
1+
# Copyright (c) 2025 Oracle Corporation and/or its affiliates.
22
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl
33

44
resource "kubernetes_storage_class_v1" "oci_high_vpu_20" {

terraform/datasources.tf

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Copyright (c) 2025 Oracle Corporation and/or its affiliates.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl
3+
4+
data "kubernetes_service" "nginx_lb" {
5+
count = alltrue([var.install_monitoring, var.install_node_problem_detector_kube_prometheus_stack, var.preferred_kubernetes_services == "public", local.deploy_from_local || local.deploy_from_orm]) ? 1 : 0
6+
7+
depends_on = [time_sleep.wait_for_nginx_lb]
8+
9+
metadata {
10+
name = format("%s-controller", one(helm_release.nginx.*.name))
11+
namespace = one(helm_release.nginx.*.namespace)
12+
}
13+
}
14+
15+
data "oci_load_balancer_load_balancers" "lbs" {
16+
count = alltrue([var.install_monitoring, var.install_node_problem_detector_kube_prometheus_stack, var.preferred_kubernetes_services == "public", local.deploy_from_operator]) ? 1 : 0
17+
18+
compartment_id = var.compartment_ocid
19+
20+
filter {
21+
name = "freeform_tags.state_id"
22+
values = [local.state_id]
23+
}
24+
25+
filter {
26+
name = "freeform_tags.application"
27+
values = ["nginx"]
28+
}
29+
30+
depends_on = [module.nginx]
31+
}
32+
33+
data "oci_load_balancer_load_balancers" "internal_lbs" {
34+
count = alltrue([var.install_monitoring, var.install_node_problem_detector_kube_prometheus_stack, var.preferred_kubernetes_services == "internal", local.deploy_from_operator]) ? 1 : 0
35+
36+
compartment_id = var.compartment_ocid
37+
38+
filter {
39+
name = "freeform_tags.state_id"
40+
values = [local.state_id]
41+
}
42+
43+
filter {
44+
name = "freeform_tags.application"
45+
values = ["grafana"]
46+
}
47+
48+
depends_on = [module.kube_prometheus_stack]
49+
}

terraform/files/.DS_Store

-6 KB
Binary file not shown.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
---
2+
apiVersion: cert-manager.io/v1
3+
kind: ClusterIssuer
4+
metadata:
5+
name: le-clusterissuer
6+
spec:
7+
acme:
8+
# You must replace this email address with your own.
9+
# Let's Encrypt will use this to contact you about expiring
10+
# certificates, and issues related to your account.
11+
12+
server: https://acme-v02.api.letsencrypt.org/directory #https://acme-staging-v02.api.letsencrypt.org/directory
13+
privateKeySecretRef:
14+
# Secret resource that will be used to store the account's private key.
15+
name: le-clusterissuer-secret
16+
# Add a single challenge solver, HTTP01 using nginx
17+
solvers:
18+
- http01:
19+
ingress:
20+
ingressClassName: nginx
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
apiVersion: 1
2+
contactPoints:
3+
- orgId: 1
4+
name: ons-webhook
5+
receivers:
6+
- uid: ons
7+
type: webhook
8+
settings:
9+
url: http://oke-ons-webhook/grafana-webhook
10+
httpMethod: POST
11+
policies:
12+
- orgId: 1
13+
receiver: ons-webhook
14+
group_by:
15+
- grafana_folder
16+
- alertname
17+
- hostname
18+
group_wait: 30s
19+
group_interval: 5m
20+
repeat_interval: 12h

terraform/files/grafana/alerts/cpu-profile.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ apiVersion: 1
22
groups:
33
- orgId: 1
44
name: Node Problem Detector
5-
folder: OKE
5+
folder: Alerts
66
interval: 1m
77
rules:
88
- uid: oke_npd_cpu_profile
@@ -54,3 +54,6 @@ groups:
5454
annotations: {}
5555
labels: {}
5656
isPaused: false
57+
notification_settings:
58+
receiver: ons-webhook
59+
repeat_interval: 24h

terraform/files/grafana/alerts/gpu-bad-pages.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ apiVersion: 1
22
groups:
33
- orgId: 1
44
name: Node Problem Detector
5-
folder: OKE
5+
folder: Alerts
66
interval: 1m
77
rules:
88
- uid: oke_npd_gpu_bad_pages
@@ -54,3 +54,6 @@ groups:
5454
annotations: {}
5555
labels: {}
5656
isPaused: false
57+
notification_settings:
58+
receiver: ons-webhook
59+
repeat_interval: 24h

terraform/files/grafana/alerts/gpu-bus.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ apiVersion: 1
22
groups:
33
- orgId: 1
44
name: Node Problem Detector
5-
folder: OKE
5+
folder: Alerts
66
interval: 1m
77
rules:
88
- uid: oke_npd_gpu_bus
@@ -54,3 +54,6 @@ groups:
5454
annotations: {}
5555
labels: {}
5656
isPaused: false
57+
notification_settings:
58+
receiver: ons-webhook
59+
repeat_interval: 24h

terraform/files/grafana/alerts/gpu-count.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ apiVersion: 1
22
groups:
33
- orgId: 1
44
name: Node Problem Detector
5-
folder: OKE
5+
folder: Alerts
66
interval: 1m
77
rules:
88
- uid: oke_npd_gpu_count
@@ -54,3 +54,6 @@ groups:
5454
annotations: {}
5555
labels: {}
5656
isPaused: false
57+
notification_settings:
58+
receiver: ons-webhook
59+
repeat_interval: 24h

0 commit comments

Comments
 (0)