Skip to content

Commit cb69d9a

Browse files
author
Sunil Thaha
authored
Merge pull request #455 from sthaha/chore-compose-kepler-metal
chore(compose): add monitoring and kepler metal comparison dashboard
2 parents 27a13ed + dc3cc4a commit cb69d9a

File tree

23 files changed

+2979
-218
lines changed

23 files changed

+2979
-218
lines changed

manifests/compose/dev/compose.yaml

Lines changed: 65 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
name: model-dev
2+
include:
3+
- path:
4+
- ../monitoring/compose.yaml
5+
- ./overrides.yaml
6+
27
services:
3-
kepler:
8+
kepler-models:
49
image: quay.io/sustainable_computing_io/kepler:latest
510
ports:
6-
- 9100:9100
11+
- 19100:9100
712
privileged: true
813
pid: host
914
networks:
10-
- kepler-network
15+
- kepler-models-network
1116
- model-server-network
1217
volumes:
1318
- type: bind
@@ -17,19 +22,16 @@ services:
1722
source: /sys
1823
target: /sys
1924
- type: bind
20-
source: ./kepler/etc/kepler
25+
source: ./kepler/models/etc/kepler
2126
target: /etc/kepler
2227

2328
# NOTE: use the models from the local repo
2429
- type: bind
25-
source: ./kepler/var/lib/kepler/data/model_weight/
30+
source: ./kepler/common/var/lib/kepler/data
2631
target: /var/lib/kepler/data
27-
- type: bind
28-
source: ./kepler/var/lib/kepler/data/cpus.yaml
29-
target: /var/lib/kepler/data/cpus.yaml
3032

3133
# NOTE: for estimator - kepler communication
32-
- kepler-tmp:/tmp
34+
- estimator-tmp:/tmp
3335

3436
healthcheck:
3537
test: curl -f http://localhost:9100/metrics || exit 1
@@ -66,6 +68,52 @@ services:
6668
-disable-power-meter \
6769
-v="8"
6870
71+
kepler-metal:
72+
image: quay.io/sustainable_computing_io/kepler:latest
73+
ports:
74+
- 19200:9100
75+
privileged: true
76+
pid: host
77+
networks:
78+
- kepler-metal-network
79+
volumes:
80+
- type: bind
81+
source: /proc
82+
target: /proc
83+
- type: bind
84+
source: /sys
85+
target: /sys
86+
- type: bind
87+
source: ./kepler/metal/etc/kepler
88+
target: /etc/kepler
89+
90+
- type: bind
91+
source: ./kepler/common/var/lib/kepler/data
92+
target: /var/lib/kepler/data
93+
94+
healthcheck:
95+
test: curl -f http://localhost:9100/metrics || exit 1
96+
interval: ${HEALTHCHECK_INTERVAL:-50s}
97+
timeout: ${HEALTHCHECK_TIMEOUT:-30s}
98+
retries: ${HEALTHCHECK_RETRIES:-3}
99+
start_period: ${HEALTHCHECK_START_PERIOD:-1m}
100+
101+
cap_add:
102+
- ALL
103+
104+
entrypoint:
105+
- /usr/bin/bash
106+
- -c
107+
108+
command:
109+
- |
110+
echo "starting kepler metal";
111+
set -x;
112+
/usr/bin/kepler \
113+
-address="0.0.0.0:9100" \
114+
-v="8"
115+
116+
69117
estimator:
70118
command: [estimator, -l, debug]
71119
build: &build
@@ -74,37 +122,38 @@ services:
74122

75123
volumes:
76124
- type: bind
77-
source: ./kepler/etc/kepler
125+
source: ./kepler/models/etc/kepler
78126
target: /etc/kepler
79127

80-
- kepler-tmp:/tmp
128+
- estimator-tmp:/tmp
81129
- estimator-mnt:/mnt
82130
networks:
83-
- kepler-network
131+
- kepler-models-network
84132
- model-server-network
85133

86134
model-server:
87135
ports:
88-
- 8100:8100
136+
- 18100:8100
89137
command: [model-server, -l, debug]
90138
build:
91139
<<: *build
92140
volumes:
93141
- type: bind
94-
source: ./kepler/etc/kepler
142+
source: ./kepler/models/etc/kepler
95143
target: /etc/kepler
96144
- model-server-mnt:/mnt
97145
networks:
98146
- model-server-network
99147

100148
volumes:
101149
# for kepler - estimator sock
102-
kepler-tmp:
150+
estimator-tmp:
103151

104152
# for downloading models
105153
estimator-mnt:
106154
model-server-mnt:
107155

108156
networks:
109-
kepler-network:
157+
kepler-models-network:
158+
kepler-metal-network:
110159
model-server-network:

0 commit comments

Comments
 (0)