Skip to content

Commit 13d1773

Browse files
author
Mohammed Abdi
authored
Merge pull request #40 from project-codeflare/fix-backend-promise-errors
Fix backend promise errors for cluster with no gpu
2 parents 4f5ffca + 3a35805 commit 13d1773

File tree

7 files changed

+45
-8
lines changed

7 files changed

+45
-8
lines changed

backend/src/routes/api/mcad-prometheus/metricsData.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ const fetchPrometheusDataRange = async (
4747

4848
const getMetric = async (host: string, query: string, axiosInstance: AxiosInstance) => {
4949
const fetchedData: any = await fetchPrometheusData(host, query, axiosInstance);
50-
const valueAsDecimal = parseFloat(fetchedData[0].value[1]);
50+
//const valueAsDecimal = parseFloat(fetchedData[0].value[1]);
5151
return fetchedData;
5252
};
5353

backend/src/routes/api/metrics-data/metricsData.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ const fetchPrometheusDataRange = async (
4646

4747
const getMetric = async (host: string, query: string, axiosInstance: AxiosInstance) => {
4848
const fetchedData: any = await fetchPrometheusData(host, query, axiosInstance);
49-
const valueAsDecimal = parseFloat(fetchedData[0].value[1]);
49+
//const valueAsDecimal = parseFloat(fetchedData[0].value[1]);
5050
return fetchedData;
5151
};
5252

frontend/src/pages/MCADashboard/MCADashboard.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ export const MCADashboardInner: React.FC<MCADashboardInnerProps> = React.memo(
4141
Dispatched: '-',
4242
Queued: '-',
4343
'Re-enqueued': '-',
44+
Failed: '-',
4445
Other: '-',
4546
},
4647
},

frontend/src/pages/MCADashboard/Metrics/MetricCard.tsx

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,15 @@ const MetricCard: React.FC<MetricCardProps> = ({
1919
unit,
2020
}: MetricCardProps): React.ReactElement => {
2121
const [percentage, setPercentage] = React.useState(0.0);
22+
const [noGpu, setNoGpu] = React.useState("");
2223

2324
const getData = async () => {
2425
const data = await getMetricData(query);
25-
setPercentage(Math.round(data * 100) / 100);
26+
if (data === 'No GPU In Cluster') {
27+
setNoGpu(data);
28+
} else {
29+
setPercentage(Math.round(data * 100) / 100);
30+
}
2631
};
2732

2833
React.useEffect(() => {
@@ -41,8 +46,16 @@ const MetricCard: React.FC<MetricCardProps> = ({
4146
<Card className="metric-card">
4247
<CardHeader className="metric-card-header">{name}</CardHeader>
4348
<CardBody className="metric-card-data">
44-
{Math.round(percentage * 100) / 100}
45-
{unit === Unit.PERCENT && unit}
49+
{noGpu === "" ? (
50+
<>
51+
{Math.round(percentage * 100) / 100}
52+
{unit === Unit.PERCENT && unit}
53+
</>
54+
) : (
55+
<>
56+
{noGpu}
57+
</>
58+
)}
4659
</CardBody>
4760
</Card>
4861
);

frontend/src/pages/MCADashboard/Metrics/Metrics.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ const emptyDataObject: Data = {
2222
Dispatched: '-',
2323
Queued: '-',
2424
'Re-enqueued': '-',
25+
Failed: '-',
2526
Other: '-',
2627
},
2728
},

frontend/src/pages/MCADashboard/Metrics/Resources.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ const emptyDataObject: Data = {
2222
Dispatched: '-',
2323
Queued: '-',
2424
'Re-enqueued': '-',
25+
Failed: '-',
2526
Other: '-',
2627
},
2728
},

frontend/src/pages/MCADashboard/Metrics/api/metricsData.ts

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,30 @@ import React from 'react';
44
import { timeStringToSeconds } from '~/pages/MCADashboard/Metrics/metrics-utils';
55

66
export const getMetricData = async (query: string) => {
7-
const body = { query: query };
8-
const res: { data: { value: [string, number] }[] } = await axios.post('/api/metrics-data', body);
9-
return res.data[0].value[1];
7+
const noGpu = "No GPU In Cluster"
8+
const utilizedGPUQuery = 'count(count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{exported_pod=~".+"})) or vector(0)'
9+
const utilizedGPUMemoryQuery = 'count(count by (UUID,GPU_I_ID) (DCGM_FI_DEV_MEM_COPY_UTIL))'
10+
try {
11+
const body = { query: query };
12+
const res: { data: { value: [string, number] }[] } = await axios.post('/api/metrics-data', body);
13+
if (query === utilizedGPUQuery) { // since vector(0) in query, even if no gpu returns 0
14+
const gpubody = { query: utilizedGPUMemoryQuery }; // use the utilizedGPUMemoryQuery to verify gpu is present in the cluster
15+
const gpures: { data: { value: [string, number] }[] } = await axios.post('/api/metrics-data', gpubody);
16+
if (gpures.data && gpures.data[0] && gpures.data[0].value && gpures.data[0].value[1] !== undefined) {
17+
return res.data[0].value[1];
18+
} else {
19+
return noGpu;
20+
}
21+
} else {
22+
if (res.data && res.data[0] && res.data[0].value && res.data[0].value[1] !== undefined) {
23+
return res.data[0].value[1];
24+
} else {
25+
return noGpu;
26+
}
27+
}
28+
} catch (error) {
29+
return 0;
30+
}
1031
};
1132

1233
export const getMetricTableData = async (query: string) => {

0 commit comments

Comments
 (0)