Skip to content

Commit e5cef16

Browse files
committed
[CELEBORN-2209] Introduce ReadBufferUsageRatio metric to monitor credit stream read buffer usage
### What changes were proposed in this pull request? Introduce `ReadBufferUsageRatio` metric to monitor credit stream read buffer usage. ### Why are the changes needed? `BufferStreamReadBuffer` metric is used to monitor the memory used by credit stream read buffer, which is not enough to monitor ratio of credit stream read buffer used and max direct memory. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? CI and grafana manual test with [celeborn dashboard](https://stenicholas.grafana.net/public-dashboards/12f47ac2ba2f4c0c88f761f98ffcf51c). Closes #3545 from SteNicholas/CELEBORN-2209. Authored-by: SteNicholas <programgeek@163.com> Signed-off-by: SteNicholas <programgeek@163.com>
1 parent 36cdc29 commit e5cef16

File tree

5 files changed

+105
-1
lines changed

5 files changed

+105
-1
lines changed

assets/grafana/celeborn-dashboard.json

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12710,6 +12710,101 @@
1271012710
],
1271112711
"title": "AvailableReadBuffer",
1271212712
"type": "timeseries"
12713+
},
12714+
{
12715+
"datasource": {
12716+
"type": "prometheus",
12717+
"uid": "${DS_PROMETHEUS}"
12718+
},
12719+
"description": "The ratio of read buffer used and max direct memory.",
12720+
"fieldConfig": {
12721+
"defaults": {
12722+
"color": {
12723+
"mode": "palette-classic"
12724+
},
12725+
"custom": {
12726+
"axisCenteredZero": false,
12727+
"axisColorMode": "text",
12728+
"axisLabel": "",
12729+
"axisPlacement": "auto",
12730+
"barAlignment": 0,
12731+
"drawStyle": "line",
12732+
"fillOpacity": 0,
12733+
"gradientMode": "none",
12734+
"hideFrom": {
12735+
"legend": false,
12736+
"tooltip": false,
12737+
"viz": false
12738+
},
12739+
"lineInterpolation": "linear",
12740+
"lineWidth": 1,
12741+
"pointSize": 5,
12742+
"scaleDistribution": {
12743+
"type": "linear"
12744+
},
12745+
"showPoints": "auto",
12746+
"spanNulls": false,
12747+
"stacking": {
12748+
"group": "A",
12749+
"mode": "none"
12750+
},
12751+
"thresholdsStyle": {
12752+
"mode": "off"
12753+
}
12754+
},
12755+
"mappings": [],
12756+
"thresholds": {
12757+
"mode": "absolute",
12758+
"steps": [
12759+
{
12760+
"color": "green"
12761+
},
12762+
{
12763+
"color": "red",
12764+
"value": 35
12765+
}
12766+
]
12767+
},
12768+
"unit": "percentunit"
12769+
},
12770+
"overrides": []
12771+
},
12772+
"gridPos": {
12773+
"h": 8,
12774+
"w": 12,
12775+
"x": 0,
12776+
"y": 289
12777+
},
12778+
"id": 109,
12779+
"options": {
12780+
"legend": {
12781+
"calcs": [],
12782+
"displayMode": "list",
12783+
"placement": "bottom",
12784+
"showLegend": true
12785+
},
12786+
"tooltip": {
12787+
"maxHeight": 600,
12788+
"mode": "single",
12789+
"sort": "none"
12790+
}
12791+
},
12792+
"targets": [
12793+
{
12794+
"datasource": {
12795+
"type": "prometheus",
12796+
"uid": "${DS_PROMETHEUS}"
12797+
},
12798+
"editorMode": "builder",
12799+
"expr": "metrics_ReadBufferUsageRatio_Value{instance=~\"${instance}\"}",
12800+
"instant": false,
12801+
"legendFormat": "${baseLegend}",
12802+
"range": true,
12803+
"refId": "A"
12804+
}
12805+
],
12806+
"title": "ReadBufferUsageRatio",
12807+
"type": "timeseries"
1271312808
}
1271412809
],
1271512810
"title": "MemoryRelatives",

docs/monitoring.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,8 @@ These metrics are exposed by Celeborn worker.
244244
| BufferStreamReadBuffer | The memory used by credit stream read buffer. |
245245
| ReadBufferDispatcherRequestsLength | The queue size of read buffer allocation requests. |
246246
| ReadBufferAllocatedCount | Allocated read buffer count. |
247-
- | AvailableReadBuffer | The available memory for credit stream read buffer. |
247+
| AvailableReadBuffer | The available memory for credit stream read buffer. |
248+
| ReadBufferUsageRatio | Ratio of credit stream read buffer used and max direct memory. |
248249
| ActiveCreditStreamCount | Active stream count for map partition reading streams. |
249250
| ActiveMapPartitionCount | The count of active map partition reading streams. |
250251
| SorterCacheHitRate | The cache hit rate for worker partition sorter index. |

worker/src/main/java/org/apache/celeborn/service/deploy/worker/memory/MemoryManager.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,10 @@ public long availableReadBuffer() {
506506
return Math.max(0, readBufferThreshold - readBufferCounter.get());
507507
}
508508

509+
public double readBufferUsageRatio() {
510+
return readBufferCounter.get() / (double) readBufferThreshold;
511+
}
512+
509513
public long getPausePushDataAndReplicateCounter() {
510514
return pausePushDataAndReplicateCounter.sum();
511515
}

worker/src/main/scala/org/apache/celeborn/service/deploy/worker/Worker.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,9 @@ private[celeborn] class Worker(
409409
workerSource.addGauge(WorkerSource.AVAILABLE_READ_BUFFER) { () =>
410410
memoryManager.availableReadBuffer
411411
}
412+
workerSource.addGauge(WorkerSource.READ_BUFFER_USAGE_RATIO) { () =>
413+
memoryManager.readBufferUsageRatio
414+
}
412415
workerSource.addGauge(WorkerSource.MEMORY_FILE_STORAGE_SIZE) { () =>
413416
memoryManager.getMemoryFileStorageCounter
414417
}

worker/src/main/scala/org/apache/celeborn/service/deploy/worker/WorkerSource.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ object WorkerSource {
243243
val READ_BUFFER_DISPATCHER_REQUESTS_LENGTH = "ReadBufferDispatcherRequestsLength"
244244
val READ_BUFFER_ALLOCATED_COUNT = "ReadBufferAllocatedCount"
245245
val AVAILABLE_READ_BUFFER = "AvailableReadBuffer"
246+
val READ_BUFFER_USAGE_RATIO = "ReadBufferUsageRatio"
246247
val MEMORY_FILE_STORAGE_SIZE = "MemoryFileStorageSize"
247248
val DIRECT_MEMORY_USAGE_RATIO = "DirectMemoryUsageRatio"
248249
val EVICTED_FILE_COUNT = "EvictedFileCount"

0 commit comments

Comments
 (0)