Skip to content

Commit b690319

Browse files
[monitoring] Remove HostReportEvent from Kafka/Elasticsearch pipeline
Host reports are too large to store in Kafka/Elasticsearch due to their high frequency (~60s intervals) and data volume. Host metrics should use Prometheus instead. Changes: - Remove HostReportEvent and RunningFrameSummary from monitoring.proto - Remove publishHostReportEvent from HostReportHandler, KafkaEventPublisher - Remove host report indexing from ElasticsearchClient, KafkaEventConsumer - Remove buildHostReportEvent from MonitoringEventBuilder - Update documentation to note host metrics use Prometheus - Keep HostEvent for state change audit trail (up/down/locked)
1 parent 2f26a91 commit b690319

File tree

9 files changed

+4
-178
lines changed

9 files changed

+4
-178
lines changed

cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@
7272
import com.imageworks.spcue.monitoring.MonitoringEventBuilder;
7373
import com.imageworks.spcue.grpc.monitoring.EventType;
7474
import com.imageworks.spcue.grpc.monitoring.HostEvent;
75-
import com.imageworks.spcue.grpc.monitoring.HostReportEvent;
7675

7776
public class HostReportHandler {
7877

@@ -170,9 +169,6 @@ public void queueHostReport(HostReport report) {
170169
public void handleHostReport(HostReport report, boolean isBoot) {
171170
long startTime = System.currentTimeMillis();
172171
try {
173-
// Publish host report event to Kafka for monitoring
174-
publishHostReportEvent(report, isBoot);
175-
176172
// Record Prometheus metric for host report
177173
if (prometheusMetrics != null) {
178174
String facility = report.getHost().getFacility();
@@ -1125,23 +1121,6 @@ public void setMonitoringEventBuilder(MonitoringEventBuilder monitoringEventBuil
11251121
this.monitoringEventBuilder = monitoringEventBuilder;
11261122
}
11271123

1128-
/**
1129-
* Publishes a host report event to Kafka for monitoring purposes. This method is called
1130-
* asynchronously to avoid blocking the report queue.
1131-
*/
1132-
private void publishHostReportEvent(HostReport report, boolean isBoot) {
1133-
if (kafkaEventPublisher == null || !kafkaEventPublisher.isEnabled()) {
1134-
return;
1135-
}
1136-
1137-
try {
1138-
HostReportEvent event = monitoringEventBuilder.buildHostReportEvent(report, isBoot);
1139-
kafkaEventPublisher.publishHostReportEvent(event);
1140-
} catch (Exception e) {
1141-
logger.trace("Failed to publish host report event: {}", e.getMessage());
1142-
}
1143-
}
1144-
11451124
/**
11461125
* Publishes a host state change event to Kafka for monitoring purposes.
11471126
*/

cuebot/src/main/java/com/imageworks/spcue/monitoring/ElasticsearchClient.java

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ public class ElasticsearchClient {
7575
private static final String INDEX_LAYER_EVENTS = "opencue-layer-events";
7676
private static final String INDEX_FRAME_EVENTS = "opencue-frame-events";
7777
private static final String INDEX_HOST_EVENTS = "opencue-host-events";
78-
private static final String INDEX_HOST_REPORTS = "opencue-host-reports";
7978
private static final String INDEX_PROC_EVENTS = "opencue-proc-events";
8079

8180
@Autowired
@@ -125,7 +124,7 @@ private void initializeThreadPool() {
125124
*/
126125
private void createIndexTemplates() {
127126
String[] indexPrefixes = {INDEX_JOB_EVENTS, INDEX_LAYER_EVENTS, INDEX_FRAME_EVENTS,
128-
INDEX_HOST_EVENTS, INDEX_HOST_REPORTS, INDEX_PROC_EVENTS};
127+
INDEX_HOST_EVENTS, INDEX_PROC_EVENTS};
129128

130129
for (String prefix : indexPrefixes) {
131130
try {
@@ -241,15 +240,6 @@ public void indexHostEvent(String eventId, String jsonDocument) {
241240
indexDocument(INDEX_HOST_EVENTS, eventId, jsonDocument);
242241
}
243242

244-
/**
245-
* Indexes a host report document.
246-
*/
247-
public void indexHostReport(String eventId, String jsonDocument) {
248-
if (!enabled)
249-
return;
250-
indexDocument(INDEX_HOST_REPORTS, eventId, jsonDocument);
251-
}
252-
253243
/**
254244
* Indexes a proc event document.
255245
*/

cuebot/src/main/java/com/imageworks/spcue/monitoring/KafkaEventConsumer.java

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ public class KafkaEventConsumer {
4747
private static final String TOPIC_LAYER_EVENTS = "opencue.layer.events";
4848
private static final String TOPIC_FRAME_EVENTS = "opencue.frame.events";
4949
private static final String TOPIC_HOST_EVENTS = "opencue.host.events";
50-
private static final String TOPIC_HOST_REPORTS = "opencue.host.reports";
5150
private static final String TOPIC_PROC_EVENTS = "opencue.proc.events";
5251

5352
@Autowired
@@ -108,7 +107,7 @@ private void initializeConsumer() {
108107

109108
// Subscribe to all event topics
110109
consumer.subscribe(Arrays.asList(TOPIC_JOB_EVENTS, TOPIC_LAYER_EVENTS, TOPIC_FRAME_EVENTS,
111-
TOPIC_HOST_EVENTS, TOPIC_HOST_REPORTS, TOPIC_PROC_EVENTS));
110+
TOPIC_HOST_EVENTS, TOPIC_PROC_EVENTS));
112111
}
113112

114113
private void startConsumerThread() {
@@ -165,9 +164,6 @@ private void processRecord(ConsumerRecord<String, String> record) {
165164
case TOPIC_HOST_EVENTS:
166165
elasticsearchClient.indexHostEvent(eventId, value);
167166
break;
168-
case TOPIC_HOST_REPORTS:
169-
elasticsearchClient.indexHostReport(eventId, value);
170-
break;
171167
case TOPIC_PROC_EVENTS:
172168
elasticsearchClient.indexProcEvent(eventId, value);
173169
break;

cuebot/src/main/java/com/imageworks/spcue/monitoring/KafkaEventPublisher.java

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
import com.imageworks.spcue.grpc.monitoring.EventType;
4242
import com.imageworks.spcue.grpc.monitoring.FrameEvent;
4343
import com.imageworks.spcue.grpc.monitoring.HostEvent;
44-
import com.imageworks.spcue.grpc.monitoring.HostReportEvent;
4544
import com.imageworks.spcue.grpc.monitoring.JobEvent;
4645
import com.imageworks.spcue.grpc.monitoring.LayerEvent;
4746
import com.imageworks.spcue.grpc.monitoring.ProcEvent;
@@ -67,7 +66,6 @@ public class KafkaEventPublisher extends ThreadPoolExecutor {
6766
private static final String TOPIC_LAYER_EVENTS = "opencue.layer.events";
6867
private static final String TOPIC_FRAME_EVENTS = "opencue.frame.events";
6968
private static final String TOPIC_HOST_EVENTS = "opencue.host.events";
70-
private static final String TOPIC_HOST_REPORTS = "opencue.host.reports";
7169
private static final String TOPIC_PROC_EVENTS = "opencue.proc.events";
7270

7371
@Autowired
@@ -205,16 +203,6 @@ public void publishHostEvent(HostEvent event) {
205203
event.getHeader().getEventType().name());
206204
}
207205

208-
/**
209-
* Publishes a host report event to Kafka.
210-
*/
211-
public void publishHostReportEvent(HostReportEvent event) {
212-
if (!enabled)
213-
return;
214-
publishEvent(TOPIC_HOST_REPORTS, event.getHostName(), event,
215-
event.getHeader().getEventType().name());
216-
}
217-
218206
/**
219207
* Publishes a proc event to Kafka.
220208
*/

cuebot/src/main/java/com/imageworks/spcue/monitoring/MonitoringEventBuilder.java

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,10 @@
3636
import com.imageworks.spcue.grpc.monitoring.EventType;
3737
import com.imageworks.spcue.grpc.monitoring.FrameEvent;
3838
import com.imageworks.spcue.grpc.monitoring.HostEvent;
39-
import com.imageworks.spcue.grpc.monitoring.HostReportEvent;
4039
import com.imageworks.spcue.grpc.monitoring.JobEvent;
4140
import com.imageworks.spcue.grpc.monitoring.LayerEvent;
4241
import com.imageworks.spcue.grpc.monitoring.ProcEvent;
43-
import com.imageworks.spcue.grpc.monitoring.RunningFrameSummary;
4442
import com.imageworks.spcue.grpc.report.FrameCompleteReport;
45-
import com.imageworks.spcue.grpc.report.HostReport;
46-
import com.imageworks.spcue.grpc.report.RenderHost;
47-
import com.imageworks.spcue.grpc.report.RunningFrameInfo;
4843

4944
import java.util.Arrays;
5045
import java.util.List;
@@ -300,42 +295,6 @@ public HostEvent buildHostEvent(EventType eventType, DispatchHost host,
300295
return builder.build();
301296
}
302297

303-
/**
304-
* Builds a HostReportEvent from a host report.
305-
*/
306-
public HostReportEvent buildHostReportEvent(HostReport report, boolean isBoot) {
307-
EventType eventType = isBoot ? EventType.HOST_BOOT : EventType.HOST_REPORT;
308-
EventHeader header = publisher.createEventHeader(eventType).build();
309-
310-
RenderHost rhost = report.getHost();
311-
312-
HostReportEvent.Builder builder = HostReportEvent.newBuilder().setHeader(header)
313-
.setHostName(rhost.getName()).setFacility(rhost.getFacility()).setHostData(rhost)
314-
.setCoreInfo(report.getCoreInfo()).setIsBootReport(isBoot);
315-
316-
// Add running frame summaries
317-
List<RunningFrameSummary> frameSummaries = report.getFramesList().stream()
318-
.map(this::buildRunningFrameSummary).collect(Collectors.toList());
319-
builder.addAllRunningFrames(frameSummaries);
320-
321-
return builder.build();
322-
}
323-
324-
/**
325-
* Builds a running frame summary from RunningFrameInfo.
326-
*/
327-
private RunningFrameSummary buildRunningFrameSummary(RunningFrameInfo frame) {
328-
return RunningFrameSummary.newBuilder().setFrameId(frame.getFrameId())
329-
.setFrameName(frame.getFrameName()).setJobId(frame.getJobId())
330-
.setJobName(frame.getJobName()).setLayerId(frame.getLayerId())
331-
.setNumCores(frame.getNumCores()).setStartTime(frame.getStartTime())
332-
.setRss(frame.getRss()).setMaxRss(frame.getMaxRss()).setVsize(frame.getVsize())
333-
.setMaxVsize(frame.getMaxVsize()).setUsedGpuMemory(frame.getUsedGpuMemory())
334-
.setMaxUsedGpuMemory(frame.getMaxUsedGpuMemory())
335-
.setUsedSwapMemory(frame.getUsedSwapMemory()).setLluTime(frame.getLluTime())
336-
.build();
337-
}
338-
339298
/**
340299
* Builds a ProcEvent for a proc booking/unbooking.
341300
*/

docs/_docs/concepts/render-farm-monitoring.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ Cuebot publishes events to Apache Kafka topics when significant state changes oc
4141
| `opencue.layer.events` | Layer state changes |
4242
| `opencue.frame.events` | Frame execution events (started, completed, failed, retried) |
4343
| `opencue.host.events` | Host state changes (up, down, locked, nimby) |
44-
| `opencue.host.reports` | Periodic host status reports with resource utilization |
4544
| `opencue.proc.events` | Process allocation and deallocation events |
4645

4746
Events are published asynchronously to avoid impacting render farm performance. A bounded queue ensures the system remains responsive even under high load.

docs/_docs/reference/monitoring-reference.md

Lines changed: 1 addition & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ This reference provides comprehensive documentation for all monitoring system co
4949
| `opencue.layer.events` | `layerId` | Layer state changes |
5050
| `opencue.frame.events` | `frameId` | Frame execution events |
5151
| `opencue.host.events` | `hostId` | Host state changes |
52-
| `opencue.host.reports` | `hostId` | Periodic host status reports |
5352
| `opencue.proc.events` | `procId` | Process allocation events |
5453

5554
![Kafka UI for Apache Kafka](/assets/images/opencue_monitoring/opencue_monitoring_ui_for_apache_kafka.png)
@@ -208,38 +207,6 @@ All events include a `header` field with common metadata, plus event-specific fi
208207
}
209208
```
210209

211-
### Host report payload
212-
213-
```json
214-
{
215-
"header": {
216-
"event_id": "550e8400-e29b-41d4-a716-446655440004",
217-
"event_type": "HOST_REPORT",
218-
"timestamp": "1732446600000",
219-
"source_cuebot": "cuebot-01",
220-
"correlation_id": ""
221-
},
222-
"host_id": "550e8400-e29b-41d4-a716-446655440004",
223-
"host_name": "render-node-01",
224-
"facility": "cloud",
225-
"alloc_name": "render.general",
226-
"state": "UP",
227-
"lock_state": "OPEN",
228-
"nimby_enabled": false,
229-
"total_cores": 64,
230-
"idle_cores": 32,
231-
"total_memory": "137438953472",
232-
"free_memory": "68719476736",
233-
"total_gpu_memory": "25769803776",
234-
"free_gpu_memory": "25769803776",
235-
"load": 1250,
236-
"ping_time": "1732443000000",
237-
"boot_time": "1732300000000",
238-
"os": "Linux",
239-
"running_procs": 4
240-
}
241-
```
242-
243210
## Prometheus metrics
244211

245212
### Job and frame metrics
@@ -357,7 +324,7 @@ metrics.prometheus.frame.memory.buckets=1073741824,2147483648,4294967296,8589934
357324
Examples:
358325
- `opencue-job-events-2024.11.24`
359326
- `opencue-frame-events-2024.11.24`
360-
- `opencue-host-reports-2024.11.24`
327+
- `opencue-host-events-2024.11.24`
361328

362329
### Index mappings
363330

proto/src/monitoring.proto

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@ enum EventType {
4444
FRAME_CHECKPOINT = 27;
4545

4646
// Host events
47-
HOST_REPORT = 30;
4847
HOST_BOOT = 31;
4948
HOST_STATE_CHANGED = 32;
5049
HOST_LOCKED = 33;
@@ -158,44 +157,6 @@ message HostEvent {
158157
string reason = 7;
159158
}
160159

161-
// -------- Host Report Event --------
162-
163-
message HostReportEvent {
164-
EventHeader header = 1;
165-
166-
// Host information
167-
string host_name = 2;
168-
string facility = 3;
169-
170-
// Full host report data
171-
report.RenderHost host_data = 4;
172-
report.CoreDetail core_info = 5;
173-
174-
// Running frame summaries
175-
repeated RunningFrameSummary running_frames = 6;
176-
177-
// Whether this is a boot report
178-
bool is_boot_report = 7;
179-
}
180-
181-
message RunningFrameSummary {
182-
string frame_id = 1;
183-
string frame_name = 2;
184-
string job_id = 3;
185-
string job_name = 4;
186-
string layer_id = 5;
187-
int32 num_cores = 6;
188-
int64 start_time = 7;
189-
int64 rss = 8;
190-
int64 max_rss = 9;
191-
int64 vsize = 10;
192-
int64 max_vsize = 11;
193-
int64 used_gpu_memory = 12;
194-
int64 max_used_gpu_memory = 13;
195-
int64 used_swap_memory = 14;
196-
int64 llu_time = 15;
197-
}
198-
199160
// -------- Proc Events --------
200161

201162
message ProcEvent {

sandbox/kibana-queries.md

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ GET /opencue-job-events-*/_count
1616
GET /opencue-layer-events-*/_count
1717
GET /opencue-proc-events-*/_count
1818
GET /opencue-host-events-*/_count
19-
GET /opencue-host-reports-*/_count
2019
```
2120

2221
## Pickup Time Tracking
@@ -478,7 +477,7 @@ GET /opencue-layer-events-*/_search
478477
}
479478
```
480479

481-
## Host Events and Reports
480+
## Host Events
482481

483482
```json
484483
# Recent host events
@@ -492,18 +491,6 @@ GET /opencue-host-events-*/_search
492491
],
493492
"size": 10
494493
}
495-
496-
# Recent host reports
497-
GET /opencue-host-reports-*/_search
498-
{
499-
"query": {
500-
"match_all": {}
501-
},
502-
"sort": [
503-
{ "header.timestamp": { "order": "desc" } }
504-
],
505-
"size": 10
506-
}
507494
```
508495

509496
## Time-Based Analytics

0 commit comments

Comments
 (0)