Skip to content

Commit 1199f20

Browse files
authored
fix(interactive): Introduce MetricsTool to profile memory usage, pending tasks and qps (#4332)
<!-- Thanks for your contribution! please review https://github.com/alibaba/GraphScope/blob/main/CONTRIBUTING.md before opening an issue. --> ## What do these changes do? the `MetricsTool` will print the following metrics periodically (setting of `metrics.tool.interval.ms`, default is 5 mins): 1. `memory.usage`: memory usage of jvm and direct memory 2. `rpc.channels.executor.queue`: pending tasks in each grpc (netty) channel 3. `gremlin.executor.queue`: pending tasks in gremlin executor 4. `gremlin.qps`: gremlin qps These metrics will be logged in a separate file, configured by `PerfMetricLog` in [logback.xml](https://github.com/shirly121/GraphScope/blob/ir_metric_tool/interactive_engine/assembly/src/conf/groot/logback.xml#L37) <!-- Please give a short brief about these changes. --> ## Related issue number <!-- Are there any issues opened that will be resolved by merging this change? --> Fixes
1 parent 1d118e9 commit 1199f20

File tree

28 files changed

+761
-75
lines changed

28 files changed

+761
-75
lines changed

interactive_engine/assembly/src/conf/groot/logback.xml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,23 @@
3434
</encoder>
3535
</appender>
3636

37+
<appender name="PerfMetricLog" class="ch.qos.logback.core.rolling.RollingFileAppender">
38+
<file>${log_dir}/perf_metric.log</file>
39+
<rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
40+
<fileNamePattern>${log_dir}/perf_metric.%d{yyyy-MM-dd}.%i.gz</fileNamePattern>
41+
<maxHistory>7</maxHistory>
42+
<maxFileSize>100MB</maxFileSize>
43+
<totalSizeCap>500MB</totalSizeCap>
44+
</rollingPolicy>
45+
<encoder>
46+
<pattern>[%d{ISO8601}][%p][%t][%c:%L] %m%n</pattern>
47+
</encoder>
48+
</appender>
49+
50+
<Logger name="PerfMetricLog" level="INFO" additivity="false">
51+
<appender-ref ref="PerfMetricLog"/>
52+
</Logger>
53+
3754
<logger name="org.apache.zookeeper" level="ERROR" />
3855
<logger name="org.apache.kafka" level="ERROR" />
3956
<logger name="kafka" level="ERROR" />

interactive_engine/compiler/src/main/java/com/alibaba/graphscope/GraphServer.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
import com.alibaba.graphscope.common.ir.tools.QueryCache;
3636
import com.alibaba.graphscope.common.ir.tools.QueryIdGenerator;
3737
import com.alibaba.graphscope.common.manager.IrMetaQueryCallback;
38+
import com.alibaba.graphscope.common.metric.MemoryMetric;
39+
import com.alibaba.graphscope.common.metric.MetricsTool;
3840
import com.alibaba.graphscope.cypher.service.CypherBootstrapper;
3941
import com.alibaba.graphscope.gremlin.integration.result.GraphProperties;
4042
import com.alibaba.graphscope.gremlin.integration.result.TestGraphFactory;
@@ -62,6 +64,7 @@ public class GraphServer {
6264
private final IrMetaQueryCallback metaQueryCallback;
6365
private final GraphProperties testGraph;
6466
private final GraphRelOptimizer optimizer;
67+
private final MetricsTool metricsTool;
6568

6669
private IrGremlinServer gremlinServer;
6770
private CypherBootstrapper cypherBootstrapper;
@@ -77,10 +80,13 @@ public GraphServer(
7780
this.metaQueryCallback = metaQueryCallback;
7881
this.testGraph = testGraph;
7982
this.optimizer = optimizer;
83+
this.metricsTool = new MetricsTool(configs);
84+
this.metricsTool.registerMetric(new MemoryMetric());
8085
}
8186

8287
public void start() throws Exception {
83-
ExecutionClient executionClient = ExecutionClient.Factory.create(configs, channelFetcher);
88+
ExecutionClient executionClient =
89+
ExecutionClient.Factory.create(configs, channelFetcher, metricsTool);
8490
QueryIdGenerator idGenerator = new QueryIdGenerator(configs);
8591
QueryCache queryCache = new QueryCache(configs);
8692
if (!FrontendConfig.GREMLIN_SERVER_DISABLED.get(configs)) {
@@ -95,7 +101,8 @@ public void start() throws Exception {
95101
executionClient,
96102
channelFetcher,
97103
metaQueryCallback,
98-
testGraph);
104+
testGraph,
105+
metricsTool);
99106
this.gremlinServer.start();
100107
}
101108
if (!FrontendConfig.NEO4J_BOLT_SERVER_DISABLED.get(configs)) {

interactive_engine/compiler/src/main/java/com/alibaba/graphscope/common/client/ExecutionClient.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import com.alibaba.graphscope.common.client.type.ExecutionResponseListener;
2222
import com.alibaba.graphscope.common.config.Configs;
2323
import com.alibaba.graphscope.common.config.QueryTimeoutConfig;
24+
import com.alibaba.graphscope.common.metric.MetricsTool;
2425
import com.alibaba.graphscope.gremlin.plugin.QueryLogger;
2526

2627
/**
@@ -45,10 +46,11 @@ public abstract void submit(
4546
public abstract void close() throws Exception;
4647

4748
public static class Factory {
48-
public static ExecutionClient create(Configs configs, ChannelFetcher channelFetcher) {
49+
public static ExecutionClient create(
50+
Configs configs, ChannelFetcher channelFetcher, MetricsTool metricsTool) {
4951
switch (channelFetcher.getType()) {
5052
case RPC:
51-
return new RpcExecutionClient(configs, channelFetcher);
53+
return new RpcExecutionClient(configs, channelFetcher, metricsTool);
5254
case HTTP:
5355
return new HttpExecutionClient(configs, channelFetcher);
5456
default:

interactive_engine/compiler/src/main/java/com/alibaba/graphscope/common/client/RpcExecutionClient.java

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,38 +17,41 @@
1717
package com.alibaba.graphscope.common.client;
1818

1919
import com.alibaba.graphscope.common.client.channel.ChannelFetcher;
20+
import com.alibaba.graphscope.common.client.metric.RpcExecutorMetric;
2021
import com.alibaba.graphscope.common.client.type.ExecutionRequest;
2122
import com.alibaba.graphscope.common.client.type.ExecutionResponseListener;
2223
import com.alibaba.graphscope.common.config.Configs;
2324
import com.alibaba.graphscope.common.config.PegasusConfig;
2425
import com.alibaba.graphscope.common.config.QueryTimeoutConfig;
26+
import com.alibaba.graphscope.common.metric.MetricsTool;
2527
import com.alibaba.graphscope.gaia.proto.IrResult;
2628
import com.alibaba.graphscope.gremlin.plugin.QueryLogger;
2729
import com.alibaba.pegasus.RpcChannel;
2830
import com.alibaba.pegasus.RpcClient;
2931
import com.alibaba.pegasus.intf.ResultProcessor;
3032
import com.alibaba.pegasus.service.protocol.PegasusClient;
33+
import com.google.common.collect.ImmutableMap;
3134
import com.google.protobuf.ByteString;
3235

36+
import io.grpc.ClientInterceptors;
3337
import io.grpc.Status;
3438

35-
import org.slf4j.Logger;
36-
import org.slf4j.LoggerFactory;
37-
38-
import java.util.concurrent.atomic.AtomicReference;
39+
import java.util.List;
40+
import java.util.stream.Collectors;
3941

4042
/**
4143
* rpc client to send request to pegasus engine service
4244
*/
4345
public class RpcExecutionClient extends ExecutionClient<RpcChannel> {
44-
Logger logger = LoggerFactory.getLogger(RpcExecutionClient.class);
4546
private final Configs graphConfig;
46-
private final AtomicReference<RpcClient> rpcClientRef;
4747

48-
public RpcExecutionClient(Configs graphConfig, ChannelFetcher<RpcChannel> channelFetcher) {
48+
public RpcExecutionClient(
49+
Configs graphConfig,
50+
ChannelFetcher<RpcChannel> channelFetcher,
51+
MetricsTool metricsTool) {
4952
super(channelFetcher);
5053
this.graphConfig = graphConfig;
51-
this.rpcClientRef = new AtomicReference<>();
54+
metricsTool.registerMetric(new RpcExecutorMetric(channelFetcher));
5255
}
5356

5457
@Override
@@ -58,10 +61,18 @@ public void submit(
5861
QueryTimeoutConfig timeoutConfig,
5962
QueryLogger queryLogger)
6063
throws Exception {
61-
if (rpcClientRef.get() == null) {
62-
rpcClientRef.compareAndSet(null, new RpcClient(channelFetcher.fetch()));
63-
}
64-
RpcClient rpcClient = rpcClientRef.get();
64+
List<RpcChannel> interceptChannels =
65+
channelFetcher.fetch().stream()
66+
.map(
67+
k ->
68+
new RpcChannel(
69+
ClientInterceptors.intercept(
70+
k.getChannel(), new RpcInterceptor())))
71+
.collect(Collectors.toList());
72+
RpcClient rpcClient =
73+
new RpcClient(
74+
interceptChannels,
75+
ImmutableMap.of(RpcInterceptor.QUERY_LOGGER_OPTION, queryLogger));
6576
PegasusClient.JobRequest jobRequest =
6677
PegasusClient.JobRequest.newBuilder()
6778
.setPlan(
@@ -99,7 +110,8 @@ public void process(PegasusClient.JobResponse jobResponse) {
99110
@Override
100111
public void finish() {
101112
listener.onCompleted();
102-
queryLogger.info("[compile]: received results from engine");
113+
queryLogger.info(
114+
"[query][response]: received all responses from all servers");
103115
}
104116

105117
@Override
@@ -113,8 +125,17 @@ public void error(Status status) {
113125

114126
@Override
115127
public void close() throws Exception {
116-
if (rpcClientRef.get() != null) {
117-
rpcClientRef.get().shutdown();
118-
}
128+
channelFetcher
129+
.fetch()
130+
.forEach(
131+
k -> {
132+
try {
133+
if (k != null) {
134+
k.shutdown();
135+
}
136+
} catch (Exception e) {
137+
throw new RuntimeException(e);
138+
}
139+
});
119140
}
120141
}
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/*
2+
*
3+
* * Copyright 2020 Alibaba Group Holding Limited.
4+
* *
5+
* * Licensed under the Apache License, Version 2.0 (the "License");
6+
* * you may not use this file except in compliance with the License.
7+
* * You may obtain a copy of the License at
8+
* *
9+
* * http://www.apache.org/licenses/LICENSE-2.0
10+
* *
11+
* * Unless required by applicable law or agreed to in writing, software
12+
* * distributed under the License is distributed on an "AS IS" BASIS,
13+
* * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* * See the License for the specific language governing permissions and
15+
* * limitations under the License.
16+
*
17+
*/
18+
19+
package com.alibaba.graphscope.common.client;
20+
21+
import com.alibaba.graphscope.gremlin.plugin.QueryLogger;
22+
23+
import io.grpc.*;
24+
25+
import java.time.Instant;
26+
import java.util.concurrent.atomic.AtomicBoolean;
27+
28+
public class RpcInterceptor implements ClientInterceptor {
29+
public static final CallOptions.Key<QueryLogger> QUERY_LOGGER_OPTION =
30+
CallOptions.Key.create("query-logger");
31+
32+
@Override
33+
public <ReqT, RespT> ClientCall<ReqT, RespT> interceptCall(
34+
MethodDescriptor<ReqT, RespT> methodDescriptor,
35+
CallOptions callOptions,
36+
Channel channel) {
37+
return new ForwardingClientCall.SimpleForwardingClientCall<ReqT, RespT>(
38+
channel.newCall(methodDescriptor, callOptions)) {
39+
private Instant requestStartTime;
40+
41+
@Override
42+
public void start(Listener<RespT> responseListener, Metadata headers) {
43+
requestStartTime = Instant.now();
44+
QueryLogger queryLogger = callOptions.getOption(QUERY_LOGGER_OPTION);
45+
super.start(
46+
new ForwardingClientCallListener.SimpleForwardingClientCallListener<RespT>(
47+
responseListener) {
48+
private final AtomicBoolean firstResponseLogged =
49+
new AtomicBoolean(false);
50+
51+
@Override
52+
public void onMessage(RespT message) {
53+
if (firstResponseLogged.compareAndSet(false, true)) {
54+
long firstResponseTime =
55+
Instant.now().toEpochMilli()
56+
- requestStartTime.toEpochMilli();
57+
if (queryLogger != null) {
58+
queryLogger.info(
59+
"[query][response]: receive the first response from"
60+
+ " the channel {} in {} ms",
61+
channel.authority(),
62+
firstResponseTime);
63+
}
64+
}
65+
super.onMessage(message);
66+
}
67+
68+
@Override
69+
public void onClose(Status status, Metadata trailers) {
70+
long endTime = Instant.now().toEpochMilli();
71+
long totalTime = endTime - requestStartTime.toEpochMilli();
72+
if (queryLogger != null) {
73+
queryLogger.info(
74+
"[query][response]: receive the last response from the"
75+
+ " channel {} with status {} in {} ms",
76+
channel.authority(),
77+
status,
78+
totalTime);
79+
}
80+
super.onClose(status, trailers);
81+
}
82+
},
83+
headers);
84+
if (queryLogger != null) {
85+
queryLogger.info(
86+
"[query][submitted]: submit the query to the task queue of channel {}",
87+
channel.authority());
88+
}
89+
}
90+
};
91+
}
92+
}

interactive_engine/compiler/src/main/java/com/alibaba/graphscope/common/client/channel/HostURIChannelFetcher.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@
1818

1919
import com.alibaba.graphscope.common.config.Configs;
2020
import com.alibaba.graphscope.common.config.HiactorConfig;
21+
import com.alibaba.graphscope.common.config.Utils;
2122

2223
import java.net.URI;
23-
import java.util.Arrays;
24+
import java.util.Collections;
2425
import java.util.List;
2526
import java.util.stream.Collectors;
2627

@@ -29,19 +30,18 @@
2930
*/
3031
public class HostURIChannelFetcher implements ChannelFetcher<URI> {
3132
private static final String schema = "http";
32-
private Configs graphConfig;
33+
private final List<URI> uriChannels;
3334

3435
public HostURIChannelFetcher(Configs graphConfig) {
35-
this.graphConfig = graphConfig;
36+
this.uriChannels =
37+
Utils.convertDotString(HiactorConfig.HIACTOR_HOSTS.get(graphConfig)).stream()
38+
.map(k -> URI.create(schema + "://" + k))
39+
.collect(Collectors.toList());
3640
}
3741

3842
@Override
3943
public List<URI> fetch() {
40-
String hosts = HiactorConfig.HIACTOR_HOSTS.get(graphConfig);
41-
String[] hostsArr = hosts.split(",");
42-
return Arrays.asList(hostsArr).stream()
43-
.map(k -> URI.create(schema + "://" + k))
44-
.collect(Collectors.toList());
44+
return Collections.unmodifiableList(uriChannels);
4545
}
4646

4747
@Override

interactive_engine/compiler/src/main/java/com/alibaba/graphscope/common/client/channel/HostsRpcChannelFetcher.java

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,30 +21,30 @@
2121
import com.alibaba.graphscope.common.config.Utils;
2222
import com.alibaba.pegasus.RpcChannel;
2323

24-
import java.util.ArrayList;
24+
import java.util.Collections;
2525
import java.util.List;
26+
import java.util.stream.Collectors;
2627

2728
/**
2829
* rpc implementation of {@link ChannelFetcher}, init rpc from local config
2930
*/
3031
public class HostsRpcChannelFetcher implements ChannelFetcher<RpcChannel> {
31-
private Configs config;
32+
private final List<RpcChannel> rpcChannels;
3233

3334
public HostsRpcChannelFetcher(Configs config) {
34-
this.config = config;
35+
this.rpcChannels =
36+
Utils.convertDotString(PegasusConfig.PEGASUS_HOSTS.get(config)).stream()
37+
.map(
38+
k -> {
39+
String[] host = k.split(":");
40+
return new RpcChannel(host[0], Integer.valueOf(host[1]));
41+
})
42+
.collect(Collectors.toList());
3543
}
3644

3745
@Override
3846
public List<RpcChannel> fetch() {
39-
List<String> hostAddresses =
40-
Utils.convertDotString(PegasusConfig.PEGASUS_HOSTS.get(config));
41-
List<RpcChannel> rpcChannels = new ArrayList<>();
42-
hostAddresses.forEach(
43-
k -> {
44-
String[] host = k.split(":");
45-
rpcChannels.add(new RpcChannel(host[0], Integer.valueOf(host[1])));
46-
});
47-
return rpcChannels;
47+
return Collections.unmodifiableList(rpcChannels);
4848
}
4949

5050
@Override

0 commit comments

Comments
 (0)