Skip to content

Commit eabddf2

Browse files
authored
Support pprof profiling feature (#13502)
1 parent be000e6 commit eabddf2

File tree

101 files changed

+4958
-8
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

101 files changed

+4958
-8
lines changed

.github/workflows/skywalking.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,13 @@ jobs:
723723
config: test/e2e-v2/cases/profiling/async-profiler/banyandb/e2e.yaml
724724
- name: Async Profiler MySQL
725725
config: test/e2e-v2/cases/profiling/async-profiler/mysql/e2e.yaml
726+
727+
- name: Pprof ES
728+
config: test/e2e-v2/cases/profiling/pprof/es/e2e.yaml
729+
- name: Pprof BanyanDB
730+
config: test/e2e-v2/cases/profiling/pprof/banyandb/e2e.yaml
731+
- name: Pprof MySQL
732+
config: test/e2e-v2/cases/profiling/pprof/mysql/e2e.yaml
726733
steps:
727734
- uses: actions/checkout@v4
728735
with:

apm-protocol/apm-network/src/main/java/org/apache/skywalking/oap/server/network/trace/component/command/CommandDeserializer.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ public static BaseCommand deserialize(final Command command) {
2929
return ConfigurationDiscoveryCommand.DESERIALIZER.deserialize(command);
3030
} else if (AsyncProfilerTaskCommand.NAME.equals(commandName)) {
3131
return AsyncProfilerTaskCommand.DESERIALIZER.deserialize(command);
32+
} else if (PprofTaskCommand.NAME.equals(commandName)) {
33+
return PprofTaskCommand.DESERIALIZER.deserialize(command);
3234
}
3335
throw new UnsupportedCommandException(command);
3436
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*
17+
*/
18+
19+
package org.apache.skywalking.oap.server.network.trace.component.command;
20+
21+
import org.apache.skywalking.apm.network.common.v3.Command;
22+
import org.apache.skywalking.apm.network.common.v3.KeyStringValuePair;
23+
import java.util.List;
24+
import lombok.Getter;
25+
26+
@Getter
27+
public class PprofTaskCommand extends BaseCommand implements Serializable, Deserializable<PprofTaskCommand> {
28+
public static final Deserializable<PprofTaskCommand> DESERIALIZER = new PprofTaskCommand("", "", "", 0, 0, 0);
29+
public static final String NAME = "PprofTaskQuery";
30+
/**
31+
* pprof taskId
32+
*/
33+
private String taskId;
34+
/**
35+
* event type of profiling (CPU/Heap/Block/Mutex/Goroutine/Threadcreate/Allocs)
36+
*/
37+
private String events;
38+
/**
39+
* run profiling for duration (minute)
40+
*/
41+
private long duration;
42+
/**
43+
* task create time
44+
*/
45+
private long createTime;
46+
/**
47+
* pprof dump period parameters. There are different dumpperiod configurations for different events.
48+
* Here is a table of parameters.
49+
*
50+
* <p>for Block - sample an average of one blocking event per rate nanoseconds spent blocked. (default: 0)</p>
51+
* <p>for Mutex - sample an average of 1/rate events are reported. (default: 0)</p>
52+
* details @see <a href="https://pkg.go.dev/runtime/pprof">pprof argument</a>
53+
*/
54+
private int dumpPeriod;
55+
56+
public PprofTaskCommand(String serialNumber, String taskId, String events,
57+
long duration, long createTime, int dumpPeriod) {
58+
super(NAME, serialNumber);
59+
this.taskId = taskId;
60+
this.duration = duration;
61+
this.createTime = createTime;
62+
this.dumpPeriod = dumpPeriod;
63+
this.events = events;
64+
}
65+
66+
@Override
67+
public PprofTaskCommand deserialize(Command command) {
68+
final List<KeyStringValuePair> argsList = command.getArgsList();
69+
String taskId = null;
70+
String events = null;
71+
long duration = 0;
72+
long createTime = 0;
73+
int dumpPeriod = 0;
74+
String serialNumber = null;
75+
for (final KeyStringValuePair pair : argsList) {
76+
if ("SerialNumber".equals(pair.getKey())) {
77+
serialNumber = pair.getValue();
78+
} else if ("TaskId".equals(pair.getKey())) {
79+
taskId = pair.getValue();
80+
} else if ("Events".equals(pair.getKey())) {
81+
events = pair.getValue();
82+
} else if ("Duration".equals(pair.getKey())) {
83+
duration = Long.parseLong(pair.getValue());
84+
} else if ("CreateTime".equals(pair.getKey())) {
85+
createTime = Long.parseLong(pair.getValue());
86+
} else if ("DumpPeriod".equals(pair.getKey())) {
87+
dumpPeriod = Integer.parseInt(pair.getValue());
88+
}
89+
}
90+
return new PprofTaskCommand(serialNumber, taskId, events, duration, createTime, dumpPeriod);
91+
}
92+
93+
@Override
94+
public Command.Builder serialize() {
95+
final Command.Builder builder = commandBuilder();
96+
builder.addArgs(KeyStringValuePair.newBuilder().setKey("TaskId").setValue(taskId))
97+
.addArgs(KeyStringValuePair.newBuilder().setKey("Events").setValue(events))
98+
.addArgs(KeyStringValuePair.newBuilder().setKey("Duration").setValue(String.valueOf(duration)))
99+
.addArgs(KeyStringValuePair.newBuilder().setKey("CreateTime").setValue(String.valueOf(createTime)))
100+
.addArgs(KeyStringValuePair.newBuilder().setKey("DumpPeriod").setValue(String.valueOf(dumpPeriod)));
101+
return builder;
102+
}
103+
}

docs/en/api/query-protocol.md

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ extend type Query {
215215
Event query fetches the event list based on given sources and time range conditions.
216216

217217
### Profiling
218-
SkyWalking offers two types of [profiling](../concepts-and-designs/profiling.md), in-process(tracing profiling and async-profiler) and out-process(ebpf profiling), allowing users to create tasks and check their execution status.
218+
SkyWalking offers two types of [profiling](../concepts-and-designs/profiling.md), in-process(tracing profiling, async-profiler and pprof) and out-process(ebpf profiling), allowing users to create tasks and check their execution status.
219219

220220
#### In-process profiling
221221

@@ -256,6 +256,25 @@ extend type Query {
256256
}
257257
```
258258

259+
##### pprof
260+
261+
```graphql
262+
extend type Mutation {
263+
# Create a new pprof task
264+
createPprofTask(pprofTaskCreationRequest: PprofTaskCreationRequest!): PprofTaskCreationResult!
265+
}
266+
267+
extend type Query {
268+
# Query all task lists and sort them in descending order by create time
269+
queryPprofTaskList(request: PprofTaskListRequest!): PprofTaskListResult!
270+
# Query task progress, including task logs
271+
queryPprofTaskProgress(taskId: String!): PprofTaskProgress!
272+
# Query the flame graph produced by pprof
273+
queryPprofAnalyze(request: PprofAnalyzationRequest!): PprofAnalyzation!
274+
}
275+
```
276+
277+
259278
#### Out-process profiling
260279

261280
```graphql

docs/en/changes/changes.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@
111111
* Make MAL percentile align with OAL percentile calculation.
112112
* Update Grafana dashboards for OAP observability.
113113
* BanyanDB: fix query `getInstance` by instance ID.
114+
* Support the go agent(0.7.0 release) bundled pprof profiling feature.
115+
114116

115117
#### UI
116118

docs/en/concepts-and-designs/profiling.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,20 @@ Async Profiler can trace the following kinds of events:
4747

4848
Only Java agent support this.
4949

50+
### Go App Profiling
51+
52+
Go App Profiling uses the [pprof](https://github.com/google/pprof) for sampling.
53+
54+
pprof is a profiling tool by Google for visualizing and analyzing sampled performance data.
55+
It reads samples in profile.proto format and generates text or graphical reports (via the dot visualization) to highlight performance hotspots.
56+
57+
pprof supports profiling of:
58+
59+
- CPU.
60+
- Memory allocs / heap.
61+
- Block / mutex.
62+
- Gouroutine / threadcreate.
63+
5064
## Out-of-process profiling
5165

5266
Out-of-process profiling leverage [eBPF](https://ebpf.io/) technology with origins in the Linux kernel.
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# Go App Profiling
2+
3+
Go App Profiling uses the pprof for sampling
4+
5+
pprof is bundled within the auto-instrument agent and corresponds to [In-Process Profiling](../../concepts-and-designs/profiling.md#in-process-profiling).
6+
7+
It is delivered to the agent in the form of a task, allowing it to be enabled or disabled dynamically.
8+
When service encounters performance issues (CPU usage, memory allocation, etc.), pprof task can be created.
9+
When the agent receives a task, it enables pprof for sampling.
10+
After sampling is completed, the sampling results are analyzed by requesting the server to render a flame graph for performance
11+
analysis to determine the specific business code lines that cause performance problems.
12+
Note, tracing profiling in the Go agent relies on the Go runtime’s global CPU sampling used by pprof.
13+
Since only one CPU profiler can run at a time within the same instance, tracing and pprof CPU profiling cannot be enabled simultaneously.
14+
If both are activated on the same instance, one task may fail to start.
15+
16+
## Activate pprof in the OAP
17+
OAP and the agent use a brand-new protocol to exchange pprof data, so it is necessary to start OAP with the following configuration:
18+
19+
```yaml
20+
receiver-pprof:
21+
selector: ${SW_RECEIVER_PPROF:default}
22+
default:
23+
# Used to manage the maximum size of the pprof file that can be received, the unit is Byte, default is 30M
24+
pprofMaxSize: ${SW_RECEIVER_PPROF_MAX_SIZE:31457280}
25+
# Used to determine whether to receive pprof in memory file or physical file mode
26+
#
27+
# The memory file mode have fewer local file system limitations, so they are by default. But it costs more memory.
28+
#
29+
# The physical file mode will use less memory when parsing and is more friendly to parsing large files.
30+
# However, if the storage of the tmp directory in the container is insufficient, the oap server instance may crash.
31+
# It is recommended to use physical file mode when volume mounting is used or the tmp directory has sufficient storage.
32+
memoryParserEnabled: ${SW_RECEIVER_PPROF_MEMORY_PARSER_ENABLED:true}
33+
```
34+
35+
## pprof Task with Analysis
36+
37+
To use the pprof feature, please follow these steps:
38+
39+
1. **Create pprof task**: Use the UI or CLI tool to create a task.
40+
2. **Wait agent collect data and upload**: Wait for pprof to collect pprof data and report.
41+
3. **Query task progress**: Query the progress of tasks, including analyzing successful and failed instances and task logs.
42+
4. **Analyze the data**: Analyze the pprof data to determine where performance bottlenecks exist in the service.
43+
44+
### Create an pprof task
45+
46+
Create an pprof task to notify some go-agent instances in the execution service to start pprof for data collection.
47+
48+
When creating a task, the following configuration fields are required:
49+
50+
1. **serviceId**: Define the service to execute the task.
51+
2. **serviceInstanceIds**: Define which instances need to execute tasks.
52+
3. **duration**: Define the duration of this task in minutes, required for CPU, BLOCK, MUTEX events.
53+
4. **events**: Define which event types this task needs to collect.
54+
5. **dumpPeriod**: Define the period of the pprof dump, required for BLOCK, MUTEX events.
55+
56+
When the Agent receives a pprof task from OAP, it automatically generates a log to notify that the task has been acknowledged. The log contains the following field information:
57+
58+
1. **Instance**: The name of the instance where the Agent is located.
59+
2. **Type**: Supports "NOTIFIED" and "EXECUTION_FINISHED" and "PPROF_UPLOAD_FILE_TOO_LARGE_ERROR", "EXECUTION_TASK_ERROR", with the current log displaying "NOTIFIED".
60+
3. **Time**: The time when the Agent received the task.
61+
62+
### Wait the agent to collect data and upload
63+
64+
At this point, pprof will trace the events you selected when you created the task:
65+
66+
1. CPU: samples CPU usage over time to show which functions consume the most processing time.
67+
2. ALLOC, HEAP:
68+
- HEAP: a sampling of memory allocations of live objects.
69+
- ALLOC: a sampling of all past memory allocations.
70+
3. BLOCK, MUTEX:
71+
- BLOCK: stack traces that led to blocking on synchronization primitives.
72+
- MUTEX: stack traces of holders of contended mutexes.
73+
4. GOROUTINE, THREADCREAT:
74+
- GOROUTINE: stack traces of all current goroutines.
75+
- THREADCREATE: stack traces that led to the creation of new OS threads.
76+
77+
Finally, the agent will upload the pprof file produced by pprof to the oap server for online performance analysis.
78+
79+
### Query the profiling task progresses
80+
81+
Wait for pprof to complete data collection and upload successfully.
82+
We can query the execution logs of the pprof task and the task status, which includes the following information:
83+
84+
1. **successInstanceIds**: SuccessInstanceIds gives instances that have executed the task successfully.
85+
2. **errorInstanceIds**: ErrorInstanceIds gives instances that failed to execute the task.
86+
3. **logs**: All task execution logs of the current task.
87+
1. **id**: The task id.
88+
2. **instanceId**: InstanceId is the id of the instance which reported this task log.
89+
3. **instanceName**: InstanceName is the name of the instance which reported this task log.
90+
4. **operationType**: Contains "NOTIFIED" and "EXECUTION_FINISHED" and "PPROF_UPLOAD_FILE_TOO_LARGE_ERROR", "EXECUTION_TASK_ERROR".
91+
5. **operationTime**: operationTime is the time when the operation occurs.
92+
93+
### Analyze the profiling data
94+
95+
Once some agents completed the task, we can analyze the data through the following query:
96+
97+
1. **taskId**: The task id.
98+
2. **instanceIds**: InstanceIds defines the instances to be included for analysis
99+
100+
After the query, the following data would be returned to render a flame graph:
101+
1. **taskId**: The task id.
102+
2. **elements**: Combined with "id" to determine the hierarchical relationship.
103+
1. **Id**: Id is the identity of the stack element.
104+
2. **parentId**: Parent element ID. The dependency relationship between elements can be determined using the element ID and parent element ID.
105+
3. **codeSignature**: Method signatures in tree nodes.
106+
4. **total**:The total number of samples of the current tree node, including child nodes.
107+
5. **self**: The sampling number of the current tree node, excluding samples of the children.

docs/menu.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,8 @@ catalog:
270270
path: "/en/setup/backend/backend-continuous-profiling"
271271
- name: "Java App Profiling"
272272
path: "/en/setup/backend/backend-java-app-profiling"
273+
- name: "Go App Profiling"
274+
path: "en/setup/backend/backend-go-app-profiling.md"
273275
- name: "Event"
274276
path: "/en/concepts-and-designs/event/"
275277
- name: "Extension"

oap-server/server-core/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@
4444
<artifactId>library-async-profiler-jfr-parser</artifactId>
4545
<version>${project.version}</version>
4646
</dependency>
47+
<dependency>
48+
<groupId>org.apache.skywalking</groupId>
49+
<artifactId>library-pprof-parser</artifactId>
50+
<version>${project.version}</version>
51+
</dependency>
4752
<dependency>
4853
<groupId>org.apache.skywalking</groupId>
4954
<artifactId>telemetry-api</artifactId>

oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/CoreModule.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.util.List;
2323
import org.apache.skywalking.oap.server.core.analysis.meter.MeterSystem;
2424
import org.apache.skywalking.oap.server.core.cache.AsyncProfilerTaskCache;
25+
import org.apache.skywalking.oap.server.core.cache.PprofTaskCache;
2526
import org.apache.skywalking.oap.server.core.cache.NetworkAddressAliasCache;
2627
import org.apache.skywalking.oap.server.core.cache.ProfileTaskCache;
2728
import org.apache.skywalking.oap.server.core.command.CommandService;
@@ -41,6 +42,8 @@
4142
import org.apache.skywalking.oap.server.core.profiling.continuous.ContinuousProfilingQueryService;
4243
import org.apache.skywalking.oap.server.core.profiling.ebpf.EBPFProfilingMutationService;
4344
import org.apache.skywalking.oap.server.core.profiling.ebpf.EBPFProfilingQueryService;
45+
import org.apache.skywalking.oap.server.core.profiling.pprof.PprofMutationService;
46+
import org.apache.skywalking.oap.server.core.profiling.pprof.PprofQueryService;
4447
import org.apache.skywalking.oap.server.core.profiling.trace.ProfileTaskMutationService;
4548
import org.apache.skywalking.oap.server.core.profiling.trace.ProfileTaskQueryService;
4649
import org.apache.skywalking.oap.server.core.query.AggregationQueryService;
@@ -106,6 +109,7 @@ public Class[] services() {
106109
addManagementService(classes);
107110
addEBPFProfilingService(classes);
108111
addAsyncProfilerService(classes);
112+
addPprofService(classes);
109113

110114
classes.add(CommandService.class);
111115
classes.add(HierarchyService.class);
@@ -137,6 +141,12 @@ private void addAsyncProfilerService(List<Class> classes) {
137141
classes.add(AsyncProfilerTaskCache.class);
138142
}
139143

144+
private void addPprofService(List<Class> classes) {
145+
classes.add(PprofMutationService.class);
146+
classes.add(PprofQueryService.class);
147+
classes.add(PprofTaskCache.class);
148+
}
149+
140150
private void addOALService(List<Class> classes) {
141151
classes.add(OALEngineLoaderService.class);
142152
}

0 commit comments

Comments
 (0)