Skip to content

Commit 02681c1

Browse files
authored
Merge pull request #8 from coroot/ebpf_l7
Capturing L7 protocols at the eBPF level
2 parents 18c1af3 + e27ef59 commit 02681c1

File tree

26 files changed

+1092
-257
lines changed

26 files changed

+1092
-257
lines changed

.dockerignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
**/.git/
2+
**/.idea/

containers/container.go

Lines changed: 109 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package containers
33
import (
44
"github.com/coroot/coroot-node-agent/cgroup"
55
"github.com/coroot/coroot-node-agent/common"
6+
"github.com/coroot/coroot-node-agent/ebpftracer"
67
"github.com/coroot/coroot-node-agent/flags"
78
"github.com/coroot/coroot-node-agent/logs"
89
"github.com/coroot/coroot-node-agent/node"
@@ -14,6 +15,7 @@ import (
1415
"inet.af/netaddr"
1516
"k8s.io/klog/v2"
1617
"os"
18+
"strconv"
1719
"strings"
1820
"sync"
1921
"time"
@@ -57,6 +59,17 @@ type AddrPair struct {
5759
dst netaddr.IPPort
5860
}
5961

62+
type ActiveConnection struct {
63+
ActualDest netaddr.IPPort
64+
Pid uint32
65+
Fd uint64
66+
}
67+
68+
type L7Stats struct {
69+
Requests *prometheus.CounterVec
70+
Latency prometheus.Histogram
71+
}
72+
6073
type Container struct {
6174
cgroup *cgroup.Cgroup
6275
metadata *ContainerMetadata
@@ -73,11 +86,13 @@ type Container struct {
7386

7487
listens map[netaddr.IPPort]map[uint32]time.Time // listen addr -> pid -> close time
7588

76-
connectsSuccessful map[AddrPair]int // dst:actual_dst -> count
77-
connectsFailed map[netaddr.IPPort]int // dst -> count
89+
connectsSuccessful map[AddrPair]int64 // dst:actual_dst -> count
90+
connectsFailed map[netaddr.IPPort]int64 // dst -> count
7891
connectLastAttempt map[netaddr.IPPort]time.Time // dst -> time
79-
connectionsActive map[AddrPair]netaddr.IPPort // src:dst -> actual_dst
80-
retransmits map[AddrPair]int // dst:actual_dst -> count
92+
connectionsActive map[AddrPair]ActiveConnection
93+
retransmits map[AddrPair]int64 // dst:actual_dst -> count
94+
95+
l7Stats map[ebpftracer.L7Protocol]map[AddrPair]*L7Stats // protocol -> dst:actual_dst -> stats
8196

8297
oomKills int
8398

@@ -101,11 +116,12 @@ func NewContainer(cg *cgroup.Cgroup, md *ContainerMetadata) *Container {
101116

102117
listens: map[netaddr.IPPort]map[uint32]time.Time{},
103118

104-
connectsSuccessful: map[AddrPair]int{},
105-
connectsFailed: map[netaddr.IPPort]int{},
119+
connectsSuccessful: map[AddrPair]int64{},
120+
connectsFailed: map[netaddr.IPPort]int64{},
106121
connectLastAttempt: map[netaddr.IPPort]time.Time{},
107-
connectionsActive: map[AddrPair]netaddr.IPPort{},
108-
retransmits: map[AddrPair]int{},
122+
connectionsActive: map[AddrPair]ActiveConnection{},
123+
retransmits: map[AddrPair]int64{},
124+
l7Stats: map[ebpftracer.L7Protocol]map[AddrPair]*L7Stats{},
109125

110126
mountIds: map[string]struct{}{},
111127

@@ -147,6 +163,12 @@ func (c *Container) Describe(ch chan<- *prometheus.Desc) {
147163
for _, m := range metricsList {
148164
ch <- m
149165
}
166+
for _, protoStats := range c.l7Stats {
167+
for _, s := range protoStats {
168+
s.Requests.Describe(ch)
169+
s.Latency.Describe(ch)
170+
}
171+
}
150172
}
151173

152174
func (c *Container) Collect(ch chan<- prometheus.Metric) {
@@ -242,8 +264,8 @@ func (c *Container) Collect(ch chan<- prometheus.Metric) {
242264
}
243265

244266
connections := map[AddrPair]int{}
245-
for c, actualDst := range c.connectionsActive {
246-
connections[AddrPair{src: c.dst, dst: actualDst}]++
267+
for c, conn := range c.connectionsActive {
268+
connections[AddrPair{src: c.dst, dst: conn.ActualDest}]++
247269
}
248270
for d, count := range connections {
249271
ch <- gauge(metrics.NetConnectionsActive, float64(count), d.src.String(), d.dst.String())
@@ -271,7 +293,14 @@ func (c *Container) Collect(ch chan<- prometheus.Metric) {
271293
ch <- gauge(metrics.ApplicationType, 1, appType)
272294
}
273295

274-
if !*flags.NoPingUpstreams {
296+
for _, protoStats := range c.l7Stats {
297+
for _, s := range protoStats {
298+
s.Requests.Collect(ch)
299+
s.Latency.Collect(ch)
300+
}
301+
}
302+
303+
if !*flags.DisablePinger {
275304
for ip, rtt := range c.ping(netNs) {
276305
ch <- gauge(metrics.NetLatency, rtt, ip.String())
277306
}
@@ -316,7 +345,7 @@ func (c *Container) onProcessExit(pid uint32, oomKill bool) {
316345
}
317346
}
318347

319-
func (c *Container) onFileOpen(pid uint32, fd uint32) {
348+
func (c *Container) onFileOpen(pid uint32, fd uint64) {
320349
mntId, logPath := resolveFd(pid, fd)
321350
c.lock.Lock()
322351
defer c.lock.Unlock()
@@ -349,7 +378,7 @@ func (c *Container) onListenClose(pid uint32, addr netaddr.IPPort) {
349378
}
350379
}
351380

352-
func (c *Container) onConnectionOpen(pid uint32, src, dst netaddr.IPPort, failed bool) {
381+
func (c *Container) onConnectionOpen(pid uint32, fd uint64, src, dst netaddr.IPPort, failed bool) {
353382
if dst.IP().IsLoopback() {
354383
netNs, err := proc.GetNetNs(pid)
355384
isHostNs := err == nil && hostNetNsId == netNs.UniqueId()
@@ -376,7 +405,11 @@ func (c *Container) onConnectionOpen(pid uint32, src, dst netaddr.IPPort, failed
376405
} else {
377406
actualDst := ConntrackGetActualDestination(src, dst)
378407
c.connectsSuccessful[AddrPair{src: dst, dst: actualDst}]++
379-
c.connectionsActive[AddrPair{src: src, dst: dst}] = actualDst
408+
c.connectionsActive[AddrPair{src: src, dst: dst}] = ActiveConnection{
409+
ActualDest: actualDst,
410+
Pid: pid,
411+
Fd: fd,
412+
}
380413
}
381414
c.connectLastAttempt[dst] = time.Now()
382415
}
@@ -391,14 +424,65 @@ func (c *Container) onConnectionClose(srcDst AddrPair) bool {
391424
return true
392425
}
393426

427+
func (c *Container) onL7Request(pid uint32, fd uint64, r *ebpftracer.L7Request) {
428+
for dest, conn := range c.connectionsActive {
429+
if conn.Pid == pid && conn.Fd == fd {
430+
key := AddrPair{src: dest.dst, dst: conn.ActualDest}
431+
stats := c.l7Stats[r.Protocol]
432+
if stats == nil {
433+
stats = map[AddrPair]*L7Stats{}
434+
c.l7Stats[r.Protocol] = stats
435+
}
436+
s := stats[key]
437+
if s == nil {
438+
constLabels := map[string]string{"destination": key.src.String(), "actual_destination": key.dst.String()}
439+
cOpts, ok := L7Requests[r.Protocol]
440+
if !ok {
441+
klog.Warningln("cannot find metric description for L7 protocol: %s", r.Protocol.String())
442+
return
443+
}
444+
hOpts, ok := L7Latency[r.Protocol]
445+
if !ok {
446+
klog.Warningln("cannot find metric description for L7 protocol: %s", r.Protocol.String())
447+
return
448+
}
449+
s = &L7Stats{
450+
Requests: prometheus.NewCounterVec(
451+
prometheus.CounterOpts{Name: cOpts.Name, Help: cOpts.Help, ConstLabels: constLabels},
452+
[]string{"status"},
453+
),
454+
Latency: prometheus.NewHistogram(
455+
prometheus.HistogramOpts{Name: hOpts.Name, Help: hOpts.Help, ConstLabels: constLabels},
456+
),
457+
}
458+
stats[key] = s
459+
}
460+
status := ""
461+
switch r.Protocol {
462+
case ebpftracer.L7ProtocolHTTP:
463+
status = strconv.Itoa(r.Status)
464+
default:
465+
if r.Status == 500 {
466+
status = "failed"
467+
} else {
468+
status = "ok"
469+
}
470+
}
471+
s.Requests.WithLabelValues(status).Inc()
472+
s.Latency.Observe(r.Duration.Seconds())
473+
return
474+
}
475+
}
476+
}
477+
394478
func (c *Container) onRetransmit(srcDst AddrPair) bool {
395479
c.lock.Lock()
396480
defer c.lock.Unlock()
397-
actualDst, ok := c.connectionsActive[srcDst]
481+
conn, ok := c.connectionsActive[srcDst]
398482
if !ok {
399483
return false
400484
}
401-
c.retransmits[AddrPair{src: srcDst.dst, dst: actualDst}]++
485+
c.retransmits[AddrPair{src: srcDst.dst, dst: conn.ActualDest}]++
402486
return true
403487
}
404488

@@ -566,7 +650,7 @@ func (c *Container) ping(netNs netns.NsHandle) map[netaddr.IP]float64 {
566650
}
567651

568652
func (c *Container) runLogParser(logPath string) {
569-
if *flags.NoParseLogs {
653+
if *flags.DisableLogParsing {
570654
return
571655
}
572656

@@ -667,6 +751,13 @@ func (c *Container) gc(now time.Time) {
667751
delete(c.retransmits, d)
668752
}
669753
}
754+
for _, protoStats := range c.l7Stats {
755+
for d := range protoStats {
756+
if d.src == dst {
757+
delete(protoStats, d)
758+
}
759+
}
760+
}
670761
}
671762
}
672763
}
@@ -738,7 +829,7 @@ func (c *Container) revalidateListens(now time.Time, actualListens map[netaddr.I
738829
}
739830
}
740831

741-
func resolveFd(pid uint32, fd uint32) (mntId string, logPath string) {
832+
func resolveFd(pid uint32, fd uint64) (mntId string, logPath string) {
742833
info := proc.GetFdInfo(pid, fd)
743834
if info == nil {
744835
return

containers/metrics.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package containers
22

33
import (
4+
"github.com/coroot/coroot-node-agent/ebpftracer"
45
"github.com/prometheus/client_golang/prometheus"
56
"reflect"
67
)
@@ -71,6 +72,25 @@ var metrics = struct {
7172
ApplicationType: metric("container_application_type", "Type of the application running in the container (e.g. memcached, postgres, mysql)", "application_type"),
7273
}
7374

75+
var (
76+
L7Requests = map[ebpftracer.L7Protocol]prometheus.CounterOpts{
77+
ebpftracer.L7ProtocolHTTP: {Name: "container_http_requests_total", Help: "Total number of outbound HTTP requests"},
78+
ebpftracer.L7ProtocolPostgres: {Name: "container_postgres_queries_total", Help: "Total number of outbound Postgres queries"},
79+
ebpftracer.L7ProtocolRedis: {Name: "container_redis_queries_total", Help: "Total number of outbound Redis queries"},
80+
ebpftracer.L7ProtocolMemcached: {Name: "container_memcached_queries_total", Help: "Total number of outbound Memcached queries"},
81+
ebpftracer.L7ProtocolMysql: {Name: "container_mysql_queries_total", Help: "Total number of outbound Mysql queries"},
82+
ebpftracer.L7ProtocolMongo: {Name: "container_mongo_queries_total", Help: "Total number of outbound Mongo queries"},
83+
}
84+
L7Latency = map[ebpftracer.L7Protocol]prometheus.HistogramOpts{
85+
ebpftracer.L7ProtocolHTTP: {Name: "container_http_request_duration_seconds_total", Help: "Histogram of the response time for each outbound HTTP request"},
86+
ebpftracer.L7ProtocolPostgres: {Name: "container_postgres_queries_duration_seconds_total", Help: "Histogram of the execution time for each outbound Postgres query"},
87+
ebpftracer.L7ProtocolRedis: {Name: "container_redis_queries_duration_seconds_total", Help: "Histogram of the execution time for each outbound Redis query"},
88+
ebpftracer.L7ProtocolMemcached: {Name: "container_memcached_queries_duration_seconds_total", Help: "Histogram of the execution time for each outbound Memcached query"},
89+
ebpftracer.L7ProtocolMysql: {Name: "container_mysql_queries_duration_seconds_total", Help: "Histogram of the execution time for each outbound Mysql query"},
90+
ebpftracer.L7ProtocolMongo: {Name: "container_mongo_queries_duration_seconds_total", Help: "Histogram of the execution time for each outbound Mongo query"},
91+
}
92+
)
93+
7494
func metric(name, help string, labels ...string) *prometheus.Desc {
7595
return prometheus.NewDesc(name, help, labels, nil)
7696
}

containers/registry.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"github.com/coroot/coroot-node-agent/cgroup"
66
"github.com/coroot/coroot-node-agent/common"
77
"github.com/coroot/coroot-node-agent/ebpftracer"
8+
"github.com/coroot/coroot-node-agent/flags"
89
"github.com/coroot/coroot-node-agent/proc"
910
"github.com/prometheus/client_golang/prometheus"
1011
"github.com/vishvananda/netns"
@@ -78,7 +79,7 @@ func NewRegistry(reg prometheus.Registerer, kernelVersion string) (*Registry, er
7879
}
7980

8081
go cs.handleEvents(cs.events)
81-
t, err := ebpftracer.NewTracer(cs.events, kernelVersion)
82+
t, err := ebpftracer.NewTracer(cs.events, kernelVersion, *flags.DisableL7Tracing)
8283
if err != nil {
8384
close(cs.events)
8485
return nil, err
@@ -177,13 +178,13 @@ func (r *Registry) handleEvents(ch <-chan ebpftracer.Event) {
177178

178179
case ebpftracer.EventTypeConnectionOpen:
179180
if c := r.getOrCreateContainer(e.Pid); c != nil {
180-
c.onConnectionOpen(e.Pid, e.SrcAddr, e.DstAddr, false)
181+
c.onConnectionOpen(e.Pid, e.Fd, e.SrcAddr, e.DstAddr, false)
181182
} else {
182183
klog.Infoln("TCP connection from unknown container", e)
183184
}
184185
case ebpftracer.EventTypeConnectionError:
185186
if c := r.getOrCreateContainer(e.Pid); c != nil {
186-
c.onConnectionOpen(e.Pid, e.SrcAddr, e.DstAddr, true)
187+
c.onConnectionOpen(e.Pid, e.Fd, e.SrcAddr, e.DstAddr, true)
187188
} else {
188189
klog.Infoln("TCP connection error from unknown container", e)
189190
}
@@ -201,6 +202,13 @@ func (r *Registry) handleEvents(ch <-chan ebpftracer.Event) {
201202
break
202203
}
203204
}
205+
case ebpftracer.EventTypeL7Request:
206+
if e.L7Request == nil {
207+
continue
208+
}
209+
if c := r.containersByPid[e.Pid]; c != nil {
210+
c.onL7Request(e.Pid, e.Fd, e.L7Request)
211+
}
204212
}
205213
}
206214
}

ebpftracer/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ FROM alpine:3.13
22

33
RUN apk add llvm clang libbpf-dev linux-headers
44

5-
COPY ebpf/* /tmp/
6-
WORKDIR /tmp
5+
COPY ebpf /tmp/ebpf
6+
WORKDIR /tmp/ebpf
77

88
RUN clang -g -O2 -target bpf -D__KERNEL=416 -c ebpf.c -o ebpf416.o && llvm-strip --strip-debug ebpf416.o
99
RUN clang -g -O2 -target bpf -D__KERNEL=420 -c ebpf.c -o ebpf420.o && llvm-strip --strip-debug ebpf420.o

ebpftracer/Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
build:
22
@echo ===BUILDING===
3-
docker build -t ebpftracer .
4-
docker cp $(shell docker create --rm ebpftracer):/tmp/ebpf.go ./ebpf.go
3+
docker rmi -f ebpftracer
4+
docker build -t ebpftracer --progress plain .
5+
docker run --rm --name ebpftracer ebpftracer cat /tmp/ebpf/ebpf.go > ./ebpf.go
56
@echo
67

78
test: test_vm1 test_vm2 test_vm3 test_vm4

ebpftracer/ebpf.go

Lines changed: 9 additions & 5 deletions
Large diffs are not rendered by default.

ebpftracer/ebpf/ebpf.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <bpf/bpf_helpers.h>
33
#include <bpf/bpf_core_read.h>
44
#include <bpf/bpf_tracing.h>
5+
#include <bpf/bpf_endian.h>
56

67
#define EVENT_TYPE_PROCESS_START 1
78
#define EVENT_TYPE_PROCESS_EXIT 2
@@ -17,7 +18,8 @@
1718

1819
#include "proc.c"
1920
#include "file.c"
20-
#include "tcp_state.c"
21-
#include "tcp_retransmit.c"
21+
#include "tcp/state.c"
22+
#include "tcp/retransmit.c"
23+
#include "l7/l7.c"
2224

2325
char _license[] SEC("license") = "GPL";

ebpftracer/ebpf/file.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
struct file_event {
44
__u32 type;
55
__u32 pid;
6-
__u32 fd;
6+
__u64 fd;
77
};
88

99
struct {

0 commit comments

Comments
 (0)