Skip to content

Commit 30aa9a8

Browse files
authored
Merge pull request moby#50104 from robmry/outline_nftabler
nftables: add initial/outline nftabler
2 parents cab4ac8 + c66abe4 commit 30aa9a8

File tree

7 files changed

+343
-0
lines changed

7 files changed

+343
-0
lines changed

libnetwork/drivers/bridge/bridge_linux.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@ import (
1919
"github.com/docker/docker/libnetwork/driverapi"
2020
"github.com/docker/docker/libnetwork/drivers/bridge/internal/firewaller"
2121
"github.com/docker/docker/libnetwork/drivers/bridge/internal/iptabler"
22+
"github.com/docker/docker/libnetwork/drivers/bridge/internal/nftabler"
2223
"github.com/docker/docker/libnetwork/drivers/bridge/internal/rlkclient"
2324
"github.com/docker/docker/libnetwork/internal/netiputil"
25+
"github.com/docker/docker/libnetwork/internal/nftables"
2426
"github.com/docker/docker/libnetwork/iptables"
2527
"github.com/docker/docker/libnetwork/netlabel"
2628
"github.com/docker/docker/libnetwork/netutils"
@@ -544,6 +546,9 @@ func (d *driver) configure(option map[string]interface{}) error {
544546
}
545547

546548
var newFirewaller = func(ctx context.Context, config firewaller.Config) (firewaller.Firewaller, error) {
549+
if nftables.Enabled() {
550+
return nftabler.NewNftabler(ctx, config)
551+
}
547552
return iptabler.NewIptabler(ctx, config)
548553
}
549554

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
//go:build linux
2+
3+
package nftabler
4+
5+
import (
6+
"context"
7+
"net/netip"
8+
)
9+
10+
func (n *network) AddEndpoint(ctx context.Context, epIPv4, epIPv6 netip.Addr) error {
11+
return nil
12+
}
13+
14+
func (n *network) DelEndpoint(ctx context.Context, epIPv4, epIPv6 netip.Addr) error {
15+
return nil
16+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
//go:build linux
2+
3+
package nftabler
4+
5+
import (
6+
"context"
7+
"net/netip"
8+
9+
"github.com/docker/docker/libnetwork/types"
10+
)
11+
12+
func (n *network) AddLink(ctx context.Context, parentIP, childIP netip.Addr, ports []types.TransportPort) error {
13+
return nil
14+
}
15+
16+
func (n *network) DelLink(ctx context.Context, parentIP, childIP netip.Addr, ports []types.TransportPort) {
17+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
//go:build linux
2+
3+
package nftabler
4+
5+
import (
6+
"context"
7+
8+
"github.com/containerd/log"
9+
"github.com/docker/docker/libnetwork/drivers/bridge/internal/firewaller"
10+
)
11+
12+
type network struct {
13+
config firewaller.NetworkConfig
14+
fw *nftabler
15+
}
16+
17+
func (nft *nftabler) NewNetwork(ctx context.Context, nc firewaller.NetworkConfig) (_ firewaller.Network, retErr error) {
18+
n := &network{
19+
fw: nft,
20+
config: nc,
21+
}
22+
return n, nil
23+
}
24+
25+
func (n *network) ReapplyNetworkLevelRules(ctx context.Context) error {
26+
log.G(ctx).Warn("ReapplyNetworkLevelRules is not implemented for nftables")
27+
return nil
28+
}
29+
30+
func (n *network) DelNetworkLevelRules(ctx context.Context) error {
31+
return nil
32+
}
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
//go:build linux
2+
3+
package nftabler
4+
5+
import (
6+
"context"
7+
"fmt"
8+
9+
"github.com/containerd/log"
10+
"github.com/docker/docker/libnetwork/drivers/bridge/internal/firewaller"
11+
"github.com/docker/docker/libnetwork/internal/nftables"
12+
"go.opentelemetry.io/otel"
13+
)
14+
15+
// Prefix for OTEL span names.
16+
const spanPrefix = "libnetwork.drivers.bridge.nftabler"
17+
18+
const (
19+
dockerTable = "docker-bridges"
20+
forwardChain = "filter-FORWARD"
21+
postroutingChain = "nat-POSTROUTING"
22+
preroutingChain = "nat-PREROUTING"
23+
outputChain = "nat-OUTPUT"
24+
natChain = "nat-prerouting-and-output"
25+
rawPreroutingChain = "raw-PREROUTING"
26+
filtFwdInVMap = "filter-forward-in-jumps"
27+
filtFwdOutVMap = "filter-forward-out-jumps"
28+
natPostroutingOutVMap = "nat-postrouting-out-jumps"
29+
natPostroutingInVMap = "nat-postrouting-in-jumps"
30+
)
31+
32+
const (
33+
initialRuleGroup nftables.RuleGroup = iota
34+
)
35+
36+
type nftabler struct {
37+
config firewaller.Config
38+
table4 nftables.TableRef
39+
table6 nftables.TableRef
40+
}
41+
42+
func NewNftabler(ctx context.Context, config firewaller.Config) (firewaller.Firewaller, error) {
43+
nft := &nftabler{config: config}
44+
45+
if nft.config.IPv4 {
46+
var err error
47+
nft.table4, err = nft.init(ctx, nftables.IPv4)
48+
if err != nil {
49+
return nil, err
50+
}
51+
if err := nftApply(ctx, nft.table4); err != nil {
52+
return nil, fmt.Errorf("IPv4 initialisation: %w", err)
53+
}
54+
}
55+
56+
if nft.config.IPv6 {
57+
var err error
58+
nft.table6, err = nft.init(ctx, nftables.IPv6)
59+
if err != nil {
60+
return nil, err
61+
}
62+
63+
if err := nftApply(ctx, nft.table6); err != nil {
64+
// Perhaps the kernel has no IPv6 support. It won't be possible to create IPv6
65+
// networks without enabling ip6_tables in the kernel, or disabling ip6tables in
66+
// the daemon config. But, allow the daemon to start because IPv4 will work. So,
67+
// log the problem, and continue.
68+
log.G(ctx).WithError(err).Warn("ip6tables is enabled, but cannot set up IPv6 nftables table")
69+
}
70+
}
71+
72+
return nft, nil
73+
}
74+
75+
func (nft *nftabler) getTable(ipv firewaller.IPVersion) nftables.TableRef {
76+
if ipv == firewaller.IPv4 {
77+
return nft.table4
78+
}
79+
return nft.table6
80+
}
81+
82+
func (nft *nftabler) FilterForwardDrop(ctx context.Context, ipv firewaller.IPVersion) error {
83+
table := nft.getTable(ipv)
84+
if err := table.Chain(forwardChain).SetPolicy("drop"); err != nil {
85+
return err
86+
}
87+
return nftApply(ctx, table)
88+
}
89+
90+
// init creates the bridge driver's nftables table for IPv4 or IPv6.
91+
func (nft *nftabler) init(ctx context.Context, family nftables.Family) (nftables.TableRef, error) {
92+
// Instantiate the table.
93+
table, err := nftables.NewTable(family, dockerTable)
94+
if err != nil {
95+
return table, err
96+
}
97+
98+
// Set up the filter forward chain.
99+
//
100+
// This base chain only contains two rules that use verdict maps:
101+
// - if a packet is entering a bridge network, jump to that network's filter-forward ingress chain.
102+
// - if a packet is leaving a bridge network, jump to that network's filter-forward egress chain.
103+
//
104+
// So, packets that aren't related to docker don't need to traverse any per-network filter forward
105+
// rules - and packets that are entering or leaving docker networks only need to traverse rules
106+
// related to those networks.
107+
fwdChain, err := table.BaseChain(forwardChain,
108+
nftables.BaseChainTypeFilter,
109+
nftables.BaseChainHookForward,
110+
nftables.BaseChainPriorityFilter)
111+
if err != nil {
112+
return nftables.TableRef{}, fmt.Errorf("initialising nftables: %w", err)
113+
}
114+
// Instantiate the verdict maps and add the jumps.
115+
_ = table.InterfaceVMap(filtFwdInVMap)
116+
if err := fwdChain.AppendRule(initialRuleGroup, "oifname vmap @"+filtFwdInVMap); err != nil {
117+
return nftables.TableRef{}, fmt.Errorf("initialising nftables: %w", err)
118+
}
119+
_ = table.InterfaceVMap(filtFwdOutVMap)
120+
if err := fwdChain.AppendRule(initialRuleGroup, "iifname vmap @"+filtFwdOutVMap); err != nil {
121+
return nftables.TableRef{}, fmt.Errorf("initialising nftables: %w", err)
122+
}
123+
124+
// Set up the NAT postrouting base chain.
125+
//
126+
// Like the filter-forward chain, its only rules are jumps to network-specific ingress and egress chains.
127+
natPostRtChain, err := table.BaseChain(postroutingChain,
128+
nftables.BaseChainTypeNAT,
129+
nftables.BaseChainHookPostrouting,
130+
nftables.BaseChainPrioritySrcNAT)
131+
if err != nil {
132+
return nftables.TableRef{}, err
133+
}
134+
_ = table.InterfaceVMap(natPostroutingOutVMap)
135+
if err := natPostRtChain.AppendRule(initialRuleGroup, "iifname vmap @"+natPostroutingOutVMap); err != nil {
136+
return nftables.TableRef{}, fmt.Errorf("initialising nftables: %w", err)
137+
}
138+
_ = table.InterfaceVMap(natPostroutingInVMap)
139+
if err := natPostRtChain.AppendRule(initialRuleGroup, "oifname vmap @"+natPostroutingInVMap); err != nil {
140+
return nftables.TableRef{}, fmt.Errorf("initialising nftables: %w", err)
141+
}
142+
143+
// Instantiate natChain, for the NAT prerouting and output base chains to jump to.
144+
_ = table.Chain(natChain)
145+
146+
// Set up the NAT prerouting base chain.
147+
natPreRtChain, err := table.BaseChain(preroutingChain,
148+
nftables.BaseChainTypeNAT,
149+
nftables.BaseChainHookPrerouting,
150+
nftables.BaseChainPriorityDstNAT)
151+
if err != nil {
152+
return nftables.TableRef{}, err
153+
}
154+
if err := natPreRtChain.AppendRule(initialRuleGroup, "fib daddr type local counter jump "+natChain); err != nil {
155+
return nftables.TableRef{}, fmt.Errorf("initialising nftables: %w", err)
156+
}
157+
158+
// Set up the NAT output base chain
159+
natOutputChain, err := table.BaseChain(outputChain,
160+
nftables.BaseChainTypeNAT,
161+
nftables.BaseChainHookOutput,
162+
nftables.BaseChainPriorityDstNAT)
163+
if err != nil {
164+
return nftables.TableRef{}, err
165+
}
166+
// For output, don't jump to the NAT chain if hairpin is enabled (no userland proxy).
167+
var skipLoopback string
168+
if !nft.config.Hairpin {
169+
if family == nftables.IPv4 {
170+
skipLoopback = "ip daddr != 127.0.0.1/8 "
171+
} else {
172+
skipLoopback = "ip6 daddr != ::1 "
173+
}
174+
}
175+
if err := natOutputChain.AppendRule(initialRuleGroup, skipLoopback+"fib daddr type local counter jump "+natChain); err != nil {
176+
return nftables.TableRef{}, fmt.Errorf("initialising nftables: %w", err)
177+
}
178+
179+
// Set up the raw prerouting base chain
180+
if _, err := table.BaseChain(rawPreroutingChain,
181+
nftables.BaseChainTypeFilter,
182+
nftables.BaseChainHookPrerouting,
183+
nftables.BaseChainPriorityRaw); err != nil {
184+
return nftables.TableRef{}, err
185+
}
186+
187+
if !nft.config.Hairpin && nft.config.WSL2Mirrored {
188+
if err := mirroredWSL2Workaround(ctx, table); err != nil {
189+
return nftables.TableRef{}, err
190+
}
191+
}
192+
193+
return table, nil
194+
}
195+
196+
func nftApply(ctx context.Context, table nftables.TableRef) error {
197+
ctx, span := otel.Tracer("").Start(ctx, spanPrefix+".nftApply."+string(table.Family()))
198+
defer span.End()
199+
if err := table.Apply(ctx); err != nil {
200+
return fmt.Errorf("applying nftables rules: %w", err)
201+
}
202+
return nil
203+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
2+
//go:build go1.22 && linux
3+
4+
package nftabler
5+
6+
import (
7+
"context"
8+
9+
"github.com/docker/docker/libnetwork/types"
10+
)
11+
12+
func (n *network) AddPorts(ctx context.Context, pbs []types.PortBinding) error {
13+
return n.modPorts(ctx, pbs, true)
14+
}
15+
16+
func (n *network) DelPorts(ctx context.Context, pbs []types.PortBinding) error {
17+
return n.modPorts(ctx, pbs, false)
18+
}
19+
20+
func (n *network) modPorts(ctx context.Context, pbs []types.PortBinding, enable bool) error {
21+
return nil
22+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
//go:build linux
2+
3+
package nftabler
4+
5+
import (
6+
"context"
7+
8+
"github.com/docker/docker/libnetwork/internal/nftables"
9+
)
10+
11+
// mirroredWSL2Workaround adds IPv4 NAT rule if docker's host Linux appears to
12+
// be a guest running under WSL2 in with mirrored mode networking.
13+
// https://learn.microsoft.com/en-us/windows/wsl/networking#mirrored-mode-networking
14+
//
15+
// Without mirrored mode networking, or for a packet sent from Linux, packets
16+
// sent to 127.0.0.1 are processed as outgoing - they hit the nat-OUTPUT chain,
17+
// which does not jump to the nat-DOCKER chain because the rule has an exception
18+
// for "-d 127.0.0.0/8". The default action on the nat-OUTPUT chain is ACCEPT (by
19+
// default), so the packet is delivered to 127.0.0.1 on lo, where docker-proxy
20+
// picks it up and acts as a man-in-the-middle; it receives the packet and
21+
// re-sends it to the container (or acks a SYN and sets up a second TCP
22+
// connection to the container). So, the container sees packets arrive with a
23+
// source address belonging to the network's bridge, and it is able to reply to
24+
// that address.
25+
//
26+
// In WSL2's mirrored networking mode, Linux has a loopback0 device as well as lo
27+
// (which owns 127.0.0.1 as normal). Packets sent to 127.0.0.1 from Windows to a
28+
// server listening on Linux's 127.0.0.1 are delivered via loopback0, and
29+
// processed as packets arriving from outside the Linux host (which they are).
30+
//
31+
// So, these packets hit the nat-PREROUTING chain instead of nat-OUTPUT. It would
32+
// normally be impossible for a packet ->127.0.0.1 to arrive from outside the
33+
// host, so the nat-PREROUTING jump to nat-DOCKER has no exception for it. The
34+
// packet is processed by a per-bridge DNAT rule in that chain, so it is
35+
// delivered directly to the container (not via docker-proxy) with source address
36+
// 127.0.0.1, so the container can't respond.
37+
//
38+
// DNAT is normally skipped by RETURN rules in the nat-DOCKER chain for packets
39+
// arriving from any other bridge network. Similarly, this function adds (or
40+
// removes) a rule to RETURN early for packets delivered via loopback0 with
41+
// destination 127.0.0.0/8.
42+
func mirroredWSL2Workaround(ctx context.Context, table nftables.TableRef) error {
43+
// WSL2 does not (currently) support Windows<->Linux communication via ::1.
44+
if table.Family() != nftables.IPv4 {
45+
return nil
46+
}
47+
return table.Chain(natChain).AppendRule(initialRuleGroup, `iifname "loopback0" ip daddr 127.0.0.0/8 counter return`)
48+
}

0 commit comments

Comments
 (0)