Skip to content

Commit c12534d

Browse files
committed
kubelet, kube-proxy: unmark packets before masquerading them
It seems that if you set the packet mark on a packet and then route that packet through a kernel VXLAN interface, the VXLAN-encapsulated packet will still have the mark from the original packet. Since our NAT rules are based on the packet mark, this was causing us to double-NAT some packets, which then triggered a kernel checksumming bug. But even without the checksum bug, there are reasons to avoid double-NATting, so fix the rules to unmark the packets before masquerading them.
1 parent 2930723 commit c12534d

File tree

5 files changed

+56
-19
lines changed

5 files changed

+56
-19
lines changed

pkg/kubelet/kubelet_network_linux.go

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ func (kl *Kubelet) syncNetworkUtil() {
6262
klog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableNAT, KubeMarkDropChain, err)
6363
return
6464
}
65-
if _, err := kl.iptClient.EnsureRule(utiliptables.Append, utiliptables.TableNAT, KubeMarkDropChain, "-j", "MARK", "--set-xmark", dropMark); err != nil {
65+
if _, err := kl.iptClient.EnsureRule(utiliptables.Append, utiliptables.TableNAT, KubeMarkDropChain, "-j", "MARK", "--or-mark", dropMark); err != nil {
6666
klog.Errorf("Failed to ensure marking rule for %v: %v", KubeMarkDropChain, err)
6767
return
6868
}
@@ -72,7 +72,7 @@ func (kl *Kubelet) syncNetworkUtil() {
7272
}
7373
if _, err := kl.iptClient.EnsureRule(utiliptables.Append, utiliptables.TableFilter, KubeFirewallChain,
7474
"-m", "comment", "--comment", "kubernetes firewall for dropping marked packets",
75-
"-m", "mark", "--mark", dropMark,
75+
"-m", "mark", "--mark", fmt.Sprintf("%s/%s", dropMark, dropMark),
7676
"-j", "DROP"); err != nil {
7777
klog.Errorf("Failed to ensure rule to drop packet marked by %v in %v chain %v: %v", KubeMarkDropChain, utiliptables.TableFilter, KubeFirewallChain, err)
7878
return
@@ -112,7 +112,7 @@ func (kl *Kubelet) syncNetworkUtil() {
112112
klog.Errorf("Failed to ensure that %s chain %s exists: %v", utiliptables.TableNAT, KubePostroutingChain, err)
113113
return
114114
}
115-
if _, err := kl.iptClient.EnsureRule(utiliptables.Append, utiliptables.TableNAT, KubeMarkMasqChain, "-j", "MARK", "--set-xmark", masqueradeMark); err != nil {
115+
if _, err := kl.iptClient.EnsureRule(utiliptables.Append, utiliptables.TableNAT, KubeMarkMasqChain, "-j", "MARK", "--or-mark", masqueradeMark); err != nil {
116116
klog.Errorf("Failed to ensure marking rule for %v: %v", KubeMarkMasqChain, err)
117117
return
118118
}
@@ -121,12 +121,26 @@ func (kl *Kubelet) syncNetworkUtil() {
121121
klog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", utiliptables.TableNAT, utiliptables.ChainPostrouting, KubePostroutingChain, err)
122122
return
123123
}
124-
// Establish the masquerading rule.
124+
125+
// Set up KUBE-POSTROUTING to unmark and masquerade marked packets
125126
// NB: THIS MUST MATCH the corresponding code in the iptables and ipvs
126127
// modes of kube-proxy
128+
if _, err := kl.iptClient.EnsureRule(utiliptables.Append, utiliptables.TableNAT, KubePostroutingChain,
129+
"-m", "mark", "!", "--mark", fmt.Sprintf("%s/%s", masqueradeMark, masqueradeMark),
130+
"-j", "RETURN"); err != nil {
131+
klog.Errorf("Failed to ensure filtering rule for %v: %v", KubePostroutingChain, err)
132+
return
133+
}
134+
// Clear the mark to avoid re-masquerading if the packet re-traverses the network stack.
135+
// We know the mark bit is currently set so we can use --xor-mark to clear it (without needing
136+
// to Sprintf another bitmask).
137+
if _, err := kl.iptClient.EnsureRule(utiliptables.Append, utiliptables.TableNAT, KubePostroutingChain,
138+
"-j", "MARK", "--xor-mark", masqueradeMark); err != nil {
139+
klog.Errorf("Failed to ensure unmarking rule for %v: %v", KubePostroutingChain, err)
140+
return
141+
}
127142
masqRule := []string{
128143
"-m", "comment", "--comment", "kubernetes service traffic requiring SNAT",
129-
"-m", "mark", "--mark", masqueradeMark,
130144
"-j", "MASQUERADE",
131145
}
132146
if kl.iptClient.HasRandomFully() {
@@ -141,5 +155,5 @@ func (kl *Kubelet) syncNetworkUtil() {
141155
// getIPTablesMark returns the fwmark given the bit
142156
func getIPTablesMark(bit int) string {
143157
value := 1 << uint(bit)
144-
return fmt.Sprintf("%#08x/%#08x", value, value)
158+
return fmt.Sprintf("%#08x", value)
145159
}

pkg/kubelet/kubelet_network_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ func TestGetIPTablesMark(t *testing.T) {
3131
}{
3232
{
3333
14,
34-
"0x00004000/0x00004000",
34+
"0x00004000",
3535
},
3636
{
3737
15,
38-
"0x00008000/0x00008000",
38+
"0x00008000",
3939
},
4040
}
4141
for _, tc := range tests {

pkg/proxy/iptables/proxier.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ func NewProxier(ipt utiliptables.Interface,
282282

283283
// Generate the masquerade mark to use for SNAT rules.
284284
masqueradeValue := 1 << uint(masqueradeBit)
285-
masqueradeMark := fmt.Sprintf("%#08x/%#08x", masqueradeValue, masqueradeValue)
285+
masqueradeMark := fmt.Sprintf("%#08x", masqueradeValue)
286286
klog.V(2).Infof("iptables(%s) masquerade mark: %s", ipt.Protocol(), masqueradeMark)
287287

288288
endpointSlicesEnabled := utilfeature.DefaultFeatureGate.Enabled(features.EndpointSliceProxying)
@@ -919,10 +919,20 @@ func (proxier *Proxier) syncProxyRules() {
919919
// this so that it is easier to flush and change, for example if the mark
920920
// value should ever change.
921921
// NB: THIS MUST MATCH the corresponding code in the kubelet
922+
writeLine(proxier.natRules, []string{
923+
"-A", string(kubePostroutingChain),
924+
"-m", "mark", "!", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark),
925+
"-j", "RETURN",
926+
}...)
927+
// Clear the mark to avoid re-masquerading if the packet re-traverses the network stack.
928+
writeLine(proxier.natRules, []string{
929+
"-A", string(kubePostroutingChain),
930+
// XOR proxier.masqueradeMark to unset it
931+
"-j", "MARK", "--xor-mark", proxier.masqueradeMark,
932+
}...)
922933
masqRule := []string{
923934
"-A", string(kubePostroutingChain),
924935
"-m", "comment", "--comment", `"kubernetes service traffic requiring SNAT"`,
925-
"-m", "mark", "--mark", proxier.masqueradeMark,
926936
"-j", "MASQUERADE",
927937
}
928938
if proxier.iptables.HasRandomFully() {
@@ -935,7 +945,7 @@ func (proxier *Proxier) syncProxyRules() {
935945
// value should ever change.
936946
writeLine(proxier.natRules, []string{
937947
"-A", string(KubeMarkMasqChain),
938-
"-j", "MARK", "--set-xmark", proxier.masqueradeMark,
948+
"-j", "MARK", "--or-mark", proxier.masqueradeMark,
939949
}...)
940950

941951
// Accumulate NAT chains to keep.
@@ -1514,7 +1524,7 @@ func (proxier *Proxier) syncProxyRules() {
15141524
writeLine(proxier.filterRules,
15151525
"-A", string(kubeForwardChain),
15161526
"-m", "comment", "--comment", `"kubernetes forwarding rules"`,
1517-
"-m", "mark", "--mark", proxier.masqueradeMark,
1527+
"-m", "mark", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark),
15181528
"-j", "ACCEPT",
15191529
)
15201530

pkg/proxy/iptables/proxier_test.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,7 @@ func NewFakeProxier(ipt utiliptables.Interface, endpointSlicesEnabled bool) *Pro
357357
endpointsMap: make(proxy.EndpointsMap),
358358
endpointsChanges: proxy.NewEndpointChangeTracker(testHostname, newEndpointInfo, nil, nil, endpointSlicesEnabled),
359359
iptables: ipt,
360+
masqueradeMark: "0x4000",
360361
localDetector: detectLocal,
361362
hostname: testHostname,
362363
portsMap: make(map[utilproxy.LocalPort]utilproxy.Closeable),
@@ -2418,7 +2419,7 @@ func TestEndpointSliceE2E(t *testing.T) {
24182419
:KUBE-EXTERNAL-SERVICES - [0:0]
24192420
:KUBE-FORWARD - [0:0]
24202421
-A KUBE-FORWARD -m conntrack --ctstate INVALID -j DROP
2421-
-A KUBE-FORWARD -m comment --comment "kubernetes forwarding rules" -m mark --mark -j ACCEPT
2422+
-A KUBE-FORWARD -m comment --comment "kubernetes forwarding rules" -m mark --mark 0x4000/0x4000 -j ACCEPT
24222423
-A KUBE-FORWARD -m comment --comment "kubernetes forwarding conntrack pod source rule" -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT
24232424
-A KUBE-FORWARD -m comment --comment "kubernetes forwarding conntrack pod destination rule" -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT
24242425
COMMIT
@@ -2431,8 +2432,10 @@ COMMIT
24312432
:KUBE-SEP-3JOIVZTXZZRGORX4 - [0:0]
24322433
:KUBE-SEP-IO5XOSKPAXIFQXAJ - [0:0]
24332434
:KUBE-SEP-XGJFVO3L2O5SRFNT - [0:0]
2434-
-A KUBE-POSTROUTING -m comment --comment "kubernetes service traffic requiring SNAT" -m mark --mark -j MASQUERADE
2435-
-A KUBE-MARK-MASQ -j MARK --set-xmark
2435+
-A KUBE-POSTROUTING -m mark ! --mark 0x4000/0x4000 -j RETURN
2436+
-A KUBE-POSTROUTING -j MARK --xor-mark 0x4000
2437+
-A KUBE-POSTROUTING -m comment --comment "kubernetes service traffic requiring SNAT" -j MASQUERADE
2438+
-A KUBE-MARK-MASQ -j MARK --or-mark 0x4000
24362439
-A KUBE-SERVICES -m comment --comment "ns1/svc1 cluster IP" -m tcp -p tcp -d 172.20.1.1/32 --dport 0 ! -s 10.0.0.0/24 -j KUBE-MARK-MASQ
24372440
-A KUBE-SERVICES -m comment --comment "ns1/svc1 cluster IP" -m tcp -p tcp -d 172.20.1.1/32 --dport 0 -j KUBE-SVC-AQI2S6QIMU7PVVRP
24382441
-A KUBE-SVC-AQI2S6QIMU7PVVRP -m comment --comment ns1/svc1 -m statistic --mode random --probability 0.3333333333 -j KUBE-SEP-3JOIVZTXZZRGORX4

pkg/proxy/ipvs/proxier.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ func NewProxier(ipt utiliptables.Interface,
418418

419419
// Generate the masquerade mark to use for SNAT rules.
420420
masqueradeValue := 1 << uint(masqueradeBit)
421-
masqueradeMark := fmt.Sprintf("%#08x/%#08x", masqueradeValue, masqueradeValue)
421+
masqueradeMark := fmt.Sprintf("%#08x", masqueradeValue)
422422

423423
isIPv6 := utilnet.IsIPv6(nodeIP)
424424

@@ -1758,7 +1758,7 @@ func (proxier *Proxier) writeIptablesRules() {
17581758
writeLine(proxier.filterRules,
17591759
"-A", string(KubeForwardChain),
17601760
"-m", "comment", "--comment", `"kubernetes forwarding rules"`,
1761-
"-m", "mark", "--mark", proxier.masqueradeMark,
1761+
"-m", "mark", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark),
17621762
"-j", "ACCEPT",
17631763
)
17641764

@@ -1842,10 +1842,20 @@ func (proxier *Proxier) createAndLinkeKubeChain() {
18421842
// this so that it is easier to flush and change, for example if the mark
18431843
// value should ever change.
18441844
// NB: THIS MUST MATCH the corresponding code in the kubelet
1845+
writeLine(proxier.natRules, []string{
1846+
"-A", string(kubePostroutingChain),
1847+
"-m", "mark", "!", "--mark", fmt.Sprintf("%s/%s", proxier.masqueradeMark, proxier.masqueradeMark),
1848+
"-j", "RETURN",
1849+
}...)
1850+
// Clear the mark to avoid re-masquerading if the packet re-traverses the network stack.
1851+
writeLine(proxier.natRules, []string{
1852+
"-A", string(kubePostroutingChain),
1853+
// XOR proxier.masqueradeMark to unset it
1854+
"-j", "MARK", "--xor-mark", proxier.masqueradeMark,
1855+
}...)
18451856
masqRule := []string{
18461857
"-A", string(kubePostroutingChain),
18471858
"-m", "comment", "--comment", `"kubernetes service traffic requiring SNAT"`,
1848-
"-m", "mark", "--mark", proxier.masqueradeMark,
18491859
"-j", "MASQUERADE",
18501860
}
18511861
if proxier.iptables.HasRandomFully() {
@@ -1858,7 +1868,7 @@ func (proxier *Proxier) createAndLinkeKubeChain() {
18581868
// value should ever change.
18591869
writeLine(proxier.natRules, []string{
18601870
"-A", string(KubeMarkMasqChain),
1861-
"-j", "MARK", "--set-xmark", proxier.masqueradeMark,
1871+
"-j", "MARK", "--or-mark", proxier.masqueradeMark,
18621872
}...)
18631873
}
18641874

0 commit comments

Comments
 (0)