Skip to content

Commit 4e6cc2f

Browse files
authored
fix: wait for vnet ns to create and ensure veths are inside namespace instead of assuming (#2341)
* Make initial changes to enable forwarding Reuse existing code to enable ip forwarding Add forwarding message Add transparent vlan config to dropgz Add enable ip forward to each network namespace for redundancy * Add general run with retries function * Migrate existing code to use retry function * Add retries for checking of vnet and container veth creation * Add preliminary repair item fixes * Clarify log message * Fix unit tests, move ensure clean populate vm to add endpoints, remove ip forwarding fix as it is in other branch * Move setting link netns and confirmation to function * Add tests for setLinkNetNSAndConfirm * Address linter issues * Add vm ns to vnet namespace log statement * Fix after rebasing in swift refactoring changes from master * Address feedback and check error message * Address feedback * Address linter issue * Address log feedback * Address feedback by asserting any error is an interface not found error
1 parent b184a75 commit 4e6cc2f

File tree

4 files changed

+410
-55
lines changed

4 files changed

+410
-55
lines changed

network/endpoint_snatroute_linux.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ func AddSnatEndpointRules(snatClient *snat.Client, hostToNC, ncToHost bool, nl n
3737
return errors.Wrap(err, "failed to block ip addresses on snat bridge")
3838
}
3939
nuc := networkutils.NewNetworkUtils(nl, plc)
40-
if err := nuc.EnableIPForwarding(snat.SnatBridgeName); err != nil {
40+
if err := nuc.EnableIPForwarding(); err != nil {
4141
return errors.Wrap(err, "failed to enable ip forwarding")
4242
}
4343

network/networkutils/networkutils_linux.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,8 +200,8 @@ func BlockIPAddresses(bridgeName, action string) error {
200200
return nil
201201
}
202202

203-
// This fucntion enables ip forwarding in VM and allow forwarding packets from the interface
204-
func (nu NetworkUtils) EnableIPForwarding(ifName string) error {
203+
// This function enables ip forwarding in VM and allow forwarding packets from the interface
204+
func (nu NetworkUtils) EnableIPForwarding() error {
205205
// Enable ip forwading on linux vm.
206206
// sysctl -w net.ipv4.ip_forward=1
207207
cmd := fmt.Sprint(enableIPForwardCmd)

network/transparent_vlan_endpointclient_linux.go

Lines changed: 129 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,11 @@ const (
2626
tunnelingTable = 2 // Packets not entering on the vlan interface go to this routing table
2727
tunnelingMark = 333 // The packets that are to tunnel will be marked with this number
2828
DisableRPFilterCmd = "sysctl -w net.ipv4.conf.all.rp_filter=0" // Command to disable the rp filter for tunneling
29+
numRetries = 5
30+
sleepInMs = 100
2931
)
3032

31-
var errNamespaceCreation = fmt.Errorf("Network namespace creation error")
32-
33-
var (
34-
numRetries = 5
35-
sleepInMs = 100
36-
)
33+
var errNamespaceCreation = fmt.Errorf("network namespace creation error")
3734

3835
type netnsClient interface {
3936
Get() (fileDescriptor int, err error)
@@ -113,8 +110,10 @@ func NewTransparentVlanEndpointClient(
113110
// Adds interfaces to the vnet (created if not existing) and vm namespace
114111
func (client *TransparentVlanEndpointClient) AddEndpoints(epInfo *EndpointInfo) error {
115112
// VM Namespace
116-
err := client.PopulateVM(epInfo)
117-
if err != nil {
113+
if err := client.ensureCleanPopulateVM(); err != nil {
114+
return errors.Wrap(err, "failed to ensure both network namespace and vlan veth were present or both absent")
115+
}
116+
if err := client.PopulateVM(epInfo); err != nil {
118117
return err
119118
}
120119
if err := client.AddSnatEndpoint(); err != nil {
@@ -126,32 +125,80 @@ func (client *TransparentVlanEndpointClient) AddEndpoints(epInfo *EndpointInfo)
126125
})
127126
}
128127

129-
func (client *TransparentVlanEndpointClient) createNetworkNamespace(vmNS, numRetries int) error {
130-
var isNamespaceUnique bool
131-
132-
for i := 0; i < numRetries; i++ {
133-
vnetNS, err := client.netnsClient.NewNamed(client.vnetNSName)
134-
if err != nil {
135-
return errors.Wrap(err, "failed to create vnet ns")
136-
}
137-
logger.Info("Vnet Namespace created", zap.String("vnetNS", client.netnsClient.NamespaceUniqueID(vnetNS)))
138-
if !client.netnsClient.IsNamespaceEqual(vnetNS, vmNS) {
139-
client.vnetNSFileDescriptor = vnetNS
140-
isNamespaceUnique = true
141-
break
128+
// Called from AddEndpoints, Namespace: VM and Vnet
129+
func (client *TransparentVlanEndpointClient) ensureCleanPopulateVM() error {
130+
// Clean up vlan interface in the VM namespace and ensure the network namespace (if it exists) has a vlan interface
131+
logger.Info("Checking if NS and vlan veth exists...")
132+
var existingErr error
133+
client.vnetNSFileDescriptor, existingErr = client.netnsClient.GetFromName(client.vnetNSName)
134+
if existingErr == nil {
135+
// The namespace exists
136+
vlanIfErr := ExecuteInNS(client.nsClient, client.vnetNSName, func() error {
137+
// Ensure the vlan interface exists in the namespace
138+
_, err := client.netioshim.GetNetworkInterfaceByName(client.vlanIfName)
139+
return errors.Wrap(err, "failed to get vlan interface in namespace")
140+
})
141+
if vlanIfErr != nil {
142+
// Assume any error is the vlan interface not found
143+
logger.Info("vlan interface doesn't exist even though network namespace exists, deleting network namespace...", zap.String("message", vlanIfErr.Error()))
144+
delErr := client.netnsClient.DeleteNamed(client.vnetNSName)
145+
if delErr != nil {
146+
return errors.Wrap(delErr, "failed to cleanup/delete ns after noticing vlan veth does not exist")
147+
}
142148
}
143-
logger.Info("Vnet Namespace is the same as VM namespace. Deleting and retrying...")
144-
delErr := client.netnsClient.DeleteNamed(client.vnetNSName)
145-
if delErr != nil {
146-
logger.Error("failed to cleanup/delete ns after failing to create vlan veth", zap.Any("error:", delErr.Error()))
149+
}
150+
// Delete the vlan interface in the VM namespace if it exists
151+
_, vlanIfInVMErr := client.netioshim.GetNetworkInterfaceByName(client.vlanIfName)
152+
if vlanIfInVMErr == nil {
153+
// The vlan interface exists in the VM ns because it failed to move into the network ns previously and needs to be cleaned up
154+
logger.Info("vlan interface exists on the VM namespace, deleting", zap.String("vlanIfName", client.vlanIfName))
155+
if delErr := client.netlink.DeleteLink(client.vlanIfName); delErr != nil {
156+
return errors.Wrap(delErr, "failed to clean up vlan interface")
147157
}
148-
time.Sleep(time.Duration(sleepInMs) * time.Millisecond)
149158
}
159+
return nil
160+
}
161+
162+
// Called from PopulateVM, Namespace: VM
163+
func (client *TransparentVlanEndpointClient) createNetworkNamespace(vmNS int) error {
164+
// If this call succeeds, the namespace is set to the new namespace
165+
vnetNS, err := client.netnsClient.NewNamed(client.vnetNSName)
166+
if err != nil {
167+
return errors.Wrap(err, "failed to create vnet ns")
168+
}
169+
logger.Info("Vnet Namespace created", zap.String("vnetNS", client.netnsClient.NamespaceUniqueID(vnetNS)), zap.String("vmNS", client.netnsClient.NamespaceUniqueID(vmNS)))
170+
if !client.netnsClient.IsNamespaceEqual(vnetNS, vmNS) {
171+
client.vnetNSFileDescriptor = vnetNS
172+
return nil
173+
}
174+
// the vnet and vm namespace are the same by this point
175+
logger.Info("Vnet Namespace is the same as VM namespace. Deleting...")
176+
delErr := client.netnsClient.DeleteNamed(client.vnetNSName)
177+
if delErr != nil {
178+
logger.Error("failed to cleanup/delete ns after noticing vnet ns is the same as vm ns", zap.Any("error:", delErr.Error()))
179+
}
180+
return errors.Wrap(errNamespaceCreation, "vnet and vm namespace are the same")
181+
}
150182

151-
if !isNamespaceUnique {
152-
return errors.Wrap(errNamespaceCreation, "vnet and vm namespace are same")
183+
// Called from PopulateVM, Namespace: VM and namespace represented by fd
184+
func (client *TransparentVlanEndpointClient) setLinkNetNSAndConfirm(name string, fd uintptr) error {
185+
logger.Info("Move link to NS", zap.String("ifName", name), zap.Any("NSFileDescriptor", fd))
186+
err := client.netlink.SetLinkNetNs(name, fd)
187+
if err != nil {
188+
return errors.Wrapf(err, "failed to set %v inside namespace %v", name, fd)
153189
}
154190

191+
// confirm veth was moved successfully
192+
err = RunWithRetries(func() error {
193+
// retry checking in the namespace if the interface is not detected
194+
return ExecuteInNS(client.nsClient, client.vnetNSName, func() error {
195+
_, ifDetectedErr := client.netioshim.GetNetworkInterfaceByName(client.vlanIfName)
196+
return errors.Wrap(ifDetectedErr, "failed to get vlan veth in namespace")
197+
})
198+
}, numRetries, sleepInMs)
199+
if err != nil {
200+
return errors.Wrapf(err, "failed to detect %v inside namespace %v", name, fd)
201+
}
155202
return nil
156203
}
157204

@@ -169,10 +216,13 @@ func (client *TransparentVlanEndpointClient) PopulateVM(epInfo *EndpointInfo) er
169216
// This will also (we assume) mean the vlan veth does not exist
170217
if existingErr != nil {
171218
// We assume the only possible error is that the namespace doesn't exist
172-
logger.Info("No existing NS detected. Creating the vnet namespace and switching to it")
219+
logger.Info("No existing NS detected. Creating the vnet namespace and switching to it", zap.String("message", existingErr.Error()))
173220

174-
if err = client.createNetworkNamespace(vmNS, numRetries); err != nil {
175-
return errors.Wrap(err, "")
221+
err = RunWithRetries(func() error {
222+
return client.createNetworkNamespace(vmNS)
223+
}, numRetries, sleepInMs)
224+
if err != nil {
225+
return errors.Wrap(err, "failed to create network namespace")
176226
}
177227

178228
deleteNSIfNotNilErr := client.netnsClient.Set(vmNS)
@@ -214,6 +264,8 @@ func (client *TransparentVlanEndpointClient) PopulateVM(epInfo *EndpointInfo) er
214264
// Any failure to add the link should error (auto delete NS)
215265
return errors.Wrap(deleteNSIfNotNilErr, "failed to create vlan vnet link after making new ns")
216266
}
267+
// Prevent accidentally deleting NS and vlan interface since we ignore this error
268+
deleteNSIfNotNilErr = nil
217269
}
218270
defer func() {
219271
if deleteNSIfNotNilErr != nil {
@@ -225,12 +277,11 @@ func (client *TransparentVlanEndpointClient) PopulateVM(epInfo *EndpointInfo) er
225277
}()
226278

227279
// sometimes there is slight delay in interface creation. check if it exists
228-
for i := 0; i < numRetries; i++ {
229-
if _, err = client.netioshim.GetNetworkInterfaceByName(client.vlanIfName); err == nil {
230-
break
231-
}
232-
time.Sleep(time.Duration(sleepInMs) * time.Millisecond)
233-
}
280+
err = RunWithRetries(func() error {
281+
_, err = client.netioshim.GetNetworkInterfaceByName(client.vlanIfName)
282+
return errors.Wrap(err, "failed to get vlan veth")
283+
}, numRetries, sleepInMs)
284+
234285
if err != nil {
235286
deleteNSIfNotNilErr = errors.Wrapf(err, "failed to get vlan veth interface:%s", client.vlanIfName)
236287
return deleteNSIfNotNilErr
@@ -242,12 +293,13 @@ func (client *TransparentVlanEndpointClient) PopulateVM(epInfo *EndpointInfo) er
242293
}
243294
// vlan veth was created successfully, so move the vlan veth you created
244295
logger.Info("Move vlan link to vnet NS", zap.String("vlanIfName", client.vlanIfName), zap.Any("vnetNSFileDescriptor", uintptr(client.vnetNSFileDescriptor)))
245-
deleteNSIfNotNilErr = client.netlink.SetLinkNetNs(client.vlanIfName, uintptr(client.vnetNSFileDescriptor))
296+
deleteNSIfNotNilErr = client.setLinkNetNSAndConfirm(client.vlanIfName, uintptr(client.vnetNSFileDescriptor))
246297
if deleteNSIfNotNilErr != nil {
247-
return errors.Wrap(deleteNSIfNotNilErr, "deleting vlan veth in vm ns due to addendpoint failure")
298+
return errors.Wrap(deleteNSIfNotNilErr, "failed to move or detect vlan veth inside vnet namespace")
248299
}
300+
logger.Info("Moving vlan veth into namespace confirmed")
249301
} else {
250-
logger.Info("Existing NS detected. Assuming exists too", zap.String("vnetNSName", client.vnetNSName), zap.String("vlanIfName", client.vlanIfName))
302+
logger.Info("Existing NS detected. vlan interface should exist or namespace would've been deleted.", zap.String("vnetNSName", client.vnetNSName), zap.String("vlanIfName", client.vlanIfName))
251303
}
252304

253305
// Get the default constant host veth mac
@@ -260,6 +312,27 @@ func (client *TransparentVlanEndpointClient) PopulateVM(epInfo *EndpointInfo) er
260312
if err = client.netUtilsClient.CreateEndpoint(client.vnetVethName, client.containerVethName, mac); err != nil {
261313
return errors.Wrap(err, "failed to create veth pair")
262314
}
315+
316+
// Ensure vnet veth is created, as there may be a slight delay
317+
err = RunWithRetries(func() error {
318+
_, getErr := client.netioshim.GetNetworkInterfaceByName(client.vnetVethName)
319+
return errors.Wrap(getErr, "failed to get vnet veth")
320+
}, numRetries, sleepInMs)
321+
if err != nil {
322+
return errors.Wrap(err, "vnet veth does not exist")
323+
}
324+
325+
// Ensure container veth is created, as there may be a slight delay
326+
var containerIf *net.Interface
327+
err = RunWithRetries(func() error {
328+
var getErr error
329+
containerIf, getErr = client.netioshim.GetNetworkInterfaceByName(client.containerVethName)
330+
return errors.Wrap(getErr, "failed to get container veth")
331+
}, numRetries, sleepInMs)
332+
if err != nil {
333+
return errors.Wrap(err, "container veth does not exist")
334+
}
335+
263336
// Disable RA for veth pair, and delete if any failure
264337
if err = client.netUtilsClient.DisableRAForInterface(client.vnetVethName); err != nil {
265338
if delErr := client.netlink.DeleteLink(client.vnetVethName); delErr != nil {
@@ -274,16 +347,11 @@ func (client *TransparentVlanEndpointClient) PopulateVM(epInfo *EndpointInfo) er
274347
return errors.Wrap(err, "failed to disable RA on container veth, deleting")
275348
}
276349

277-
if err = client.netlink.SetLinkNetNs(client.vnetVethName, uintptr(client.vnetNSFileDescriptor)); err != nil {
350+
if err = client.setLinkNetNSAndConfirm(client.vnetVethName, uintptr(client.vnetNSFileDescriptor)); err != nil {
278351
if delErr := client.netlink.DeleteLink(client.vnetVethName); delErr != nil {
279352
logger.Error("Deleting vnet veth failed on addendpoint failure with", zap.Error(delErr))
280353
}
281-
return errors.Wrap(err, "failed to move vnetVethName into vnet ns, deleting")
282-
}
283-
284-
containerIf, err := client.netioshim.GetNetworkInterfaceByName(client.containerVethName)
285-
if err != nil {
286-
return errors.Wrap(err, "container veth does not exist")
354+
return errors.Wrap(err, "failed to move or detect vnetVethName in vnet ns, deleting")
287355
}
288356
client.containerMac = containerIf.HardwareAddr
289357
return nil
@@ -492,7 +560,7 @@ func (client *TransparentVlanEndpointClient) GetVnetRoutes(ipAddresses []net.IPN
492560

493561
// Helper that creates routing rules for the current NS which direct packets
494562
// to the virtual gateway ip on linkToName device interface
495-
// Route 1: 169.254.1.1 dev <linkToName>
563+
// Route 1: 169.254.2.1 dev <linkToName>
496564
// Route 2: default via 169.254.2.1 dev <linkToName>
497565
func (client *TransparentVlanEndpointClient) addDefaultRoutes(linkToName string, table int) error {
498566
// Add route for virtualgwip (ip route add 169.254.2.1/32 dev eth0)
@@ -637,3 +705,16 @@ func ExecuteInNS(nsc NamespaceClientInterface, nsName string, f func() error) er
637705
}()
638706
return f()
639707
}
708+
709+
func RunWithRetries(f func() error, maxRuns, sleepMs int) error {
710+
var err error
711+
for i := 0; i < maxRuns; i++ {
712+
err = f()
713+
if err == nil {
714+
break
715+
}
716+
logger.Info("Retrying after delay...", zap.String("error", err.Error()), zap.Int("retry", i), zap.Int("sleepMs", sleepMs))
717+
time.Sleep(time.Duration(sleepMs) * time.Millisecond)
718+
}
719+
return err
720+
}

0 commit comments

Comments
 (0)