fix network DB desync after failed connect/disconnect

Luap99 · Luap99 · commit f87ab2b7a606 · 2025-04-04T14:42:20.000+02:00
Networks are stored in two ways in the DB, first a static network list which holds all the network with its option for the container. Second, the network status which hold the actual network result from netavark but only when the container is running. If the container is running they must be in sync and podman inspect has checks to ensure that as well it errors out of there is a desync between the two. As the adding to the db and doing actual networking configuration are diffeent parts it possible that one worked while the other failed which triggers the desync. To avoid this make the network connect/disconnect code more robust against partial failures. When the network calls fail we update the db again to remove/add the network back. Fixes: https://issues.redhat.com/browse/RHEL-78037 Signed-off-by: Paul Holzinger <pholzing@redhat.com>
diff --git a/libpod/networking_common.go b/libpod/networking_common.go
@@ -378,7 +378,7 @@ func (c *Container) NetworkDisconnect(nameOrID, netName string, force bool) erro
 		return err
 	}
 
-	_, nameExists := networks[netName]
+	netOpts, nameExists := networks[netName]
 	if !nameExists && len(networks) > 0 {
 		return fmt.Errorf("container %s is not connected to network %s", nameOrID, netName)
 	}
@@ -393,12 +393,20 @@ func (c *Container) NetworkDisconnect(nameOrID, netName string, force bool) erro
 		return err
 	}
 
+	// Since we removed the new network from the container db we must have to add it back during partial setup errors
+	addContainerNetworkToDB := func() {
+		if err := c.runtime.state.NetworkConnect(c, netName, netOpts); err != nil {
+			logrus.Errorf("Failed to add network %s for container %s to DB after failed network disconnect", netName, nameOrID)
+		}
+	}
+
 	c.newNetworkEvent(events.NetworkDisconnect, netName)
 	if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) {
 		return nil
 	}
 
 	if c.state.NetNS == "" {
+		addContainerNetworkToDB()
 		return fmt.Errorf("unable to disconnect %s from %s: %w", nameOrID, netName, define.ErrNoNetwork)
 	}
 
@@ -412,6 +420,7 @@ func (c *Container) NetworkDisconnect(nameOrID, netName string, force bool) erro
 	}
 
 	if err := c.runtime.teardownNetworkBackend(c.state.NetNS, opts); err != nil {
+		addContainerNetworkToDB()
 		return err
 	}
 
@@ -524,11 +533,20 @@ func (c *Container) NetworkConnect(nameOrID, netName string, netOpts types.PerNe
 
 		return err
 	}
+
+	// Since we added the new network to the container db we must have to remove it from that during partial setup errors
+	removeContainerNetworkFromDB := func() {
+		if err := c.runtime.state.NetworkDisconnect(c, netName); err != nil {
+			logrus.Errorf("Failed to remove network %s for container %s from DB after failed network connect", netName, nameOrID)
+		}
+	}
+
 	c.newNetworkEvent(events.NetworkConnect, netName)
 	if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) {
 		return nil
 	}
 	if c.state.NetNS == "" {
+		removeContainerNetworkFromDB()
 		return fmt.Errorf("unable to connect %s to %s: %w", nameOrID, netName, define.ErrNoNetwork)
 	}
 
@@ -543,6 +561,7 @@ func (c *Container) NetworkConnect(nameOrID, netName string, netOpts types.PerNe
 
 	results, err := c.runtime.setUpNetwork(c.state.NetNS, opts)
 	if err != nil {
+		removeContainerNetworkFromDB()
 		return err
 	}
 	if len(results) != 1 {
diff --git a/test/system/500-networking.bats b/test/system/500-networking.bats
@@ -583,6 +583,14 @@ load helpers.network
     run_podman network connect $netname $background_cid
     is "$output" "" "(re)connect of container with no open ports"
 
+    # connect a network with an intentional error (bad mac address)
+    run_podman 125 network connect --mac-address 00:00:00:00:00:00 $netname2 $cid
+    assert "$output" =~ "Cannot assign requested address" "mac address error"
+
+    # podman inspect must still work correctly and not error due network desync
+    run_podman inspect --format '{{ range $index, $value := .NetworkSettings.Networks }}{{$index}}{{end}}' $cid
+    assert "$output" == "$netname" "only network1 must be connected"
+
     # connect a second network
     run_podman network connect $netname2 $cid
     is "$output" "" "Output should be empty (no errors)"

Original file line number	Diff line number	Diff line change
`@@ -378,7 +378,7 @@ func (c *Container) NetworkDisconnect(nameOrID, netName string, force bool) erro`
`378`	`378`	`return err`
`379`	`379`	`}`
`380`	`380`
`381`		`- _, nameExists := networks[netName]`
	`381`	`+ netOpts, nameExists := networks[netName]`
`382`	`382`	`if !nameExists && len(networks) > 0 {`
`383`	`383`	`return fmt.Errorf("container %s is not connected to network %s", nameOrID, netName)`
`384`	`384`	`}`
`@@ -393,12 +393,20 @@ func (c *Container) NetworkDisconnect(nameOrID, netName string, force bool) erro`
`393`	`393`	`return err`
`394`	`394`	`}`
`395`	`395`
	`396`	`+ // Since we removed the new network from the container db we must have to add it back during partial setup errors`
	`397`	`+ addContainerNetworkToDB := func() {`
	`398`	`+ if err := c.runtime.state.NetworkConnect(c, netName, netOpts); err != nil {`
	`399`	`+ logrus.Errorf("Failed to add network %s for container %s to DB after failed network disconnect", netName, nameOrID)`
	`400`	`+ }`
	`401`	`+ }`
	`402`	`+`
`396`	`403`	`c.newNetworkEvent(events.NetworkDisconnect, netName)`
`397`	`404`	`if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) {`
`398`	`405`	`return nil`
`399`	`406`	`}`
`400`	`407`
`401`	`408`	`if c.state.NetNS == "" {`
	`409`	`+ addContainerNetworkToDB()`
`402`	`410`	`return fmt.Errorf("unable to disconnect %s from %s: %w", nameOrID, netName, define.ErrNoNetwork)`
`403`	`411`	`}`
`404`	`412`
`@@ -412,6 +420,7 @@ func (c *Container) NetworkDisconnect(nameOrID, netName string, force bool) erro`
`412`	`420`	`}`
`413`	`421`
`414`	`422`	`if err := c.runtime.teardownNetworkBackend(c.state.NetNS, opts); err != nil {`
	`423`	`+ addContainerNetworkToDB()`
`415`	`424`	`return err`
`416`	`425`	`}`
`417`	`426`
`@@ -524,11 +533,20 @@ func (c *Container) NetworkConnect(nameOrID, netName string, netOpts types.PerNe`
`524`	`533`
`525`	`534`	`return err`
`526`	`535`	`}`
	`536`	`+`
	`537`	`+ // Since we added the new network to the container db we must have to remove it from that during partial setup errors`
	`538`	`+ removeContainerNetworkFromDB := func() {`
	`539`	`+ if err := c.runtime.state.NetworkDisconnect(c, netName); err != nil {`
	`540`	`+ logrus.Errorf("Failed to remove network %s for container %s from DB after failed network connect", netName, nameOrID)`
	`541`	`+ }`
	`542`	`+ }`
	`543`	`+`
`527`	`544`	`c.newNetworkEvent(events.NetworkConnect, netName)`
`528`	`545`	`if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) {`
`529`	`546`	`return nil`
`530`	`547`	`}`
`531`	`548`	`if c.state.NetNS == "" {`
	`549`	`+ removeContainerNetworkFromDB()`
`532`	`550`	`return fmt.Errorf("unable to connect %s to %s: %w", nameOrID, netName, define.ErrNoNetwork)`
`533`	`551`	`}`
`534`	`552`
`@@ -543,6 +561,7 @@ func (c *Container) NetworkConnect(nameOrID, netName string, netOpts types.PerNe`
`543`	`561`
`544`	`562`	`results, err := c.runtime.setUpNetwork(c.state.NetNS, opts)`
`545`	`563`	`if err != nil {`
	`564`	`+ removeContainerNetworkFromDB()`
`546`	`565`	`return err`
`547`	`566`	`}`
`548`	`567`	`if len(results) != 1 {`