@@ -18,11 +18,11 @@ package exporter
1818import (
1919 "context"
2020 "fmt"
21- "runtime"
2221 "strings"
2322 "testing"
2423 "time"
2524
25+ pbmconnect "github.com/percona/percona-backup-mongodb/pbm/connect"
2626 "github.com/prometheus/client_golang/prometheus"
2727 "github.com/prometheus/client_golang/prometheus/testutil"
2828 "github.com/prometheus/common/promslog"
@@ -73,12 +73,7 @@ func TestPBMCollector(t *testing.T) {
7373}
7474
7575// TestPBMCollectorConnectionLeak verifies that the PBM collector does not leak
76- // MongoDB connections or goroutines across multiple scrape cycles.
77- //
78- // This test monitors both server-side connections and client-side goroutines
79- // to detect leaks. A leak in the PBM SDK's connect.MongoConnectWithOpts()
80- // function (where Ping() failure doesn't call Disconnect()) would cause
81- // goroutine growth even if server connections appear stable.
76+ // MongoDB connections across multiple scrape cycles.
8277//
8378//nolint:paralleltest
8479func TestPBMCollectorConnectionLeak (t * testing.T ) {
@@ -101,13 +96,11 @@ func TestPBMCollectorConnectionLeak(t *testing.T) {
10196 err = client .Ping (ctx , nil )
10297 require .NoError (t , err )
10398
104- // Allow GC and goroutines to stabilize before measuring
105- runtime .GC ()
99+ // Allow connections to stabilize
106100 time .Sleep (100 * time .Millisecond )
107101
108102 initialConns := getServerConnectionCount (ctx , t , client )
109- initialGoroutines := runtime .NumGoroutine ()
110- t .Logf ("Initial state: connections=%d, goroutines=%d" , initialConns , initialGoroutines )
103+ t .Logf ("Initial connections: %d" , initialConns )
111104
112105 c := newPbmCollector (ctx , client , mongoURI , promslog .New (& promslog.Config {}))
113106
@@ -121,138 +114,99 @@ func TestPBMCollectorConnectionLeak(t *testing.T) {
121114 }
122115 }
123116
124- // Allow time for cleanup and GC
117+ // Allow time for cleanup
125118 time .Sleep (1 * time .Second )
126- runtime .GC ()
127- time .Sleep (100 * time .Millisecond )
128119
129120 finalConns := getServerConnectionCount (ctx , t , client )
130- finalGoroutines := runtime .NumGoroutine ()
131121 connGrowth := finalConns - initialConns
132- goroutineGrowth := finalGoroutines - initialGoroutines
133122
134- t .Logf ("Final state: connections=%d (growth=%d), goroutines=%d (growth=%d) over %d scrapes" ,
135- finalConns , connGrowth , finalGoroutines , goroutineGrowth , scrapeCount )
123+ t .Logf ("Final connections: %d (growth=%d over %d scrapes)" , finalConns , connGrowth , scrapeCount )
136124
137- // Check for connection leaks
138125 // With a leak: expect ~3-4 connections per scrape (one per cluster member)
139126 // Without leak: should be near zero growth
140127 assert .Less (t , connGrowth , int64 (scrapeCount ),
141128 "Connection leak detected: server connections grew by %d over %d scrapes" , connGrowth , scrapeCount )
142-
143- // Check for goroutine leaks (more sensitive indicator)
144- // Each leaked mongo.Client creates ~24 goroutines for monitoring
145- // Allow some variance for GC timing, but flag major growth
146- maxGoroutineGrowth := scrapeCount * 5 // Allow 5 goroutines per scrape as buffer
147- assert .Less (t , goroutineGrowth , maxGoroutineGrowth ,
148- "Goroutine leak detected: grew by %d over %d scrapes (max allowed: %d). " +
149- "This may indicate MongoDB clients are not being properly disconnected." ,
150- goroutineGrowth , scrapeCount , maxGoroutineGrowth )
151129}
152130
153- // TestMongoClientLeakOnPingFailure demonstrates the goroutine leak that occurs
154- // when mongo.Connect succeeds but Ping fails, and Disconnect is not called .
131+ // TestPBMSDKConnectionLeakOnPingFailure tests the PBM SDK's connect.MongoConnect
132+ // for connection leaks when Ping fails after Connect succeeds .
155133//
156- // This test reproduces the bug in PBM SDK's connect.MongoConnectWithOpts()
157- // where a failed Ping does not call Disconnect on the client, causing leaked
158- // goroutines and connections.
134+ // The bug in PBM SDK's connect.MongoConnectWithOpts():
135+ // - mongo.Connect() succeeds (establishes connections)
136+ // - Ping() fails (e.g., unreachable server, wrong replica set)
137+ // - Connection is NOT disconnected -> connections leak
159138//
160- // The test uses a ReadPreference with a non-existent tag to force Ping to fail
161- // after the driver has already established monitoring connections.
162- //
163- // This test documents the leak behavior. It passes to show the leak exists.
164- // After the PBM SDK fix, this test can be modified to verify the fix.
139+ // This test uses an unsatisfiable ReadPreference to trigger Ping failure
140+ // after the driver has established connections.
165141//
166142//nolint:paralleltest
167- func TestMongoClientLeakOnPingFailure (t * testing.T ) {
143+ func TestPBMSDKConnectionLeakOnPingFailure (t * testing.T ) {
168144 ctx , cancel := context .WithTimeout (context .Background (), 60 * time .Second )
169145 defer cancel ()
170146
171147 port , err := tu .PortForContainer ("mongo-2-1" )
172148 require .NoError (t , err )
173149
174- // Allow GC and goroutines to stabilize before measuring
175- runtime .GC ()
150+ // Create a monitoring client to check server connection count
151+ monitorURI := fmt .
Sprintf (
"mongodb://admin:[email protected] :%s/" ,
port )
//nolint:gosec 152+ monitorClient , err := mongo .Connect (ctx , options .Client ().ApplyURI (monitorURI ))
153+ require .NoError (t , err )
154+ t .Cleanup (func () {
155+ monitorClient .Disconnect (ctx ) //nolint:errcheck
156+ })
157+ err = monitorClient .Ping (ctx , nil )
158+ require .NoError (t , err )
159+
160+ // Allow connections to stabilize
176161 time .Sleep (100 * time .Millisecond )
177- initialGoroutines := runtime .NumGoroutine ()
162+
163+ initialConns := getServerConnectionCount (ctx , t , monitorClient )
164+ t .Logf ("Initial connections: %d" , initialConns )
178165
179166 const iterations = 5
180- leakedClients := make ([]* mongo.Client , 0 , iterations )
181167
182168 for i := 0 ; i < iterations ; i ++ {
183- // Use a ReadPreference that can't be satisfied - this causes:
184- // 1. Connect to succeed (driver starts monitoring goroutines)
185- // 2. Ping to fail (no server matches the tag selector)
186- opts := options .Client ().
187- ApplyURI (
fmt .
Sprintf (
"mongodb://admin:[email protected] :%s/" ,
port )).
188- SetReadPreference (readpref .Nearest (readpref .WithTags ("dc" , "nonexistent" ))).
189- SetServerSelectionTimeout (300 * time .Millisecond )
190-
191- client , err := mongo .Connect (ctx , opts )
192- require .NoError (t , err , "Connect should succeed" )
193-
194- // Give the driver time to establish monitoring connections
195- time .Sleep (100 * time .Millisecond )
196-
197- // Ping will fail because no server matches the read preference
198- err = client .Ping (ctx , nil )
199- require .Error (t , err , "Ping should fail due to unsatisfiable read preference" )
200- t .Logf ("Iteration %d: Ping failed as expected: %v" , i , err )
201-
202- // Simulate the bug: NOT calling Disconnect after Ping failure
203- // This is what happens in connect.MongoConnectWithOpts when Ping fails
204- leakedClients = append (leakedClients , client )
169+ // Use a ReadPreference that cannot be satisfied to trigger the leak:
170+ // 1. mongo.Connect() succeeds and establishes connections
171+ // 2. Ping() fails because no server matches the tag selector
172+ // 3. Without proper cleanup, connections are leaked
173+ _ , err := pbmconnect .MongoConnect (ctx ,
174+ monitorURI ,
175+ pbmconnect .AppName ("leak-test" ),
176+ func (opts * options.ClientOptions ) error {
177+ opts .SetReadPreference (readpref .Nearest (readpref .WithTags ("dc" , "nonexistent" )))
178+ opts .SetServerSelectionTimeout (300 * time .Millisecond )
179+ return nil
180+ },
181+ )
182+ require .Error (t , err , "MongoConnect should fail due to unsatisfiable ReadPreference" )
183+ t .Logf ("Iteration %d: MongoConnect failed as expected" , i )
205184 }
206185
207- // Allow time for any cleanup and measure
186+ // Allow time for any cleanup
208187 time .Sleep (500 * time .Millisecond )
209- runtime .GC ()
210- time .Sleep (100 * time .Millisecond )
211188
212- goroutinesAfterLeak := runtime . NumGoroutine ( )
213- leakGrowth := goroutinesAfterLeak - initialGoroutines
189+ finalConns := getServerConnectionCount ( ctx , t , monitorClient )
190+ connGrowth := finalConns - initialConns
214191
215- t .Logf ("After %d iterations without Disconnect: goroutines=%d (growth=%d)" ,
216- iterations , goroutinesAfterLeak , leakGrowth )
192+ t .Logf ("Final connections: %d (growth=%d over %d iterations)" , finalConns , connGrowth , iterations )
217193
218- // Each mongo.Client creates ~24 goroutines for monitoring
219- // With 5 iterations , we expect ~120 leaked goroutines
220- expectedLeakPerClient := 20 // Conservative estimate
221- expectedTotalLeak := iterations * expectedLeakPerClient
194+ // Each leaked connection attempt should leave connections open
195+ // If there's a leak , we'd see growth proportional to iterations
196+ // Without leak, growth should be zero or minimal
197+ leakThreshold := int64 ( iterations * 2 ) // Allow some buffer
222198
223- // Document that the leak exists
224- if leakGrowth >= expectedTotalLeak / 2 {
225- t .Logf ("LEAK CONFIRMED: %d goroutines leaked over %d iterations (expected ~%d)" ,
226- leakGrowth , iterations , expectedTotalLeak )
199+ if connGrowth >= leakThreshold {
200+ t .Logf ("LEAK DETECTED: %d connections leaked over %d iterations" , connGrowth , iterations )
201+ t .Logf ("This confirms the bug in PBM SDK's connect.MongoConnectWithOpts()" )
227202 }
228203
229- // Now properly disconnect all clients and verify cleanup works
230- for _ , client := range leakedClients {
231- client .Disconnect (ctx ) //nolint:errcheck
232- }
233-
234- time .Sleep (500 * time .Millisecond )
235- runtime .GC ()
236- time .Sleep (100 * time .Millisecond )
237-
238- goroutinesAfterCleanup := runtime .NumGoroutine ()
239- cleanupGrowth := goroutinesAfterCleanup - initialGoroutines
240-
241- t .Logf ("After Disconnect cleanup: goroutines=%d (growth from initial=%d)" ,
242- goroutinesAfterCleanup , cleanupGrowth )
243-
244- // After proper cleanup, goroutine count should return to near initial
245- // This verifies that Disconnect() properly cleans up resources
246- assert .Less (t , cleanupGrowth , 10 ,
247- "Goroutines should return to near initial after Disconnect, but grew by %d" , cleanupGrowth )
248-
249- // Document the leak - this assertion verifies the bug exists
250- // When NOT calling Disconnect after Ping failure, goroutines leak
251- assert .Greater (t , leakGrowth , expectedTotalLeak / 2 ,
252- "Expected goroutine leak when Disconnect is not called after Ping failure. " +
253- "Growth was %d, expected at least %d. " +
254- "This documents the bug in PBM SDK's connect.MongoConnectWithOpts()" ,
255- leakGrowth , expectedTotalLeak / 2 )
204+ // This test documents the leak. Once the PBM SDK is fixed, flip this assertion:
205+ // assert.Less(t, connGrowth, leakThreshold, "Connection leak in PBM SDK")
206+ assert .GreaterOrEqual (t , connGrowth , leakThreshold ,
207+ "Expected connection leak when Ping fails. Growth was %d, expected at least %d. " +
208+ "If this fails, the PBM SDK may have been fixed!" ,
209+ connGrowth , leakThreshold )
256210}
257211
258212// getServerConnectionCount returns the current number of connections to the MongoDB server.
0 commit comments