@@ -17,14 +17,21 @@ package exporter
1717
1818import (
1919 "context"
20+ "fmt"
21+ "runtime"
2022 "strings"
2123 "testing"
2224 "time"
2325
26+ "github.com/prometheus/client_golang/prometheus"
2427 "github.com/prometheus/client_golang/prometheus/testutil"
2528 "github.com/prometheus/common/promslog"
2629 "github.com/stretchr/testify/assert"
2730 "github.com/stretchr/testify/require"
31+ "go.mongodb.org/mongo-driver/bson"
32+ "go.mongodb.org/mongo-driver/mongo"
33+ "go.mongodb.org/mongo-driver/mongo/options"
34+ "go.mongodb.org/mongo-driver/mongo/readpref"
2835
2936 "github.com/percona/mongodb_exporter/internal/tu"
3037)
@@ -64,3 +71,209 @@ func TestPBMCollector(t *testing.T) {
6471 assert .Equal (t , expectedLength , count , "PBM metrics are missing" )
6572 })
6673}
74+
75+ // TestPBMCollectorConnectionLeak verifies that the PBM collector does not leak
76+ // MongoDB connections or goroutines across multiple scrape cycles.
77+ //
78+ // This test monitors both server-side connections and client-side goroutines
79+ // to detect leaks. A leak in the PBM SDK's connect.MongoConnectWithOpts()
80+ // function (where Ping() failure doesn't call Disconnect()) would cause
81+ // goroutine growth even if server connections appear stable.
82+ //
83+ //nolint:paralleltest
84+ func TestPBMCollectorConnectionLeak (t * testing.T ) {
85+ ctx , cancel := context .WithTimeout (context .Background (), 60 * time .Second )
86+ defer cancel ()
87+
88+ port , err := tu .PortForContainer ("mongo-2-1" )
89+ require .NoError (t , err )
90+
91+ mongoURI := fmt .Sprintf (
92+ "mongodb://admin:[email protected] :%s/?connectTimeoutMS=500&serverSelectionTimeoutMS=500" ,
93+ port ,
94+ ) //nolint:gosec
95+
96+ client , err := mongo .Connect (ctx , options .Client ().ApplyURI (mongoURI ))
97+ require .NoError (t , err )
98+ t .Cleanup (func () {
99+ client .Disconnect (ctx ) //nolint:errcheck
100+ })
101+ err = client .Ping (ctx , nil )
102+ require .NoError (t , err )
103+
104+ // Allow GC and goroutines to stabilize before measuring
105+ runtime .GC ()
106+ time .Sleep (100 * time .Millisecond )
107+
108+ initialConns := getServerConnectionCount (ctx , t , client )
109+ initialGoroutines := runtime .NumGoroutine ()
110+ t .Logf ("Initial state: connections=%d, goroutines=%d" , initialConns , initialGoroutines )
111+
112+ c := newPbmCollector (ctx , client , mongoURI , promslog .New (& promslog.Config {}))
113+
114+ // Run multiple collection cycles
115+ const scrapeCount = 10
116+ for i := 0 ; i < scrapeCount ; i ++ {
117+ ch := make (chan prometheus.Metric , 100 )
118+ c .Collect (ch )
119+ close (ch )
120+ for range ch {
121+ }
122+ }
123+
124+ // Allow time for cleanup and GC
125+ time .Sleep (1 * time .Second )
126+ runtime .GC ()
127+ time .Sleep (100 * time .Millisecond )
128+
129+ finalConns := getServerConnectionCount (ctx , t , client )
130+ finalGoroutines := runtime .NumGoroutine ()
131+ connGrowth := finalConns - initialConns
132+ goroutineGrowth := finalGoroutines - initialGoroutines
133+
134+ t .Logf ("Final state: connections=%d (growth=%d), goroutines=%d (growth=%d) over %d scrapes" ,
135+ finalConns , connGrowth , finalGoroutines , goroutineGrowth , scrapeCount )
136+
137+ // Check for connection leaks
138+ // With a leak: expect ~3-4 connections per scrape (one per cluster member)
139+ // Without leak: should be near zero growth
140+ assert .Less (t , connGrowth , int64 (scrapeCount ),
141+ "Connection leak detected: server connections grew by %d over %d scrapes" , connGrowth , scrapeCount )
142+
143+ // Check for goroutine leaks (more sensitive indicator)
144+ // Each leaked mongo.Client creates ~24 goroutines for monitoring
145+ // Allow some variance for GC timing, but flag major growth
146+ maxGoroutineGrowth := scrapeCount * 5 // Allow 5 goroutines per scrape as buffer
147+ assert .Less (t , goroutineGrowth , maxGoroutineGrowth ,
148+ "Goroutine leak detected: grew by %d over %d scrapes (max allowed: %d). " +
149+ "This may indicate MongoDB clients are not being properly disconnected." ,
150+ goroutineGrowth , scrapeCount , maxGoroutineGrowth )
151+ }
152+
153+ // TestMongoClientLeakOnPingFailure demonstrates the goroutine leak that occurs
154+ // when mongo.Connect succeeds but Ping fails, and Disconnect is not called.
155+ //
156+ // This test reproduces the bug in PBM SDK's connect.MongoConnectWithOpts()
157+ // where a failed Ping does not call Disconnect on the client, causing leaked
158+ // goroutines and connections.
159+ //
160+ // The test uses a ReadPreference with a non-existent tag to force Ping to fail
161+ // after the driver has already established monitoring connections.
162+ //
163+ // This test documents the leak behavior. It passes to show the leak exists.
164+ // After the PBM SDK fix, this test can be modified to verify the fix.
165+ //
166+ //nolint:paralleltest
167+ func TestMongoClientLeakOnPingFailure (t * testing.T ) {
168+ ctx , cancel := context .WithTimeout (context .Background (), 60 * time .Second )
169+ defer cancel ()
170+
171+ port , err := tu .PortForContainer ("mongo-2-1" )
172+ require .NoError (t , err )
173+
174+ // Allow GC and goroutines to stabilize before measuring
175+ runtime .GC ()
176+ time .Sleep (100 * time .Millisecond )
177+ initialGoroutines := runtime .NumGoroutine ()
178+
179+ const iterations = 5
180+ leakedClients := make ([]* mongo.Client , 0 , iterations )
181+
182+ for i := 0 ; i < iterations ; i ++ {
183+ // Use a ReadPreference that can't be satisfied - this causes:
184+ // 1. Connect to succeed (driver starts monitoring goroutines)
185+ // 2. Ping to fail (no server matches the tag selector)
186+ opts := options .Client ().
187+ ApplyURI (
fmt .
Sprintf (
"mongodb://admin:[email protected] :%s/" ,
port )).
188+ SetReadPreference (readpref .Nearest (readpref .WithTags ("dc" , "nonexistent" ))).
189+ SetServerSelectionTimeout (300 * time .Millisecond )
190+
191+ client , err := mongo .Connect (ctx , opts )
192+ require .NoError (t , err , "Connect should succeed" )
193+
194+ // Give the driver time to establish monitoring connections
195+ time .Sleep (100 * time .Millisecond )
196+
197+ // Ping will fail because no server matches the read preference
198+ err = client .Ping (ctx , nil )
199+ require .Error (t , err , "Ping should fail due to unsatisfiable read preference" )
200+ t .Logf ("Iteration %d: Ping failed as expected: %v" , i , err )
201+
202+ // Simulate the bug: NOT calling Disconnect after Ping failure
203+ // This is what happens in connect.MongoConnectWithOpts when Ping fails
204+ leakedClients = append (leakedClients , client )
205+ }
206+
207+ // Allow time for any cleanup and measure
208+ time .Sleep (500 * time .Millisecond )
209+ runtime .GC ()
210+ time .Sleep (100 * time .Millisecond )
211+
212+ goroutinesAfterLeak := runtime .NumGoroutine ()
213+ leakGrowth := goroutinesAfterLeak - initialGoroutines
214+
215+ t .Logf ("After %d iterations without Disconnect: goroutines=%d (growth=%d)" ,
216+ iterations , goroutinesAfterLeak , leakGrowth )
217+
218+ // Each mongo.Client creates ~24 goroutines for monitoring
219+ // With 5 iterations, we expect ~120 leaked goroutines
220+ expectedLeakPerClient := 20 // Conservative estimate
221+ expectedTotalLeak := iterations * expectedLeakPerClient
222+
223+ // Document that the leak exists
224+ if leakGrowth >= expectedTotalLeak / 2 {
225+ t .Logf ("LEAK CONFIRMED: %d goroutines leaked over %d iterations (expected ~%d)" ,
226+ leakGrowth , iterations , expectedTotalLeak )
227+ }
228+
229+ // Now properly disconnect all clients and verify cleanup works
230+ for _ , client := range leakedClients {
231+ client .Disconnect (ctx ) //nolint:errcheck
232+ }
233+
234+ time .Sleep (500 * time .Millisecond )
235+ runtime .GC ()
236+ time .Sleep (100 * time .Millisecond )
237+
238+ goroutinesAfterCleanup := runtime .NumGoroutine ()
239+ cleanupGrowth := goroutinesAfterCleanup - initialGoroutines
240+
241+ t .Logf ("After Disconnect cleanup: goroutines=%d (growth from initial=%d)" ,
242+ goroutinesAfterCleanup , cleanupGrowth )
243+
244+ // After proper cleanup, goroutine count should return to near initial
245+ // This verifies that Disconnect() properly cleans up resources
246+ assert .Less (t , cleanupGrowth , 10 ,
247+ "Goroutines should return to near initial after Disconnect, but grew by %d" , cleanupGrowth )
248+
249+ // Document the leak - this assertion verifies the bug exists
250+ // When NOT calling Disconnect after Ping failure, goroutines leak
251+ assert .Greater (t , leakGrowth , expectedTotalLeak / 2 ,
252+ "Expected goroutine leak when Disconnect is not called after Ping failure. " +
253+ "Growth was %d, expected at least %d. " +
254+ "This documents the bug in PBM SDK's connect.MongoConnectWithOpts()" ,
255+ leakGrowth , expectedTotalLeak / 2 )
256+ }
257+
258+ // getServerConnectionCount returns the current number of connections to the MongoDB server.
259+ func getServerConnectionCount (ctx context.Context , t * testing.T , client * mongo.Client ) int64 {
260+ t .Helper ()
261+
262+ var result bson.M
263+ err := client .Database ("admin" ).RunCommand (ctx , bson.D {{Key : "serverStatus" , Value : 1 }}).Decode (& result )
264+ require .NoError (t , err )
265+
266+ connections , ok := result ["connections" ].(bson.M )
267+ require .True (t , ok , "serverStatus should contain connections field" )
268+
269+ current , ok := connections ["current" ].(int32 )
270+ if ok {
271+ return int64 (current )
272+ }
273+
274+ // Try int64 in case MongoDB returns it differently
275+ current64 , ok := connections ["current" ].(int64 )
276+ require .True (t , ok , "connections.current should be numeric" )
277+
278+ return current64
279+ }
0 commit comments