Skip to content

Commit 5f5b5e2

Browse files
authored
Random instance selection (#136)
* wip * revert some' * revert more * poor-man's integration test * remove test * fmt * --workspace * fix build * fix integration test * another stab * log * run after integration * cargo test after integration * revert * revert more * Refactor + clean up * more clean up
1 parent 5948fef commit 5f5b5e2

File tree

3 files changed

+46
-83
lines changed

3 files changed

+46
-83
lines changed

src/client.rs

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -499,7 +499,6 @@ where
499499
// The query router determines where the query is going to go,
500500
// e.g. primary, replica, which shard.
501501
let mut query_router = QueryRouter::new();
502-
let mut round_robin = rand::random();
503502

504503
// Our custom protocol loop.
505504
// We expect the client to either start a transaction with regular queries
@@ -631,12 +630,7 @@ where
631630

632631
// Grab a server from the pool.
633632
let connection = match pool
634-
.get(
635-
query_router.shard(),
636-
query_router.role(),
637-
self.process_id,
638-
round_robin,
639-
)
633+
.get(query_router.shard(), query_router.role(), self.process_id)
640634
.await
641635
{
642636
Ok(conn) => {
@@ -655,8 +649,6 @@ where
655649
let address = connection.1;
656650
let server = &mut *reference;
657651

658-
round_robin += 1;
659-
660652
// Server is assigned to the client in case the client wants to
661653
// cancel a query later.
662654
server.claim(self.process_id, self.secret_key);

src/config.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ pub struct Address {
6363
pub shard: usize,
6464
pub database: String,
6565
pub role: Role,
66-
pub replica_number: usize,
66+
pub instance_index: usize,
6767
pub username: String,
6868
pub poolname: String,
6969
}
@@ -75,7 +75,7 @@ impl Default for Address {
7575
host: String::from("127.0.0.1"),
7676
port: String::from("5432"),
7777
shard: 0,
78-
replica_number: 0,
78+
instance_index: 0,
7979
database: String::from("database"),
8080
role: Role::Replica,
8181
username: String::from("username"),
@@ -92,7 +92,7 @@ impl Address {
9292

9393
Role::Replica => format!(
9494
"{}_shard_{}_replica_{}",
95-
self.poolname, self.shard, self.replica_number
95+
self.poolname, self.shard, self.instance_index
9696
),
9797
}
9898
}

src/pool.rs

Lines changed: 42 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ use chrono::naive::NaiveDateTime;
66
use log::{debug, error, info, warn};
77
use once_cell::sync::Lazy;
88
use parking_lot::{Mutex, RwLock};
9+
use rand::seq::SliceRandom;
10+
use rand::thread_rng;
911
use std::collections::HashMap;
1012
use std::sync::Arc;
1113
use std::time::Instant;
@@ -118,7 +120,7 @@ impl ConnectionPool {
118120
host: server.0.clone(),
119121
port: server.1.to_string(),
120122
role: role,
121-
replica_number,
123+
instance_index: replica_number,
122124
shard: shard_idx.parse::<usize>().unwrap(),
123125
username: user_info.username.clone(),
124126
poolname: pool_name.clone(),
@@ -201,42 +203,30 @@ impl ConnectionPool {
201203
/// the pooler starts up.
202204
async fn validate(&mut self) -> Result<(), Error> {
203205
let mut server_infos = Vec::new();
204-
let stats = self.stats.clone();
205-
206206
for shard in 0..self.shards() {
207-
let mut round_robin = 0;
208-
209-
for _ in 0..self.servers(shard) {
210-
// To keep stats consistent.
211-
let fake_process_id = 0;
212-
213-
let connection = match self.get(shard, None, fake_process_id, round_robin).await {
207+
for index in 0..self.servers(shard) {
208+
let connection = match self.databases[shard][index].get().await {
214209
Ok(conn) => conn,
215210
Err(err) => {
216211
error!("Shard {} down or misconfigured: {:?}", shard, err);
217212
continue;
218213
}
219214
};
220215

221-
let proxy = connection.0;
222-
let address = connection.1;
216+
let proxy = connection;
223217
let server = &*proxy;
224218
let server_info = server.server_info();
225219

226-
stats.client_disconnecting(fake_process_id, address.id);
227-
228220
if server_infos.len() > 0 {
229221
// Compare against the last server checked.
230222
if server_info != server_infos[server_infos.len() - 1] {
231223
warn!(
232224
"{:?} has different server configuration than the last server",
233-
address
225+
proxy.address()
234226
);
235227
}
236228
}
237-
238229
server_infos.push(server_info);
239-
round_robin += 1;
240230
}
241231
}
242232

@@ -254,70 +244,46 @@ impl ConnectionPool {
254244
/// Get a connection from the pool.
255245
pub async fn get(
256246
&self,
257-
shard: usize, // shard number
258-
role: Option<Role>, // primary or replica
259-
process_id: i32, // client id
260-
mut round_robin: usize, // round robin offset
247+
shard: usize, // shard number
248+
role: Option<Role>, // primary or replica
249+
process_id: i32, // client id
261250
) -> Result<(PooledConnection<'_, ServerPool>, Address), Error> {
262251
let now = Instant::now();
263-
let addresses = &self.addresses[shard];
264-
265-
let mut allowed_attempts = match role {
266-
// Primary-specific queries get one attempt, if the primary is down,
267-
// nothing we should do about it I think. It's dangerous to retry
268-
// write queries.
269-
Some(Role::Primary) => 1,
252+
let mut candidates: Vec<Address> = self.addresses[shard]
253+
.clone()
254+
.into_iter()
255+
.filter(|address| address.role == role)
256+
.collect();
270257

271-
// Replicas get to try as many times as there are replicas
272-
// and connections in the pool.
273-
_ => addresses.len(),
274-
};
275-
276-
debug!("Allowed attempts for {:?}: {}", role, allowed_attempts);
277-
278-
let exists = match role {
279-
Some(role) => addresses.iter().filter(|addr| addr.role == role).count() > 0,
280-
None => true,
281-
};
282-
283-
if !exists {
284-
error!("Requested role {:?}, but none are configured", role);
285-
return Err(Error::BadConfig);
286-
}
258+
// Random load balancing
259+
candidates.shuffle(&mut thread_rng());
287260

288261
let healthcheck_timeout = get_config().general.healthcheck_timeout;
289262
let healthcheck_delay = get_config().general.healthcheck_delay as u128;
290263

291-
while allowed_attempts > 0 {
292-
// Round-robin replicas.
293-
round_robin += 1;
294-
295-
let index = round_robin % addresses.len();
296-
let address = &addresses[index];
297-
298-
// Make sure you're getting a primary or a replica
299-
// as per request. If no specific role is requested, the first
300-
// available will be chosen.
301-
if address.role != role {
302-
continue;
303-
}
304-
305-
allowed_attempts -= 1;
264+
while !candidates.is_empty() {
265+
// Get the next candidate
266+
let address = match candidates.pop() {
267+
Some(address) => address,
268+
None => break,
269+
};
306270

307-
// Don't attempt to connect to banned servers.
308-
if self.is_banned(address, shard, role) {
271+
if self.is_banned(&address, address.shard, role) {
309272
continue;
310273
}
311274

312275
// Indicate we're waiting on a server connection from a pool.
313276
self.stats.client_waiting(process_id, address.id);
314277

315278
// Check if we can connect
316-
let mut conn = match self.databases[shard][index].get().await {
279+
let mut conn = match self.databases[address.shard][address.instance_index]
280+
.get()
281+
.await
282+
{
317283
Ok(conn) => conn,
318284
Err(err) => {
319-
error!("Banning replica {}, error: {:?}", index, err);
320-
self.ban(address, shard, process_id);
285+
error!("Banning instance {:?}, error: {:?}", address, err);
286+
self.ban(&address, address.shard, process_id);
321287
self.stats.client_disconnecting(process_id, address.id);
322288
self.stats
323289
.checkout_time(now.elapsed().as_micros(), process_id, address.id);
@@ -359,29 +325,34 @@ impl ConnectionPool {
359325
}
360326

361327
// Health check failed.
362-
Err(_) => {
363-
error!("Banning replica {} because of failed health check", index);
328+
Err(err) => {
329+
error!(
330+
"Banning instance {:?} because of failed health check, {:?}",
331+
address, err
332+
);
364333

365334
// Don't leave a bad connection in the pool.
366335
server.mark_bad();
367336

368-
self.ban(address, shard, process_id);
337+
self.ban(&address, address.shard, process_id);
369338
continue;
370339
}
371340
},
372341

373342
// Health check timed out.
374-
Err(_) => {
375-
error!("Banning replica {} because of health check timeout", index);
343+
Err(err) => {
344+
error!(
345+
"Banning instance {:?} because of health check timeout, {:?}",
346+
address, err
347+
);
376348
// Don't leave a bad connection in the pool.
377349
server.mark_bad();
378350

379-
self.ban(address, shard, process_id);
351+
self.ban(&address, address.shard, process_id);
380352
continue;
381353
}
382354
}
383355
}
384-
385356
return Err(Error::AllServersDown);
386357
}
387358

0 commit comments

Comments
 (0)