Skip to content

Commit a78d45f

Browse files
committed
store/postgres: make sure only one graph-node runs database migrations
We had the problem that if multiple graph-nodes started up simultaneously, all of them would try to run database migrations. Only one of them would succeed, and the others would have their migrations rolled back. That causes issues with slow or expensive migrations because it forces the database to do a lot of unnecessary work. With this change, only one graph-node will migrate the database, all the others will wait for it to finish by blocking on a new __graph_node_global_lock table.
1 parent b04fd9f commit a78d45f

File tree

1 file changed

+42
-7
lines changed

1 file changed

+42
-7
lines changed

store/postgres/src/store.rs

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,23 +27,58 @@ use crate::store_events::StoreEventListener;
2727

2828
embed_migrations!("./migrations");
2929

30-
/// Run all initial schema migrations.
30+
/// Run all schema migrations.
3131
///
32-
/// Creates the "entities" table if it doesn't already exist.
33-
fn initiate_schema(logger: &Logger, conn: &PgConnection) {
32+
/// When multiple `graph-node` processes start up at the same time, we ensure
33+
/// that they do not run migrations in parallel by using `blocking_conn` to
34+
/// serialize them. The `conn` is used to run the actual migration.
35+
fn initiate_schema(logger: &Logger, conn: &PgConnection, blocking_conn: &PgConnection) {
3436
// Collect migration logging output
3537
let mut output = vec![];
3638

37-
match embedded_migrations::run_with_output(conn, &mut output) {
39+
// Make sure the locking table exists so we have something
40+
// to lock. We intentionally ignore errors here, because they are most
41+
// likely caused by us losing a race to create the table against another
42+
// graph-node. If this truly is an error, we will trip over it when
43+
// we try to lock the table and report it to the user
44+
if let Err(e) = blocking_conn.batch_execute(
45+
"create table if not exists \
46+
__graph_node_global_lock(id int)",
47+
) {
48+
debug!(
49+
logger,
50+
"Creating lock table failed, this is most likely harmless";
51+
"error" => format!("{:?}", e)
52+
);
53+
}
54+
55+
// blocking_conn holds the lock on the migrations table for the duration
56+
// of the migration on conn. Since all nodes execute this code, only one
57+
// of them can run this code at the same time. We need to use two
58+
// connections for this because diesel will run each migration in its
59+
// own txn, which makes it impossible to hold a lock across all of them
60+
// on that connection
61+
info!(
62+
logger,
63+
"Waiting for other graph-node instances to finish migrating"
64+
);
65+
let result = blocking_conn.transaction(|| {
66+
diesel::sql_query("lock table __graph_node_global_lock in exclusive mode")
67+
.execute(blocking_conn)?;
68+
info!(logger, "Running migrations");
69+
embedded_migrations::run_with_output(conn, &mut output)
70+
});
71+
info!(logger, "Migrations finished");
72+
73+
match result {
3874
Ok(_) => info!(logger, "Completed pending Postgres schema migrations"),
3975
Err(e) => panic!(
4076
"Error setting up Postgres database: \
4177
You may need to drop and recreate your database to work with the \
4278
latest version of graph-node. Error information: {:?}",
4379
e
4480
),
45-
}
46-
81+
};
4782
// If there was any migration output, log it now
4883
if !output.is_empty() {
4984
debug!(
@@ -112,7 +147,7 @@ impl Store {
112147
);
113148

114149
// Create the entities table (if necessary)
115-
initiate_schema(&logger, &pool.get().unwrap());
150+
initiate_schema(&logger, &pool.get().unwrap(), &pool.get().unwrap());
116151

117152
// Listen to entity changes in Postgres
118153
let mut listener = StoreEventListener::new(&logger, config.postgres_url.clone());

0 commit comments

Comments
 (0)