Skip to content

Commit 4684fd0

Browse files
committed
store: Automatically remove deployments that have been unused for a while
1 parent 9759016 commit 4684fd0

File tree

2 files changed

+82
-3
lines changed

2 files changed

+82
-3
lines changed

docs/environment-variables.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,9 @@ those.
166166
decisions. Set to `true` to turn simulation on, defaults to `false`
167167
- `GRAPH_STORE_CONNECTION_TIMEOUT`: How long to wait to connect to a
168168
database before assuming the database is down in ms. Defaults to 5000ms.
169+
- `GRAPH_REMOVE_UNUSED_INTERVAL`: How long to wait before removing an
170+
unused deployment. The system periodically checks and marks deployments
171+
that are not used by any subgraphs any longer. Once a deployment has been
172+
identified as unused, `graph-node` will wait at least this long before
173+
actually deleting the data (value is in minutes, defaults to 360, i.e. 6
174+
hours)

store/postgres/src/jobs.rs

Lines changed: 76 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,25 @@
11
//! Jobs for database maintenance
22
use std::collections::HashMap;
33
use std::sync::Arc;
4-
use std::time::Duration;
4+
use std::time::{Duration, Instant};
55

66
use async_trait::async_trait;
77
use diesel::{prelude::RunQueryDsl, sql_query, sql_types::Double};
88

9-
use graph::prelude::{error, Logger, MetricsRegistry, StoreError};
9+
use graph::env::env_var;
10+
use graph::prelude::{chrono, error, lazy_static, Logger, MetricsRegistry, StoreError};
1011
use graph::prometheus::Gauge;
1112
use graph::util::jobs::{Job, Runner};
1213

1314
use crate::connection_pool::ConnectionPool;
14-
use crate::{Store, SubgraphStore};
15+
use crate::{unused, Store, SubgraphStore};
16+
17+
lazy_static! {
18+
static ref UNUSED_INTERVAL: chrono::Duration = {
19+
let interval: u32 = env_var("GRAPH_REMOVE_UNUSED_INTERVAL", 360);
20+
chrono::Duration::minutes(interval as i64)
21+
};
22+
}
1523

1624
pub fn register(
1725
runner: &mut Runner,
@@ -33,6 +41,12 @@ pub fn register(
3341
Arc::new(MirrorPrimary::new(store.subgraph_store())),
3442
Duration::from_secs(15 * 60),
3543
);
44+
45+
// Remove unused deployments every 2 hours
46+
runner.register(
47+
Arc::new(UnusedJob::new(store.subgraph_store())),
48+
Duration::from_secs(2 * 60 * 60),
49+
)
3650
}
3751

3852
/// A job that vacuums `subgraphs.subgraph_deployment`. With a large number
@@ -141,3 +155,62 @@ impl Job for MirrorPrimary {
141155
self.store.mirror_primary_tables(logger).await;
142156
}
143157
}
158+
159+
struct UnusedJob {
160+
store: Arc<SubgraphStore>,
161+
}
162+
163+
impl UnusedJob {
164+
fn new(store: Arc<SubgraphStore>) -> UnusedJob {
165+
UnusedJob { store }
166+
}
167+
}
168+
169+
#[async_trait]
170+
impl Job for UnusedJob {
171+
fn name(&self) -> &str {
172+
"Record and remove unused deployments"
173+
}
174+
175+
/// Record unused deployments and remove ones that were recorded at
176+
/// least `UNUSED_INTERVAL` ago
177+
async fn run(&self, logger: &Logger) {
178+
// Work on removing about 5 minutes
179+
const REMOVAL_DEADLINE: Duration = Duration::from_secs(5 * 60);
180+
181+
let start = Instant::now();
182+
183+
if let Err(e) = self.store.record_unused_deployments() {
184+
error!(logger, "failed to record unused deployments"; "error" => e.to_string());
185+
return;
186+
}
187+
188+
let remove = match self
189+
.store
190+
.list_unused_deployments(unused::Filter::UnusedLongerThan(*UNUSED_INTERVAL))
191+
{
192+
Ok(remove) => remove,
193+
Err(e) => {
194+
error!(logger, "failed to list removable deployments"; "error" => e.to_string());
195+
return;
196+
}
197+
};
198+
199+
for deployment in remove {
200+
match self.store.remove_deployment(deployment.id) {
201+
Ok(()) => { /* ignore */ }
202+
Err(e) => {
203+
error!(logger, "failed to remove unused deployment";
204+
"sgd" => deployment.id.to_string(),
205+
"deployment" => deployment.deployment,
206+
"error" => e.to_string());
207+
}
208+
}
209+
// Stop working on removing after a while to not block other
210+
// jobs for too long
211+
if start.elapsed() > REMOVAL_DEADLINE {
212+
return;
213+
}
214+
}
215+
}
216+
}

0 commit comments

Comments
 (0)