chore: Optimized DB queries (#580)

dianacarvalho1 · web-flow · commit f271e0c1ff86 · 2025-05-21T13:46:54.000+01:00
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -2,6 +2,7 @@ version: '3.1'
 services:
   db:
     build:
+      context: .
       dockerfile: postgres.Dockerfile
     restart: "always"
     environment:
@@ -35,6 +36,6 @@ services:
       - ./extractors.yaml:/opt/tycho-indexer/extractors.yaml
       - ./substreams/:/opt/tycho-indexer/substreams/
     entrypoint: [ "/usr/wait-for-postgres.sh", "db" ]
-    command: [ "/opt/tycho-indexer/tycho-indexer", "--endpoint", "https://mainnet.eth.streamingfast.io:443", "index"]
+    command: [ "/opt/tycho-indexer/tycho-indexer", "--endpoint", "https://mainnet.eth.streamingfast.io:443", "index" ]
 volumes:
   postgres_data:
diff --git a/tycho-storage/README.md b/tycho-storage/README.md
@@ -1,35 +1,47 @@
 # Tycho Storage
 
-Tycho is an indexer designed to process and store data, necessitating the saving of state. 
+Tycho is an indexer designed to process and store data, necessitating the saving of state.
 
 This tycho-storage crate handles all data storage and communication with the database.
 
 ## Architecture
 
 ### Database
 
-Tycho currently uses PostgresSQL as its storage backend. The full schema can be found in [schema.rs](./src/postgres/schema.rs).
+Tycho currently uses PostgresSQL as its storage backend. The full schema can be found
+in [schema.rs](./src/postgres/schema.rs).
 
 Below is the Entity Relationship (ER) diagram illustrating the tables used for this project:
 
 [![Entity Relation Diagram](../assets/tycho_db_er.png)](https://drive.google.com/file/d/1IQvdsfwRtg-AqtLuJjyGM2s6bqJGuciK/view?usp=sharing)
 
 ### Gateways
 
-Database interactions are managed through multiple gateways, including [cache](./src/postgres/cache.rs), [chain](./src/postgres/chain.rs), [contract](./src/postgres/contract.rs), [extraction_state](./src/postgres/extraction_state.rs) and [protocol](./src/postgres/protocol.rs). 
+Database interactions are managed through multiple gateways,
+including [cache](./src/postgres/cache.rs), [chain](./src/postgres/chain.rs), [contract](./src/postgres/contract.rs), [extraction_state](./src/postgres/extraction_state.rs)
+and [protocol](./src/postgres/protocol.rs).
 
-The CachedGateway serves as the main entry point for all database communications. It is designed to efficiently manage and execute database operations by utilizing an in-memory cache and ensuring data consistency through transactional writes. Writes are batched and deduplicated to improve performance and reduce load on the database.
+The CachedGateway serves as the main entry point for all database communications. It is designed to efficiently manage
+and execute database operations by utilizing an in-memory cache and ensuring data consistency through transactional
+writes. Writes are batched and deduplicated to improve performance and reduce load on the database.
 
 ### Versioning
 
-Tycho employs a robust versioning system to track historical data within the database. The [versioning](./src/postgres/versioning.rs) module provides tools to handle historical data, ensuring that each version of an entity is tracked and stored appropriately.
+Tycho employs a robust versioning system to track historical data within the database.
+The [versioning](./src/postgres/versioning.rs) module provides tools to handle historical data, ensuring that each
+version of an entity is tracked and stored appropriately.
 
 #### Key Concepts
-- VersionedRow: A trait for structs that can be inserted into a versioned table. It automates the valid_to attribute management, facilitating batch insertions.
 
-- DeltaVersionedRow: Similar to VersionedRow, but also handles setting previous_value attributes, allowing for more complex versioning scenarios.
+- VersionedRow: A trait for structs that can be inserted into a versioned table. It automates the valid_to attribute
+  management, facilitating batch insertions.
 
-- StoredVersionedRow: A trait that enables setting the end version on currently active rows in the database based on new incoming entries. It's essential for ensuring that historical data is correctly marked as outdated when new versions are inserted.
+- DeltaVersionedRow: Similar to VersionedRow, but also handles setting previous_value attributes, allowing for more
+  complex versioning scenarios.
+
+- StoredVersionedRow: A trait that enables setting the end version on currently active rows in the database based on new
+  incoming entries. It's essential for ensuring that historical data is correctly marked as outdated when new versions
+  are inserted.
 
 # Development
 
@@ -58,8 +70,9 @@ docker-compose up -d db
 ```
 
 4. Set Environment Variables:
+
 ```
-export DATABASE_URL=postgres://postgres:mypassword@localhost:5432/tycho_indexer_0
+export DATABASE_URL=postgres://postgres:mypassword@localhost:5431/tycho_indexer_0
 export ETH_RPC_URL="url-here"
 
 ```
@@ -77,19 +90,25 @@ We use [pgFormatter](https://github.com/darold/pgFormatter) to keep SQL files co
 ### Setup pgFormatter with RustRover
 
 1. Ensure you have pgFormatter installed:
+
 ```bash
 brew install pgformatter
 ```
+
 2. In RustRover, search for "External Tools" and add a new tool using the "+" button.
 3. Get the path of pgFormatter installation:
+
 ```bash
 which pg_format
 ```
+
 4. Set the "Program" feild to this path.
-5. Set the "Arguments" field to: 
+5. Set the "Arguments" field to:
+
 ```bash
 --no-space-function -i $FilePath$
 ```
+
 6. Leave working directory empty.
 7. Save the tool under "pgFormat" and add a shortcut if desired.
 
@@ -108,7 +127,8 @@ If you have to change the database schema, please make sure the down migration i
 diesel migration redo --migration-dir ./tycho-storage/migrations
 ```
 
-If the schema.rs file does not automatically update after you've run a migration with table changes, you can trigger the update manually by executing:
+If the schema.rs file does not automatically update after you've run a migration with table changes, you can trigger the
+update manually by executing:
 
 ```bash
 diesel print-schema --config-file ./tycho-storage/diesel.toml > ./tycho-storage/src/postgres/schema.rs
@@ -118,6 +138,10 @@ diesel print-schema --config-file ./tycho-storage/diesel.toml > ./tycho-storage/
 
 Currently Tycho exposes a single special [test-group](https://nexte.st/book/test-groups.html) via nextest:
 
-1. `test(serial-db)`: These are tests against the database that need to commit data. To not intefere with other test that require a empty db but do not commit, we run these tests separately. Most of these tests use the `run_against_db` test harness. Test within that group are run sequentially, the remaining tests run in parallel. To add a test to this group simply ensure its name or its test package name includes the string `serial_db`.
+1. `test(serial-db)`: These are tests against the database that need to commit data. To not intefere with other test
+   that require a empty db but do not commit, we run these tests separately. Most of these tests use
+   the `run_against_db` test harness. Test within that group are run sequentially, the remaining tests run in parallel.
+   To add a test to this group simply ensure its name or its test package name includes the string `serial_db`.
 
-If your test does not require committing to the database and has no special resource requirements, create the test as usual.
+If your test does not require committing to the database and has no special resource requirements, create the test as
+usual.
diff --git a/tycho-storage/src/postgres/orm.rs b/tycho-storage/src/postgres/orm.rs
@@ -460,29 +460,59 @@ impl PartitionedVersionedRow for NewComponentBalance {
             .zip(token_ids.iter())
             .collect::<HashSet<_>>();
 
-        // PERF: The removal of the filter 'valid_to = MAX_TS' means we now search in archived
-        // tables as well. A possible optimisation would be to add the valid_to filter back
-        // and then use a second query for balances still missing that will access the
-        // archived tables. Therefore, performance is not impacted in the common case
-        // (balances are rarely deleted).
-        Ok(component_balance::table
+        let mut results: Vec<ComponentBalance> = component_balance::table
             .select(ComponentBalance::as_select())
+            .into_boxed()
             .filter(
                 component_balance::protocol_component_id
                     .eq_any(&component_ids)
-                    .and(component_balance::token_id.eq_any(&token_ids)),
+                    .and(component_balance::token_id.eq_any(&token_ids))
+                    .and(component_balance::valid_to.eq(MAX_TS)),
             )
-            .distinct_on((component_balance::protocol_component_id, component_balance::token_id))
-            .order_by((
-                component_balance::protocol_component_id,
-                component_balance::token_id,
-                component_balance::valid_to.desc(),
-            ))
             .get_results(conn)
             .await
-            .map_err(PostgresError::from)?
+            .map_err(PostgresError::from)?;
+
+        let found_ids: HashSet<_> = results
+            .iter()
+            .map(|cb| (&cb.protocol_component_id, &cb.token_id))
+            .collect();
+
+        let missing_ids: Vec<_> = tuple_ids
+            .clone()
             .into_iter()
-            .filter(|cs| tuple_ids.contains(&(&cs.protocol_component_id, &cs.token_id)))
+            .filter(|id| !found_ids.contains(id))
+            .collect();
+
+        // If we have missing ids, we need to query the archived tables as well. This is necessary
+        // when entries are deleted
+        if !missing_ids.is_empty() {
+            let (missing_component_ids, missing_token_ids): (Vec<&i64>, Vec<&i64>) =
+                missing_ids.into_iter().unzip();
+            let deleted_results = component_balance::table
+                .select(ComponentBalance::as_select())
+                .filter(
+                    component_balance::protocol_component_id
+                        .eq_any(&missing_component_ids)
+                        .and(component_balance::token_id.eq_any(&missing_token_ids)),
+                )
+                .distinct_on((
+                    component_balance::protocol_component_id,
+                    component_balance::token_id,
+                ))
+                .order_by((
+                    component_balance::protocol_component_id,
+                    component_balance::token_id,
+                    component_balance::valid_to.desc(),
+                ))
+                .get_results(conn)
+                .await
+                .map_err(PostgresError::from)?;
+            results.extend(deleted_results);
+        }
+        Ok(results
+            .into_iter()
+            .filter(|cb| tuple_ids.contains(&(&cb.protocol_component_id, &cb.token_id)))
             .map(NewComponentBalance::from)
             .collect())
     }
@@ -1115,28 +1145,62 @@ impl PartitionedVersionedRow for NewProtocolState {
             .zip(attr_name.iter())
             .collect::<HashSet<_>>();
 
-        // PERF: The removal of the filter 'valid_to = MAX_TS' means we now search in archived
-        // tables as well. A possible optimisation would be to add the valid_to filter back
-        // and then use a second query for states still missing that will access the
-        // archived tables. Therefore, performance is not impacted in the common case.
-        Ok(protocol_state::table
+        let mut results: Vec<ProtocolState> = protocol_state::table
             .select(ProtocolState::as_select())
+            .into_boxed()
             .filter(
                 protocol_state::protocol_component_id
                     .eq_any(&pc_id)
-                    .and(protocol_state::attribute_name.eq_any(&attr_name)),
+                    .and(protocol_state::attribute_name.eq_any(&attr_name))
+                    .and(protocol_state::valid_to.eq(MAX_TS)),
             )
-            .distinct_on((protocol_state::protocol_component_id, protocol_state::attribute_name))
-            .order_by((
-                protocol_state::protocol_component_id,
-                protocol_state::attribute_name,
-                protocol_state::valid_to.desc(),
-            ))
             .get_results(conn)
             .await
-            .map_err(PostgresError::from)?
+            .map_err(PostgresError::from)?;
+
+        let found_ids: HashSet<_> = results
+            .iter()
+            .map(|ps| (&ps.protocol_component_id, &ps.attribute_name))
+            .collect();
+
+        let missing_ids: Vec<_> = tuple_ids
+            .clone()
             .into_iter()
-            .filter(|cs| tuple_ids.contains(&(&cs.protocol_component_id, &cs.attribute_name)))
+            .filter(|id| !found_ids.contains(id))
+            .collect();
+
+        // If we have missing ids, we need to query the archived tables as well. This is necessary
+        // when entries are deleted
+        if !missing_ids.is_empty() {
+            let (missing_protocol_component_ids, missing_attribute_names): (
+                Vec<&i64>,
+                Vec<&String>,
+            ) = missing_ids.into_iter().unzip();
+            let deleted_results: Vec<ProtocolState> = protocol_state::table
+                .select(ProtocolState::as_select())
+                .filter(
+                    protocol_state::protocol_component_id
+                        .eq_any(&missing_protocol_component_ids)
+                        .and(protocol_state::attribute_name.eq_any(&missing_attribute_names)),
+                )
+                .distinct_on((
+                    protocol_state::protocol_component_id,
+                    protocol_state::attribute_name,
+                ))
+                .order_by((
+                    protocol_state::protocol_component_id,
+                    protocol_state::attribute_name,
+                    protocol_state::valid_to.desc(),
+                ))
+                .get_results(conn)
+                .await
+                .map_err(PostgresError::from)?;
+            results.extend(deleted_results);
+        }
+
+        Ok(results
+            .into_iter()
+            .filter(|ps| tuple_ids.contains(&(&ps.protocol_component_id, &ps.attribute_name)))
             .map(NewProtocolState::from)
             .collect())
     }
@@ -1644,22 +1708,55 @@ impl PartitionedVersionedRow for NewSlot {
         // tables as well. A possible optimisation would be to add the valid_to filter back
         // and then use a second query for storage still missing that will access the
         // archived tables. Therefore, performance is not impacted in the common case.
-        Ok(contract_storage::table
+        let mut results: Vec<ContractStorage> = contract_storage::table
             .select(ContractStorage::as_select())
+            .into_boxed()
             .filter(
                 contract_storage::account_id
                     .eq_any(&accounts)
-                    .and(contract_storage::slot.eq_any(&slots)),
+                    .and(contract_storage::slot.eq_any(&slots))
+                    .and(contract_storage::valid_to.eq(MAX_TS)),
             )
-            .distinct_on((contract_storage::account_id, contract_storage::slot))
-            .order_by((
-                contract_storage::account_id,
-                contract_storage::slot,
-                contract_storage::valid_to.desc(),
-            ))
             .get_results(conn)
             .await
-            .map_err(PostgresError::from)?
+            .map_err(PostgresError::from)?;
+
+        let found_ids: HashSet<_> = results
+            .iter()
+            .map(|cs| (&cs.account_id, &cs.slot))
+            .collect();
+
+        let missing_ids: Vec<_> = tuple_ids
+            .clone()
+            .into_iter()
+            .filter(|id| !found_ids.contains(id))
+            .collect();
+
+        // If we have missing ids, we need to query the archived tables as well. This is necessary
+        // when entries are deleted
+        if !missing_ids.is_empty() {
+            let (missing_accounts, missing_slots): (Vec<&i64>, Vec<&Bytes>) =
+                missing_ids.into_iter().unzip();
+            let deleted_results: Vec<ContractStorage> = contract_storage::table
+                .select(ContractStorage::as_select())
+                .filter(
+                    contract_storage::account_id
+                        .eq_any(&missing_accounts)
+                        .and(contract_storage::slot.eq_any(&missing_slots)),
+                )
+                .distinct_on((contract_storage::account_id, contract_storage::slot))
+                .order_by((
+                    contract_storage::account_id,
+                    contract_storage::slot,
+                    contract_storage::valid_to.desc(),
+                ))
+                .get_results(conn)
+                .await
+                .map_err(PostgresError::from)?;
+            results.extend(deleted_results);
+        }
+
+        Ok(results
             .into_iter()
             .filter(|cs| tuple_ids.contains(&(&cs.account_id, &cs.slot)))
             .map(NewSlot::from)