diff --git a/sql/2024-12-16-00-00_entity_depths.sql b/sql/2024-12-16-00-00_entity_depths.sql new file mode 100644 index 00000000..fb56de97 --- /dev/null +++ b/sql/2024-12-16-00-00_entity_depths.sql @@ -0,0 +1,231 @@ +-- We can track the maximum dependency depth of any sub-dag rooted at each entity. +-- The depth of any entity is simply the maximum depth of any of its children plus one. +-- This allows us to trivially sort entities into a valid dependency order without needing a complex topological +-- sort at query time. + +-- Unfortunately we can't use triggers for most of these since for some entities their depth is dependent on +-- references which, due to foreign keys, must be inserted AFTER the entity itself, it must be run after all +-- the entity's local references are inserted, but there's no way for us to trigger +-- only when the LAST one of those is done, so we'd need to run this on every +-- local reference insert, and remove the optimistic exit in the case where the row +-- already exists, which is a big waste. +-- +-- Instead we just run these functions manually after an entity's references are all inserted. + +CREATE TABLE causal_depth ( + causal_id INTEGER PRIMARY KEY REFERENCES causals (id) ON DELETE CASCADE, + depth INTEGER NOT NULL +); + +CREATE TABLE component_depth ( + component_hash_id INTEGER PRIMARY KEY REFERENCES component_hashes (id) ON DELETE CASCADE, + depth INTEGER NOT NULL +); + +CREATE TABLE namespace_depth ( + namespace_hash_id INTEGER PRIMARY KEY REFERENCES branch_hashes (id) ON DELETE CASCADE, + depth INTEGER NOT NULL +); + +CREATE TABLE patch_depth ( + patch_id INTEGER PRIMARY KEY REFERENCES patches (id) ON DELETE CASCADE, + depth INTEGER NOT NULL +); + + +-- Triggers + +CREATE OR REPLACE FUNCTION update_causal_depth(the_causal_id integer) RETURNS VOID AS $$ +DECLARE + max_namespace_depth INTEGER; + max_child_causal_depth INTEGER; + the_namespace_hash_id INTEGER; +BEGIN + -- If there's already a depth entry for this causal, we're done. + IF EXISTS (SELECT FROM causal_depth cd WHERE cd.causal_id = the_causal_id) THEN + RETURN; + END IF; + + SELECT c.namespace_hash_id INTO the_namespace_hash_id + FROM causals c + WHERE c.id = the_causal_id; + -- Find the max depth of the associated namespace + -- Find the max depth of any child causal + -- Set the depth of this causal to the max of those two plus one + SELECT COALESCE(MAX(nd.depth), -1) INTO max_namespace_depth + FROM namespace_depth nd + WHERE nd.namespace_hash_id = the_namespace_hash_id; + SELECT COALESCE(MAX(cd.depth), -1) INTO max_child_causal_depth + FROM causal_depth cd + JOIN causal_ancestors ca ON cd.causal_id = ca.ancestor_id + WHERE ca.causal_id = the_causal_id; + INSERT INTO causal_depth (causal_id, depth) + VALUES (the_causal_id, GREATEST(max_namespace_depth, max_child_causal_depth) + 1); + + RETURN; +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION update_component_depth(the_component_hash_id integer) RETURNS VOID AS $$ +DECLARE + max_referenced_component_depth INTEGER; +BEGIN + -- If there's already a depth entry for this component, we're done. + IF EXISTS (SELECT FROM component_depth cd WHERE cd.component_hash_id = the_component_hash_id) THEN + RETURN; + END IF; + -- Find the max depth of any component referenced by this component + -- Set the depth of this component to that plus one + SELECT COALESCE(MAX(refs.depth), -1) INTO max_referenced_component_depth + FROM ( + ( SELECT cd.depth AS depth + FROM terms t + JOIN term_local_component_references cr + ON cr.term_id = t.id + JOIN component_depth cd + ON cd.component_hash_id = cr.component_hash_id + WHERE t.component_hash_id = the_component_hash_id + ) UNION + ( SELECT cd.depth AS depth + FROM types t + JOIN type_local_component_references cr + ON cr.type_id = t.id + JOIN component_depth cd + ON cd.component_hash_id = cr.component_hash_id + WHERE t.component_hash_id = the_component_hash_id + ) + ) AS refs; + INSERT INTO component_depth (component_hash_id, depth) + VALUES (the_component_hash_id, max_referenced_component_depth + 1); + RETURN; +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION update_namespace_depth(the_namespace_hash_id integer) RETURNS VOID AS $$ +DECLARE + max_child_causal_depth INTEGER; + max_patch_depth INTEGER; + max_referenced_component_depth INTEGER; +BEGIN + -- If there's already a depth entry for this namespace, we're done. + IF EXISTS (SELECT FROM namespace_depth nd WHERE nd.namespace_hash_id = the_namespace_hash_id) THEN + RETURN; + END IF; + -- Find the max depth of any child causal + -- Find the max depth of any patch + -- Find the max depth of any component referenced by a term, type, or term metadata in this namespace + -- Set the depth of this namespace to the max of those plus one + SELECT COALESCE(MAX(cd.depth), -1) INTO max_child_causal_depth + FROM causal_depth cd + JOIN namespace_children nc ON cd.causal_id = nc.child_causal_id + WHERE nc.parent_namespace_hash_id = the_namespace_hash_id; + SELECT COALESCE(MAX(pd.depth), -1) INTO max_patch_depth + FROM patch_depth pd + JOIN namespace_patches np ON pd.patch_id = np.patch_id + WHERE np.namespace_hash_id = the_namespace_hash_id; + SELECT COALESCE(MAX(depth), -1) INTO max_referenced_component_depth + FROM ( + -- direct term references + ( SELECT cd.depth AS depth + FROM namespace_terms nt + JOIN terms t + ON nt.term_id = t.id + JOIN component_depth cd + ON t.component_hash_id = cd.component_hash_id + WHERE nt.namespace_hash_id = the_namespace_hash_id + ) UNION + -- term metadata references + ( SELECT cd.depth AS depth + FROM namespace_terms nt + JOIN namespace_term_metadata ntm + ON ntm.named_term = nt.id + JOIN terms t + ON ntm.metadata_term_id = t.id + JOIN component_depth cd + ON t.component_hash_id = cd.component_hash_id + WHERE nt.namespace_hash_id = the_namespace_hash_id + ) UNION + -- direct constructor references + ( SELECT cd.depth AS depth + FROM namespace_terms nt + JOIN constructors c + ON c.id = nt.constructor_id + JOIN types t + ON c.type_id = t.id + JOIN component_depth cd + ON t.component_hash_id = cd.component_hash_id + WHERE nt.namespace_hash_id = the_namespace_hash_id + ) UNION + -- direct type references + ( SELECT cd.depth AS depth + FROM namespace_types nt + JOIN types t + ON nt.type_id = t.id + JOIN component_depth cd + ON t.component_hash_id = cd.component_hash_id + WHERE nt.namespace_hash_id = the_namespace_hash_id + ) UNION + -- type metadata references + ( SELECT cd.depth AS depth + FROM namespace_types nt + JOIN namespace_type_metadata ntm + ON ntm.named_type = nt.id + JOIN terms t + ON ntm.metadata_term_id = t.id + JOIN component_depth cd + ON t.component_hash_id = cd.component_hash_id + WHERE nt.namespace_hash_id = the_namespace_hash_id + ) + ) AS refs; + INSERT INTO namespace_depth (namespace_hash_id, depth) + VALUES (the_namespace_hash_id, GREATEST(max_child_causal_depth, max_patch_depth, max_referenced_component_depth) + 1); + + RETURN; +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION update_patch_depth(the_patch_id integer) RETURNS VOID AS $$ +DECLARE + max_referenced_component_depth INTEGER; +BEGIN + -- If there's already a depth entry for this patch, we're done. + IF EXISTS (SELECT FROM patch_depth pd WHERE pd.patch_id = the_patch_id) THEN + RETURN; + END IF; + -- Find the max depth of any term component referenced by a patch + -- Find the max depth of any type component referenced by a patch + -- Set the depth of this patch to that plus one + + SELECT COALESCE(MAX(cd.depth), -1) INTO max_referenced_component_depth + FROM ( + -- term references + ( SELECT t.component_hash_id AS component_hash_id + FROM patch_term_mappings ptm + JOIN terms t + ON ptm.to_term_id = t.id + WHERE ptm.patch_id = the_patch_id + ) UNION + -- constructor mappings + ( SELECT t.component_hash_id AS component_hash_id + FROM patch_constructor_mappings pcm + JOIN constructors c + ON pcm.to_constructor_id = c.id + JOIN types t + ON c.type_id = t.id + WHERE pcm.patch_id = the_patch_id + ) UNION + -- type references + ( SELECT t.component_hash_id AS component_hash_id + FROM patch_type_mappings ptm + JOIN types t + ON ptm.to_type_id = t.id + WHERE ptm.patch_id = the_patch_id + ) + ) AS refs JOIN component_depth cd + ON cd.component_hash_id = refs.component_hash_id; + INSERT INTO patch_depth (patch_id, depth) + VALUES (the_patch_id, max_referenced_component_depth + 1); + + RETURN; +END; +$$ LANGUAGE plpgsql; diff --git a/sql/migration-helpers/2025-02-25_causal_depth_migration.sql b/sql/migration-helpers/2025-02-25_causal_depth_migration.sql new file mode 100644 index 00000000..810e858e --- /dev/null +++ b/sql/migration-helpers/2025-02-25_causal_depth_migration.sql @@ -0,0 +1,55 @@ + +CREATE TABLE unfinished_causal_depths ( + id INTEGER PRIMARY KEY REFERENCES causals (id) ON DELETE CASCADE +); + +CREATE TABLE unfinished_namespace_depths ( + id INTEGER PRIMARY KEY REFERENCES branch_hashes (id) ON DELETE CASCADE +); + +CREATE TABLE unfinished_patch_depths ( + id INTEGER PRIMARY KEY REFERENCES patches (id) ON DELETE CASCADE +); + +CREATE TABLE unfinished_component_depths ( + id INTEGER PRIMARY KEY REFERENCES component_hashes (id) ON DELETE CASCADE +); + +CREATE TABLE unfinished_namespaces_working_set ( + id INTEGER PRIMARY KEY REFERENCES branch_hashes (id) ON DELETE CASCADE +); + +INSERT INTO unfinished_causal_depths (id) + SELECT c.id + FROM causals c + WHERE NOT EXISTS ( + SELECT FROM causal_depth cd WHERE cd.causal_id = c.id + ) ON CONFLICT DO NOTHING; + +INSERT INTO unfinished_namespace_depths (id) + SELECT n.namespace_hash_id + FROM namespaces n + WHERE NOT EXISTS ( + SELECT FROM namespace_depth nd WHERE nd.namespace_hash_id = n.namespace_hash_id + ) ON CONFLICT DO NOTHING; + +INSERT INTO unfinished_patch_depths (id) + SELECT p.id + FROM patches p + WHERE NOT EXISTS ( + SELECT FROM patch_depth pd WHERE pd.patch_id = p.id + ) ON CONFLICT DO NOTHING; + +INSERT INTO unfinished_component_depths (id) + SELECT ch.id + FROM component_hashes ch + WHERE NOT EXISTS ( + SELECT FROM component_depth cd WHERE cd.component_hash_id = ch.id + ) ON CONFLICT DO NOTHING; + +-- Afterwards +DROP TABLE unfinished_causal_depths; +DROP TABLE unfinished_namespace_depths; +DROP TABLE unfinished_patch_depths; +DROP TABLE unfinished_component_depths; +DROP TABLE unfinished_namespaces_working_set; diff --git a/src/Share/BackgroundJobs.hs b/src/Share/BackgroundJobs.hs index f5a533d3..dec54bb0 100644 --- a/src/Share/BackgroundJobs.hs +++ b/src/Share/BackgroundJobs.hs @@ -3,12 +3,12 @@ module Share.BackgroundJobs (startWorkers) where import Ki.Unlifted qualified as Ki import Share.BackgroundJobs.Monad (Background) import Share.BackgroundJobs.Search.DefinitionSync qualified as DefnSearch -import Share.BackgroundJobs.SerializedEntitiesMigration.Worker qualified as SerializedEntitiesMigration -- | Kicks off all background workers. startWorkers :: Ki.Scope -> Background () startWorkers scope = do DefnSearch.worker scope - -- Temporary disable background diff jobs until the new diffing logic is done. - -- ContributionDiffs.worker scope - SerializedEntitiesMigration.worker scope + +-- Temporary disable background diff jobs until the new diffing logic is done. +-- ContributionDiffs.worker scope +-- SerializedEntitiesMigration.worker scope diff --git a/src/Share/Postgres/Causal/Queries.hs b/src/Share/Postgres/Causal/Queries.hs index 98d9cc1a..6157ebbd 100644 --- a/src/Share/Postgres/Causal/Queries.hs +++ b/src/Share/Postgres/Causal/Queries.hs @@ -642,6 +642,7 @@ savePgNamespace maySerialized mayBh b@(BranchFull.Branch {terms, types, patches, |] -- Note: this must be run AFTER inserting the namespace and all its children. execute_ [sql| SELECT save_namespace(#{bhId}) |] + execute_ [sql| SELECT update_namespace_depth(#{bhId}) |] saveSerializedNamespace :: (QueryM m) => BranchHashId -> CBORBytes TempEntity -> m () saveSerializedNamespace bhId (CBORBytes bytes) = do @@ -785,6 +786,7 @@ saveCausal maySerializedCausal mayCh bhId ancestorIds = do SELECT #{cId}, a.ancestor_id FROM ancestors a |] + execute_ [sql| SELECT update_causal_depth(#{cId}) |] pure cId saveSerializedCausal :: (QueryM m) => CausalId -> CBORBytes TempEntity -> m () diff --git a/src/Share/Postgres/Definitions/Queries.hs b/src/Share/Postgres/Definitions/Queries.hs index cb3a9be9..ef3666c6 100644 --- a/src/Share/Postgres/Definitions/Queries.hs +++ b/src/Share/Postgres/Definitions/Queries.hs @@ -863,6 +863,10 @@ saveEncodedTermComponent componentHash maySerialized elements = do SELECT defn_mappings.term_id, defn_mappings.local_index, defn_mappings.component_hash_id FROM defn_mappings |] + execute_ + [sql| + SELECT update_component_depth(#{componentHashId}) + |] pure termIds saveTypeComponent :: ComponentHash -> Maybe TempEntity -> [(PgLocalIds, DeclFormat.Decl Symbol)] -> CodebaseM e () @@ -1013,6 +1017,10 @@ saveTypeComponent componentHash maySerialized elements = do FROM defn_mappings |] saveConstructors (zip (toList typeIds) elements) + execute_ + [sql| + SELECT update_component_depth(#{componentHashId}) + |] pure typeIds -- | Efficiently resolve all pg Ids across selected Local Ids. diff --git a/src/Share/Postgres/Patches/Queries.hs b/src/Share/Postgres/Patches/Queries.hs index 4a1f1f14..8c454eae 100644 --- a/src/Share/Postgres/Patches/Queries.hs +++ b/src/Share/Postgres/Patches/Queries.hs @@ -233,6 +233,7 @@ savePatch maySerialized patchHash PatchFull.Patch {termEdits, typeEdits} = do LEFT JOIN types to_type ON to_type.component_hash_id = to_type_component_hash_id AND to_type.component_index = to_type_component_index |] + execute_ [sql| SELECT update_patch_depth(#{patchId}) |] pure patchId termsTable :: [(Maybe ComponentHashId, Maybe Int64 {- from comp index -}, Maybe TextId, Maybe ComponentHashId, Maybe Int64 {- to comp index -}, Maybe TextId, Maybe PatchFullTermEdit.Typing, Bool)] constructorsTable :: [(ComponentHashId, Int64 {- from comp index -}, Int64 {- from constr index -}, Maybe ComponentHashId, Maybe Int64 {- to comp index-}, Maybe Int64 {- to constr index -}, Maybe PatchFullTermEdit.Typing, Bool)] diff --git a/src/Share/Web/UCM/SyncV2/Impl.hs b/src/Share/Web/UCM/SyncV2/Impl.hs index 71ddaccf..f1a025e7 100644 --- a/src/Share/Web/UCM/SyncV2/Impl.hs +++ b/src/Share/Web/UCM/SyncV2/Impl.hs @@ -48,7 +48,7 @@ batchSize :: Int32 batchSize = 1000 streamSettings :: Hash32 -> Maybe SyncV2.BranchRef -> StreamInitInfo -streamSettings rootCausalHash rootBranchRef = StreamInitInfo {version = SyncV2.Version 1, entitySorting = SyncV2.Unsorted, numEntities = Nothing, rootCausalHash, rootBranchRef} +streamSettings rootCausalHash rootBranchRef = StreamInitInfo {version = SyncV2.Version 1, entitySorting = SyncV2.DependenciesFirst, numEntities = Nothing, rootCausalHash, rootBranchRef} server :: Maybe UserId -> SyncV2.Routes WebAppServer server mayUserId = diff --git a/src/Share/Web/UCM/SyncV2/Queries.hs b/src/Share/Web/UCM/SyncV2/Queries.hs index 02264982..61d5d034 100644 --- a/src/Share/Web/UCM/SyncV2/Queries.hs +++ b/src/Share/Web/UCM/SyncV2/Queries.hs @@ -50,7 +50,7 @@ allSerializedDependenciesOfCausalCursor cid exceptCausalHashes = do JOIN component_hashes ch ON dh.hash = ch.base32 |] cursor <- - PGCursor.newRowCursor + PGCursor.newRowCursor @(CBORBytes TempEntity, Hash32, Maybe Int32) "serialized_entities" [sql| WITH RECURSIVE transitive_causals(causal_id, causal_hash, causal_namespace_hash_id) AS ( @@ -173,32 +173,45 @@ allSerializedDependenciesOfCausalCursor cid exceptCausalHashes = do WHERE NOT EXISTS (SELECT FROM except_components ec WHERE ec.component_hash_id = ref.component_hash_id) ) ) - (SELECT bytes.bytes, ch.base32 + (SELECT bytes.bytes, ch.base32, cd.depth FROM transitive_components tc JOIN serialized_components sc ON sc.user_id = #{ownerUserId} AND tc.component_hash_id = sc.component_hash_id JOIN bytes ON sc.bytes_id = bytes.id JOIN component_hashes ch ON tc.component_hash_id = ch.id + LEFT JOIN component_depth cd ON ch.id = cd.component_hash_id ) UNION ALL - (SELECT bytes.bytes, ap.patch_hash + (SELECT bytes.bytes, ap.patch_hash, pd.depth FROM all_patches ap JOIN serialized_patches sp ON ap.patch_id = sp.patch_id JOIN bytes ON sp.bytes_id = bytes.id + LEFT JOIN patch_depth pd ON ap.patch_id = pd.patch_id ) UNION ALL - (SELECT bytes.bytes, an.namespace_hash + (SELECT bytes.bytes, an.namespace_hash, nd.depth FROM all_namespaces an JOIN serialized_namespaces sn ON an.namespace_hash_id = sn.namespace_hash_id JOIN bytes ON sn.bytes_id = bytes.id + LEFT JOIN namespace_depth nd ON an.namespace_hash_id = nd.namespace_hash_id ) UNION ALL - (SELECT bytes.bytes, tc.causal_hash + (SELECT bytes.bytes, tc.causal_hash, cd.depth FROM transitive_causals tc JOIN serialized_causals sc ON tc.causal_id = sc.causal_id JOIN bytes ON sc.bytes_id = bytes.id + LEFT JOIN causal_depth cd ON tc.causal_id = cd.causal_id ) + -- Put them in dependency order, nulls come first because we want to bail and + -- report an error if we are somehow missing a depth. + ORDER BY depth ASC NULLS FIRST |] - pure cursor + pure + ( cursor <&> \(bytes, hash, depth) -> case depth of + -- This should never happen, but is a sanity check in case we're missing a depth. + -- Better than silently omitting a required result. + Nothing -> error $ "allSerializedDependenciesOfCausalCursor: Missing depth for entity: " <> show hash + Just _ -> (bytes, hash) + ) spineAndLibDependenciesOfCausalCursor :: CausalId -> CodebaseM e (PGCursor (Hash32, IsCausalSpine, IsLibRoot)) spineAndLibDependenciesOfCausalCursor cid = do diff --git a/transcripts/sql/inserts.sql b/transcripts/sql/inserts.sql index c7591a3f..510a8722 100644 --- a/transcripts/sql/inserts.sql +++ b/transcripts/sql/inserts.sql @@ -107,12 +107,19 @@ INSERT INTO namespaces(namespace_hash_id, contained_terms, deep_contained_terms, VALUES (0, 0, 0, 0, 0, 0, 0) ON CONFLICT DO NOTHING; +INSERT INTO namespace_depth(namespace_hash_id, depth) + VALUES (0, 0) + ON CONFLICT DO NOTHING; -- Initialize the empty causal INSERT INTO causals(id, hash, namespace_hash_id) VALUES (0, 'sg60bvjo91fsoo7pkh9gejbn0qgc95vra87ap6l5d35ri0lkaudl7bs12d71sf3fh6p23teemuor7mk1i9n567m50ibakcghjec5ajg', 0) ON CONFLICT DO NOTHING; +INSERT INTO causal_depth(causal_id, depth) + VALUES (0, 0) + ON CONFLICT DO NOTHING; + -- Projects INSERT INTO projects ( id,