Skip to content

Commit 5476677

Browse files
committed
Fix ever growing db tables
1 parent 6d60e93 commit 5476677

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+12838
-4960
lines changed

migrations/000033_update_raw_visits_trigger.up.sql

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ BEGIN;
44
DROP TRIGGER insert_raw_visit ON raw_visits;
55
DROP FUNCTION normalize_raw_visit();
66

7-
87
CREATE FUNCTION normalize_raw_visit() RETURNS TRIGGER AS
98
$normalize_raw_visit$
109
DECLARE
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
-- Begin the transaction
2+
BEGIN;
3+
4+
ALTER TABLE peers DROP CONSTRAINT fk_peers_protocols_set_id;
5+
ALTER TABLE peers DROP COLUMN protocols_set_id;
6+
7+
ALTER TABLE visits DROP CONSTRAINT fk_visits_protocols_set_id;
8+
ALTER TABLE visits DROP COLUMN protocols_set_id;
9+
10+
DROP TABLE protocols_sets;
11+
DROP TABLE protocols;
12+
13+
-- End the transaction
14+
COMMIT;
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
-- Begin the transaction
2+
BEGIN;
3+
4+
-- Activate intarray extension for efficient array operations
5+
CREATE EXTENSION IF NOT EXISTS intarray;
6+
7+
-- The protocols table holds all the different protocols that the crawler came across
8+
CREATE TABLE protocols
9+
(
10+
-- The ID for this protocol
11+
id SERIAL,
12+
-- When was this protocol updated the last time, used to retrieve the ID after an upsert operation
13+
updated_at TIMESTAMPTZ NOT NULL,
14+
-- When was this protocol created
15+
created_at TIMESTAMPTZ NOT NULL,
16+
17+
-- The property name
18+
protocol VARCHAR(1000) NOT NULL,
19+
20+
-- There should only be one protocol
21+
CONSTRAINT uq_protocols_protocol UNIQUE (protocol),
22+
23+
PRIMARY KEY (id)
24+
);
25+
26+
-- migrate protocols from the properties table
27+
INSERT INTO protocols (protocol, updated_at, created_at)
28+
SELECT value, p.updated_at, p.created_at
29+
FROM properties p
30+
WHERE p.property = 'protocol';
31+
32+
-- Since the set of protocols for a particular peer doesn't change very often in between crawls. The
33+
-- visits_x_properties table is blowing up quite quickly. This table holds particular sets of protocols
34+
-- that peers support. Each visit is then linked to just one of these sets.
35+
CREATE TABLE protocols_sets
36+
(
37+
-- The ID for this set of properties
38+
id SERIAL,
39+
-- The properties in this set
40+
protocol_ids INT ARRAY NOT NULL,
41+
42+
-- Don't allow identical sets in the database
43+
EXCLUDE USING GIST(protocol_ids WITH =),
44+
45+
PRIMARY KEY (id)
46+
);
47+
48+
-- Allow efficient lookups of particular protocol sets.
49+
CREATE INDEX idx_protocols_sets_protocol_ids on protocols_sets USING GIN (protocol_ids);
50+
51+
-- A temporary table for this transaction. This table holds all sets of protocols for each visit.
52+
CREATE TEMP TABLE visits_agg_protocols ON COMMIT DROP AS (
53+
SELECT visit_id, uniq(sort(array_agg(prot.id))) as protocols_set
54+
FROM visits_x_properties vxp
55+
INNER JOIN properties p on p.id = vxp.property_id
56+
INNER JOIN protocols prot ON p.value = prot.protocol
57+
WHERE p.property = 'protocol'
58+
GROUP BY 1);
59+
60+
-- This temporary table holds all distinct protocol sets
61+
CREATE TEMP TABLE distinct_visits_agg_protocols ON COMMIT DROP AS (
62+
SELECT DISTINCT protocols_set
63+
FROM visits_agg_protocols);
64+
65+
-- Save all the distinct sets in to the protocols_sets table
66+
INSERT
67+
INTO protocols_sets (protocol_ids)
68+
SELECT distinct_visits_agg_protocols.protocols_set
69+
FROM distinct_visits_agg_protocols;
70+
71+
-- Create a column on the visits table to associate a visit with a set of protocols
72+
ALTER TABLE visits
73+
ADD COLUMN protocols_set_id INT;
74+
75+
-- For each visit in the visits_agg_protocols table find the associated protocol set
76+
-- then set the protocols_set_id column to that set.
77+
WITH visits_x_protocols_sets AS (
78+
SELECT ag.visit_id AS visit_id, ps.id AS protocols_set_id
79+
FROM protocols_sets ps
80+
INNER JOIN visits_agg_protocols ag ON ag.protocols_set = ps.protocol_ids
81+
)
82+
UPDATE visits
83+
SET protocols_set_id = vxps.protocols_set_id
84+
FROM visits_x_protocols_sets vxps
85+
WHERE vxps.visit_id = visits.id;
86+
87+
ALTER TABLE visits ADD CONSTRAINT fk_visits_protocols_set_id FOREIGN KEY (protocols_set_id)
88+
REFERENCES protocols_sets (id)
89+
ON DELETE NO ACTION;
90+
91+
92+
CREATE TEMP TABLE peers_agg_protocols ON COMMIT DROP AS (
93+
SELECT peer_id, uniq(sort(array_agg(prot.id))) as protocols_set
94+
FROM peers_x_properties vxp
95+
INNER JOIN properties p on p.id = vxp.property_id
96+
INNER JOIN protocols prot ON p.value = prot.protocol
97+
WHERE p.property = 'protocol'
98+
GROUP BY 1);
99+
100+
-- Create a column on the visits table to associate a visit with a set of protocols
101+
ALTER TABLE peers
102+
ADD COLUMN protocols_set_id INT;
103+
104+
WITH peers_x_protocols_sets AS (
105+
SELECT ag.peer_id AS visit_id, ps.id AS protocols_set_id
106+
FROM protocols_sets ps
107+
INNER JOIN peers_agg_protocols ag ON ag.protocols_set = ps.protocol_ids
108+
)
109+
UPDATE peers
110+
SET protocols_set_id = pxps.protocols_set_id
111+
FROM peers_x_protocols_sets pxps
112+
WHERE pxps.visit_id = peers.id;
113+
114+
ALTER TABLE peers ADD CONSTRAINT fk_peers_protocols_set_id FOREIGN KEY (protocols_set_id)
115+
REFERENCES protocols_sets (id)
116+
ON DELETE NO ACTION;
117+
118+
-- End the transaction
119+
COMMIT;
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
-- Begin the transaction
2+
BEGIN;
3+
4+
ALTER TABLE peers DROP CONSTRAINT fk_peers_agent_version_id;
5+
ALTER TABLE peers DROP COLUMN agent_version_id;
6+
7+
ALTER TABLE visits DROP CONSTRAINT fk_visits_agent_version_id;
8+
ALTER TABLE visits DROP COLUMN agent_version_id;
9+
10+
DROP TABLE agent_versions;
11+
12+
-- End the transaction
13+
COMMIT;
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
-- Begin the transaction
2+
BEGIN;
3+
4+
-- agent_versions
5+
CREATE TABLE agent_versions
6+
(
7+
-- The ID for this agent version
8+
id SERIAL,
9+
-- When was this agent version updated the last time, used to retrieve the ID after an upsert operation
10+
updated_at TIMESTAMPTZ NOT NULL,
11+
-- When was this agent version created
12+
created_at TIMESTAMPTZ NOT NULL,
13+
14+
-- The property name
15+
agent_version VARCHAR(1000) NOT NULL,
16+
17+
-- There should only be one protocol
18+
CONSTRAINT uq_agent_versions_agent_version UNIQUE (agent_version),
19+
20+
PRIMARY KEY (id)
21+
);
22+
23+
-- migrate agent_versions
24+
INSERT INTO agent_versions (agent_version, updated_at, created_at)
25+
SELECT value, p.updated_at, p.created_at
26+
FROM properties p
27+
WHERE p.property = 'agent_version';
28+
29+
-- add agent version column
30+
ALTER TABLE visits
31+
ADD COLUMN agent_version_id INT;
32+
33+
-- migrate agent versions
34+
UPDATE visits
35+
SET agent_version_id = subquery.agent_version_id
36+
FROM (SELECT visit_id, av.id as agent_version_id
37+
FROM visits_x_properties vxp
38+
INNER JOIN properties p ON p.id = vxp.property_id
39+
INNER JOIN agent_versions av ON p.value = av.agent_version
40+
WHERE p.property = 'agent_version') AS subquery
41+
WHERE visits.id = subquery.visit_id;
42+
43+
44+
ALTER TABLE visits
45+
ADD CONSTRAINT fk_visits_agent_version_id FOREIGN KEY (agent_version_id)
46+
REFERENCES agent_versions (id)
47+
ON DELETE NO ACTION;
48+
49+
50+
-- Create a column on the visits table to associate a visit with a set of protocols
51+
ALTER TABLE peers
52+
ADD COLUMN agent_version_id INT;
53+
54+
-- migrate agent versions
55+
UPDATE peers
56+
SET agent_version_id = subquery.agent_version_id
57+
FROM (SELECT peer_id, av.id as agent_version_id
58+
FROM peers_x_properties vxp
59+
INNER JOIN properties p ON p.id = vxp.property_id
60+
INNER JOIN agent_versions av ON p.value = av.agent_version
61+
WHERE p.property = 'agent_version') AS subquery
62+
WHERE peers.id = subquery.peer_id;
63+
64+
ALTER TABLE peers
65+
ADD CONSTRAINT fk_peers_agent_version_id FOREIGN KEY (agent_version_id)
66+
REFERENCES agent_versions (id)
67+
ON DELETE NO ACTION;
68+
69+
-- End the transaction
70+
COMMIT;

migrations/000039_migrate_crawl_properties.down.sql

Whitespace-only changes.
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
-- Begin the transaction
2+
BEGIN;
3+
4+
ALTER TABLE crawl_properties
5+
RENAME TO crawl_properties_old;
6+
7+
CREATE TABLE crawl_properties
8+
(
9+
id SERIAL PRIMARY KEY,
10+
crawl_id SERIAL NOT NULL,
11+
protocol_id INT,
12+
agent_version_id INT,
13+
error dial_error,
14+
count INT NOT NULL,
15+
created_at TIMESTAMPTZ NOT NULL,
16+
updated_at TIMESTAMPTZ NOT NULL
17+
);
18+
19+
ALTER TABLE crawl_properties
20+
ADD CONSTRAINT fk_crawl_properties_crawl_id
21+
FOREIGN KEY (crawl_id)
22+
REFERENCES crawls (id)
23+
ON DELETE CASCADE;
24+
25+
ALTER TABLE crawl_properties
26+
ADD CONSTRAINT fk_crawl_properties_protocol_id
27+
FOREIGN KEY (protocol_id)
28+
REFERENCES protocols (id)
29+
ON DELETE NO ACTION;
30+
31+
ALTER TABLE crawl_properties
32+
ADD CONSTRAINT fk_crawl_properties_agent_version_id
33+
FOREIGN KEY (agent_version_id)
34+
REFERENCES agent_versions (id)
35+
ON DELETE NO ACTION;
36+
37+
INSERT INTO crawl_properties (crawl_id, protocol_id, count, created_at, updated_at)
38+
SELECT cp.crawl_id, prot.id protocol_id, count, cp.created_at, cp.updated_at
39+
FROM crawl_properties_old cp
40+
INNER JOIN properties p on cp.property_id = p.id
41+
INNER JOIN protocols prot on prot.protocol = p.value
42+
WHERE p.property = 'protocol';
43+
44+
INSERT INTO crawl_properties (crawl_id, agent_version_id, count, created_at, updated_at)
45+
SELECT cp.crawl_id, av.id agent_version_id, count, cp.created_at, cp.updated_at
46+
FROM crawl_properties_old cp
47+
INNER JOIN properties p on cp.property_id = p.id
48+
INNER JOIN agent_versions av on av.agent_version = p.value
49+
WHERE p.property = 'agent_version';
50+
51+
INSERT INTO crawl_properties (crawl_id, error, count, created_at, updated_at)
52+
SELECT cp.crawl_id, p.value::dial_error, count, cp.created_at, cp.updated_at
53+
FROM crawl_properties_old cp
54+
INNER JOIN properties p on cp.property_id = p.id
55+
WHERE p.property = 'error';
56+
57+
DROP TABLE crawl_properties_old;
58+
59+
-- End the transaction
60+
COMMIT;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
-- can't bother
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
-- Begin the transaction
2+
BEGIN;
3+
4+
DROP TABLE visits_x_properties;
5+
DROP TABLE peers_x_properties;
6+
DROP TABLE properties;
7+
8+
-- End the transaction
9+
COMMIT;
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
-- Begin the transaction
2+
BEGIN;
3+
4+
ALTER TABLE visits DROP CONSTRAINT fk_visits_multi_addresses_set_id;
5+
ALTER TABLE visits DROP COLUMN multi_addresses_set_id;
6+
7+
DROP TABLE multi_addresses_sets;
8+
9+
-- End the transaction
10+
COMMIT;

0 commit comments

Comments
 (0)