Skip to content

Commit 398b977

Browse files
Added ClickHouse ddl statements.
1 parent 2664ee9 commit 398b977

File tree

3 files changed

+866
-0
lines changed

3 files changed

+866
-0
lines changed
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Bronze layer
2+
3+
## bluesky_raw table
4+
5+
```sql
6+
CREATE TABLE bluesky.bluesky_raw
7+
(
8+
`data` JSON( SKIP `commit.record.reply.root.record`, SKIP `commit.record.value.value`),
9+
`_file` LowCardinality(String),
10+
`kind` LowCardinality(String) MATERIALIZED getSubcolumn(data, 'kind'),
11+
`scrape_ts` DateTime64(6) MATERIALIZED fromUnixTimestamp64Micro(CAST(getSubcolumn(data, 'time_us'), 'UInt64')),
12+
`bluesky_ts` DateTime64(6) MATERIALIZED multiIf(getSubcolumn(data, 'kind') = 'commit', parseDateTime64BestEffortOrZero(CAST(getSubcolumn(data, 'commit.record.createdAt'), 'String')), getSubcolumn(data, 'kind') = 'identity', parseDateTime64BestEffortOrZero(CAST(getSubcolumn(data, 'identity.time'), 'String')), getSubcolumn(data, 'kind') = 'account', parseDateTime64BestEffortOrZero(CAST(getSubcolumn(data, 'account.time'), 'String')), toDateTime64(0, 6)),
13+
`dedup_hash` String MATERIALIZED cityHash64(arrayFilter(p -> ((p.1) != 'time_us'), JSONExtractKeysAndValues(CAST(data, 'String'), 'String')))
14+
)
15+
ENGINE = ReplacingMergeTree
16+
PRIMARY KEY (kind, bluesky_ts)
17+
ORDER BY (kind, bluesky_ts, dedup_hash)
18+
```
19+
20+
## S3Queue table
21+
22+
```sql
23+
CREATE TABLE bluesky.bluesky_queue
24+
(
25+
`data` Nullable(String)
26+
)
27+
ENGINE = S3Queue('https://storage.googleapis.com/pme-internal/bluesky/*.gz', '<HMAC_KEY>', '<HMAC_SECRET>', 'CSVWithNames')
28+
SETTINGS mode = 'ordered', s3queue_buckets = 30, s3queue_processing_threads_num = 10;
29+
```
30+
31+
32+
## Materialized view for S3Queue table
33+
```sql
34+
CREATE MATERIALIZED VIEW bluesky.bluesky_mv TO bluesky.bluesky_raw
35+
(
36+
`data` Nullable(String)
37+
)
38+
AS SELECT
39+
data,
40+
_file
41+
FROM bluesky.bluesky_queue
42+
WHERE isValidJSON(data) = 1
43+
```
44+
45+
46+
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# Silver layer
2+
3+
## bluesky_dedup table
4+
5+
```sql
6+
CREATE TABLE bluesky.bluesky_dedup
7+
(
8+
`data` JSON( SKIP `commit.record.reply.root.record`, SKIP `commit.record.value.value`),
9+
`kind` LowCardinality(String),
10+
`scrape_ts` DateTime64(6),
11+
`bluesky_ts` DateTime64(6),
12+
`dedup_hash` String
13+
)
14+
ENGINE = ReplacingMergeTree
15+
PARTITION BY toStartOfInterval(bluesky_ts, toIntervalMinute(20))
16+
ORDER BY dedup_hash
17+
TTL toStartOfMinute(bluesky_ts) + toIntervalMinute(1440) SETTINGS ttl_only_drop_parts=1
18+
```
19+
20+
# Transfer from Bronze to Silver
21+
22+
## Materialized view for bluesky_dedup table
23+
24+
```sql
25+
CREATE MATERIALIZED VIEW bluesky.bluesky_dedup_mv TO bluesky.bluesky_dedup
26+
(
27+
`data` JSON,
28+
`kind` LowCardinality(String),
29+
`scrape_ts` DateTime64(6),
30+
`bluesky_ts` DateTime64(6),
31+
`dedup_hash` String
32+
)
33+
AS SELECT
34+
data,
35+
kind,
36+
scrape_ts,
37+
bluesky_ts,
38+
dedup_hash
39+
FROM bluesky.bluesky_raw
40+
WHERE abs(timeDiff(scrape_ts, bluesky_ts)) < 1200
41+
```
42+
43+
## Dead-letter queue table
44+
45+
```sql
46+
CREATE TABLE bluesky.bluesky_dlq
47+
(
48+
`data` JSON( SKIP `commit.record.reply.root.record`, SKIP `commit.record.value.value`),
49+
`kind` LowCardinality(String),
50+
`scrape_ts` DateTime64(6),
51+
`bluesky_ts` DateTime64(6),
52+
`dedup_hash` String
53+
)
54+
ENGINE = MergeTree
55+
ORDER BY (kind, scrape_ts)
56+
```
57+
58+
59+
## Materialized view for dead-letter queue table
60+
61+
```sql
62+
CREATE MATERIALIZED VIEW bluesky.bluesky_dlq_mv TO bluesky.bluesky_dlq
63+
(
64+
`data` JSON,
65+
`kind` LowCardinality(String),
66+
`scrape_ts` DateTime64(6),
67+
`bluesky_ts` DateTime64(6),
68+
`dedup_hash` String
69+
)
70+
AS SELECT
71+
data,
72+
kind,
73+
scrape_ts,
74+
bluesky_ts,
75+
dedup_hash
76+
FROM bluesky.bluesky_raw
77+
WHERE abs(timeDiff(scrape_ts, bluesky_ts)) >= 1200
78+
```
79+
80+
81+

0 commit comments

Comments
 (0)