Skip to content

Commit e90f0bf

Browse files
authored
feat: add distributed embedding table (#42258)
1 parent 68c5797 commit e90f0bf

File tree

8 files changed

+220
-463
lines changed

8 files changed

+220
-463
lines changed

posthog/clickhouse/migrations/0174_add_content_column_to_document_embeddings.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33

44
from products.error_tracking.backend.embedding import (
55
DOCUMENT_EMBEDDING_WRITABLE,
6-
DOCUMENT_EMBEDDINGS,
76
DOCUMENT_EMBEDDINGS_MV,
87
DOCUMENT_EMBEDDINGS_MV_SQL,
98
KAFKA_DOCUMENT_EMBEDDINGS,
109
KAFKA_DOCUMENT_EMBEDDINGS_TABLE_SQL,
10+
SHARDED_DOCUMENT_EMBEDDINGS,
1111
)
1212

1313
ADD_CONTENT_COLUMN_SQL = """
@@ -25,7 +25,7 @@
2525
node_roles=[NodeRole.INGESTION_SMALL],
2626
),
2727
run_sql_with_exceptions(
28-
ADD_CONTENT_COLUMN_SQL.format(table_name=DOCUMENT_EMBEDDINGS),
28+
ADD_CONTENT_COLUMN_SQL.format(table_name=SHARDED_DOCUMENT_EMBEDDINGS),
2929
node_roles=[NodeRole.DATA, NodeRole.COORDINATOR],
3030
sharded=False,
3131
is_alter_on_replicated_table=True,
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
from posthog.clickhouse.client.connection import NodeRole
2+
from posthog.clickhouse.client.migration_tools import run_sql_with_exceptions
3+
4+
from products.error_tracking.backend.embedding import (
5+
DISTRIBUTED_DOCUMENT_EMBEDDINGS_TABLE_SQL,
6+
DOCUMENT_EMBEDDING_WRITABLE,
7+
DOCUMENT_EMBEDDINGS_MV,
8+
DOCUMENT_EMBEDDINGS_MV_SQL,
9+
DOCUMENT_EMBEDDINGS_TABLE_SQL,
10+
DOCUMENT_EMBEDDINGS_WRITABLE_TABLE_SQL,
11+
KAFKA_DOCUMENT_EMBEDDINGS,
12+
KAFKA_DOCUMENT_EMBEDDINGS_TABLE_SQL,
13+
)
14+
15+
# The only tricky part of this migration is that the behaviour of `DOCUMENT_EMBEDDINGS_TABLE_SQL` has changed -
16+
# it now creates sharded tables rather than replicated tables. We don't drop the replicated table, as in production
17+
# we want to keep the historical data around, at least until we get around to migrating it, but this does mean
18+
# all future "rebuild the world" runs of the migration set will never create that old table, only the new sharded one.
19+
20+
operations = [
21+
# 1. Drop MV to stop processing messages from kafka
22+
run_sql_with_exceptions(
23+
f"DROP TABLE IF EXISTS {DOCUMENT_EMBEDDINGS_MV}",
24+
node_roles=[NodeRole.INGESTION_SMALL],
25+
),
26+
# 2. Drop Kafka table
27+
run_sql_with_exceptions(
28+
f"DROP TABLE IF EXISTS {KAFKA_DOCUMENT_EMBEDDINGS}",
29+
node_roles=[NodeRole.INGESTION_SMALL],
30+
),
31+
# 3. Drop old writable table (but not the old "main" table, since we want to keep the data around)
32+
run_sql_with_exceptions(
33+
f"DROP TABLE IF EXISTS {DOCUMENT_EMBEDDING_WRITABLE}",
34+
node_roles=[NodeRole.INGESTION_SMALL],
35+
),
36+
# 4. Create new sharded data tables (this function used to create "posthog_document_embeddings" directly, but now creates the sharded_ version)
37+
run_sql_with_exceptions(
38+
DOCUMENT_EMBEDDINGS_TABLE_SQL(),
39+
node_roles=[NodeRole.DATA],
40+
sharded=True,
41+
),
42+
# 5. Create distributed read table for the sharded data
43+
run_sql_with_exceptions(
44+
DISTRIBUTED_DOCUMENT_EMBEDDINGS_TABLE_SQL(),
45+
node_roles=[NodeRole.DATA, NodeRole.COORDINATOR],
46+
),
47+
# 6. Create new writable distributed table pointing to sharded tables
48+
run_sql_with_exceptions(
49+
DOCUMENT_EMBEDDINGS_WRITABLE_TABLE_SQL(),
50+
node_roles=[NodeRole.INGESTION_SMALL],
51+
),
52+
# 7. Recreate Kafka table
53+
run_sql_with_exceptions(
54+
KAFKA_DOCUMENT_EMBEDDINGS_TABLE_SQL(),
55+
node_roles=[NodeRole.INGESTION_SMALL],
56+
),
57+
# 8. Recreate MV writing to writable table (which now writes to sharded tables)
58+
run_sql_with_exceptions(
59+
DOCUMENT_EMBEDDINGS_MV_SQL(),
60+
node_roles=[NodeRole.INGESTION_SMALL],
61+
),
62+
]
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0182_endpoints_cluster_query_log_prod
1+
0183_shard_document_embeddings

posthog/clickhouse/schema.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,10 @@
171171
)
172172

173173
from products.error_tracking.backend.embedding import (
174+
DISTRIBUTED_DOCUMENT_EMBEDDINGS_TABLE_SQL,
174175
DOCUMENT_EMBEDDINGS_MV_SQL,
175176
DOCUMENT_EMBEDDINGS_TABLE_SQL,
177+
DOCUMENT_EMBEDDINGS_WRITABLE_TABLE_SQL,
176178
KAFKA_DOCUMENT_EMBEDDINGS_TABLE_SQL,
177179
)
178180
from products.error_tracking.backend.sql import (
@@ -264,6 +266,8 @@
264266
WRITABLE_APP_METRICS2_TABLE_SQL,
265267
WRITABLE_ERROR_TRACKING_ISSUE_FINGERPRINT_OVERRIDES_TABLE_SQL,
266268
WRITABLE_EVENTS_RECENT_TABLE_SQL,
269+
DISTRIBUTED_DOCUMENT_EMBEDDINGS_TABLE_SQL,
270+
DOCUMENT_EMBEDDINGS_WRITABLE_TABLE_SQL,
267271
)
268272
CREATE_KAFKA_TABLE_QUERIES = (
269273
KAFKA_LOG_ENTRIES_TABLE_SQL,

0 commit comments

Comments
 (0)