Skip to content

Commit ab31fcf

Browse files
dpageclaude
andcommitted
Support custom primary key column names for vectorized tables
Add a source_pk parameter to enable_vectorization() (defaults to 'id') that is passed through as a trigger argument. The vectorization_trigger() and recreate_chunks() functions now use this value instead of hardcoding 'id', allowing tables with any primary key column name to be vectorized. Fixes #7 Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent d9e7253 commit ab31fcf

File tree

3 files changed

+107
-12
lines changed

3 files changed

+107
-12
lines changed

sql/pgedge_vectorizer--1.0-beta2.sql

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,8 @@ CREATE FUNCTION pgedge_vectorizer.enable_vectorization(
8181
chunk_size INT DEFAULT NULL,
8282
chunk_overlap INT DEFAULT NULL,
8383
embedding_dimension INT DEFAULT 1536,
84-
chunk_table_name TEXT DEFAULT NULL
84+
chunk_table_name TEXT DEFAULT NULL,
85+
source_pk NAME DEFAULT 'id'
8586
) RETURNS VOID AS $$
8687
DECLARE
8788
chunk_table TEXT;
@@ -133,10 +134,10 @@ BEGIN
133134
CREATE OR REPLACE TRIGGER %I
134135
AFTER INSERT OR UPDATE ON %s
135136
FOR EACH ROW
136-
EXECUTE FUNCTION pgedge_vectorizer.vectorization_trigger(%L, %L, %L, %L, %L)',
137+
EXECUTE FUNCTION pgedge_vectorizer.vectorization_trigger(%L, %L, %L, %L, %L, %L)',
137138
trigger_name, source_table,
138139
source_column, chunk_table, actual_strategy,
139-
actual_chunk_size, actual_chunk_overlap);
140+
actual_chunk_size, actual_chunk_overlap, source_pk);
140141

141142
RAISE NOTICE 'Vectorization enabled: % -> %', source_table, chunk_table;
142143
RAISE NOTICE 'Strategy: %, chunk_size: %, overlap: %',
@@ -155,8 +156,8 @@ BEGIN
155156
BEGIN
156157
RAISE NOTICE 'Processing existing rows...';
157158

158-
FOR row_record IN EXECUTE format('SELECT id, %I as content FROM %s WHERE %I IS NOT NULL AND %I != ''''',
159-
source_column, source_table, source_column, source_column)
159+
FOR row_record IN EXECUTE format('SELECT %I as pk_val, %I as content FROM %s WHERE %I IS NOT NULL AND %I != ''''',
160+
source_pk, source_column, source_table, source_column, source_column)
160161
LOOP
161162
doc_content := row_record.content;
162163

@@ -182,7 +183,7 @@ BEGIN
182183
RETURNING id,
183184
(embedding IS NULL) AS needs_embedding',
184185
chunk_table, chunk_table, chunk_table)
185-
USING row_record.id, i, chunk_text,
186+
USING row_record.pk_val, i, chunk_text,
186187
length(chunk_text) / 4 -- Approximate token count
187188
INTO chunk_id, needs_embedding;
188189

@@ -273,6 +274,7 @@ DECLARE
273274
strategy TEXT;
274275
chunk_sz INT;
275276
overlap INT;
277+
pk_col TEXT;
276278
doc_content TEXT;
277279
chunks TEXT[];
278280
chunk_text TEXT;
@@ -286,9 +288,10 @@ BEGIN
286288
strategy := TG_ARGV[2];
287289
chunk_sz := TG_ARGV[3]::INT;
288290
overlap := TG_ARGV[4]::INT;
291+
pk_col := COALESCE(TG_ARGV[5], 'id');
289292

290-
-- Get source document ID (assumes 'id' column)
291-
EXECUTE format('SELECT $1.id') USING NEW INTO source_id_val;
293+
-- Get source document ID
294+
EXECUTE format('SELECT ($1).%I', pk_col) USING NEW INTO source_id_val;
292295

293296
-- Get document content
294297
EXECUTE format('SELECT $1.%I', content_col) USING NEW INTO doc_content;
@@ -554,6 +557,7 @@ BEGIN
554557
actual_strategy TEXT;
555558
actual_chunk_size INT;
556559
actual_chunk_overlap INT;
560+
pk_col TEXT;
557561
BEGIN
558562
-- Get chunking configuration from trigger arguments
559563
-- In PostgreSQL 17+, tgargs is bytea and needs to be decoded
@@ -567,18 +571,19 @@ BEGIN
567571
WHERE c.oid = source_table_name
568572
AND t.tgname = trigger_name;
569573

570-
-- Arguments are 0-indexed: 0=chunk_table, 1=source_column, 2=strategy, 3=size, 4=overlap
574+
-- Arguments: 1=content_col, 2=chunk_table, 3=strategy, 4=size, 5=overlap, 6=pk_col
571575
actual_strategy := tgargs_array[3];
572576
actual_chunk_size := tgargs_array[4]::INT;
573577
actual_chunk_overlap := tgargs_array[5]::INT;
578+
pk_col := COALESCE(tgargs_array[6], 'id');
574579
END;
575580

576581
RAISE NOTICE 'Re-chunking with strategy=%, size=%, overlap=%',
577582
actual_strategy, actual_chunk_size, actual_chunk_overlap;
578583

579584
FOR row_record IN EXECUTE format(
580-
'SELECT id, %I as content FROM %s WHERE %I IS NOT NULL AND %I != ''''',
581-
source_column_name, source_table_name, source_column_name, source_column_name
585+
'SELECT %I as pk_val, %I as content FROM %s WHERE %I IS NOT NULL AND %I != ''''',
586+
pk_col, source_column_name, source_table_name, source_column_name, source_column_name
582587
)
583588
LOOP
584589
doc_content := row_record.content;
@@ -595,7 +600,7 @@ BEGIN
595600
INSERT INTO %I (source_id, chunk_index, content, token_count)
596601
VALUES ($1, $2, $3, $4)
597602
RETURNING id', chunk_table_name)
598-
USING row_record.id, i, chunk_text,
603+
USING row_record.pk_val, i, chunk_text,
599604
length(chunk_text) / 4 -- Approximate token count
600605
INTO chunk_id;
601606

test/expected/edge_cases.out

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,3 +90,57 @@ NOTICE: Vectorization disabled and chunk table dropped: edge_test_content_chunk
9090
(1 row)
9191

9292
DROP TABLE edge_test;
93+
-- Test 8: Custom primary key column name
94+
CREATE TABLE custom_pk_test (
95+
doc_id BIGSERIAL PRIMARY KEY,
96+
body TEXT
97+
);
98+
SELECT pgedge_vectorizer.enable_vectorization(
99+
'custom_pk_test'::regclass,
100+
'body',
101+
'token_based',
102+
100,
103+
10,
104+
1536,
105+
NULL,
106+
'doc_id'
107+
);
108+
NOTICE: Vectorization enabled: custom_pk_test -> custom_pk_test_body_chunks
109+
NOTICE: Strategy: token_based, chunk_size: 100, overlap: 10
110+
NOTICE: Processing existing rows...
111+
NOTICE: Processed 0 existing rows
112+
enable_vectorization
113+
----------------------
114+
115+
(1 row)
116+
117+
-- Insert a document using the custom PK
118+
INSERT INTO custom_pk_test (body)
119+
VALUES ('This document uses a custom primary key column.');
120+
-- Verify chunks were created with correct source_id
121+
SELECT COUNT(*) > 0 AS chunks_created FROM custom_pk_test_body_chunks WHERE source_id = 1;
122+
chunks_created
123+
----------------
124+
t
125+
(1 row)
126+
127+
-- Verify queue entries exist
128+
SELECT COUNT(*) > 0 AS queue_entries FROM pgedge_vectorizer.queue
129+
WHERE chunk_table = 'custom_pk_test_body_chunks';
130+
queue_entries
131+
---------------
132+
t
133+
(1 row)
134+
135+
-- Update the document (should recreate chunks)
136+
UPDATE custom_pk_test SET body = 'Updated body text.' WHERE doc_id = 1;
137+
-- Clean up
138+
DELETE FROM pgedge_vectorizer.queue WHERE chunk_table = 'custom_pk_test_body_chunks';
139+
SELECT pgedge_vectorizer.disable_vectorization('custom_pk_test'::regclass, 'body', true);
140+
NOTICE: Vectorization disabled and chunk table dropped: custom_pk_test_body_chunks
141+
disable_vectorization
142+
-----------------------
143+
144+
(1 row)
145+
146+
DROP TABLE custom_pk_test;

test/sql/edge_cases.sql

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,39 @@ SELECT COUNT(*) > 0 AS chunks_after_null_update FROM edge_test_content_chunks WH
5050
DELETE FROM pgedge_vectorizer.queue WHERE chunk_table = 'edge_test_content_chunks';
5151
SELECT pgedge_vectorizer.disable_vectorization('edge_test'::regclass, 'content', true);
5252
DROP TABLE edge_test;
53+
54+
-- Test 8: Custom primary key column name
55+
CREATE TABLE custom_pk_test (
56+
doc_id BIGSERIAL PRIMARY KEY,
57+
body TEXT
58+
);
59+
60+
SELECT pgedge_vectorizer.enable_vectorization(
61+
'custom_pk_test'::regclass,
62+
'body',
63+
'token_based',
64+
100,
65+
10,
66+
1536,
67+
NULL,
68+
'doc_id'
69+
);
70+
71+
-- Insert a document using the custom PK
72+
INSERT INTO custom_pk_test (body)
73+
VALUES ('This document uses a custom primary key column.');
74+
75+
-- Verify chunks were created with correct source_id
76+
SELECT COUNT(*) > 0 AS chunks_created FROM custom_pk_test_body_chunks WHERE source_id = 1;
77+
78+
-- Verify queue entries exist
79+
SELECT COUNT(*) > 0 AS queue_entries FROM pgedge_vectorizer.queue
80+
WHERE chunk_table = 'custom_pk_test_body_chunks';
81+
82+
-- Update the document (should recreate chunks)
83+
UPDATE custom_pk_test SET body = 'Updated body text.' WHERE doc_id = 1;
84+
85+
-- Clean up
86+
DELETE FROM pgedge_vectorizer.queue WHERE chunk_table = 'custom_pk_test_body_chunks';
87+
SELECT pgedge_vectorizer.disable_vectorization('custom_pk_test'::regclass, 'body', true);
88+
DROP TABLE custom_pk_test;

0 commit comments

Comments
 (0)