@@ -81,7 +81,8 @@ CREATE FUNCTION pgedge_vectorizer.enable_vectorization(
8181 chunk_size INT DEFAULT NULL ,
8282 chunk_overlap INT DEFAULT NULL ,
8383 embedding_dimension INT DEFAULT 1536 ,
84- chunk_table_name TEXT DEFAULT NULL
84+ chunk_table_name TEXT DEFAULT NULL ,
85+ source_pk NAME DEFAULT ' id'
8586) RETURNS VOID AS $$
8687DECLARE
8788 chunk_table TEXT ;
@@ -133,10 +134,10 @@ BEGIN
133134 CREATE OR REPLACE TRIGGER %I
134135 AFTER INSERT OR UPDATE ON %s
135136 FOR EACH ROW
136- EXECUTE FUNCTION pgedge_vectorizer.vectorization_trigger(%L, %L, %L, %L, %L)' ,
137+ EXECUTE FUNCTION pgedge_vectorizer.vectorization_trigger(%L, %L, %L, %L, %L, %L )' ,
137138 trigger_name, source_table,
138139 source_column, chunk_table, actual_strategy,
139- actual_chunk_size, actual_chunk_overlap);
140+ actual_chunk_size, actual_chunk_overlap, source_pk );
140141
141142 RAISE NOTICE ' Vectorization enabled: % -> %' , source_table, chunk_table;
142143 RAISE NOTICE ' Strategy: %, chunk_size: %, overlap: %' ,
@@ -155,8 +156,8 @@ BEGIN
155156 BEGIN
156157 RAISE NOTICE ' Processing existing rows...' ;
157158
158- FOR row_record IN EXECUTE format(' SELECT id , %I as content FROM %s WHERE %I IS NOT NULL AND %I != ' ' ' ' ' ,
159- source_column, source_table, source_column, source_column)
159+ FOR row_record IN EXECUTE format(' SELECT %I as pk_val , %I as content FROM %s WHERE %I IS NOT NULL AND %I != ' ' ' ' ' ,
160+ source_pk, source_column, source_table, source_column, source_column)
160161 LOOP
161162 doc_content := row_record .content ;
162163
@@ -182,7 +183,7 @@ BEGIN
182183 RETURNING id,
183184 (embedding IS NULL) AS needs_embedding' ,
184185 chunk_table, chunk_table, chunk_table)
185- USING row_record .id , i, chunk_text,
186+ USING row_record .pk_val , i, chunk_text,
186187 length(chunk_text) / 4 -- Approximate token count
187188 INTO chunk_id, needs_embedding;
188189
@@ -273,6 +274,7 @@ DECLARE
273274 strategy TEXT ;
274275 chunk_sz INT ;
275276 overlap INT ;
277+ pk_col TEXT ;
276278 doc_content TEXT ;
277279 chunks TEXT [];
278280 chunk_text TEXT ;
@@ -286,9 +288,10 @@ BEGIN
286288 strategy := TG_ARGV[2 ];
287289 chunk_sz := TG_ARGV[3 ]::INT ;
288290 overlap := TG_ARGV[4 ]::INT ;
291+ pk_col := COALESCE(TG_ARGV[5 ], ' id' );
289292
290- -- Get source document ID (assumes 'id' column)
291- EXECUTE format(' SELECT $1.id ' ) USING NEW INTO source_id_val;
293+ -- Get source document ID
294+ EXECUTE format(' SELECT ($1).%I ' , pk_col ) USING NEW INTO source_id_val;
292295
293296 -- Get document content
294297 EXECUTE format(' SELECT $1.%I' , content_col) USING NEW INTO doc_content;
@@ -554,6 +557,7 @@ BEGIN
554557 actual_strategy TEXT ;
555558 actual_chunk_size INT ;
556559 actual_chunk_overlap INT ;
560+ pk_col TEXT ;
557561 BEGIN
558562 -- Get chunking configuration from trigger arguments
559563 -- In PostgreSQL 17+, tgargs is bytea and needs to be decoded
@@ -567,18 +571,19 @@ BEGIN
567571 WHERE c .oid = source_table_name
568572 AND t .tgname = trigger_name;
569573
570- -- Arguments are 0-indexed: 0=chunk_table, 1=source_column, 2 =strategy, 3 =size, 4 =overlap
574+ -- Arguments: 1=content_col, 2=chunk_table, 3 =strategy, 4 =size, 5 =overlap, 6=pk_col
571575 actual_strategy := tgargs_array[3 ];
572576 actual_chunk_size := tgargs_array[4 ]::INT ;
573577 actual_chunk_overlap := tgargs_array[5 ]::INT ;
578+ pk_col := COALESCE(tgargs_array[6 ], ' id' );
574579 END;
575580
576581 RAISE NOTICE ' Re-chunking with strategy=%, size=%, overlap=%' ,
577582 actual_strategy, actual_chunk_size, actual_chunk_overlap;
578583
579584 FOR row_record IN EXECUTE format(
580- ' SELECT id , %I as content FROM %s WHERE %I IS NOT NULL AND %I != ' ' ' ' ' ,
581- source_column_name, source_table_name, source_column_name, source_column_name
585+ ' SELECT %I as pk_val , %I as content FROM %s WHERE %I IS NOT NULL AND %I != ' ' ' ' ' ,
586+ pk_col, source_column_name, source_table_name, source_column_name, source_column_name
582587 )
583588 LOOP
584589 doc_content := row_record .content ;
@@ -595,7 +600,7 @@ BEGIN
595600 INSERT INTO %I (source_id, chunk_index, content, token_count)
596601 VALUES ($1, $2, $3, $4)
597602 RETURNING id' , chunk_table_name)
598- USING row_record .id , i, chunk_text,
603+ USING row_record .pk_val , i, chunk_text,
599604 length(chunk_text) / 4 -- Approximate token count
600605 INTO chunk_id;
601606
0 commit comments