-
Notifications
You must be signed in to change notification settings - Fork 16
Open
Description
I noticed the discussion in Issue #94:
Unexpectedly large BM25 index size and encountered a similar problem when using the jieba tokenizer. The index size is unexpectedly large, especially when processing Chinese text.
324,725 vocab size -> 10153 MB
When can we expect the release of the new docker image or code?
Reproduce:
-- Docker Image: tensorchord/vchord-suite:pg15-20251201
-- vchord: 1.0.0
-- pg_tokenizer: 0.1.1
-- vchord_bm25 : 0.2.2
create extension if not exists pg_tokenizer;
create extension if not exists vchord_bm25;
create table documents (
id serial primary key,
passage text,
embedding int[]
);
\copy documents(id, passage) from 'collection.tsv' with (format csv, delimiter e'\t') where id < 100000;
SELECT create_text_analyzer('jieba_analyzer', $$
[pre_tokenizer.jieba]
$$);
select create_custom_model_tokenizer_and_trigger(
tokenizer_name => 'tokenizer1',
model_name => 'model1',
text_analyzer_name => 'text_analyzer1',
table_name => 'documents',
source_column => 'passage',
target_column => 'embedding'
);
select count(*) from model_model1; -- 324,725 vocab size
alter table documents alter column embedding type bm25vector;
create index if not exists documents_embedding_idx on documents using bm25 (embedding bm25_ops);
-- Result: documents_embedding_idx 10153 MB
select indexname, pg_size_pretty(pg_relation_size(indexname::regclass)) as size
from pg_indexes where tablename = 'documents';Metadata
Metadata
Assignees
Labels
No labels