diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c55b6b8..d4e9661 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -37,7 +37,7 @@ jobs: loadtest: strategy: matrix: - kind: ['csv_agg', 'csv_agg_delim', 'postgrest'] + kind: ['csv_agg', 'csv_agg_delim', 'csv_agg_delim_bom', 'postgrest'] name: Loadtest runs-on: ubuntu-24.04 steps: diff --git a/Makefile b/Makefile index 4d65e33..fa72836 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,7 @@ else endif EXTENSION = pg_csv -EXTVERSION = 0.2 +EXTVERSION = 0.3 DATA = $(wildcard sql/*--*.sql) diff --git a/README.md b/README.md index cdf7b39..06bf374 100644 --- a/README.md +++ b/README.md @@ -35,10 +35,12 @@ select csv_agg(x) from projects x; (1 row) ``` -It also supports adding a custom delimiter. +### Custom Delimiter + +You can use a custom delimiter. ```psql -select csv_agg(x, '|') from projects x; +select csv_agg(x, csv_options(delimiter := '|')) from projects x; csv_agg ------------------- id|name|client_id+ @@ -50,5 +52,22 @@ select csv_agg(x, '|') from projects x; (1 row) ``` -> [!IMPORTANT] +> [!NOTE] > Newline, carriage return and double quotes are not supported as delimiters to maintain the integrity of the separated values format. + +### BOM + +You can include a byte-order mark (BOM) to make the CSV compatible with Excel. + +```psql +select csv_agg(x, csv_options(bom := true)) from projects x; + csv_agg +------------------- +id,name,client_id+ + 1,Windows 7,1 + + 2,Windows 10,1 + + 3,IOS,2 + + 4,OSX,2 + + 5,Orphan, +(1 row) +``` diff --git a/bench/csv_agg_delim_bom.sql b/bench/csv_agg_delim_bom.sql new file mode 100644 index 0000000..ffdec38 --- /dev/null +++ b/bench/csv_agg_delim_bom.sql @@ -0,0 +1,5 @@ +\set lim random(1000, 2000) + +select csv_agg(t, csv_options(delimiter:=',', bom:=true)) from ( + select * from student_emotion_assessments limit :lim +) as t; diff --git a/sql/pg_csv--0.2--0.3.sql b/sql/pg_csv--0.2--0.3.sql new file mode 100644 index 0000000..6be414b --- /dev/null +++ b/sql/pg_csv--0.2--0.3.sql @@ -0,0 +1,5 @@ +alter type csv_options add attribute bom bool; + +create or replace function csv_options(delimiter "char" default NULL, bom bool default NULL) returns csv_options as $$ + select row(delimiter, bom)::csv_options; +$$ language sql; diff --git a/sql/pg_csv.sql b/sql/pg_csv.sql index b48fbc3..9932321 100644 --- a/sql/pg_csv.sql +++ b/sql/pg_csv.sql @@ -1,9 +1,10 @@ create type csv_options as ( delimiter "char" +, bom bool ); -create function csv_options(delimiter "char" default ',') returns csv_options as $$ - select row(delimiter)::csv_options; +create or replace function csv_options(delimiter "char" default NULL, bom bool default NULL) returns csv_options as $$ + select row(delimiter, bom)::csv_options; $$ language sql; create function csv_agg_transfn(internal, anyelement) @@ -34,4 +35,3 @@ create aggregate csv_agg(anyelement, csv_options) ( finalfunc = csv_agg_finalfn, parallel = safe ); - diff --git a/src/pg_csv.c b/src/pg_csv.c index 59574e4..21535c2 100644 --- a/src/pg_csv.c +++ b/src/pg_csv.c @@ -5,9 +5,11 @@ PG_MODULE_MAGIC; static const char NEWLINE = '\n'; static const char DQUOTE = '"'; static const char CR = '\r'; +static const char BOM[3] = "\xEF\xBB\xBF"; typedef struct { char delim; + bool with_bom; } CsvOptions; typedef struct { @@ -55,15 +57,16 @@ static char *datum_to_cstring(Datum datum, Oid typeoid) { static void parse_csv_options(HeapTupleHeader opts_hdr, CsvOptions *csv_opts) { // defaults - csv_opts->delim = ','; + csv_opts->delim = ','; + csv_opts->with_bom = false; if (opts_hdr == NULL) return; TupleDesc desc = lookup_rowtype_tupdesc(HeapTupleHeaderGetTypeId(opts_hdr), HeapTupleHeaderGetTypMod(opts_hdr)); - Datum values[1]; - bool nulls[1]; + Datum values[2]; + bool nulls[2]; heap_deform_tuple( &(HeapTupleData){.t_len = HeapTupleHeaderGetDatumLength(opts_hdr), .t_data = opts_hdr}, desc, @@ -77,6 +80,10 @@ static void parse_csv_options(HeapTupleHeader opts_hdr, CsvOptions *csv_opts) { "double quote"))); } + if (!nulls[1]) { + csv_opts->with_bom = DatumGetBool(values[1]); + } + ReleaseTupleDesc(desc); } @@ -118,6 +125,8 @@ Datum csv_agg_transfn(PG_FUNCTION_ARGS) { TupleDesc tdesc = lookup_rowtype_tupdesc(HeapTupleHeaderGetTypeId(next), HeapTupleHeaderGetTypMod(next)); + if (state->options->with_bom) appendBinaryStringInfo(&state->accum_buf, BOM, sizeof(BOM)); + // build header row for (int i = 0; i < tdesc->natts; i++) { Form_pg_attribute att = TupleDescAttr(tdesc, i); diff --git a/test/expected/bom.out b/test/expected/bom.out new file mode 100644 index 0000000..f339569 --- /dev/null +++ b/test/expected/bom.out @@ -0,0 +1,38 @@ +-- this is done to avoid failing on a pure psql change that happened on postgres 16 +-- on pg <= 15 the BOM output adds one extra space, on pg 16 it doesn't +\pset format unaligned +\pset tuples_only on +\echo + +-- include BOM (byte-order mark) +SELECT csv_agg(x, csv_options(bom := true)) AS body +FROM projects x; +id,name,client_id +1,Windows 7,1 +2,"has,comma",1 +,, +4,OSX,2 +,"has""quote", +5,"has,comma and ""quote""",7 +6,"has + LF",7 +7,"has CR",8 +8,"has + CRLF""",8 +\echo + +-- include BOM with custom delimiter +SELECT csv_agg(x, csv_options(delimiter := ';', bom := true)) AS body +FROM projects x; +id;name;client_id +1;Windows 7;1 +2;has,comma;1 +;; +4;OSX;2 +;"has""quote"; +5;"has,comma and ""quote""";7 +6;"has + LF";7 +7;"has CR";8 +8;"has + CRLF""";8 diff --git a/test/sql/bom.sql b/test/sql/bom.sql new file mode 100644 index 0000000..60f086e --- /dev/null +++ b/test/sql/bom.sql @@ -0,0 +1,14 @@ +-- this is done to avoid failing on a pure psql change that happened on postgres 16 +-- on pg <= 15 the BOM output adds one extra space, on pg 16 it doesn't +\pset format unaligned +\pset tuples_only on +\echo + +-- include BOM (byte-order mark) +SELECT csv_agg(x, csv_options(bom := true)) AS body +FROM projects x; +\echo + +-- include BOM with custom delimiter +SELECT csv_agg(x, csv_options(delimiter := ';', bom := true)) AS body +FROM projects x;