Support Parquet ZSTD and BigInt JSON (#54)

walterra · web-flow · commit 02d3e183d291 · 2026-02-16T19:32:09.000+01:00
## Summary
- register ZSTD support for Parquet via zlib
- normalize BigInt values to JSON-safe numbers or strings
- update Parquet tests and README

## Testing
- yarn test --runTestsByPath __tests__/parquet_reader.test.js
diff --git a/.changeset/add-zstd-parquet-support.md b/.changeset/add-zstd-parquet-support.md
@@ -0,0 +1,5 @@
+---
+"node-es-transformer": patch
+---
+
+Add ZSTD support for Parquet ingestion and normalize BigInt values during indexing
diff --git a/README.md b/README.md
@@ -344,6 +344,8 @@ Choose **one** of these sources:
 - **`sourceFormat`** (`'ndjson' | 'csv' | 'parquet' | 'arrow'`): Format for file/stream sources. Default: `'ndjson'`.
   - `arrow` expects Arrow IPC file/stream payloads.
   - `parquet` stream sources are currently buffered in memory before row iteration (file sources remain streaming by row cursor).
+  - `parquet` supports ZSTD-compressed files when running on Node.js 22+ (uses the built-in zlib zstd implementation).
+  - `parquet` INT64 values are normalized for JSON: safe-range values become numbers, larger values become strings.
 - **`csvOptions`** (object): CSV parser options (delimiter, quote, columns, etc.) used when `sourceFormat: 'csv'`.
 
 #### Client Configuration
diff --git a/__tests__/parquet_reader.test.js b/__tests__/parquet_reader.test.js
@@ -4,6 +4,32 @@ const path = require('path');
 
 const retry = require('async-retry');
 const parquet = require('@dsnp/parquetjs');
+const zlib = require('zlib');
+const { PARQUET_COMPRESSION_METHODS } = require('@dsnp/parquetjs/dist/lib/compression');
+
+function registerZstdCompression() {
+  if (PARQUET_COMPRESSION_METHODS.ZSTD) {
+    return;
+  }
+
+  if (
+    typeof zlib.zstdCompressSync !== 'function' ||
+    typeof zlib.zstdDecompressSync !== 'function'
+  ) {
+    throw new Error('ZSTD compression requires Node.js with zstd support.');
+  }
+
+  PARQUET_COMPRESSION_METHODS.ZSTD = {
+    deflate(value) {
+      return zlib.zstdCompressSync(value);
+    },
+    inflate(value) {
+      return zlib.zstdDecompressSync(value);
+    },
+  };
+}
+
+registerZstdCompression();
 
 const transformer = require('../dist/node-es-transformer.cjs');
 const deleteIndex = require('./utils/delete_index');
@@ -13,13 +39,16 @@ const client = getElasticsearchClient();
 
 const indexes = {
   single: 'parquet_file_reader_single',
+  noTransform: 'parquet_file_reader_no_transform',
   wildcard: 'parquet_file_reader_wildcard',
   stream: 'parquet_stream_reader',
+  zstd: 'parquet_file_reader_zstd',
 };
 
 const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'node-es-transformer-parquet-'));
 const sampleFile1 = path.join(tempDir, 'sample_data_parquet_1.parquet');
 const sampleFile2 = path.join(tempDir, 'sample_data_parquet_2.parquet');
+const sampleFileZstd = path.join(tempDir, 'sample_data_parquet_zstd.parquet');
 
 async function runTransformerAndWait(options) {
   const { events } = await transformer(options);
@@ -30,12 +59,16 @@ async function runTransformerAndWait(options) {
   });
 }
 
-async function createParquetFile(filePath, rows) {
+function buildField(type, compression) {
+  return compression ? { type, compression } : { type };
+}
+
+async function createParquetFile(filePath, rows, compression) {
   const schema = new parquet.ParquetSchema({
-    the_index: { type: 'INT64' },
-    code: { type: 'INT64' },
-    url: { type: 'UTF8' },
-    text: { type: 'UTF8' },
+    the_index: buildField('INT64', compression),
+    code: buildField('INT64', compression),
+    url: buildField('UTF8', compression),
+    text: buildField('UTF8', compression),
   });
 
   const writer = await parquet.ParquetWriter.openFile(schema, filePath);
@@ -87,12 +120,33 @@ describe('indexes parquet sources', () => {
         text: 'parquet-five',
       },
     ]);
+
+    await createParquetFile(
+      sampleFileZstd,
+      [
+        {
+          the_index: 6,
+          code: 600,
+          url: 'https://example.com/p6',
+          text: 'parquet-zstd-one',
+        },
+        {
+          the_index: 7,
+          code: 700,
+          url: 'https://example.com/p7',
+          text: 'parquet-zstd-two',
+        },
+      ],
+      'ZSTD',
+    );
   });
 
   afterAll(async () => {
     await deleteIndex(client, indexes.single)();
+    await deleteIndex(client, indexes.noTransform)();
     await deleteIndex(client, indexes.wildcard)();
     await deleteIndex(client, indexes.stream)();
+    await deleteIndex(client, indexes.zstd)();
     await client.close();
     fs.rmSync(tempDir, { recursive: true, force: true });
   });
@@ -132,6 +186,69 @@ describe('indexes parquet sources', () => {
     });
   });
 
+  it('should index a parquet file without a transform', async () => {
+    await runTransformerAndWait({
+      fileName: sampleFile1,
+      sourceFormat: 'parquet',
+      targetIndexName: indexes.noTransform,
+      mappings: {
+        properties: {
+          the_index: { type: 'integer' },
+          code: { type: 'integer' },
+          url: { type: 'keyword' },
+          text: { type: 'keyword' },
+        },
+      },
+      verbose: false,
+    });
+
+    await client.indices.refresh({ index: indexes.noTransform });
+
+    await retry(async () => {
+      const res = await fetch(`${elasticsearchUrl}/${indexes.noTransform}/_search?q=the_index:2`);
+      expect(res.status).toBe(200);
+
+      const body = await res.json();
+      expect(body?.hits?.total?.value).toBe(1);
+      expect(body?.hits?.hits?.[0]?._source?.text).toBe('parquet-two');
+    });
+  });
+
+  it('should index a ZSTD-compressed parquet file', async () => {
+    await runTransformerAndWait({
+      fileName: sampleFileZstd,
+      sourceFormat: 'parquet',
+      targetIndexName: indexes.zstd,
+      mappings: {
+        properties: {
+          the_index: { type: 'integer' },
+          code: { type: 'integer' },
+          url: { type: 'keyword' },
+          text: { type: 'keyword' },
+        },
+      },
+      transform(doc) {
+        return {
+          ...doc,
+          the_index: Number(doc.the_index),
+          code: Number(doc.code),
+        };
+      },
+      verbose: false,
+    });
+
+    await client.indices.refresh({ index: indexes.zstd });
+
+    await retry(async () => {
+      const res = await fetch(`${elasticsearchUrl}/${indexes.zstd}/_search?q=the_index:6`);
+      expect(res.status).toBe(200);
+
+      const body = await res.json();
+      expect(body?.hits?.total?.value).toBe(1);
+      expect(body?.hits?.hits?.[0]?._source?.text).toBe('parquet-zstd-one');
+    });
+  });
+
   it('should index parquet files through wildcard patterns', async () => {
     await runTransformerAndWait({
       fileName: path.join(tempDir, 'sample_data_parquet_*.parquet'),
@@ -162,7 +279,7 @@ describe('indexes parquet sources', () => {
       expect(res.status).toBe(200);
 
       const body = await res.json();
-      expect(body?.count).toBe(5);
+      expect(body?.count).toBe(7);
     });
   });
 
diff --git a/src/_file-reader.js b/src/_file-reader.js
@@ -1,4 +1,4 @@
-import parquet from '@dsnp/parquetjs';
+import parquet from './_parquet';
 import * as arrow from 'apache-arrow';
 import fs from 'fs';
 import { parse } from 'csv-parse';
diff --git a/src/_index-queue.js b/src/_index-queue.js
@@ -5,6 +5,26 @@ import { DEFAULT_BUFFER_SIZE } from './_constants';
 const EventEmitter = require('events');
 
 const parallelCalls = 5;
+const MAX_SAFE_BIGINT = BigInt(Number.MAX_SAFE_INTEGER);
+const MIN_SAFE_BIGINT = BigInt(Number.MIN_SAFE_INTEGER);
+
+function coerceBigInt(value) {
+  if (value >= MIN_SAFE_BIGINT && value <= MAX_SAFE_BIGINT) {
+    return Number(value);
+  }
+
+  return value.toString();
+}
+
+function safeStringify(doc) {
+  return JSON.stringify(doc, (_key, value) => {
+    if (typeof value === 'bigint') {
+      return coerceBigInt(value);
+    }
+
+    return value;
+  });
+}
 
 // a simple helper queue to bulk index documents
 export default function indexQueueFactory({
@@ -126,7 +146,7 @@ export default function indexQueueFactory({
         throw new Error('Unexpected doc added after indexer should finish.');
       }
 
-      const canContinue = stream.write(`${JSON.stringify(doc)}\n`);
+      const canContinue = stream.write(`${safeStringify(doc)}\n`);
       if (!canContinue) {
         queueEmitter.emit('pause');
 
diff --git a/src/_parquet.js b/src/_parquet.js
@@ -0,0 +1,37 @@
+import parquet from '@dsnp/parquetjs';
+import zlib from 'zlib';
+import { PARQUET_COMPRESSION_METHODS } from '@dsnp/parquetjs/dist/lib/compression.js';
+
+function registerZstdCompression() {
+  if (PARQUET_COMPRESSION_METHODS.ZSTD) {
+    return;
+  }
+
+  if (
+    typeof zlib.zstdCompressSync !== 'function' ||
+    typeof zlib.zstdDecompressSync !== 'function'
+  ) {
+    PARQUET_COMPRESSION_METHODS.ZSTD = {
+      deflate() {
+        throw new Error('ZSTD compression requires Node.js with zstd support.');
+      },
+      inflate() {
+        throw new Error('ZSTD compression requires Node.js with zstd support.');
+      },
+    };
+    return;
+  }
+
+  PARQUET_COMPRESSION_METHODS.ZSTD = {
+    deflate(value) {
+      return zlib.zstdCompressSync(value);
+    },
+    inflate(value) {
+      return zlib.zstdDecompressSync(value);
+    },
+  };
+}
+
+registerZstdCompression();
+
+export default parquet;
diff --git a/src/_stream-reader.js b/src/_stream-reader.js
@@ -1,4 +1,4 @@
-import parquet from '@dsnp/parquetjs';
+import parquet from './_parquet';
 import * as arrow from 'apache-arrow';
 import { parse } from 'csv-parse';
 import es from 'event-stream';

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"node-es-transformer": patch
 +---
++
 +Add ZSTD support for Parquet ingestion and normalize BigInt values during indexing
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-import parquet from '@dsnp/parquetjs';`
	`1`	`+import parquet from './_parquet';`
`2`	`2`	`import * as arrow from 'apache-arrow';`
`3`	`3`	`import fs from 'fs';`
`4`	`4`	`import { parse } from 'csv-parse';`