Skip to content

Commit 7b9b5eb

Browse files
roelentlessjheer
authored andcommitted
Fix null field metadata encoding
Tables with null columns crashed PyArrow/Polars with "ran out of field metadata", both for stream/file. The bug, eg: a Schema defined 3 columns, but the record batch only wrote metadata for 2 columns because it skipped null columns. PyArrow expected the counts to match. Fixed by writing metadata for all columns, including null ones (they just don't get any actual data buffers). Also added a test + test data files (generated by pyarrow) and encode/decode tests using those.
1 parent d95bee3 commit 7b9b5eb

File tree

4 files changed

+105
-4
lines changed

4 files changed

+105
-4
lines changed

src/encode/table-to-ipc.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -226,12 +226,12 @@ function assembleRecordBatch(columns, batchIndex = 0) {
226226
function visit(type, batch, ctx) {
227227
const { typeId } = type;
228228

229-
// no field node, no buffers
230-
if (typeId === Type.Null) return;
231-
232-
// record field node info
229+
// record field node info - ALL fields need field nodes, including nulls
233230
ctx.node(batch.length, batch.nullCount);
234231

232+
// null fields have field nodes but no data buffers
233+
if (typeId === Type.Null) return;
234+
235235
switch (typeId) {
236236
// validity and value buffers
237237
// backing dictionaries handled elsewhere

test/data/null_test.arrow

810 Bytes
Binary file not shown.

test/data/null_test.arrows

536 Bytes
Binary file not shown.

test/null-field-test.js

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import assert from 'node:assert';
2+
import { readFile } from 'node:fs/promises';
3+
import { tableFromColumns, columnFromArray, tableFromIPC, tableToIPC } from '../src/index.js';
4+
5+
describe('Null field compatibility', () => {
6+
it('reads PyArrow-generated files with null columns', async () => {
7+
// Test file format
8+
const fileBytes = new Uint8Array(await readFile('test/data/null_test.arrow'));
9+
const fileTable = tableFromIPC(fileBytes);
10+
11+
assert.strictEqual(fileTable.numRows, 4);
12+
assert.strictEqual(fileTable.numCols, 3);
13+
14+
// Check schema
15+
const fields = fileTable.schema.fields;
16+
assert.strictEqual(fields.length, 3);
17+
assert.strictEqual(fields[0].name, 'numbers');
18+
assert.strictEqual(fields[1].name, 'text');
19+
assert.strictEqual(fields[2].name, 'nulls');
20+
assert.strictEqual(fields[2].type.typeId, 1); // Type.Null
21+
22+
// Check data
23+
const numbers = fileTable.getChildAt(0).toArray();
24+
const text = fileTable.getChildAt(1).toArray();
25+
const nulls = fileTable.getChildAt(2).toArray();
26+
27+
assert.deepStrictEqual(Array.from(numbers), [1, 2, 3, 4]);
28+
assert.deepStrictEqual(Array.from(text), ['hello', 'world', 'test', 'null']);
29+
assert.deepStrictEqual(Array.from(nulls), [null, null, null, null]);
30+
31+
// Test stream format
32+
const streamBytes = new Uint8Array(await readFile('test/data/null_test.arrows'));
33+
const streamTable = tableFromIPC(streamBytes);
34+
35+
assert.strictEqual(streamTable.numRows, 4);
36+
assert.strictEqual(streamTable.numCols, 3);
37+
});
38+
39+
it('creates files with null columns that PyArrow can read', () => {
40+
// Create table with null column
41+
const intCol = columnFromArray([10, 20, 30]);
42+
const strCol = columnFromArray(['a', 'b', 'c']);
43+
const nullCol = columnFromArray([null, null, null]);
44+
45+
const table = tableFromColumns({
46+
integers: intCol,
47+
strings: strCol,
48+
nulls: nullCol
49+
});
50+
51+
// Test both formats
52+
const fileBytes = tableToIPC(table, { format: 'file' });
53+
const streamBytes = tableToIPC(table, { format: 'stream' });
54+
55+
// Round-trip test
56+
const fileTable = tableFromIPC(fileBytes);
57+
const streamTable = tableFromIPC(streamBytes);
58+
59+
// Verify structure
60+
assert.strictEqual(fileTable.numRows, 3);
61+
assert.strictEqual(fileTable.numCols, 3);
62+
assert.strictEqual(streamTable.numRows, 3);
63+
assert.strictEqual(streamTable.numCols, 3);
64+
65+
// Verify null column is preserved
66+
const nullField = fileTable.schema.fields.find(f => f.name === 'nulls');
67+
assert.strictEqual(nullField.type.typeId, 1); // Type.Null
68+
69+
const nullData = fileTable.getChild('nulls').toArray();
70+
assert.deepStrictEqual(Array.from(nullData), [null, null, null]);
71+
});
72+
73+
it('handles mixed null and non-null data correctly', () => {
74+
// Test columns with some null values (different from pure null columns)
75+
const mixedCol = columnFromArray([1, null, 3, null, 5]);
76+
const pureNullCol = columnFromArray([null, null, null, null, null]);
77+
78+
const table = tableFromColumns({
79+
mixed: mixedCol,
80+
pure_nulls: pureNullCol
81+
});
82+
83+
const bytes = tableToIPC(table, { format: 'file' });
84+
const roundTrip = tableFromIPC(bytes);
85+
86+
// Verify the mixed column (should not be Type.Null)
87+
const mixedField = roundTrip.schema.fields.find(f => f.name === 'mixed');
88+
assert.notStrictEqual(mixedField.type.typeId, 1); // Should be Int type, not Null
89+
90+
// Verify the pure null column (should be Type.Null)
91+
const nullField = roundTrip.schema.fields.find(f => f.name === 'pure_nulls');
92+
assert.strictEqual(nullField.type.typeId, 1); // Should be Type.Null
93+
94+
// Verify data integrity
95+
const mixedData = roundTrip.getChild('mixed').toArray();
96+
const nullData = roundTrip.getChild('pure_nulls').toArray();
97+
98+
assert.deepStrictEqual(Array.from(mixedData), [1, null, 3, null, 5]);
99+
assert.deepStrictEqual(Array.from(nullData), [null, null, null, null, null]);
100+
});
101+
});

0 commit comments

Comments
 (0)