1010import com .fasterxml .jackson .core .JsonParseException ;
1111import com .fasterxml .jackson .core .JsonParser ;
1212import com .fasterxml .jackson .core .JsonToken ;
13+ import com .fasterxml .jackson .core .exc .InputCoercionException ;
1314import com .fasterxml .jackson .core .io .JsonEOFException ;
1415
1516import org .apache .lucene .util .BytesRef ;
3334import java .io .Closeable ;
3435import java .io .IOException ;
3536import java .io .InputStream ;
36- import java .time .Instant ;
3737import java .util .BitSet ;
3838import java .util .HashMap ;
3939import java .util .List ;
4040import java .util .Map ;
4141
4242public class NdJsonPageDecoder implements Closeable {
4343
44- private static final Logger LOGGER = LogManager .getLogger (NdJsonPageDecoder .class );
44+ private static final Logger logger = LogManager .getLogger (NdJsonPageDecoder .class );
4545
4646 private InputStream input ;
4747 private final BlockDecoder decoder ;
@@ -64,7 +64,7 @@ public class NdJsonPageDecoder implements Closeable {
6464 this .input = input ;
6565
6666 var projectedAttributes = attributes ;
67- if (projectedColumns .isEmpty () == false ) {
67+ if (projectedColumns != null && projectedColumns .isEmpty () == false ) {
6868 // Keep projected columns in order, adding NULL for missing columns
6969 projectedAttributes = projectedColumns .stream ()
7070 .map (
@@ -99,9 +99,9 @@ Page decodePage() throws IOException {
9999 }
100100 } catch (JsonParseException e ) {
101101 if (e instanceof JsonEOFException ) {
102- LOGGER .debug ("Truncated NDJSON at line {} (expected at split boundaries): {}" , lineCount , e .getOriginalMessage ());
102+ logger .debug ("Truncated NDJSON at line {} (expected at split boundaries): {}" , lineCount , e .getOriginalMessage ());
103103 } else {
104- LOGGER . warn ("Malformed NDJSON at line {}: {}" , lineCount , e .getOriginalMessage ());
104+ logger . debug ("Malformed NDJSON at line {}: {}" , lineCount , e .getOriginalMessage ());
105105 }
106106 this .input = NdJsonUtils .moveToNextLine (parser , this .input );
107107 parser = NdJsonUtils .JSON_FACTORY .createParser (this .input );
@@ -112,12 +112,12 @@ Page decodePage() throws IOException {
112112 this .blockTracker .clear ();
113113
114114 try {
115- decoder .decodeObject (parser );
115+ decoder .decodeObject (parser , false );
116116 } catch (JsonParseException e ) {
117117 if (e instanceof JsonEOFException ) {
118- LOGGER .debug ("Truncated NDJSON at line {} (expected at split boundaries): {}" , lineCount , e .getOriginalMessage ());
118+ logger .debug ("Truncated NDJSON at line {} (expected at split boundaries): {}" , lineCount , e .getOriginalMessage ());
119119 } else {
120- LOGGER . warn ("Malformed NDJSON at line {}: {}" , lineCount , e .getOriginalMessage ());
120+ logger . debug ("Malformed NDJSON at line {}: {}" , lineCount , e .getOriginalMessage ());
121121 }
122122 this .input = NdJsonUtils .moveToNextLine (parser , this .input );
123123 parser = NdJsonUtils .JSON_FACTORY .createParser (this .input );
@@ -185,20 +185,22 @@ public void close() throws IOException {
185185 // A tree of decoders. Avoids path reconstruction when traversing nested objects.
186186 private class BlockDecoder {
187187 @ Nullable
188- Attribute attribute ;
189- Block . Builder blockBuilder ;
188+ DataType dataType ;
189+ String name ;
190190 int blockIdx ;
191+ Block .Builder blockBuilder ;
191192 Map <String , BlockDecoder > children ;
192193
193194 void setAttribute (Attribute attribute , int blockIdx ) {
194- this .attribute = attribute ;
195+ this .dataType = attribute .dataType ();
196+ this .name = attribute .name ();
195197 this .blockIdx = blockIdx ;
196198 }
197199
198200 // Builders setup independently as we need to create new ones for each page.
199201 void setupBuilders (Block .Builder [] blockBuilders ) {
200- if (attribute != null ) {
201- blockBuilder = switch (attribute . dataType () ) {
202+ if (dataType != null ) {
203+ blockBuilder = switch (dataType ) {
202204 // Keep in sync with NdJsonSchemaInferrer.inferValueSchema
203205 case BOOLEAN -> blockFactory .newBooleanBlockBuilder (batchSize );
204206 case NULL -> new ConstantNullBlock .Builder (blockFactory );
@@ -207,7 +209,7 @@ void setupBuilders(Block.Builder[] blockBuilders) {
207209 case DOUBLE -> blockFactory .newDoubleBlockBuilder (batchSize );
208210 case KEYWORD -> blockFactory .newBytesRefBlockBuilder (batchSize );
209211 case DATETIME -> blockFactory .newLongBlockBuilder (batchSize ); // milliseconds since epoch
210- default -> throw new IllegalArgumentException ("Unsupported data type: " + attribute . dataType () );
212+ default -> throw new IllegalArgumentException ("Unsupported data type: " + dataType );
211213 };
212214 blockBuilders [blockIdx ] = blockBuilder ;
213215 }
@@ -219,7 +221,7 @@ void setupBuilders(Block.Builder[] blockBuilders) {
219221 }
220222 }
221223
222- private void decodeObject (JsonParser parser ) throws IOException {
224+ private void decodeObject (JsonParser parser , boolean inArray ) throws IOException {
223225 JsonToken token = parser .currentToken ();
224226 if (token != JsonToken .START_OBJECT ) {
225227 throw new NdJsonParseException (parser , "Expected JSON object" );
@@ -234,69 +236,125 @@ private void decodeObject(JsonParser parser) throws IOException {
234236 // Unknown field, skip it
235237 parser .skipChildren ();
236238 } else {
237- childDecoder .decodeValue (parser );
239+ childDecoder .decodeValue (parser , inArray );
238240 }
239241 }
240242 }
241243
242- private void decodeValue (JsonParser parser ) throws IOException {
244+ private void beginPositionEntry () {
245+ // We may have DataType.NULL for unknown columns. And NullBlock.Builder throws on beginPositionEntry()
246+ if (blockBuilder != null && dataType != DataType .NULL ) {
247+ blockBuilder .beginPositionEntry ();
248+ }
249+ if (children != null ) {
250+ for (var child : children .values ()) {
251+ child .beginPositionEntry ();
252+ }
253+ }
254+ }
255+
256+ private void endPositionEntry () {
257+ if (blockBuilder != null && dataType != DataType .NULL ) {
258+ blockBuilder .endPositionEntry ();
259+ }
260+ if (children != null ) {
261+ for (var child : children .values ()) {
262+ child .endPositionEntry ();
263+ }
264+ }
265+ }
266+
267+ private void decodeValue (JsonParser parser , boolean inArray ) throws IOException {
243268 JsonToken token = parser .currentToken ();
244- blockTracker .set (blockIdx );
269+
270+ if (dataType == DataType .NULL ) {
271+ // Don't do anything. We must do a single appendNull() on null blocks, this will be done
272+ // at the end of decodePage() when we check that all blocks have moved forward.
273+ parser .skipChildren ();
274+ return ;
275+ }
276+
245277 if (token == JsonToken .START_ARRAY ) {
246- this .blockBuilder .beginPositionEntry ();
278+ // Start a multi-value entry on this decoder and all its children (nested arrays are flattened).
279+ // Note: the `inArray` flag is needed because blockBuilder.beginPositionEntry() is not idempotent.
280+ // Calling it twice implicitly calls endPositionEntry().
281+ if (!inArray ) {
282+ beginPositionEntry ();
283+ }
247284 while (parser .nextToken () != JsonToken .END_ARRAY ) {
248- decodeValue (parser );
285+ decodeValue (parser , true );
249286 }
250- this .blockBuilder .endPositionEntry ();
287+ if (!inArray ) {
288+ endPositionEntry ();
289+ }
290+ return ;
291+ }
292+
293+ if (token == JsonToken .START_OBJECT ) {
294+ decodeObject (parser , inArray );
251295 return ;
252296 }
253297
254- if (token == JsonToken .VALUE_NULL ) {
298+ blockTracker .set (blockIdx );
299+ if (token == JsonToken .VALUE_NULL && inArray == false ) {
300+ // Nulls in arrays aren't supported. Furthermore, appendNull will implicitly call endPositionEntry()
255301 blockBuilder .appendNull ();
256302 return ;
257303 }
258304
259- switch (attribute . dataType () ) {
305+ switch (dataType ) {
260306 case BOOLEAN -> {
261307 if (token == JsonToken .VALUE_TRUE ) {
262308 ((BooleanBlock .Builder ) blockBuilder ).appendBoolean (true );
263309 } else if (token == JsonToken .VALUE_FALSE ) {
264310 ((BooleanBlock .Builder ) blockBuilder ).appendBoolean (false );
265311 } else {
266- unexpectedValue (parser );
312+ unexpectedValue (blockBuilder , parser , inArray );
267313 }
268314 }
269315 case NULL -> {
270316 // NULL handled above
271- unexpectedValue (parser );
317+ unexpectedValue (blockBuilder , parser , inArray );
272318 }
273319 case INTEGER -> {
274320 if (token == JsonToken .VALUE_NUMBER_INT || token == JsonToken .VALUE_NUMBER_FLOAT ) {
275- ((IntBlock .Builder ) blockBuilder ).appendInt (parser .getIntValue ());
321+ try {
322+ ((IntBlock .Builder ) blockBuilder ).appendInt (parser .getIntValue ());
323+ } catch (InputCoercionException e ) {
324+ unexpectedValue (blockBuilder , parser , inArray );
325+ }
276326 } else {
277- unexpectedValue (parser );
327+ unexpectedValue (blockBuilder , parser , inArray );
278328 }
279329 }
280330 case LONG -> {
281331 if (token == JsonToken .VALUE_NUMBER_INT || token == JsonToken .VALUE_NUMBER_FLOAT ) {
282- ((LongBlock .Builder ) blockBuilder ).appendLong (parser .getLongValue ());
332+ try {
333+ ((LongBlock .Builder ) blockBuilder ).appendLong (parser .getLongValue ());
334+ } catch (InputCoercionException e ) {
335+ unexpectedValue (blockBuilder , parser , inArray );
336+ }
283337 } else {
284- unexpectedValue (parser );
338+ unexpectedValue (blockBuilder , parser , inArray );
285339 }
286340 }
287341 case DOUBLE -> {
288342 if (token == JsonToken .VALUE_NUMBER_INT || token == JsonToken .VALUE_NUMBER_FLOAT ) {
289- ((DoubleBlock .Builder ) blockBuilder ).appendDouble (parser .getDoubleValue ());
343+ try {
344+ ((DoubleBlock .Builder ) blockBuilder ).appendDouble (parser .getDoubleValue ());
345+ } catch (InputCoercionException e ) {
346+ unexpectedValue (blockBuilder , parser , inArray );
347+ }
290348 } else {
291- unexpectedValue (parser );
349+ unexpectedValue (blockBuilder , parser , inArray );
292350 }
293351 }
294352 case DATETIME -> {
295353 try {
296- var millis = Instant . parse (parser .getValueAsString ()). toEpochMilli ( );
354+ var millis = NdJsonSchemaInferrer . DATE_FORMATTER . parseMillis (parser .getValueAsString ());
297355 ((LongBlock .Builder ) blockBuilder ).appendLong (millis );
298356 } catch (Exception e ) {
299- unexpectedValue (parser );
357+ unexpectedValue (blockBuilder , parser , inArray );
300358 }
301359 }
302360 case KEYWORD -> {
@@ -305,20 +363,21 @@ private void decodeValue(JsonParser parser) throws IOException {
305363 if (str != null ) {
306364 ((BytesRefBlock .Builder ) blockBuilder ).appendBytesRef (new BytesRef (str ));
307365 } else {
308- unexpectedValue (parser );
366+ unexpectedValue (blockBuilder , parser , inArray );
309367 }
310368 }
311- default -> throw new IllegalArgumentException ("Unsupported data type: " + attribute . dataType () );
369+ default -> throw new IllegalArgumentException ("Unsupported data type: " + dataType );
312370 }
313371 }
314372
315- private void unexpectedValue (JsonParser parser ) throws IOException {
316- LOGGER .warn (
317- "Unexpected token type: {} for attribute: {} at {}" ,
318- parser .currentToken (),
319- attribute .name (),
320- parser .getTokenLocation ()
321- );
373+ private void unexpectedValue (Block .Builder builder , JsonParser parser , boolean inArray ) throws IOException {
374+ // Append a null and log the problem
375+ if (inArray == false ) {
376+ // See previous comment about nulls and arrays
377+ builder .appendNull ();
378+ }
379+
380+ logger .debug ("Unexpected token type: {} for attribute: {} at {}" , parser .currentToken (), name , parser .getTokenLocation ());
322381 // Ignore any children to keep reading other values
323382 parser .skipChildren ();
324383 }
0 commit comments