|
35 | 35 | #include "arrow/util/bit_util.h" |
36 | 36 | #include "arrow/util/io_util.h" |
37 | 37 | #include "arrow/util/rle_encoding_internal.h" |
| 38 | +#include "arrow/util/span.h" |
38 | 39 |
|
39 | 40 | namespace arrow::util { |
40 | 41 |
|
@@ -458,6 +459,29 @@ void TestRleBitPackedParser(std::vector<uint8_t> bytes, rle_size_t bit_width, |
458 | 459 | EXPECT_EQ(decoded, expected); |
459 | 460 | } |
460 | 461 |
|
| 462 | +void TestRleBitPackedParserError(span<const uint8_t> bytes, rle_size_t bit_width) { |
| 463 | + auto parser = |
| 464 | + RleBitPackedParser(bytes.data(), static_cast<rle_size_t>(bytes.size()), bit_width); |
| 465 | + EXPECT_FALSE(parser.exhausted()); |
| 466 | + |
| 467 | + struct { |
| 468 | + auto OnRleRun(RleRun run) { return RleBitPackedParser::ControlFlow::Continue; } |
| 469 | + auto OnBitPackedRun(BitPackedRun run) { |
| 470 | + return RleBitPackedParser::ControlFlow::Continue; |
| 471 | + } |
| 472 | + } handler; |
| 473 | + |
| 474 | + // Iterate over all runs |
| 475 | + parser.Parse(handler); |
| 476 | + // Non-exhaustion despite ControlFlow::Continue signals an error occurred. |
| 477 | + EXPECT_FALSE(parser.exhausted()); |
| 478 | +} |
| 479 | + |
| 480 | +void TestRleBitPackedParserError(const std::vector<uint8_t>& bytes, |
| 481 | + rle_size_t bit_width) { |
| 482 | + TestRleBitPackedParserError(span(bytes), bit_width); |
| 483 | +} |
| 484 | + |
461 | 485 | TEST(RleBitPacked, RleBitPackedParser) { |
462 | 486 | TestRleBitPackedParser<uint16_t>( |
463 | 487 | /* bytes= */ |
@@ -500,6 +524,108 @@ TEST(RleBitPacked, RleBitPackedParser) { |
500 | 524 | } |
501 | 525 | } |
502 | 526 |
|
| 527 | +TEST(RleBitPacked, RleBitPackedParserInvalidNonPadded) { |
| 528 | + // GH-47981: a non-padded trailing bit-packed, produced by some non-compliant |
| 529 | + // encoders, should still be decoded successfully. |
| 530 | + |
| 531 | + TestRleBitPackedParser<uint16_t>( |
| 532 | + /* bytes= */ |
| 533 | + {/* LEB128 for 8 values bit packed marker */ 0x3, |
| 534 | + /* Bitpacked run */ 0x88, 0xc6}, |
| 535 | + /* bit_width= */ 3, |
| 536 | + /* expected= */ {0, 1, 2, 3, 4}); |
| 537 | + TestRleBitPackedParser<uint16_t>( |
| 538 | + /* bytes= */ |
| 539 | + {/* LEB128 for 8 values bit packed marker */ 0x3, |
| 540 | + /* Bitpacked run */ 0x88}, |
| 541 | + /* bit_width= */ 3, |
| 542 | + /* expected= */ {0, 1}); |
| 543 | + TestRleBitPackedParser<uint16_t>( |
| 544 | + /* bytes= */ |
| 545 | + {/* LEB128 for 8 values bit packed marker */ 0x3, |
| 546 | + /* Bitpacked run */ 0x1, 0x2, 0x3}, |
| 547 | + /* bit_width= */ 8, |
| 548 | + /* expected= */ {1, 2, 3}); |
| 549 | + TestRleBitPackedParser<uint16_t>( |
| 550 | + /* bytes= */ |
| 551 | + {/* LEB128 for 8 values bit packed marker */ 0x3, |
| 552 | + /* Bitpacked run */ 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7}, |
| 553 | + /* bit_width= */ 8, |
| 554 | + /* expected= */ {1, 2, 3, 4, 5, 6, 7}); |
| 555 | + TestRleBitPackedParser<uint16_t>( |
| 556 | + /* bytes= */ |
| 557 | + {/* LEB128 for 16 values bit packed marker */ 0x5, |
| 558 | + /* Bitpacked run */ 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9}, |
| 559 | + /* bit_width= */ 8, |
| 560 | + /* expected= */ {1, 2, 3, 4, 5, 6, 7, 8, 9}); |
| 561 | + |
| 562 | + // If the trailing bit-packed declares more values than padding allows, |
| 563 | + // it's an error. |
| 564 | + |
| 565 | + // 2 values encoded, 16 values declared (8 would be ok) |
| 566 | + TestRleBitPackedParserError( |
| 567 | + /* bytes= */ |
| 568 | + {/* LEB128 for 16 values bit packed marker */ 0x5, |
| 569 | + /* Bitpacked run */ 0x88}, |
| 570 | + /* bit_width= */ 3); |
| 571 | + // 8 values encoded, 16 values declared (8 would be ok) |
| 572 | + TestRleBitPackedParserError( |
| 573 | + /* bytes= */ |
| 574 | + {/* LEB128 for 16 values bit packed marker */ 0x5, |
| 575 | + /* Bitpacked run */ 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8}, |
| 576 | + /* bit_width= */ 8); |
| 577 | + |
| 578 | + // If the trailing bit-packed run does not have room for at least 1 value, |
| 579 | + // it's an error. |
| 580 | + |
| 581 | + TestRleBitPackedParserError( |
| 582 | + /* bytes= */ |
| 583 | + {/* LEB128 for 8 values bit packed marker */ 0x3}, |
| 584 | + /* bit_width= */ 3); |
| 585 | + TestRleBitPackedParserError( |
| 586 | + /* bytes= */ |
| 587 | + {/* LEB128 for 8 values bit packed marker */ 0x3, |
| 588 | + /* Bitpacked run */ 0x1}, |
| 589 | + /* bit_width= */ 9); |
| 590 | +} |
| 591 | + |
| 592 | +TEST(RleBitPacked, RleBitPackedParserErrors) { |
| 593 | + // Truncated LEB128 header |
| 594 | + TestRleBitPackedParserError( |
| 595 | + /* bytes= */ |
| 596 | + {0x81}, |
| 597 | + /* bit_width= */ 3); |
| 598 | + |
| 599 | + // Invalid LEB128 header for a 32-bit value |
| 600 | + TestRleBitPackedParserError( |
| 601 | + /* bytes= */ |
| 602 | + {0xFF, 0xFF, 0xFF, 0xFF, 0x7f}, |
| 603 | + /* bit_width= */ 3); |
| 604 | + |
| 605 | + // Zero-length repeated run |
| 606 | + TestRleBitPackedParserError( |
| 607 | + /* bytes= */ |
| 608 | + {0x00}, |
| 609 | + /* bit_width= */ 3); |
| 610 | + TestRleBitPackedParserError( |
| 611 | + /* bytes= */ |
| 612 | + {0x80, 0x00}, |
| 613 | + /* bit_width= */ 3); |
| 614 | + |
| 615 | + // Zero-length bit-packed run |
| 616 | + TestRleBitPackedParserError( |
| 617 | + /* bytes= */ |
| 618 | + {0x01}, |
| 619 | + /* bit_width= */ 3); |
| 620 | + |
| 621 | + // Bit-packed run too large |
| 622 | + // (we pass a span<> on invalid memory, but only the reachable part should be read) |
| 623 | + std::vector<uint8_t> bytes = {0x81, 0x80, 0x80, 0x80, 0x02}; |
| 624 | + TestRleBitPackedParserError( |
| 625 | + /* bytes= */ span(bytes.data(), 1ULL << 30), |
| 626 | + /* bit_width= */ 1); |
| 627 | +} |
| 628 | + |
503 | 629 | // Validates encoding of values by encoding and decoding them. If |
504 | 630 | // expected_encoding != NULL, also validates that the encoded buffer is |
505 | 631 | // exactly 'expected_encoding'. |
|
0 commit comments