Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use bytes::Bytes;
use parquet::arrow::arrow_reader::{
ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReaderBuilder,
};
use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader};

/// Internal function to read a buffer with Parquet data into a buffer with Arrow IPC Stream data
pub fn read_parquet(parquet_file: Vec<u8>, options: JsReaderOptions) -> Result<Table> {
Expand Down Expand Up @@ -58,6 +59,15 @@ pub fn read_schema(parquet_file: Vec<u8>) -> Result<Schema> {
Ok(schema.into())
}

/// Internal function to read a buffer with Parquet data into an Arrow schema
pub fn read_metadata(parquet_file: Vec<u8>) -> Result<ParquetMetaData> {
// Create Parquet reader
let cursor: Bytes = parquet_file.into();
let reader = ParquetMetaDataReader::new();
let metadata = reader.parse_and_finish(&cursor)?;
Ok(metadata)
}

/// Cast any view types in the metadata's schema to non-view types
pub(crate) fn cast_metadata_view_types(
metadata: &ArrowReaderMetadata,
Expand Down
10 changes: 10 additions & 0 deletions src/wasm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,16 @@ pub fn read_schema(parquet_file: Vec<u8>) -> WasmResult<Schema> {
Ok(crate::reader::read_schema(parquet_file)?)
}

/// Read Parquet metadata from a Parquet file (or footer-only) bytes in memory.
#[wasm_bindgen(js_name = readMetadata)]
#[cfg(feature = "reader")]
pub fn read_metadata(parquet_file: Vec<u8>) -> WasmResult<crate::metadata::ParquetMetaData> {
assert_parquet_file_not_empty(parquet_file.as_slice())?;
let orig_metadata = crate::reader::read_metadata(parquet_file)?;
let metadata = crate::metadata::ParquetMetaData::from(orig_metadata);
Ok(metadata)
}

/// Write Arrow data to a Parquet file.
///
/// For example, to create a Parquet file with Snappy compression:
Expand Down
44 changes: 43 additions & 1 deletion tests/js/schema.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import * as wasm from "../../pkg/node/parquet_wasm";
import { readFileSync } from "fs";
import * as arrow from "apache-arrow";
import { readExpectedArrowData } from "./utils";
import { readExpectedArrowData, extractFooterBytes } from "./utils";
import { parseSchema } from "arrow-js-ffi";
import { it, expect } from "vitest";

Expand Down Expand Up @@ -39,3 +39,45 @@ it("read schema via IPC", async (t) => {
schema.fields.length
);
});

it("read metadata from full file bytes", async (t) => {
const dataPath = `${dataDir}/1-partition-brotli.parquet`;
const buffer = readFileSync(dataPath);
const arr = new Uint8Array(buffer);
// TODO: test with footer bytes alone as well
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment is outdated? It looks like you test against footer bytes below?

const metadata = wasm.readMetadata(arr);

// Convert the parquet file buffer from readFileSync to a Blob.
const blob = new Blob([buffer], { type: "application/octet-stream" });
const pqFile = await wasm.ParquetFile.fromFile(blob);
// Test against the existing ParquetFile.metadata method.
const expectedMetadata = pqFile.metadata();

expect(metadata.fileMetadata().createdBy()).toStrictEqual(expectedMetadata.fileMetadata().createdBy());
expect(metadata.fileMetadata().numRows()).toStrictEqual(expectedMetadata.fileMetadata().numRows());
expect(metadata.fileMetadata().version()).toStrictEqual(expectedMetadata.fileMetadata().version());
expect(metadata.numRowGroups()).toStrictEqual(1);
expect(metadata.numRowGroups()).toStrictEqual(expectedMetadata.numRowGroups());
expect(metadata.rowGroup(0).numRows()).toStrictEqual(expectedMetadata.rowGroup(0).numRows());
});

it("read metadata from footer bytes only", async (t) => {
const dataPath = `${dataDir}/1-partition-brotli.parquet`;
const buffer = readFileSync(dataPath);
const arr = new Uint8Array(buffer);
const footerBytes = extractFooterBytes(arr);
const metadata = wasm.readMetadata(footerBytes);

// Convert the parquet file buffer from readFileSync to a Blob.
const blob = new Blob([buffer], { type: "application/octet-stream" });
const pqFile = await wasm.ParquetFile.fromFile(blob);
// Test against the existing ParquetFile.metadata method.
const expectedMetadata = pqFile.metadata();

expect(metadata.fileMetadata().createdBy()).toStrictEqual(expectedMetadata.fileMetadata().createdBy());
expect(metadata.fileMetadata().numRows()).toStrictEqual(expectedMetadata.fileMetadata().numRows());
expect(metadata.fileMetadata().version()).toStrictEqual(expectedMetadata.fileMetadata().version());
expect(metadata.numRowGroups()).toStrictEqual(1);
expect(metadata.numRowGroups()).toStrictEqual(expectedMetadata.numRowGroups());
expect(metadata.rowGroup(0).numRows()).toStrictEqual(expectedMetadata.rowGroup(0).numRows());
});
28 changes: 28 additions & 0 deletions tests/js/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,31 @@ export async function temporaryServer() {
});
return server as FastifyInstance;
}

export function extractFooterBytes(parquetFile: Uint8Array): Uint8Array {
// Step 1: Obtain the last 8 bytes to get footer length and magic number.
const TAIL_LENGTH = 8;
const tailStartIndex = parquetFile.length - TAIL_LENGTH;
const tailBytes = parquetFile.subarray(tailStartIndex);
if (!tailBytes || tailBytes.length < TAIL_LENGTH) {
throw new Error('Failed to load the Parquet footer length.');
}

// Step 2: Parse the footer length and magic number.
// little-endian
const footerLength = new DataView(tailBytes.buffer, tailBytes.byteOffset, tailBytes.byteLength).getInt32(0, true);
const magic = new TextDecoder().decode(tailBytes.slice(4, 8));
if (magic !== 'PAR1') {
throw new Error('Invalid Parquet file: missing PAR1 magic number.');
}

// Step 3. Extract the footer bytes.
const footerStartIndex = parquetFile.length - (footerLength + TAIL_LENGTH);
// Use .slice here to ensure a fresh arrayBuffer is created,
// so that downstream usage is not "seeing" the full parquetFile buffer.
const footerBytes = parquetFile.slice(footerStartIndex);
if (footerBytes.length !== footerLength + TAIL_LENGTH) {
throw new Error('Failed to load the Parquet footer bytes.');
}
return footerBytes;
}