diff --git a/package.json b/package.json index 9457a746..bc9f0472 100644 --- a/package.json +++ b/package.json @@ -54,7 +54,7 @@ }, "dependencies": { "hightable": "0.16.0", - "hyparquet": "1.13.5", + "hyparquet": "1.14.0", "hyparquet-compressors": "1.1.1", "icebird": "0.3.0", "react": "18.3.1", @@ -79,7 +79,7 @@ "eslint-plugin-react-hooks": "5.2.0", "eslint-plugin-react-refresh": "0.4.20", "eslint-plugin-storybook": "0.12.0", - "globals": "16.1.0", + "globals": "16.2.0", "jsdom": "26.1.0", "nodemon": "3.1.10", "npm-run-all": "4.1.5", diff --git a/src/lib/getParquetColumn.ts b/src/lib/getParquetColumn.ts deleted file mode 100644 index 5d7028c0..00000000 --- a/src/lib/getParquetColumn.ts +++ /dev/null @@ -1,65 +0,0 @@ -import { ColumnData, ParquetReadOptions, parquetRead } from 'hyparquet' - -type GetColumnOptions = Omit & {column: string} - -export async function getParquetColumn({ metadata, file, column, compressors }: GetColumnOptions): Promise { - const numRows = Number(metadata?.num_rows) - if (isNaN(numRows)) { - throw new Error('metadata.num_rows is undefined') - } - if (numRows === 0) { - return [] - } - const lastError: {error?: Error} = {} - const values: unknown[] = Array(numRows).fill(undefined) - const ranges: [number, number][] = [] - function onChunk({ columnName, columnData, rowStart, rowEnd }: ColumnData) { - if (columnName !== column) { - lastError.error = new Error(`unexpected column name ${columnName}`) - } - for (let i = rowStart; i < rowEnd; i++) { - values[i] = columnData[i - rowStart] - } - ranges.push([rowStart, rowEnd]) - } - - // this awaits all the promises. When it returns, all the data should have already been sent using onChunk - await parquetRead({ metadata, file, columns: [column], compressors, onChunk }) - - // Do some checks before returning the data - - // check for errors - if (lastError.error !== undefined) { - throw lastError.error - } - - // check for missing data (should be faster than checking for undefined values in the array) - const sortedRanges = ranges.sort((a, b) => a[0] - b[0]) - for (let i = 0; i < sortedRanges.length - 1; i++) { - const range = sortedRanges[i] - const nextRange = sortedRanges[i + 1] - if (!range || !nextRange) { - throw new Error('The ranges should not be undefined') - } - if (range[1] !== nextRange[0]) { - throw new Error(`missing data between rows ${range[1]} and ${nextRange[0]}`) - } - } - const firstRange = sortedRanges[0] - if (!firstRange) { - throw new Error('The first range should not be undefined') - } - if (firstRange[0] !== 0) { - throw new Error(`missing data before row ${firstRange[0]}`) - } - const lastRange = sortedRanges[sortedRanges.length - 1] - if (!lastRange) { - throw new Error('The last range should not be undefined') - } - if (lastRange[1] !== numRows) { - throw new Error(`missing data after row ${lastRange[1]}`) - } - - // return the values - return values -} diff --git a/src/lib/workers/parquetWorker.ts b/src/lib/workers/parquetWorker.ts index cfc06dc6..a3f29874 100644 --- a/src/lib/workers/parquetWorker.ts +++ b/src/lib/workers/parquetWorker.ts @@ -1,6 +1,6 @@ import { ColumnData, parquetQuery } from 'hyparquet' import { compressors } from 'hyparquet-compressors' -import { getParquetColumn } from '../getParquetColumn.js' +import { parquetReadColumn } from 'hyparquet/src/read.js' import { asyncBufferFrom } from '../utils.js' import type { ChunkMessage, ClientMessage, ColumnRanksMessage, ErrorMessage, ResultMessage } from './types.js' @@ -28,10 +28,9 @@ self.onmessage = async ({ data }: { data: ClientMessage }) => { // purpose of sorting. try { - const sortColumn = await getParquetColumn({ metadata, file, column, compressors }) + const sortColumn: unknown[] = Array.from(await parquetReadColumn({ file, metadata, columns: [column], compressors })) const valuesWithIndex = sortColumn.map((value, index) => ({ value, index })) - const sortedValuesWithIndex = Array.from(valuesWithIndex).sort(({ value: a }, { value: b }) => compare(a, b)) - const numRows = sortedValuesWithIndex.length + const sortedValuesWithIndex = valuesWithIndex.sort(({ value: a }, { value: b }) => compare(a, b)) const columnRanks = sortedValuesWithIndex.reduce((accumulator, currentValue, rank) => { const { lastValue, lastRank, ranks } = accumulator const { value, index } = currentValue @@ -43,7 +42,7 @@ self.onmessage = async ({ data }: { data: ClientMessage }) => { return { ranks, lastValue: value, lastRank: rank } } }, { - ranks: Array(numRows).fill(-1) as number[], + ranks: Array(sortColumn.length).fill(-1) as number[], lastValue: undefined as unknown, lastRank: 0, }).ranks