|
| 1 | +import { ColumnData, ParquetReadOptions, parquetRead } from 'hyparquet' |
| 2 | + |
| 3 | +type GetColumnOptions = Omit<ParquetReadOptions, 'columns' | 'rowStart' | 'rowEnd' | 'onChunk' | 'onComplete'> & {column: string} |
| 4 | + |
| 5 | +export async function getParquetColumn({ metadata, file, column, compressors }: GetColumnOptions): Promise<unknown[]> { |
| 6 | + const numRows = Number(metadata?.num_rows) |
| 7 | + if (isNaN(numRows)) { |
| 8 | + throw new Error('metadata.num_rows is undefined') |
| 9 | + } |
| 10 | + if (numRows === 0) { |
| 11 | + return [] |
| 12 | + } |
| 13 | + const lastError: {error?: Error} = {} |
| 14 | + const values: unknown[] = Array(numRows).fill(undefined) |
| 15 | + const ranges: [number, number][] = [] |
| 16 | + function onChunk({ columnName, columnData, rowStart, rowEnd }: ColumnData) { |
| 17 | + if (columnName !== column) { |
| 18 | + lastError.error = new Error(`unexpected column name ${columnName}`) |
| 19 | + } |
| 20 | + for (let i = rowStart; i < rowEnd; i++) { |
| 21 | + values[i] = columnData[i - rowStart] |
| 22 | + } |
| 23 | + ranges.push([rowStart, rowEnd]) |
| 24 | + } |
| 25 | + |
| 26 | + // this awaits all the promises. When it returns, all the data should have already been sent using onChunk |
| 27 | + await parquetRead({ metadata, file, columns: [column], compressors, onChunk }) |
| 28 | + |
| 29 | + // Do some checks before returning the data |
| 30 | + |
| 31 | + // check for errors |
| 32 | + if (lastError.error !== undefined) { |
| 33 | + throw lastError.error |
| 34 | + } |
| 35 | + |
| 36 | + // check for missing data (should be faster than checking for undefined values in the array) |
| 37 | + const sortedRanges = ranges.sort((a, b) => a[0] - b[0]) |
| 38 | + for (let i = 0; i < sortedRanges.length - 1; i++) { |
| 39 | + const range = sortedRanges[i] |
| 40 | + const nextRange = sortedRanges[i + 1] |
| 41 | + if (!range || !nextRange) { |
| 42 | + throw new Error('The ranges should not be undefined') |
| 43 | + } |
| 44 | + if (range[1] !== nextRange[0]) { |
| 45 | + throw new Error(`missing data between rows ${range[1]} and ${nextRange[0]}`) |
| 46 | + } |
| 47 | + } |
| 48 | + const firstRange = sortedRanges[0] |
| 49 | + if (!firstRange) { |
| 50 | + throw new Error('The first range should not be undefined') |
| 51 | + } |
| 52 | + if (firstRange[0] !== 0) { |
| 53 | + throw new Error(`missing data before row ${firstRange[0]}`) |
| 54 | + } |
| 55 | + const lastRange = sortedRanges[sortedRanges.length - 1] |
| 56 | + if (!lastRange) { |
| 57 | + throw new Error('The last range should not be undefined') |
| 58 | + } |
| 59 | + if (lastRange[1] !== numRows) { |
| 60 | + throw new Error(`missing data after row ${lastRange[1]}`) |
| 61 | + } |
| 62 | + |
| 63 | + // return the values |
| 64 | + return values |
| 65 | +} |
0 commit comments