1- import { DataFrame , DataFrameEvents , ResolvedValue , UnsortableDataFrame , createEventTarget , sortableDataFrame } from 'hightable'
1+ import { DataFrame , DataFrameEvents , ResolvedValue , checkSignal , createEventTarget , sortableDataFrame , validateFetchParams , validateGetCellParams , validateGetRowNumberParams } from 'hightable'
22import type { ColumnData } from 'hyparquet'
33import { FileMetaData , ParquetReadOptions , parquetSchema } from 'hyparquet'
44import { parquetReadWorker } from './workers/parquetWorkerClient.js'
@@ -20,13 +20,15 @@ interface VirtualRowGroup {
2020
2121/**
2222 * Convert a parquet file into a dataframe.
23+ *
24+ * It's sortable on all the columns, and fetches data on demand in chunks of 1000 rows.
2325 */
24- export function parquetDataFrame ( from : AsyncBufferFrom , metadata : FileMetaData , options ?: Pick < ParquetReadOptions , 'utf8' > ) : DataFrame {
26+ export function parquetDataFrame ( from : AsyncBufferFrom , metadata : FileMetaData , options ?: Pick < ParquetReadOptions , 'utf8' > ) : DataFrame < { parquet : FileMetaData } > {
2527 const { children } = parquetSchema ( metadata )
26- const header = children . map ( child => child . element . name )
28+ const columnDescriptors = children . map ( child => ( { name : child . element . name } ) )
2729 const eventTarget = createEventTarget < DataFrameEvents > ( )
2830
29- const cellCache = new Map < string , ResolvedValue < unknown > [ ] > ( header . map ( name => [ name , [ ] ] ) )
31+ const cellCache = new Map < string , ResolvedValue < unknown > [ ] > ( columnDescriptors . map ( ( { name } ) => [ name , [ ] ] ) )
3032
3133 // virtual row groups are up to 1000 rows within row group boundaries
3234 const groups : VirtualRowGroup [ ] = [ ]
@@ -39,7 +41,7 @@ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData,
3941 groups . push ( {
4042 groupStart,
4143 groupEnd,
42- state : new Map ( header . map ( name => [ name , { kind : 'unfetched' } ] ) ) ,
44+ state : new Map ( columnDescriptors . map ( ( { name } ) => [ name , { kind : 'unfetched' } ] ) ) ,
4345 } )
4446 groupStart = groupEnd
4547 }
@@ -84,22 +86,21 @@ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData,
8486
8587 const numRows = Number ( metadata . num_rows )
8688
87- const unsortableDataFrame : UnsortableDataFrame = {
88- header ,
89+ const unsortableDataFrame : DataFrame < { parquet : FileMetaData } > = {
90+ columnDescriptors ,
8991 numRows,
90- metadata,
92+ metadata : { parquet : metadata } ,
9193 eventTarget,
92- getRowNumber ( { row } ) {
93- validateRow ( { row, data : { numRows } } )
94+ getRowNumber ( { row, orderBy } ) {
95+ validateGetRowNumberParams ( { row, orderBy , data : { numRows, columnDescriptors } } )
9496 return { value : row }
9597 } ,
96- getCell ( { row, column } ) {
97- validateRow ( { row, data : { numRows } } )
98- validateColumn ( { column, data : { header } } )
98+ getCell ( { row, column, orderBy } ) {
99+ validateGetCellParams ( { row, column, orderBy, data : { numRows, columnDescriptors } } )
99100 return cellCache . get ( column ) ?. [ row ]
100101 } ,
101102 fetch : async ( { rowStart, rowEnd, columns, signal } ) => {
102- validateFetchParams ( { rowStart, rowEnd, columns, data : { numRows, header } } )
103+ validateFetchParams ( { rowStart, rowEnd, columns, data : { numRows, columnDescriptors } } )
103104 checkSignal ( signal )
104105
105106 if ( ! columns || columns . length === 0 ) {
@@ -128,27 +129,3 @@ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData,
128129
129130 return sortableDataFrame ( unsortableDataFrame )
130131}
131-
132- function validateFetchParams ( { rowStart, rowEnd, columns, data : { numRows, header } } : { rowStart : number , rowEnd : number , columns ?: string [ ] , data : Pick < DataFrame , 'numRows' | 'header' > } ) : void {
133- if ( rowStart < 0 || rowEnd > numRows || ! Number . isInteger ( rowStart ) || ! Number . isInteger ( rowEnd ) || rowStart > rowEnd ) {
134- throw new Error ( `Invalid row range: ${ rowStart } - ${ rowEnd } , numRows: ${ numRows } ` )
135- }
136- if ( columns ?. some ( column => ! header . includes ( column ) ) ) {
137- throw new Error ( `Invalid columns: ${ columns . join ( ', ' ) } . Available columns: ${ header . join ( ', ' ) } ` )
138- }
139- }
140- function validateRow ( { row, data : { numRows } } : { row : number , data : Pick < DataFrame , 'numRows' > } ) : void {
141- if ( row < 0 || row >= numRows || ! Number . isInteger ( row ) ) {
142- throw new Error ( `Invalid row index: ${ row } , numRows: ${ numRows } ` )
143- }
144- }
145- function validateColumn ( { column, data : { header } } : { column : string , data : Pick < DataFrame , 'header' > } ) : void {
146- if ( ! header . includes ( column ) ) {
147- throw new Error ( `Invalid column: ${ column } . Available columns: ${ header . join ( ', ' ) } ` )
148- }
149- }
150- function checkSignal ( signal ?: AbortSignal ) : void {
151- if ( signal ?. aborted ) {
152- throw new DOMException ( 'The operation was aborted.' , 'AbortError' )
153- }
154- }
0 commit comments