1- import { DataFrame , DataFrameEvents , ResolvedValue , UnsortableDataFrame , createEventTarget , sortableDataFrame } from 'hightable'
1+ import { DataFrame , DataFrameEvents , ResolvedValue , checkSignal , createEventTarget , validateFetchParams , validateGetCellParams , validateGetRowNumberParams } from 'hightable'
22import type { ColumnData } from 'hyparquet'
33import { FileMetaData , ParquetReadOptions , parquetSchema } from 'hyparquet'
44import { parquetReadWorker } from './workers/parquetWorkerClient.js'
@@ -20,13 +20,16 @@ interface VirtualRowGroup {
2020
2121/**
2222 * Convert a parquet file into a dataframe.
23+ *
24+ * It fetches data on demand in chunks of 1000 rows within each row group.
25+ * It's not sortable. You can use sortableDataFrame from hightable to make it sortable.
2326 */
24- export function parquetDataFrame ( from : AsyncBufferFrom , metadata : FileMetaData , options ?: Pick < ParquetReadOptions , 'utf8' > ) : DataFrame {
27+ export function parquetDataFrame ( from : AsyncBufferFrom , metadata : FileMetaData , options ?: Pick < ParquetReadOptions , 'utf8' > ) : DataFrame < { parquet : FileMetaData } > {
2528 const { children } = parquetSchema ( metadata )
26- const header = children . map ( child => child . element . name )
29+ const columnDescriptors = children . map ( child => ( { name : child . element . name } ) )
2730 const eventTarget = createEventTarget < DataFrameEvents > ( )
2831
29- const cellCache = new Map < string , ResolvedValue < unknown > [ ] > ( header . map ( name => [ name , [ ] ] ) )
32+ const cellCache = new Map < string , ResolvedValue < unknown > [ ] > ( columnDescriptors . map ( ( { name } ) => [ name , [ ] ] ) )
3033
3134 // virtual row groups are up to 1000 rows within row group boundaries
3235 const groups : VirtualRowGroup [ ] = [ ]
@@ -39,7 +42,7 @@ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData,
3942 groups . push ( {
4043 groupStart,
4144 groupEnd,
42- state : new Map ( header . map ( name => [ name , { kind : 'unfetched' } ] ) ) ,
45+ state : new Map ( columnDescriptors . map ( ( { name } ) => [ name , { kind : 'unfetched' } ] ) ) ,
4346 } )
4447 groupStart = groupEnd
4548 }
@@ -84,22 +87,21 @@ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData,
8487
8588 const numRows = Number ( metadata . num_rows )
8689
87- const unsortableDataFrame : UnsortableDataFrame = {
88- header ,
90+ const unsortableDataFrame : DataFrame < { parquet : FileMetaData } > = {
91+ columnDescriptors ,
8992 numRows,
90- metadata,
93+ metadata : { parquet : metadata } ,
9194 eventTarget,
92- getRowNumber ( { row } ) {
93- validateRow ( { row, data : { numRows } } )
95+ getRowNumber ( { row, orderBy } ) {
96+ validateGetRowNumberParams ( { row, orderBy , data : { numRows, columnDescriptors } } )
9497 return { value : row }
9598 } ,
96- getCell ( { row, column } ) {
97- validateRow ( { row, data : { numRows } } )
98- validateColumn ( { column, data : { header } } )
99+ getCell ( { row, column, orderBy } ) {
100+ validateGetCellParams ( { row, column, orderBy, data : { numRows, columnDescriptors } } )
99101 return cellCache . get ( column ) ?. [ row ]
100102 } ,
101103 fetch : async ( { rowStart, rowEnd, columns, signal } ) => {
102- validateFetchParams ( { rowStart, rowEnd, columns, data : { numRows, header } } )
104+ validateFetchParams ( { rowStart, rowEnd, columns, data : { numRows, columnDescriptors } } )
103105 checkSignal ( signal )
104106
105107 if ( ! columns || columns . length === 0 ) {
@@ -126,29 +128,5 @@ export function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData,
126128 } ,
127129 }
128130
129- return sortableDataFrame ( unsortableDataFrame )
130- }
131-
132- function validateFetchParams ( { rowStart, rowEnd, columns, data : { numRows, header } } : { rowStart : number , rowEnd : number , columns ?: string [ ] , data : Pick < DataFrame , 'numRows' | 'header' > } ) : void {
133- if ( rowStart < 0 || rowEnd > numRows || ! Number . isInteger ( rowStart ) || ! Number . isInteger ( rowEnd ) || rowStart > rowEnd ) {
134- throw new Error ( `Invalid row range: ${ rowStart } - ${ rowEnd } , numRows: ${ numRows } ` )
135- }
136- if ( columns ?. some ( column => ! header . includes ( column ) ) ) {
137- throw new Error ( `Invalid columns: ${ columns . join ( ', ' ) } . Available columns: ${ header . join ( ', ' ) } ` )
138- }
139- }
140- function validateRow ( { row, data : { numRows } } : { row : number , data : Pick < DataFrame , 'numRows' > } ) : void {
141- if ( row < 0 || row >= numRows || ! Number . isInteger ( row ) ) {
142- throw new Error ( `Invalid row index: ${ row } , numRows: ${ numRows } ` )
143- }
144- }
145- function validateColumn ( { column, data : { header } } : { column : string , data : Pick < DataFrame , 'header' > } ) : void {
146- if ( ! header . includes ( column ) ) {
147- throw new Error ( `Invalid column: ${ column } . Available columns: ${ header . join ( ', ' ) } ` )
148- }
149- }
150- function checkSignal ( signal ?: AbortSignal ) : void {
151- if ( signal ?. aborted ) {
152- throw new DOMException ( 'The operation was aborted.' , 'AbortError' )
153- }
131+ return unsortableDataFrame
154132}
0 commit comments