Skip to content

Commit 6f836f6

Browse files
committed
Update squirreling
1 parent 967890e commit 6f836f6

File tree

5 files changed

+48
-36
lines changed

5 files changed

+48
-36
lines changed

bin/tools/parquetDataSource.js

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,56 @@
1+
import { parquetSchema } from 'hyparquet'
12
import { parquetPlan } from 'hyparquet/src/plan.js'
23
import { asyncGroupToRows, readRowGroup } from 'hyparquet/src/rowgroup.js'
34
import { whereToParquetFilter } from './parquetFilter.js'
45

56
/**
6-
* @import { AsyncBuffer, FileMetaData } from 'hyparquet'
7-
* @import { AsyncDataSource, AsyncRow } from 'squirreling'
7+
* @import { AsyncBuffer, Compressors, FileMetaData } from 'hyparquet'
8+
* @import { AsyncDataSource } from 'squirreling'
9+
* @import { AsyncCells } from 'squirreling/src/types.js'
810
*/
911

1012
/**
1113
* Creates a parquet data source for use with squirreling SQL engine.
1214
*
1315
* @param {AsyncBuffer} file
1416
* @param {FileMetaData} metadata
15-
* @param {import('hyparquet-compressors').Compressors} compressors
17+
* @param {Compressors} compressors
1618
* @returns {AsyncDataSource}
1719
*/
1820
export function parquetDataSource(file, metadata, compressors) {
1921
return {
20-
async *getRows(hints) {
22+
async *scan(hints) {
2123
const options = {
2224
file,
2325
metadata,
2426
compressors,
2527
columns: hints?.columns,
28+
// convert WHERE clause to parquet pushdown filter
2629
filter: whereToParquetFilter(hints?.where),
30+
filterStrict: false,
2731
}
32+
33+
// TODO: check that columns exist in parquet file
34+
let { columns } = options
35+
if (!columns?.length) {
36+
const schema = parquetSchema(metadata)
37+
columns = schema.children.map(col => col.element.name)
38+
}
39+
2840
const plan = parquetPlan(options)
29-
let count = 0
3041
for (const subplan of plan.groups) {
42+
// Read row group
3143
const rg = readRowGroup(options, plan, subplan)
44+
// Transpose to materialized rows
3245
const rows = await asyncGroupToRows(rg, 0, rg.groupRows, undefined, 'object')
33-
for (const asyncRow of rows) {
34-
/** @type {AsyncRow} */
35-
const row = {}
36-
for (const [key, value] of Object.entries(asyncRow)) {
37-
row[key] = () => Promise.resolve(value)
46+
// Convert to AsyncRow generator
47+
for (const row of rows) {
48+
/** @type {AsyncCells} */
49+
const cells = {}
50+
for (const [key, value] of Object.entries(row)) {
51+
cells[key] = () => Promise.resolve(value)
3852
}
39-
yield row
40-
count++
41-
// Check limit after each row
42-
if (hints?.limit !== undefined && count >= hints.limit) return
53+
yield { columns, cells }
4354
}
4455
}
4556
},

package.json

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -55,37 +55,36 @@
5555
"watch:url": "NODE_ENV=development nodemon bin/cli.js https://hyperparam.blob.core.windows.net/hyperparam/starcoderdata-js-00000-of-00065.parquet"
5656
},
5757
"dependencies": {
58-
"hightable": "0.24.1",
59-
"hyparquet": "1.22.1",
58+
"hightable": "0.25.0",
59+
"hyparquet": "1.23.2",
6060
"hyparquet-compressors": "1.1.1",
6161
"icebird": "0.3.1",
62-
"squirreling": "0.4.8"
62+
"squirreling": "0.6.0"
6363
},
6464
"devDependencies": {
65-
"@eslint/js": "9.39.1",
66-
"@storybook/react-vite": "10.1.4",
67-
"@testing-library/react": "16.3.0",
68-
"@types/node": "24.10.2",
65+
"@storybook/react-vite": "10.1.9",
66+
"@testing-library/react": "16.3.1",
67+
"@types/node": "25.0.3",
6968
"@types/react": "19.2.7",
7069
"@types/react-dom": "19.2.3",
7170
"@vitejs/plugin-react": "5.1.2",
72-
"@vitest/coverage-v8": "4.0.15",
73-
"eslint": "9.39.1",
71+
"@vitest/coverage-v8": "4.0.16",
72+
"eslint": "9.39.2",
7473
"eslint-plugin-react": "7.37.5",
7574
"eslint-plugin-react-hooks": "7.0.1",
76-
"eslint-plugin-react-refresh": "0.4.24",
77-
"eslint-plugin-storybook": "10.1.4",
75+
"eslint-plugin-react-refresh": "0.4.26",
76+
"eslint-plugin-storybook": "10.1.9",
7877
"globals": "16.5.0",
7978
"jsdom": "27.3.0",
8079
"nodemon": "3.1.11",
8180
"npm-run-all": "4.1.5",
82-
"react": "19.2.1",
83-
"react-dom": "19.2.1",
84-
"storybook": "10.1.4",
81+
"react": "19.2.3",
82+
"react-dom": "19.2.3",
83+
"storybook": "10.1.9",
8584
"typescript": "5.9.3",
86-
"typescript-eslint": "8.49.0",
87-
"vite": "7.2.7",
88-
"vitest": "4.0.15"
85+
"typescript-eslint": "8.50.0",
86+
"vite": "7.3.0",
87+
"vitest": "4.0.16"
8988
},
9089
"peerDependencies": {
9190
"react": "^18.3.1 || ^19",

src/lib/workers/parquetWorker.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import type { ColumnData } from 'hyparquet'
2-
import { AsyncBuffer, parquetQuery, parquetRead, parquetReadObjects } from 'hyparquet'
1+
import { AsyncBuffer, ColumnData, parquetQuery, parquetRead, parquetReadObjects } from 'hyparquet'
2+
import type { SubColumnData } from 'hyparquet/src/types.js'
33
import { compressors } from 'hyparquet-compressors'
44
import type { ChunkMessage, ClientMessage, CompleteMessage, PageMessage, ParquetQueryResolveMessage, ParquetReadObjectsResolveMessage, ParquetReadResolveMessage, RejectMessage, Rows } from './types.js'
55
import { fromToAsyncBuffer } from './utils.js'
@@ -52,7 +52,7 @@ self.onmessage = async ({ data }: { data: ClientMessage }) => {
5252
function onChunk(chunk: ColumnData) {
5353
postChunkMessage({ chunk, queryId })
5454
}
55-
function onPage(page: ColumnData) {
55+
function onPage(page: SubColumnData) {
5656
postPageMessage({ page, queryId })
5757
}
5858
}

src/lib/workers/parquetWorkerClient.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import type { ColumnData } from 'hyparquet'
2+
import type { SubColumnData } from 'hyparquet/src/types.js'
23
import ParquetWorker from './parquetWorker?worker&inline'
34
import type { ClientMessage, ParquetQueryWorkerOptions, ParquetReadObjectsWorkerOptions, ParquetReadWorkerOptions, Rows, WorkerMessage } from './types.js'
45
/// ^ the worker is bundled with the main thread code (inline) which is easier for users to import
@@ -9,7 +10,7 @@ let nextQueryId = 0
910
interface Agent {
1011
onComplete?: ((rows: Rows) => void)
1112
onChunk?: (chunk: ColumnData) => void
12-
onPage?: (page: ColumnData) => void
13+
onPage?: (page: SubColumnData) => void
1314
reject: (error: Error) => void
1415
parquetReadResolve?: () => void
1516
parquetReadObjectsResolve?: (rows: Rows) => void

src/lib/workers/types.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import type { ColumnData, ParquetReadOptions } from 'hyparquet'
22
import { parquetQuery } from 'hyparquet'
3+
import { SubColumnData } from 'hyparquet/src/types.js'
34

45
// https://github.com/hyparam/hyparquet/pull/105
56
type ParquetQueryFilter = Exclude<Parameters<typeof parquetQuery>[0]['filter'], undefined>
@@ -88,7 +89,7 @@ export interface ChunkMessage extends QueryId {
8889
}
8990
export interface PageMessage extends QueryId {
9091
kind: 'onPage'
91-
page: ColumnData
92+
page: SubColumnData
9293
}
9394
export interface RejectMessage extends QueryId {
9495
kind: 'onReject'

0 commit comments

Comments
 (0)