diff --git a/.github/workflows/ci_apps_hyparquet_demo.yml b/.github/workflows/ci_apps_hyparquet_demo.yml new file mode 100644 index 00000000..0d7c0894 --- /dev/null +++ b/.github/workflows/ci_apps_hyparquet_demo.yml @@ -0,0 +1,32 @@ +name: apps/hyparquet-demo +on: + push: + paths: + - 'apps/hyparquet-demo/**' + - '.github/workflows/ci_apps_hyparquet_demo.yml' + +defaults: + run: + working-directory: ./apps/hyparquet-demo + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: npm i + - run: npm run lint + + typecheck: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: npm i + - run: tsc + + buildcheck: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: npm i + - run: npm run build \ No newline at end of file diff --git a/README.md b/README.md index da01403f..2424480b 100644 --- a/README.md +++ b/README.md @@ -8,3 +8,4 @@ It contains the following package: It also contains the following applications: - [`hyperparam`](./apps/cli): a cli tool for viewing arbitrarily large datasets in the browser. - [`hightable-demo`](./apps/hightable-demo): an example project showing how to use [hightable](https://github.com/hyparam/hightable). +- [`hyparquet-demo`](./apps/hyparquet-demo): an example project showing how to use [hyparquet](https://github.com/hyparam/hyparquet). diff --git a/apps/hyparquet-demo/.gitignore b/apps/hyparquet-demo/.gitignore new file mode 100644 index 00000000..a547bf36 --- /dev/null +++ b/apps/hyparquet-demo/.gitignore @@ -0,0 +1,24 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +node_modules +dist +dist-ssr +*.local + +# Editor directories and files +.vscode/* +!.vscode/extensions.json +.idea +.DS_Store +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? diff --git a/apps/hyparquet-demo/README.md b/apps/hyparquet-demo/README.md new file mode 100644 index 00000000..4367d2da --- /dev/null +++ b/apps/hyparquet-demo/README.md @@ -0,0 +1,18 @@ +# Hyparquet demo + +This is an example project showing how to use [hyparquet](https://github.com/hyparam/hyparquet). + +## Build + +```bash +cd apps/hyparquet-demo +npm i +npm run build +``` + +The build artifacts will be stored in the `dist/` directory and can be served using any static server, eg. `http-server`: + +```bash +npm i -g http-server +http-server dist/ +``` diff --git a/apps/hyparquet-demo/eslint.config.js b/apps/hyparquet-demo/eslint.config.js new file mode 100644 index 00000000..3c7601c4 --- /dev/null +++ b/apps/hyparquet-demo/eslint.config.js @@ -0,0 +1,43 @@ +import js from '@eslint/js' +import react from 'eslint-plugin-react' +import reactHooks from 'eslint-plugin-react-hooks' +import reactRefresh from 'eslint-plugin-react-refresh' +import globals from 'globals' +import tseslint from 'typescript-eslint' +import { sharedJsRules, sharedTsRules } from '../../shared.eslint.config.js' + +export default tseslint.config( + { ignores: ['dist'] }, + { + extends: [js.configs.recommended, ...tseslint.configs.strictTypeChecked, ...tseslint.configs.stylisticTypeChecked], + // Set the react version + settings: { react: { version: '18.3' } }, + files: ['src/**/*.{ts,tsx}'], + languageOptions: { + ecmaVersion: 2020, + globals: globals.browser, + parserOptions: { + project: './tsconfig.json', + tsconfigRootDir: import.meta.dirname, + }, + }, + plugins: { + react, + 'react-hooks': reactHooks, + 'react-refresh': reactRefresh, + }, + rules: { + ...react.configs.recommended.rules, + ...react.configs['jsx-runtime'].rules, + ...reactHooks.configs.recommended.rules, + 'react-refresh/only-export-components': [ + 'warn', + { allowConstantExport: true }, + ], + ...js.configs.recommended.rules, + ...tseslint.configs.recommended.rules, + ...sharedJsRules, + ...sharedTsRules, + }, + }, +) diff --git a/apps/hyparquet-demo/index.html b/apps/hyparquet-demo/index.html new file mode 100644 index 00000000..38c8843c --- /dev/null +++ b/apps/hyparquet-demo/index.html @@ -0,0 +1,27 @@ + + + + + hyparquet parquet file parser demo + + + + + + + + + + +
+
+
+ + + + + diff --git a/apps/hyparquet-demo/package.json b/apps/hyparquet-demo/package.json new file mode 100644 index 00000000..57612f37 --- /dev/null +++ b/apps/hyparquet-demo/package.json @@ -0,0 +1,33 @@ +{ + "name": "hyparquet-demo", + "private": true, + "version": "0.0.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "tsc -b && vite build", + "lint": "eslint .", + "preview": "vite preview" + }, + "dependencies": { + "hyparquet": "1.5.0", + "hyparquet-compressors": "0.1.4", + "hightable": "0.7.0", + "react": "^18.3.1", + "react-dom": "^18.3.1" + }, + "devDependencies": { + "@eslint/js": "^9.13.0", + "@types/react": "^18.3.12", + "@types/react-dom": "^18.3.1", + "@vitejs/plugin-react": "^4.3.3", + "eslint": "^9.13.0", + "eslint-plugin-react": "^7.37.2", + "eslint-plugin-react-hooks": "^5.0.0", + "eslint-plugin-react-refresh": "^0.4.14", + "globals": "^15.11.0", + "typescript": "~5.6.2", + "typescript-eslint": "^8.11.0", + "vite": "^5.4.10" + } +} diff --git a/apps/hyparquet-demo/public/favicon.png b/apps/hyparquet-demo/public/favicon.png new file mode 100644 index 00000000..cd162fdd Binary files /dev/null and b/apps/hyparquet-demo/public/favicon.png differ diff --git a/apps/hyparquet-demo/src/App.tsx b/apps/hyparquet-demo/src/App.tsx new file mode 100644 index 00000000..109a4a9f --- /dev/null +++ b/apps/hyparquet-demo/src/App.tsx @@ -0,0 +1,85 @@ +import { ReactNode } from 'react' +import Page, { PageProps } from './Page.js' +import Welcome from './Welcome.js' + +import { DataFrame, rowCache } from 'hightable' +import { FileMetaData, byteLengthFromUrl, parquetMetadataAsync, parquetSchema } from 'hyparquet' +import { useCallback, useEffect, useState } from 'react' +import Dropzone from './Dropzone.js' +import Layout from './Layout.js' +import { asyncBufferFrom } from './utils.js' +import { parquetQueryWorker } from './workers/parquetWorkerClient.js' +import { AsyncBufferFrom, Row } from './workers/types.js' + +export default function App(): ReactNode { + const params = new URLSearchParams(location.search) + const url = params.get('key') ?? undefined + + const [error, setError] = useState() + const [pageProps, setPageProps] = useState() + + const setUnknownError = useCallback((e: unknown) => { + setError(e instanceof Error ? e : new Error(String(e))) + }, []) + + const onUrlDrop = useCallback( + (url: string) => { + // Add key=url to query string + const params = new URLSearchParams(location.search) + params.set('key', url) + history.pushState({}, '', `${location.pathname}?${params}`) + byteLengthFromUrl(url).then(byteLength => setAsyncBuffer(url, { url, byteLength })).catch(setUnknownError) + }, + [setUnknownError], + ) + + useEffect(() => { + if (!pageProps && url) { + onUrlDrop(url) + } + }, [ url, pageProps, onUrlDrop]) + + function onFileDrop(file: File) { + // Clear query string + history.pushState({}, '', location.pathname) + setAsyncBuffer(file.name, { file, byteLength: file.size }).catch(setUnknownError) + } + + async function setAsyncBuffer(name: string, from: AsyncBufferFrom) { + const asyncBuffer = await asyncBufferFrom(from) + const metadata = await parquetMetadataAsync(asyncBuffer) + const df = rowCache(parquetDataFrame(from, metadata)) + setPageProps({ metadata, df, name, byteLength: from.byteLength, setError }) + } + + return + { setError(e) }} + onFileDrop={onFileDrop} + onUrlDrop={onUrlDrop}> + {pageProps ? : } + + +} + +/** + * Convert a parquet file into a dataframe. + */ +function parquetDataFrame(from: AsyncBufferFrom, metadata: FileMetaData): DataFrame { + const { children } = parquetSchema(metadata) + return { + header: children.map(child => child.element.name), + numRows: Number(metadata.num_rows), + /** + * @param {number} rowStart + * @param {number} rowEnd + * @param {string} orderBy + * @returns {Promise} + */ + rows(rowStart: number, rowEnd: number, orderBy: string): Promise { + console.log(`reading rows ${rowStart}-${rowEnd}`, orderBy) + return parquetQueryWorker({ from, metadata, rowStart, rowEnd, orderBy }) + }, + sortable: true, + } +} diff --git a/apps/hyparquet-demo/src/Dropdown.tsx b/apps/hyparquet-demo/src/Dropdown.tsx new file mode 100644 index 00000000..6acf72cd --- /dev/null +++ b/apps/hyparquet-demo/src/Dropdown.tsx @@ -0,0 +1,72 @@ +import { ReactNode, useEffect, useRef, useState } from 'react' +import { cn } from './utils.js' + +interface DropdownProps { + label?: string + className?: string + children: ReactNode +} + +/** + * Dropdown menu component. + * + * @param {Object} props + * @param {string} props.label - button label + * @param {string} props.className - custom class name for the dropdown container + * @param {ReactNode} props.children - dropdown menu items + * @returns {ReactNode} + * @example + * + * + * + * + */ +export default function Dropdown({ label, className, children }: DropdownProps): ReactNode { + const [isOpen, setIsOpen] = useState(false) + const dropdownRef = useRef(null) + const menuRef = useRef(null) + + function toggleDropdown() { + setIsOpen(!isOpen) + } + + useEffect(() => { + function handleClickInside(event: MouseEvent) { + const target = event.target as Element + if (menuRef.current && menuRef.current.contains(target) && target.tagName !== 'INPUT') { + setIsOpen(false) + } + } + function handleClickOutside(event: MouseEvent) { + if (dropdownRef.current && !dropdownRef.current.contains(event.target as Node)) { + setIsOpen(false) + } + } + function handleEscape(event: KeyboardEvent) { + if (event.key === 'Escape') { + setIsOpen(false) + } + } + document.addEventListener('click', handleClickInside) + document.addEventListener('keydown', handleEscape) + document.addEventListener('mousedown', handleClickOutside) + return () => { + document.removeEventListener('click', handleClickInside) + document.removeEventListener('keydown', handleEscape) + document.removeEventListener('mousedown', handleClickOutside) + } + }, []) + + return ( +
+ +
+ {children} +
+
+ ) +} diff --git a/apps/hyparquet-demo/src/Dropzone.tsx b/apps/hyparquet-demo/src/Dropzone.tsx new file mode 100644 index 00000000..016209aa --- /dev/null +++ b/apps/hyparquet-demo/src/Dropzone.tsx @@ -0,0 +1,129 @@ +import React, { ReactNode, useEffect, useRef, useState } from 'react' + +interface DropzoneProps { + children: ReactNode + onFileDrop: (file: File) => void + onUrlDrop: (url: string) => void + onError: (error: Error) => void +} + +/** + * A dropzone component for uploading files. + * + * Shows a fullscreen overlay when files are dragged over the dropzone. + * + * You can have an element inside the dropzone that triggers the file input + * dialog when clicked by adding the class 'dropzone-select' to it. + * + * @param {Object} props + * @param {ReactNode} props.children - message to display in dropzone. + * @param {Function} props.onFileDrop - called when a file is dropped. + * @param {Function} props.onUrlDrop - called when a url is dropped. + * @param {Function} props.onError - called when an error occurs. + * @returns {ReactNode} + */ +export default function Dropzone({ children, onFileDrop, onUrlDrop }: DropzoneProps): ReactNode { + const dropzoneRef = useRef(null) + const fileInputRef = useRef(null) + // number of dragenter events minus dragleave events + const [enters, setEnters] = useState(0) + + /** + * Trigger file input dialog. + * @param {MouseEvent} e - click + */ + function triggerFileSelect(e: React.MouseEvent) { + // If click inside '.dropzone', activate file input dialog + if ((e.target as Element).classList.contains('dropzone')) { + fileInputRef.current?.click() + } + } + + /** + * Handle file selection event. + * Recursively upload files and directories, in parallel. + * @param {ChangeEvent} e + * @returns {void} + */ + function handleFileSelect(e: React.ChangeEvent): void { + const { files } = e.target + if (!files || files.length !== 1) return + onFileDrop(files[0]) + } + + useEffect(() => { + const dropzone = dropzoneRef.current + if (!dropzone) return + + // Attach drag-and-drop event listeners + function onDragEnter(e: DragEvent) { + // check if any of the items are files (not strings) + const items = e.dataTransfer?.items + if (!items) return + if (!Array.from(items).some(item => item.kind === 'file')) return + setEnters(enters => enters + 1) + } + function onDragOver(e: DragEvent) { + e.preventDefault() + } + function onDragLeave(e: DragEvent) { + const items = e.dataTransfer?.items + if (!items) return + if (!Array.from(items).some(item => item.kind === 'file')) return + setEnters(enters => enters - 1) + } + function handleFileDrop(e: DragEvent) { + e.preventDefault() + setEnters(0) + + if (!e.dataTransfer) throw new Error('Missing dataTransfer') + const { files, items } = e.dataTransfer + if (files.length > 0) { + const file = files[0] + onFileDrop(file) + } + if (items.length > 0) { + const item = items[0] + if (item.kind === 'string') { + item.getAsString(url => { + if (url.startsWith('http')) { + onUrlDrop(url) + } + }) + } + } + } + + window.addEventListener('dragenter', onDragEnter) + window.addEventListener('dragover', onDragOver) + window.addEventListener('dragleave', onDragLeave) + dropzone.addEventListener('drop', handleFileDrop) + + // Cleanup event listeners when component is unmounted + return () => { + window.removeEventListener('dragenter', onDragEnter) + window.removeEventListener('dragover', onDragOver) + window.removeEventListener('dragleave', onDragLeave) + dropzone.removeEventListener('drop', handleFileDrop) + } + }) + + return ( +
0 ? 'dropzone hover' : 'dropzone'} + onClick={triggerFileSelect} + ref={dropzoneRef}> + {children} +
+
+
Drop files to view. 👀
+
+
+ +
+ ) +} diff --git a/apps/hyparquet-demo/src/Layout.tsx b/apps/hyparquet-demo/src/Layout.tsx new file mode 100644 index 00000000..f3e59c88 --- /dev/null +++ b/apps/hyparquet-demo/src/Layout.tsx @@ -0,0 +1,48 @@ +import { ReactNode, useEffect } from 'react' +import { cn } from './utils.js' + +interface LayoutProps { + children: ReactNode + className?: string + progress?: number + error?: Error +} + +/** + * Layout for shared UI. + * Content div style can be overridden by className prop. + * + * @param {Object} props + * @param {ReactNode} props.children - content to display inside the layout + * @param {string | undefined} props.className - additional class names to apply to the content container + * @param {number | undefined} props.progress - progress bar value + * @param {Error} props.error - error message to display + * @returns {ReactNode} + */ +export default function Layout({ children, className, progress, error }: LayoutProps): ReactNode { + const errorMessage = error?.toString() + if (error) console.error(error) + + useEffect(() => { + document.title = 'hyparquet demo - apache parquet file viewer online' + }, []) + + return <> +
+
+ {children} +
+
{errorMessage}
+
+ {progress !== undefined && progress < 1 && +
+
+
+ } + +} + + +export function Spinner({ className }: { className: string }) { + return
+} diff --git a/apps/hyparquet-demo/src/Page.tsx b/apps/hyparquet-demo/src/Page.tsx new file mode 100644 index 00000000..1d3735e7 --- /dev/null +++ b/apps/hyparquet-demo/src/Page.tsx @@ -0,0 +1,56 @@ +import HighTable, { DataFrame } from 'hightable' +import { FileMetaData } from 'hyparquet' +import { ReactNode, useState } from 'react' +import Dropdown from './Dropdown.js' +import ParquetLayout from './ParquetLayout.js' +import ParquetMetadata from './ParquetMetadata.js' + +type Lens = 'table' | 'metadata' | 'layout' + +export interface PageProps { + metadata: FileMetaData + df: DataFrame + name: string + byteLength?: number + setError: (e: Error) => void +} + +/** + * Hyparquet demo viewer page + * @param {Object} props + * @returns {ReactNode} + */ +export default function Page({ metadata, df, name, byteLength, setError }: PageProps): ReactNode { + const [lens, setLens] = useState('table') + + return <> +
{name}
+
+ {byteLength !== undefined && {formatFileSize(byteLength)}} + {df.numRows.toLocaleString()} rows + + + + {byteLength && } + +
+ {lens === 'table' && } + {lens === 'metadata' && } + {lens === 'layout' && byteLength && } + +} + +/** + * Returns the file size in human readable format. + * + * @param {number} bytes file size in bytes + * @returns {string} formatted file size string + */ +function formatFileSize(bytes: number): string { + const sizes = ['b', 'kb', 'mb', 'gb', 'tb'] + if (bytes === 0) return '0 b' + const i = Math.floor(Math.log2(bytes) / 10) + if (i === 0) return `${bytes} b` + const base = bytes / Math.pow(1024, i) + return `${base < 10 ? base.toFixed(1) : Math.round(base)} ${sizes[i]}` +} diff --git a/apps/hyparquet-demo/src/ParquetLayout.tsx b/apps/hyparquet-demo/src/ParquetLayout.tsx new file mode 100644 index 00000000..1c331eed --- /dev/null +++ b/apps/hyparquet-demo/src/ParquetLayout.tsx @@ -0,0 +1,139 @@ +import { FileMetaData } from 'hyparquet' +import { ReactNode } from 'react' + +interface LayoutProps { + byteLength: number + metadata: FileMetaData +} + +/** + * Renders the file layout of a parquet file as nested rowgroups and columns. + * @param {Object} props + * @param {number} props.byteLength + * @param {FileMetaData} props.metadata + * @returns {ReactNode} + */ +export default function ParquetLayout({ byteLength, metadata }: LayoutProps): ReactNode { + const metadataStart = byteLength - metadata.metadata_length - 4 + const metadataEnd = byteLength - 4 + + return
+
+ + + + + +
+
+} + + +function Cell({ name, start, end }: { name: string, start: N, end: N }) { + const bytes = end - start + return
+ +
    +
  • start {start.toLocaleString()}
  • +
  • bytes {bytes.toLocaleString()}
  • +
  • end {end.toLocaleString()}
  • +
+
+} + +function Group({ children, name, bytes }: { children: ReactNode, name?: string, bytes?: bigint }) { + return
+
+ + {bytes === undefined ? '' : `bytes ${bytes.toLocaleString()}`} +
+ {children} +
+} + +function RowGroups({ metadata }: { metadata: FileMetaData }) { + return <> + {metadata.row_groups.map((rowGroup, i) => + + {rowGroup.columns.map((column, j) => + , + )} + , + )} + +} + +type ColumnChunk = FileMetaData['row_groups'][number]['columns'][number] +type ColumnMetadata = NonNullable + +function Column({ key, column }: { key: number, column: ColumnChunk }) { + + if (!column.meta_data) return null + const { meta_data } = column + const { dictionary_page_offset, data_page_offset, index_page_offset } = meta_data + const end = getColumnRange(column.meta_data)[1] + const pages = [ + { name: 'Dictionary', offset: dictionary_page_offset }, + { name: 'Data', offset: data_page_offset }, + { name: 'Index', offset: index_page_offset }, + { name: 'End', offset: end }, + ] + .filter((page): page is {name: string, offset: bigint} => page.offset !== undefined) + .sort((a, b) => Number(a.offset) - Number(b.offset)) + + const children = pages.slice(0, -1).map(({ name, offset }, index) => + , + ) + + return + {children} + +} + +function ColumnIndexes({ metadata }: { metadata: FileMetaData }) { + const indexPages = [] + for (const rowGroup of metadata.row_groups) { + for (const column of rowGroup.columns) { + const columnName = column.meta_data?.path_in_schema.join('.') + if (column.column_index_offset) { + indexPages.push({ + name: `ColumnIndex ${columnName}`, + start: column.column_index_offset, + end: column.column_index_offset + BigInt(column.column_index_length ?? 0), + }) + } + if (column.offset_index_offset) { + indexPages.push({ + name: `OffsetIndex ${columnName}`, + start: column.offset_index_offset, + end: column.offset_index_offset + BigInt(column.offset_index_length ?? 0), + }) + } + } + } + + return + {indexPages.map(({ name, start, end }, index) => + , + )} + +} + + +/** + * Find the start byte offset for a column chunk. + * + * @param {ColumnMetaData} columnMetadata + * @returns {[bigint, bigint]} byte offset range + */ +function getColumnRange({ dictionary_page_offset, data_page_offset, total_compressed_size }: ColumnMetadata): [bigint, bigint] { + /// Copied from hyparquet because it's not exported + let columnOffset = dictionary_page_offset + if (!columnOffset || data_page_offset < columnOffset) { + columnOffset = data_page_offset + } + return [columnOffset, columnOffset + total_compressed_size] +} diff --git a/apps/hyparquet-demo/src/ParquetMetadata.tsx b/apps/hyparquet-demo/src/ParquetMetadata.tsx new file mode 100644 index 00000000..4333e25e --- /dev/null +++ b/apps/hyparquet-demo/src/ParquetMetadata.tsx @@ -0,0 +1,18 @@ +import { FileMetaData, toJson } from 'hyparquet' +import { ReactNode } from 'react' + +interface MetadataProps { + metadata: FileMetaData +} + +/** + * Renders the metadata of a parquet file as JSON. + * @param {Object} props + * @param {FileMetaData} props.metadata + * @returns {ReactNode} + */ +export default function ParquetMetadata({ metadata }: MetadataProps): ReactNode { + return + {JSON.stringify(toJson(metadata), null, ' ')} + +} diff --git a/apps/hyparquet-demo/src/Welcome.tsx b/apps/hyparquet-demo/src/Welcome.tsx new file mode 100644 index 00000000..22e97528 --- /dev/null +++ b/apps/hyparquet-demo/src/Welcome.tsx @@ -0,0 +1,65 @@ +import { ReactNode, useRef } from 'react' +import audioSvg from './assets/audio.svg' +import hyparquetMp3 from './assets/hyparquet.mp3' + + +export default function Welcome(): ReactNode { + const audio = useRef(null) + + function playAudio() { + audio.current?.play().catch(() => { + console.warn('Failed to play audio') + }) + } + + return
+

hyparquet

+ /haɪ pɑːrˈkeɪ/play hyparquet pronunciation + +

in-browser parquet file reader

+

+ npm hyparquet + star hyparquet +

+

+ Online demo of hyparquet: a parser for apache parquet files. + Uses hightable for high performance windowed table viewing. +

+

+ Drag and drop a parquet file (or url) to see your parquet data. 👀 +

+

+ Example files: +

+ +
+} diff --git a/apps/hyparquet-demo/src/assets/audio.svg b/apps/hyparquet-demo/src/assets/audio.svg new file mode 100644 index 00000000..c9a4e138 --- /dev/null +++ b/apps/hyparquet-demo/src/assets/audio.svg @@ -0,0 +1,3 @@ + + + diff --git a/apps/hyparquet-demo/src/assets/azure.svg b/apps/hyparquet-demo/src/assets/azure.svg new file mode 100644 index 00000000..7ac89d0f --- /dev/null +++ b/apps/hyparquet-demo/src/assets/azure.svg @@ -0,0 +1,3 @@ + + + diff --git a/apps/hyparquet-demo/src/assets/demo.png b/apps/hyparquet-demo/src/assets/demo.png new file mode 100644 index 00000000..b109e8a7 Binary files /dev/null and b/apps/hyparquet-demo/src/assets/demo.png differ diff --git a/apps/hyparquet-demo/src/assets/git.svg b/apps/hyparquet-demo/src/assets/git.svg new file mode 100644 index 00000000..99de7a4c --- /dev/null +++ b/apps/hyparquet-demo/src/assets/git.svg @@ -0,0 +1,3 @@ + + + diff --git a/apps/hyparquet-demo/src/assets/huggingface.svg b/apps/hyparquet-demo/src/assets/huggingface.svg new file mode 100644 index 00000000..9b31c76c --- /dev/null +++ b/apps/hyparquet-demo/src/assets/huggingface.svg @@ -0,0 +1,4 @@ + + + + diff --git a/apps/hyparquet-demo/src/assets/hyparquet.mp3 b/apps/hyparquet-demo/src/assets/hyparquet.mp3 new file mode 100644 index 00000000..17716e97 Binary files /dev/null and b/apps/hyparquet-demo/src/assets/hyparquet.mp3 differ diff --git a/apps/hyparquet-demo/src/assets/s3.svg b/apps/hyparquet-demo/src/assets/s3.svg new file mode 100644 index 00000000..400f3c9f --- /dev/null +++ b/apps/hyparquet-demo/src/assets/s3.svg @@ -0,0 +1,3 @@ + + + diff --git a/apps/hyparquet-demo/src/index.css b/apps/hyparquet-demo/src/index.css new file mode 100644 index 00000000..481ff69a --- /dev/null +++ b/apps/hyparquet-demo/src/index.css @@ -0,0 +1,601 @@ +* { + box-sizing: border-box; + font-family: 'Mulish', 'Helvetica Neue', Helvetica, Arial, sans-serif; + margin: 0; + padding: 0; +} +body { + display: flex; + font-family: sans-serif; + height: 100vh; + width: 100vw; +} + +h1 { + font-size: 22pt; +} +h2 { + margin-top: 10px; + font-size: 12pt; +} +p { + margin: 15px 0; +} +code { + font-family: monospace; + padding: 10px; + white-space: pre-wrap; + word-break: break-all; +} +sub { + align-items: center; + display: flex; + gap: 5px; +} +sub img { + cursor: pointer; +} + +.error { + color: #c11; + font-family: monospace; + white-space: pre-wrap; +} + +/* dropzone */ +.dropzone { + display: flex; + flex-direction: column; + height: 100%; +} +.dropzone.hover .overlay { + display: flex; +} +.overlay { + font-size: 125%; + position: fixed; + top: 0; + bottom: 0; + right: 0; + left: 0; + background-color: rgba(240, 240, 240, 0.6); + backdrop-filter: blur(4px); + display: none; + padding: 12px; + z-index: 40; +} +.target { + border: 6px dashed #444; + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + gap: 16px; + height: 100%; + width: 100%; +} + +/* sidebar */ +nav { + height: 100vh; + min-width: 48px; + background-image: linear-gradient(to bottom, #667, #585669); + box-shadow: 0 0 4px rgba(10, 10, 10, 0.5); + height: 100vh; +} + +/* brand logo */ +.brand { + color: #fff; + display: flex; + align-items: center; + filter: drop-shadow(0 0 2px #444); + font-family: 'Century Gothic', 'Helvetica Neue', Helvetica, Arial, sans-serif; + font-size: 1.1em; + font-weight: bold; + text-orientation: mixed; + opacity: 0.85; + padding: 10px 12px; + user-select: none; + writing-mode: vertical-rl; + text-decoration: none; +} +.brand:hover { + color: #fff; + filter: drop-shadow(0 0 2px #333); + opacity: 0.9; + text-decoration: none; +} +.brand::before { + content: ''; + background: url(logo.svg) no-repeat 0 center; + background-size: 26px; + height: 26px; + width: 26px; + margin-bottom: 10px; +} + +/* content area */ +main, +#content { + display: flex; + flex-direction: column; + flex: 1; + min-width: 0; +} + +#content { + position: relative; +} + +#app { + flex: 1; +} + +/* content area */ +.content-container { + min-width: 0; + height: 100vh; + display: flex; + flex-direction: column; + flex: 1; +} +.content { + display: flex; + flex-direction: column; + flex: 1; + height: 100vh; + padding: 0; + /* no outer scrollbars */ + overflow: hidden; +} + +/* error bar */ +.error-bar { + max-height: 0; + padding: 0; + background-color: #dd111199; + font-family: monospace; + overflow-y: auto; + transition: max-height 0.3s; + white-space: pre-wrap; +} +.show-error { + max-height: 30%; + padding: 10px; +} + +.top-header { + align-items: center; + background: linear-gradient(to right, #353540, #24202b); + color: #dde4ea; + display: flex; + height: 32px; + justify-content: space-between; + min-height: 32px; + padding-left: 8px; +} +.top-header { + color: #f0f8ff; + font-family: 'Courier New', Courier, monospace; + font-size: 18px; + text-overflow: ellipsis; + white-space: nowrap; + text-decoration-thickness: 1px; +} + +.view-header { + align-items: center; + background-color: #ccc; + color: #444; + display: flex; + gap: 16px; + height: 24px; + padding: 0 8px; + /* all one line */ + text-overflow: ellipsis; + white-space: nowrap; +} +.viewer { + display: flex; + flex: 1; + flex-direction: column; + white-space: pre-wrap; + overflow-y: auto; +} + +/* dropdown */ +.dropdown { + display: inline-block; + position: relative; + text-overflow: ellipsis; + user-select: none; + white-space: nowrap; +} + +.dropdown-button, +.dropdown-button:active, +.dropdown-button:focus, +.dropdown-button:hover { + background: transparent; + border: none; + color: inherit; + cursor: pointer; + font-size: inherit; + height: 24px; + max-width: 300px; + overflow: hidden; +} +/* dropdown caret */ +.dropdown-button::before { + content: "\25bc"; + display: inline-block; + font-size: 10px; + margin-right: 4px; + transform: rotate(-90deg); + transition: transform 0.1s; +} +.open .dropdown-button::before { + transform: rotate(0deg); +} +/* dropdown menu options */ +.dropdown-content { + position: absolute; + left: 0; + background-color: #ccc; + border-bottom-left-radius: 6px; + border-bottom-right-radius: 6px; + box-shadow: 0px 8px 8px 0px rgba(0, 0, 0, 0.2); + max-height: 0; + max-width: 200px; + min-width: 120px; + transition: max-height 0.1s ease-out; + overflow-y: hidden; + z-index: 20; +} +.dropdown-content > button { + background: none; + border: none; + padding: 8px 16px; + text-align: left; +} +/* dropdown menu options hover */ +.dropdown-content > button:active, +.dropdown-content > button:focus, +.dropdown-content > button:hover { + background-color: rgba(95, 75, 133, 0.4); +} +/* roll out dropdown menu */ +.open .dropdown-content { + display: flex; + flex-direction: column; + max-height: 170px; +} + +/* welcome */ +#welcome { + position: absolute; + bottom: 0; + top: 0; + right: 0; + left: 0; + border: 2px #777; + border-radius: 10px; + color: #444; + margin: 10px; + padding: 10px; + display: flex; + flex-direction: column; + flex: 1; + font-size: 20px; + justify-content: center; + max-width: 640px; + margin: 0 auto; +} +/* quick link buttons */ +.quick-links { + display: flex; + flex-wrap: wrap; + gap: 10px; + list-style: none; +} +.quick-links li { + display: flex; + flex: 1 1 calc(50% - 10px); + min-width: 0; +} +.quick-links a { + background-position: 10px center; + background-size: 18px; + border: 1px solid #444; + border-radius: 4px; + font-size: 8pt; + overflow: hidden; + padding: 12px; + padding-left: 36px; + text-overflow: ellipsis; + white-space: nowrap; + width: 100%; +} +.quick-links a:hover { + background-color: #cec; +} +.huggingface { + background: url('assets/huggingface.svg') no-repeat 8px center; +} +.github { + background: url('assets/git.svg') no-repeat 8px center; +} +.aws { + background: url('assets/s3.svg') no-repeat 8px center; +} +.azure { + background: url('assets/azure.svg') no-repeat 8px center; +} + +/* file upload */ +input[type="file"] { + display: none; +} +.overlay { + font-size: 125%; + justify-content: center; + position: absolute; + top: 0; + bottom: 0; + right: 0; + left: 0; + background-color: rgba(240, 240, 240, 0.6); + backdrop-filter: blur(4px); + display: none; + padding: 12px; + z-index: 40; +} +.over .overlay { + display: flex; +} + +/* table */ +.table-container { + display: flex; + flex-direction: column; + min-height: 0; + flex: 1; + position: relative; +} +.table-scroll { + flex: 1; + overflow: auto; +} +.table-scroll > div { + position: relative; +} +.table-scroll .table { + position: absolute; +} + +table { + border-collapse: separate; + border-spacing: 0; +} +table:focus-visible { + outline: none; +} + +/* header */ +.table thead th { + background-color: #eaeaeb; + border: none; + border-bottom: 2px solid #c9c9c9; + box-sizing: content-box; + color: #444; + height: 20px; + padding-top: 8px; + position: sticky; + top: -1px; /* fix 1px gap above thead */ + user-select: none; + z-index: 10; +} +.table thead th:first-child { + border: none; +} +.table thead th:first-child span { + cursor: default; + width: 0; +} +.table tbody tr:first-child td { + border-top: 1px solid transparent; +} + +/* sortable */ +.table.sortable thead th { + cursor: pointer; +} +.table thead th.orderby ::after { + position: absolute; + right: 8px; + top: 8px; + padding-left: 2px; + background-color: #eaeaeb; + content: "▾"; +} + +/* cells */ +.table th, +.table td { + border-bottom: 1px solid #ddd; + border-right: 1px solid #ddd; + height: 32px; + max-width: 1000px; /* prevent columns expanding */ + padding: 4px 12px; + text-align: left; + text-overflow: ellipsis; + overflow: hidden; + white-space: nowrap; +} + +/* pending cell state */ +.table td.pending { + position: relative; +} +.table td.pending::after { + content: ''; + position: absolute; + top: 8px; + left: 8px; + right: 8px; + bottom: 8px; + border-radius: 4px; + background: linear-gradient( + 60deg, + rgba(0, 0, 0, 0.05) 25%, + rgba(0, 0, 0, 0.08) 50%, + rgba(0, 0, 0, 0.05) 75% + ); + background-size: 120px 100%; + animation: textshimmer 3s infinite linear; +} +/* stagger row shimmering */ +.table tr:nth-child(2n) td.pending::after { animation-delay: -1s; } +.table tr:nth-child(2n+1) td.pending::after { animation-delay: -3s; } +.table tr:nth-child(3n) td.pending::after { animation-delay: -2s; } +.table tr:nth-child(5n) td.pending::after { animation-delay: -4s; } +.table tr:nth-child(7n) td.pending::after { animation-delay: -1.5s; } +@keyframes textshimmer { + 0% { + background-position: -120px 0; + } + 100% { + background-position: 120px 0; + } +} + +/* pending table state */ +.table th::before { + content: ''; + position: absolute; + top: 0; + left: 0; + width: 100%; + height: 4px; + background-color: #706fb1; + z-index: 100; +} +.pending .table th::before { + animation: shimmer 2s infinite linear; +} +@keyframes shimmer { + 0%, 100% { background-color: #6fb176; } + 50% { background-color: #adc6b0; } +} + +/* column resize */ +.table thead span { + position: absolute; + border-right: 1px solid #ddd; + top: 0; + right: 0; + bottom: 0; + width: 8px; + cursor: col-resize; + transition: background-color 0.2s ease; +} +.table thead span:hover { + background-color: #aab; +} + +/* row numbers */ +td:first-child { + background-color: #eaeaeb; + border-right: 1px solid #ddd; + color: #888; + font-size: 10px; + padding: 0 2px; + position: sticky; + left: 0; + text-align: center; + user-select: none; + min-width: 32px; + max-width: none; + width: 32px; +} + +/* table corner */ +.table-corner { + background-color: #e4e4e6; + border-right: 1px solid #ccc; + position: absolute; + height: 34px; + width: 32px; + top: 0; + left: 0; + z-index: 15; + box-shadow: inset 0 0 4px rgba(0, 0, 0, 0.2); +} + +/* mock row numbers */ +.mock-row-label { + content: ""; + position: absolute; + top: 0; + left: 0; + bottom: 0; + background: #eaeaeb; + z-index: -10; +} + +#filename { + font-size: 10pt; + margin-top: 20px; +} +.sidebar { + word-break: break-all; +} +.sidebar a { + color: #445; + text-decoration: none; +} +.sidebar a:hover { + text-decoration: underline; +} + +/* layout */ +.layout { + margin: 10px; + max-width: 480px; +} +.layout, +.layout .group, +.layout .cell { + background-color: rgba(100, 80, 180, 0.05); + border: 1px solid #ccc; + border-radius: 4px; + font-size: 12px; + margin-top: 4px; + padding: 4px; + word-break: break-all; +} +.cell, +.group-header { + display: flex; +} +.group-header > label, +.cell > label { + display: flex; + flex: 1; + font-size: 12px; + font-weight: normal; + justify-content: flex-start; +} +.group-header > span { + font-size: 10px; +} + +.layout div ul { + list-style: none; +} +.layout div li { + font-size: 10px; + padding: 2px 4px; + text-align: right; +} diff --git a/apps/hyparquet-demo/src/logo.svg b/apps/hyparquet-demo/src/logo.svg new file mode 100644 index 00000000..90903e28 --- /dev/null +++ b/apps/hyparquet-demo/src/logo.svg @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/apps/hyparquet-demo/src/main.tsx b/apps/hyparquet-demo/src/main.tsx new file mode 100644 index 00000000..7299e3c4 --- /dev/null +++ b/apps/hyparquet-demo/src/main.tsx @@ -0,0 +1,11 @@ +import { StrictMode } from 'react' +import ReactDOM from 'react-dom/client' +import App from './App.js' +import './index.css' + +const app = document.getElementById('app') +if (!app) throw new Error('missing app element') + +ReactDOM.createRoot(app).render( + +) diff --git a/apps/hyparquet-demo/src/utils.ts b/apps/hyparquet-demo/src/utils.ts new file mode 100644 index 00000000..b8350e0b --- /dev/null +++ b/apps/hyparquet-demo/src/utils.ts @@ -0,0 +1,31 @@ +import { AsyncBuffer, asyncBufferFromUrl, cachedAsyncBuffer } from 'hyparquet' +import { AsyncBufferFrom } from './workers/types' + +/** + * Helper function to join class names. + * Filters out falsy values and joins the rest. + * + * @param {...string | undefined | false} names - class name(s) to join + * @returns {string} + */ +export function cn(...names: (string | undefined | false)[]): string { + return names.filter(n => n).join(' ') +} + +/** + * Convert AsyncBufferFromUrl to AsyncBuffer. + */ +export function asyncBufferFrom(from: AsyncBufferFrom): Promise { + if ('url' in from) { + // Cached asyncBuffer for urls only + const key = JSON.stringify(from) + const cached = cache.get(key) + if (cached) return cached + const asyncBuffer = asyncBufferFromUrl(from.url, from.byteLength).then(cachedAsyncBuffer) + cache.set(key, asyncBuffer) + return asyncBuffer + } else { + return from.file.arrayBuffer() + } +} +const cache = new Map>() diff --git a/apps/hyparquet-demo/src/vite-env.d.ts b/apps/hyparquet-demo/src/vite-env.d.ts new file mode 100644 index 00000000..11f02fe2 --- /dev/null +++ b/apps/hyparquet-demo/src/vite-env.d.ts @@ -0,0 +1 @@ +/// diff --git a/apps/hyparquet-demo/src/workers/parquetWorker.ts b/apps/hyparquet-demo/src/workers/parquetWorker.ts new file mode 100644 index 00000000..2afdfe5f --- /dev/null +++ b/apps/hyparquet-demo/src/workers/parquetWorker.ts @@ -0,0 +1,38 @@ +import { ColumnData, parquetQuery } from 'hyparquet' +import { compressors } from 'hyparquet-compressors' +import { asyncBufferFrom } from '../utils.js' +import type { + ChunkMessage, + ErrorMessage, + ParquetReadWorkerOptions, + ResultMessage, +} from './types.d.ts' + +function postChunkMessage ({ chunk, queryId }: ChunkMessage) { + self.postMessage({ chunk, queryId }) +} +function postResultMessage ({ result, queryId }: ResultMessage) { + self.postMessage({ result, queryId }) +} +function postErrorMessage ({ error, queryId }: ErrorMessage) { + self.postMessage({ error, queryId }) +} + +self.onmessage = async ({ data }: { + data: ParquetReadWorkerOptions & { queryId: number; chunks: boolean }; +}) => { + const { metadata, from, rowStart, rowEnd, orderBy, columns, queryId, chunks } = data + const file = await asyncBufferFrom(from) + /** + * @type {((chunk: ColumnData) => void) | undefined} + */ + const onChunk: ((chunk: ColumnData) => void) | undefined = chunks ? chunk => { postChunkMessage({ chunk, queryId }) } : undefined + try { + const result = await parquetQuery({ + metadata, file, rowStart, rowEnd, orderBy, columns, compressors, onChunk, + }) + postResultMessage({ result, queryId }) + } catch (error) { + postErrorMessage({ error: error as Error, queryId }) + } +} diff --git a/apps/hyparquet-demo/src/workers/parquetWorkerClient.ts b/apps/hyparquet-demo/src/workers/parquetWorkerClient.ts new file mode 100644 index 00000000..77141e19 --- /dev/null +++ b/apps/hyparquet-demo/src/workers/parquetWorkerClient.ts @@ -0,0 +1,62 @@ +import ParquetWorker from './parquetWorker?worker&inline' +/// ^ the worker is bundled with the main thread code (inline) which is easier for users to import +/// (no need to copy the worker file to the right place) +import { ColumnData } from 'hyparquet' +import type { ParquetMessage, ParquetReadWorkerOptions, Row } from './types.d.ts' + +let worker: Worker | undefined +let nextQueryId = 0 +interface QueryAgent { + resolve: (value: Row[]) => void; + reject: (error: Error) => void; + onChunk?: (chunk: ColumnData) => void; +} +const pending = new Map() + +function getWorker() { + if (!worker) { + worker = new ParquetWorker() + worker.onmessage = ({ data }: { data: ParquetMessage }) => { + const pendingQueryAgent = pending.get(data.queryId) + if (!pendingQueryAgent) { + console.warn( + `Unexpected: no pending promise found for queryId: ${data.queryId.toString()}`, + ) + return + } + const { resolve, reject, onChunk } = pendingQueryAgent + if ('error' in data) { + reject(data.error) + } else if ('result' in data) { + resolve(data.result) + } else if ('chunk' in data) { + onChunk?.(data.chunk) + } else { + reject(new Error('Unexpected message from worker')) + } + } + } + return worker +} + +/** + * Presents almost the same interface as parquetRead, but runs in a worker. + * This is useful for reading large parquet files without blocking the main thread. + * Instead of taking an AsyncBuffer, it takes a AsyncBufferFrom, because it needs + * to be serialized to the worker. + */ +export function parquetQueryWorker( + { metadata, from, rowStart, rowEnd, orderBy, onChunk }: ParquetReadWorkerOptions, +): Promise { + return new Promise((resolve, reject) => { + const queryId = nextQueryId++ + pending.set(queryId, { resolve, reject, onChunk }) + const worker = getWorker() + + // If caller provided an onChunk callback, worker will send chunks as they are parsed + const chunks = onChunk !== undefined + worker.postMessage({ + queryId, metadata, from, rowStart, rowEnd, orderBy, chunks, + }) + }) +} diff --git a/apps/hyparquet-demo/src/workers/types.d.ts b/apps/hyparquet-demo/src/workers/types.d.ts new file mode 100644 index 00000000..4251bcc6 --- /dev/null +++ b/apps/hyparquet-demo/src/workers/types.d.ts @@ -0,0 +1,35 @@ +import { ColumnData, ParquetReadOptions } from 'hyparquet' + +// Serializable constructors for AsyncBuffers +interface AsyncBufferFromFile { + file: File + byteLength: number +} +interface AsyncBufferFromUrl { + url: string + byteLength: number +} +export type AsyncBufferFrom = AsyncBufferFromFile | AsyncBufferFromUrl + +// Same as ParquetReadOptions, but AsyncBufferFrom instead of AsyncBuffer +export interface ParquetReadWorkerOptions extends Omit { + from: AsyncBufferFrom + orderBy?: string +} +// Row is defined in hightable, but not exported + we change any to unknown +export type Row = Record ; + +interface Message { + queryId: number +} +export interface ChunkMessage extends Message { + chunk: ColumnData +} +export interface ResultMessage extends Message { + result: Row[] +} +export interface ErrorMessage extends Message { + error: Error +} + +export type ParquetMessage = ChunkMessage | ResultMessage | ErrorMessage diff --git a/apps/hyparquet-demo/tsconfig.json b/apps/hyparquet-demo/tsconfig.json new file mode 100644 index 00000000..243bf41c --- /dev/null +++ b/apps/hyparquet-demo/tsconfig.json @@ -0,0 +1,26 @@ +{ + "compilerOptions": { + "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.tsbuildinfo", + "target": "ES2020", + "useDefineForClassFields": true, + "lib": ["ES2020", "DOM", "DOM.Iterable"], + "module": "ESNext", + "skipLibCheck": true, + + /* Bundler mode */ + "moduleResolution": "Bundler", + "allowImportingTsExtensions": true, + "isolatedModules": true, + "moduleDetection": "force", + "noEmit": true, + "jsx": "react-jsx", + + /* Linting */ + "strict": true, + "noUnusedLocals": true, + "noUnusedParameters": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedSideEffectImports": true + }, + "include": ["src"] +} diff --git a/apps/hyparquet-demo/vite.config.ts b/apps/hyparquet-demo/vite.config.ts new file mode 100644 index 00000000..8b0f57b9 --- /dev/null +++ b/apps/hyparquet-demo/vite.config.ts @@ -0,0 +1,7 @@ +import { defineConfig } from 'vite' +import react from '@vitejs/plugin-react' + +// https://vite.dev/config/ +export default defineConfig({ + plugins: [react()], +}) diff --git a/package.json b/package.json index 32f68679..6ceb926b 100644 --- a/package.json +++ b/package.json @@ -7,6 +7,7 @@ "workspaces": [ "apps/cli", "apps/hightable-demo", + "apps/hyparquet-demo", "packages/components" ] }