Skip to content

Commit 80dce43

Browse files
committed
Replace parquet.js with hyparquet
1 parent df4bf75 commit 80dce43

File tree

3 files changed

+20
-119
lines changed

3 files changed

+20
-119
lines changed

package.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,14 @@
3232
"@swc/jest": "^0.2.29",
3333
"@types/jest": "^29.4.0",
3434
"@types/node": "^20.17.6",
35-
"@types/parquetjs": "^0.10.6",
3635
"@typescript-eslint/eslint-plugin": "8.31.1",
3736
"@typescript-eslint/parser": "8.31.1",
3837
"eslint": "^9.39.1",
3938
"eslint-plugin-prettier": "^5.4.1",
4039
"eslint-plugin-unused-imports": "^4.1.4",
40+
"hyparquet": "1.14.0",
4141
"iconv-lite": "^0.6.3",
4242
"jest": "^29.4.0",
43-
"parquetjs": "^0.11.2",
4443
"prettier": "^3.0.0",
4544
"publint": "^0.2.12",
4645
"ts-jest": "^29.1.0",

src/lib/check-file.ts

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -642,17 +642,6 @@ async function _check_parquet(
642642
file: string,
643643
purpose: FilePurpose | string,
644644
): Promise<Partial<CheckFileReport>> {
645-
let ParquetReader: any;
646-
try {
647-
// ParquetJS is optional as it's large and isn't compatible with older systems.
648-
const pkg = await import('parquetjs');
649-
ParquetReader = pkg.ParquetReader;
650-
} catch {
651-
throw new Error(
652-
'parquetjs is not installed and is required to use parquet files. Please install it via `npm install parquetjs`',
653-
);
654-
}
655-
656645
const report_dict: Partial<CheckFileReport> = {};
657646

658647
if (purpose === 'eval') {
@@ -662,14 +651,24 @@ async function _check_parquet(
662651
}
663652

664653
try {
665-
const reader = await ParquetReader.openFile(file);
666-
const schema = reader.schema;
667-
const column_names = Object.keys(schema.fields);
654+
let column_names: string[] = [];
655+
let num_samples: number = 0;
656+
try {
657+
const { asyncBufferFromFile, parquetMetadataAsync, parquetSchema } = await import('hyparquet');
658+
const asyncBuffer = await asyncBufferFromFile(file);
659+
const metadata = await parquetMetadataAsync(asyncBuffer);
660+
const { children } = parquetSchema(metadata);
661+
column_names = children.map((child: any) => child.element.name);
662+
num_samples = Number(metadata.num_rows);
663+
} catch {
664+
throw new Error(
665+
'hyparquet is not installed and is required to use parquet files. Please install it via `npm install hyparquet`',
666+
);
667+
}
668668

669669
if (!column_names.includes('input_ids')) {
670670
report_dict.load_parquet = `Parquet file ${file} does not contain the \`input_ids\` column.`;
671671
report_dict.is_check_passed = false;
672-
await reader.close();
673672
return report_dict;
674673
}
675674

@@ -679,24 +678,19 @@ async function _check_parquet(
679678
', ',
680679
)} are supported.`;
681680
report_dict.is_check_passed = false;
682-
await reader.close();
683681
return report_dict;
684682
}
685683
}
686684

687-
const num_samples = reader.getRowCount() as number;
688-
689685
if (num_samples < MIN_SAMPLES) {
690686
report_dict.has_min_samples = false;
691687
report_dict.message = `Processing ${file} resulted in only ${num_samples} samples. Our minimum is ${MIN_SAMPLES} samples. `;
692688
report_dict.is_check_passed = false;
693-
await reader.close();
694689
return report_dict;
695690
} else {
696691
report_dict.num_samples = num_samples;
697692
}
698693

699-
await reader.close();
700694
report_dict.is_check_passed = true;
701695
} catch (e) {
702696
const errorMessage = e instanceof Error ? e.message : String(e);

yarn.lock

Lines changed: 5 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -937,13 +937,6 @@
937937
resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.15.tgz#596a1747233694d50f6ad8a7869fcb6f56cf5841"
938938
integrity sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==
939939

940-
"@types/node-int64@*":
941-
version "0.4.32"
942-
resolved "https://registry.yarnpkg.com/@types/node-int64/-/node-int64-0.4.32.tgz#a540bcb9e48816ca1b5329d1ab907d6ad134b856"
943-
integrity sha512-xf/JsSlnXQ+mzvc0IpXemcrO4BrCfpgNpMco+GLcXkFk01k/gW9lGJu+Vof0ZSvHK6DsHJDPSbjFPs36QkWXqw==
944-
dependencies:
945-
"@types/node" "*"
946-
947940
"@types/node@*":
948941
version "20.10.5"
949942
resolved "https://registry.yarnpkg.com/@types/node/-/node-20.10.5.tgz#47ad460b514096b7ed63a1dae26fad0914ed3ab2"
@@ -958,13 +951,6 @@
958951
dependencies:
959952
undici-types "~6.21.0"
960953

961-
"@types/parquetjs@^0.10.6":
962-
version "0.10.6"
963-
resolved "https://registry.yarnpkg.com/@types/parquetjs/-/parquetjs-0.10.6.tgz#7e4b54d9d336a8dda9c7a9091ec7f60db98744af"
964-
integrity sha512-ZCsD6j97YD0mGU8/VnVs3NjORXa7zeHvqlpJpCqy4jU8a1O21dalL+MFn9QNbdEfy8rszR1N7NHeT7/LdtHf+A==
965-
dependencies:
966-
"@types/node-int64" "*"
967-
968954
"@types/stack-utils@^2.0.0":
969955
version "2.0.3"
970956
resolved "https://registry.yarnpkg.com/@types/stack-utils/-/stack-utils-2.0.3.tgz#6209321eb2c1712a7e7466422b8cb1fc0d9dd5d8"
@@ -1244,16 +1230,6 @@ balanced-match@^1.0.0:
12441230
resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.2.tgz#e83e3a7e3f300b34cb9d87f615fa0cbf357690ee"
12451231
integrity sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==
12461232

1247-
base64-js@^1.1.2:
1248-
version "1.5.1"
1249-
resolved "https://registry.yarnpkg.com/base64-js/-/base64-js-1.5.1.tgz#1b1b440160a5bf7ad40b650f095963481903930a"
1250-
integrity sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==
1251-
1252-
bindings@~1.2.1:
1253-
version "1.2.1"
1254-
resolved "https://registry.yarnpkg.com/bindings/-/bindings-1.2.1.tgz#14ad6113812d2d37d72e67b4cacb4bb726505f11"
1255-
integrity sha512-u4cBQNepWxYA55FunZSM7wMi55yQaN0otnhhilNoWHq0MfOfJeQx0v0mRRpolGOExPjZcl6FtB0BB8Xkb88F0g==
1256-
12571233
brace-expansion@^1.1.7:
12581234
version "1.1.11"
12591235
resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd"
@@ -1276,13 +1252,6 @@ braces@^3.0.3:
12761252
dependencies:
12771253
fill-range "^7.1.1"
12781254

1279-
brotli@^1.3.0:
1280-
version "1.3.3"
1281-
resolved "https://registry.yarnpkg.com/brotli/-/brotli-1.3.3.tgz#7365d8cc00f12cf765d2b2c898716bcf4b604d48"
1282-
integrity sha512-oTKjJdShmDuGW94SyyaoQvAjf30dZaHnjJ8uAF+u2/vGJkJbJPJAT1gDiOJP5v1Zb6f9KEyW/1HpuaWIXtGHPg==
1283-
dependencies:
1284-
base64-js "^1.1.2"
1285-
12861255
browserslist@^4.22.2:
12871256
version "4.22.2"
12881257
resolved "https://registry.yarnpkg.com/browserslist/-/browserslist-4.22.2.tgz#704c4943072bd81ea18997f3bd2180e89c77874b"
@@ -1307,11 +1276,6 @@ bser@2.1.1:
13071276
dependencies:
13081277
node-int64 "^0.4.0"
13091278

1310-
bson@^1.0.4:
1311-
version "1.1.6"
1312-
resolved "https://registry.yarnpkg.com/bson/-/bson-1.1.6.tgz#fb819be9a60cd677e0853aee4ca712a785d6618a"
1313-
integrity sha512-EvVNVeGo4tHxwi8L6bPj3y3itEvStdwvvlojVxxbyYfoaxJ6keLgrTuKdyfEAszFK+H3olzBuafE0yoh0D1gdg==
1314-
13151279
buffer-from@^1.0.0:
13161280
version "1.1.2"
13171281
resolved "https://registry.yarnpkg.com/buffer-from/-/buffer-from-1.1.2.tgz#2b146a6fd72e80b4f55d255f35ed59a3a9a41bd5"
@@ -1971,6 +1935,11 @@ human-signals@^2.1.0:
19711935
resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0"
19721936
integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==
19731937

1938+
hyparquet@1.14.0:
1939+
version "1.14.0"
1940+
resolved "https://registry.yarnpkg.com/hyparquet/-/hyparquet-1.14.0.tgz#9339d06dc52ee9edc606e74ce6d65c32ff2ed50f"
1941+
integrity sha512-qhDmkQwDrpd+7UESp0gkDoCgJ3m2uyy754Xm49xzZnn49FEvNC2Sm2/oKhbSkmfs0rNepcMh5E2KUiRKE64N0w==
1942+
19741943
iconv-lite@^0.6.3:
19751944
version "0.6.3"
19761945
resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.6.3.tgz#a52f80bf38da1952eb5c681790719871a1a72501"
@@ -2029,11 +1998,6 @@ inherits@2, inherits@^2.0.3:
20291998
resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c"
20301999
integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
20312000

2032-
int53@^0.2.4:
2033-
version "0.2.4"
2034-
resolved "https://registry.yarnpkg.com/int53/-/int53-0.2.4.tgz#5ed8d7aad6c5c6567cae69aa7ffc4a109ee80f86"
2035-
integrity sha512-a5jlKftS7HUOhkUyYD7j2sJ/ZnvWiNlZS1ldR+g1ifQ+/UuZXIE+YTc/lK1qGj/GwAU5F8Z0e1eVq2t1J5Ob2g==
2036-
20372001
is-arrayish@^0.2.1:
20382002
version "0.2.1"
20392003
resolved "https://registry.yarnpkg.com/is-arrayish/-/is-arrayish-0.2.1.tgz#77c99840527aa8ecb1a8ba697b80645a7a926a9d"
@@ -2622,13 +2586,6 @@ lru-cache@^6.0.0:
26222586
dependencies:
26232587
yallist "^4.0.0"
26242588

2625-
lzo@^0.4.0:
2626-
version "0.4.11"
2627-
resolved "https://registry.yarnpkg.com/lzo/-/lzo-0.4.11.tgz#0e76d582567b29e285cb84a6aa392cb94c6283f8"
2628-
integrity sha512-apQHNoW2Alg72FMqaC/7pn03I7umdgSVFt2KRkCXXils4Z9u3QBh1uOtl2O5WmZIDLd9g6Lu4lIdOLmiSTFVCQ==
2629-
dependencies:
2630-
bindings "~1.2.1"
2631-
26322589
make-dir@^4.0.0:
26332590
version "4.0.0"
26342591
resolved "https://registry.yarnpkg.com/make-dir/-/make-dir-4.0.0.tgz#c3c2307a771277cd9638305f915c29ae741b614e"
@@ -2803,11 +2760,6 @@ object-assign@^4.0.1:
28032760
resolved "https://registry.yarnpkg.com/object-assign/-/object-assign-4.1.1.tgz#2109adc7965887cfc05cbbd442cac8bfbb360863"
28042761
integrity sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==
28052762

2806-
object-stream@0.0.1:
2807-
version "0.0.1"
2808-
resolved "https://registry.yarnpkg.com/object-stream/-/object-stream-0.0.1.tgz#3a03a26e94fd112c9abffeb4651e07a5e23cf840"
2809-
integrity sha512-+NPJnRvX9RDMRY9mOWOo/NDppBjbZhXirNNSu2IBnuNboClC9h1ZGHXgHBLDbJMHsxeJDq922aVmG5xs24a/cA==
2810-
28112763
once@^1.3.0:
28122764
version "1.4.0"
28132765
resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1"
@@ -2888,21 +2840,6 @@ parent-module@^1.0.0:
28882840
dependencies:
28892841
callsites "^3.0.0"
28902842

2891-
parquetjs@^0.11.2:
2892-
version "0.11.2"
2893-
resolved "https://registry.yarnpkg.com/parquetjs/-/parquetjs-0.11.2.tgz#ea13221b3583cb1277f8b4b879776420f8863660"
2894-
integrity sha512-Y6FOc3Oi2AxY4TzJPz7fhICCR8tQNL3p+2xGQoUAMbmlJBR7+JJmMrwuyMjIpDiM7G8Wj/8oqOH4UDUmu4I5ZA==
2895-
dependencies:
2896-
brotli "^1.3.0"
2897-
bson "^1.0.4"
2898-
int53 "^0.2.4"
2899-
object-stream "0.0.1"
2900-
snappyjs "^0.6.0"
2901-
thrift "^0.11.0"
2902-
varint "^5.0.0"
2903-
optionalDependencies:
2904-
lzo "^0.4.0"
2905-
29062843
parse-json@^5.2.0:
29072844
version "5.2.0"
29082845
resolved "https://registry.yarnpkg.com/parse-json/-/parse-json-5.2.0.tgz#c76fc66dee54231c962b22bcc8a72cf2f99753cd"
@@ -3030,11 +2967,6 @@ pure-rand@^6.0.0:
30302967
resolved "https://registry.yarnpkg.com/pure-rand/-/pure-rand-6.0.4.tgz#50b737f6a925468679bff00ad20eade53f37d5c7"
30312968
integrity sha512-LA0Y9kxMYv47GIPJy6MI84fqTd2HmYZI83W/kM/SkKfDlajnZYfmXFTxkbY+xSBPkLJxltMa9hIkmdc29eguMA==
30322969

3033-
q@^1.5.0:
3034-
version "1.5.1"
3035-
resolved "https://registry.yarnpkg.com/q/-/q-1.5.1.tgz#7e32f75b41381291d04611f1bf14109ac00651d7"
3036-
integrity sha512-kV/CThkXo6xyFEZUugw/+pIOywXcDbFYgSct5cT3gqlbkBE1SJdwy6UQoZvodiWF/ckQLZyDE/Bu1M6gVu5lVw==
3037-
30382970
queue-microtask@^1.2.2:
30392971
version "1.2.3"
30402972
resolved "https://registry.yarnpkg.com/queue-microtask/-/queue-microtask-1.2.3.tgz#4929228bbc724dfac43e0efb058caf7b6cfb6243"
@@ -3175,11 +3107,6 @@ slash@^3.0.0:
31753107
resolved "https://registry.yarnpkg.com/slash/-/slash-3.0.0.tgz#6539be870c165adbd5240220dbe361f1bc4d4634"
31763108
integrity sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==
31773109

3178-
snappyjs@^0.6.0:
3179-
version "0.6.1"
3180-
resolved "https://registry.yarnpkg.com/snappyjs/-/snappyjs-0.6.1.tgz#9bca9ff8c54b133a9cc84a71d22779e97fc51878"
3181-
integrity sha512-YIK6I2lsH072UE0aOFxxY1dPDCS43I5ktqHpeAsuLNYWkE5pGxRGWfDM4/vSUfNzXjC1Ivzt3qx31PCLmc9yqg==
3182-
31833110
source-map-support@0.5.13:
31843111
version "0.5.13"
31853112
resolved "https://registry.yarnpkg.com/source-map-support/-/source-map-support-0.5.13.tgz#31b24a9c2e73c2de85066c0feb7d44767ed52932"
@@ -3332,15 +3259,6 @@ thenify-all@^1.0.0:
33323259
dependencies:
33333260
any-promise "^1.0.0"
33343261

3335-
thrift@^0.11.0:
3336-
version "0.11.0"
3337-
resolved "https://registry.yarnpkg.com/thrift/-/thrift-0.11.0.tgz#256115e4ff87871e12537f4b510bd2b425e13990"
3338-
integrity sha512-UpsBhOC45a45TpeHOXE4wwYwL8uD2apbHTbtBvkwtUU4dNwCjC7DpQTjw2Q6eIdfNtw+dKthdwq94uLXTJPfFw==
3339-
dependencies:
3340-
node-int64 "^0.4.0"
3341-
q "^1.5.0"
3342-
ws ">= 2.2.3"
3343-
33443262
tmpl@1.0.5:
33453263
version "1.0.5"
33463264
resolved "https://registry.yarnpkg.com/tmpl/-/tmpl-1.0.5.tgz#8683e0b902bb9c20c4f726e3c0b69f36518c07cc"
@@ -3515,11 +3433,6 @@ validate-npm-package-name@^5.0.0:
35153433
resolved "https://registry.yarnpkg.com/validate-npm-package-name/-/validate-npm-package-name-5.0.1.tgz#a316573e9b49f3ccd90dbb6eb52b3f06c6d604e8"
35163434
integrity sha512-OljLrQ9SQdOUqTaQxqL5dEfZWrXExyyWsozYlAWFawPVNuD83igl7uJD2RTkNMbniIYgt8l81eCJGIdQF7avLQ==
35173435

3518-
varint@^5.0.0:
3519-
version "5.0.2"
3520-
resolved "https://registry.yarnpkg.com/varint/-/varint-5.0.2.tgz#5b47f8a947eb668b848e034dcfa87d0ff8a7f7a4"
3521-
integrity sha512-lKxKYG6H03yCZUpAGOPOsMcGxd1RHCu1iKvEHYDPmTyq2HueGhD73ssNBqqQWfvYs04G9iUFRvmAVLW20Jw6ow==
3522-
35233436
walker@^1.0.8:
35243437
version "1.0.8"
35253438
resolved "https://registry.yarnpkg.com/walker/-/walker-1.0.8.tgz#bd498db477afe573dc04185f011d3ab8a8d7653f"
@@ -3556,11 +3469,6 @@ write-file-atomic@^4.0.2:
35563469
imurmurhash "^0.1.4"
35573470
signal-exit "^3.0.7"
35583471

3559-
"ws@>= 2.2.3":
3560-
version "8.18.0"
3561-
resolved "https://registry.yarnpkg.com/ws/-/ws-8.18.0.tgz#0d7505a6eafe2b0e712d232b42279f53bc289bbc"
3562-
integrity sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==
3563-
35643472
y18n@^5.0.5:
35653473
version "5.0.8"
35663474
resolved "https://registry.yarnpkg.com/y18n/-/y18n-5.0.8.tgz#7f4934d0f7ca8c56f95314939ddcd2dd91ce1d55"

0 commit comments

Comments
 (0)