From 0ace93e63641b03158d6bbb992436f1aa87f5e76 Mon Sep 17 00:00:00 2001 From: Lovisa Berggren Date: Tue, 28 Jan 2025 16:10:07 +0000 Subject: [PATCH] CLOUDP-297242: Fix parquet file creation for IPA metrics --- .gitignore | 2 +- package-lock.json | 9 ++++++++- package.json | 3 ++- tools/spectral/ipa/metrics/config.js | 5 ++++- tools/spectral/ipa/metrics/metricS3Upload.js | 14 ++++---------- .../ipa/metrics/scripts/runMetricCollection.js | 10 +++++++++- 6 files changed, 28 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 301315a9e6..032ea459e3 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,6 @@ *.out **/*ipa-collector-results-combined.log -**/*metric-collection-results.json +**/*metric-collection-results.parquet **/*spectral-output.txt **/*spectral-report.xml diff --git a/package-lock.json b/package-lock.json index d037494caa..eea55bc081 100644 --- a/package-lock.json +++ b/package-lock.json @@ -15,7 +15,8 @@ "apache-arrow": "^19.0.0", "dotenv": "^16.4.7", "eslint-plugin-jest": "^28.10.0", - "openapi-to-postmanv2": "4.24.0" + "openapi-to-postmanv2": "4.24.0", + "parquet-wasm": "^0.6.1" }, "devDependencies": { "@babel/preset-env": "^7.26.0", @@ -9406,6 +9407,12 @@ "node": ">=6" } }, + "node_modules/parquet-wasm": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/parquet-wasm/-/parquet-wasm-0.6.1.tgz", + "integrity": "sha512-wTM/9Y4EHny8i0qgcOlL9UHsTXftowwCqDsAD8axaZbHp0Opp3ue8oxexbzTVNhqBjFhyhLiU3MT0rnEYnYU0Q==", + "license": "MIT OR Apache-2.0" + }, "node_modules/parse-json": { "version": "5.2.0", "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-5.2.0.tgz", diff --git a/package.json b/package.json index 944d10bd3f..93795afba4 100644 --- a/package.json +++ b/package.json @@ -28,7 +28,8 @@ "apache-arrow": "^19.0.0", "dotenv": "^16.4.7", "eslint-plugin-jest": "^28.10.0", - "openapi-to-postmanv2": "4.24.0" + "openapi-to-postmanv2": "4.24.0", + "parquet-wasm": "^0.6.1" }, "devDependencies": { "@babel/preset-env": "^7.26.0", diff --git a/tools/spectral/ipa/metrics/config.js b/tools/spectral/ipa/metrics/config.js index 972129237f..bb9ac4894e 100644 --- a/tools/spectral/ipa/metrics/config.js +++ b/tools/spectral/ipa/metrics/config.js @@ -11,6 +11,9 @@ const config = { defaultOutputsDir: path.join(dirname, 'outputs'), }; -config.defaultMetricCollectionResultsFilePath = path.join(config.defaultOutputsDir, 'metric-collection-results.json'); +config.defaultMetricCollectionResultsFilePath = path.join( + config.defaultOutputsDir, + 'metric-collection-results.parquet' +); export default config; diff --git a/tools/spectral/ipa/metrics/metricS3Upload.js b/tools/spectral/ipa/metrics/metricS3Upload.js index e4645aefa3..b3787322c1 100644 --- a/tools/spectral/ipa/metrics/metricS3Upload.js +++ b/tools/spectral/ipa/metrics/metricS3Upload.js @@ -2,7 +2,6 @@ import { PutObjectCommand, S3ServiceException } from '@aws-sdk/client-s3'; import config from './config.js'; import path from 'path'; import fs from 'node:fs'; -import { tableFromJSON, tableToIPC } from 'apache-arrow'; import { getS3Client, getS3FilePath } from './utils/dataDumpUtils.js'; /** @@ -11,14 +10,9 @@ import { getS3Client, getS3FilePath } from './utils/dataDumpUtils.js'; */ export async function uploadMetricCollectionDataToS3(filePath = config.defaultMetricCollectionResultsFilePath) { console.log('Loading metrics collection data from', filePath); - const metricsCollectionData = JSON.parse(fs.readFileSync(filePath, 'utf8')); - if (metricsCollectionData === undefined || metricsCollectionData.length === 0) { - throw new Error('Loaded metrics collection data is empty'); - } - - const table = tableFromJSON(metricsCollectionData); - if (table === undefined) { - throw new Error('Unable to transform metrics collection data to table'); + const metricsData = await fs.readFileSync(filePath); + if (metricsData === undefined) { + throw new Error('Loaded metrics collection data is undefined'); } try { @@ -32,7 +26,7 @@ export async function uploadMetricCollectionDataToS3(filePath = config.defaultMe const command = new PutObjectCommand({ Bucket: s3fileProps.bucketName, Key: path.join(s3fileProps.key, formattedDate, 'metric-collection-results.parquet'), - Body: tableToIPC(table, 'stream'), + Body: metricsData, }); console.log('Dumping data to S3...'); diff --git a/tools/spectral/ipa/metrics/scripts/runMetricCollection.js b/tools/spectral/ipa/metrics/scripts/runMetricCollection.js index b0bc971f86..cd9a2af601 100644 --- a/tools/spectral/ipa/metrics/scripts/runMetricCollection.js +++ b/tools/spectral/ipa/metrics/scripts/runMetricCollection.js @@ -1,6 +1,8 @@ import fs from 'node:fs'; import { spawnSync } from 'child_process'; import spectral from '@stoplight/spectral-core'; +import { Compression, Table, writeParquet, WriterPropertiesBuilder } from 'parquet-wasm'; +import { tableFromJSON, tableToIPC } from 'apache-arrow'; import config from '../config.js'; import { runMetricCollectionJob } from '../metricCollection.js'; @@ -51,6 +53,12 @@ runMetricCollectionJob( ) .then((results) => { console.log('Writing results'); - fs.writeFileSync(config.defaultMetricCollectionResultsFilePath, JSON.stringify(results)); + const table = tableFromJSON(results); + const wasmTable = Table.fromIPCStream(tableToIPC(table, 'stream')); + const parquetUint8Array = writeParquet( + wasmTable, + new WriterPropertiesBuilder().setCompression(Compression.GZIP).build() + ); + fs.writeFileSync(config.defaultMetricCollectionResultsFilePath, parquetUint8Array); }) .catch((error) => console.error(error.message));