Skip to content

Commit 2986f19

Browse files
authored
refactor(base-driver): Lazy loading for SDKs (improve memory usage + speedup start) (#10190)
So, I've figured out that we load all SDKs at start of the program, more interesting, these SDKs can be unused for a lot of deployments. The biggest problem, that it slow down the startup time (especially in the cloud, because it has overlay-fs as protection to allow only readonly operations). On another size, $1 is $1, 10mb is 10mb. So, first I've make Azure lazy: ``` 39mb compiled code -> 35mb (azure lazy) 27mb strings -> 22,8 mb (azure lazy) ``` AWS lazy: ``` 35mb compiled code -> 32,6 (aws sdk) 22,8mb strings -> 21,6 (aws sdk) ``` And finally Google CS: ``` 32,6mb compiled -> 30,1 mb (gcs) 21,6mb strings -> 19,58mb (gcs) ```
1 parent cd2e341 commit 2986f19

File tree

4 files changed

+235
-193
lines changed

4 files changed

+235
-193
lines changed

packages/cubejs-base-driver/src/BaseDriver.ts

Lines changed: 11 additions & 193 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,6 @@ import {
1515
isSslCert,
1616
} from '@cubejs-backend/shared';
1717
import fs from 'fs';
18-
import { getSignedUrl } from '@aws-sdk/s3-request-presigner';
19-
import { S3, GetObjectCommand, S3ClientConfig } from '@aws-sdk/client-s3';
20-
import { Storage } from '@google-cloud/storage';
21-
import {
22-
BlobServiceClient,
23-
StorageSharedKeyCredential,
24-
ContainerSASPermissions,
25-
SASProtocol,
26-
generateBlobSASQueryParameters,
27-
} from '@azure/storage-blob';
28-
import {
29-
DefaultAzureCredential,
30-
ClientSecretCredential,
31-
} from '@azure/identity';
3218

3319
import { cancelCombinator } from './utils';
3420
import {
@@ -57,44 +43,10 @@ import {
5743
InformationSchemaColumn,
5844
} from './driver.interface';
5945

60-
/**
61-
* @see {@link DefaultAzureCredential} constructor options
62-
*/
63-
export type AzureStorageClientConfig = {
64-
azureKey?: string,
65-
sasToken?: string,
66-
/**
67-
* The client ID of a Microsoft Entra app registration.
68-
* In case of DefaultAzureCredential flow if it is omitted
69-
* the Azure library will try to use the AZURE_CLIENT_ID env
70-
*/
71-
clientId?: string,
72-
/**
73-
* ID of the application's Microsoft Entra tenant. Also called its directory ID.
74-
* In case of DefaultAzureCredential flow if it is omitted
75-
* the Azure library will try to use the AZURE_TENANT_ID env
76-
*/
77-
tenantId?: string,
78-
/**
79-
* Azure service principal client secret.
80-
* Enables authentication to Microsoft Entra ID using a client secret that was generated
81-
* for an App Registration. More information on how to configure a client secret can be found here:
82-
* https://learn.microsoft.com/entra/identity-platform/quickstart-configure-app-access-web-apis#add-credentials-to-your-web-application
83-
* In case of DefaultAzureCredential flow if it is omitted
84-
* the Azure library will try to use the AZURE_CLIENT_SECRET env
85-
*/
86-
clientSecret?: string,
87-
/**
88-
* The path to a file containing a Kubernetes service account token that authenticates the identity.
89-
* In case of DefaultAzureCredential flow if it is omitted
90-
* the Azure library will try to use the AZURE_FEDERATED_TOKEN_FILE env
91-
*/
92-
tokenFilePath?: string,
93-
};
94-
95-
export type GoogleStorageClientConfig = {
96-
credentials: any,
97-
};
46+
// Import only types, because these SDKs are quite large and should be loaded lazily
47+
import type { AzureStorageClientConfig } from './storage-fs/azure.fs';
48+
import type { S3StorageClientConfig } from './storage-fs/aws.fs';
49+
import type { GoogleStorageClientConfig } from './storage-fs/gcs.fs';
9850

9951
export type ParsedBucketUrl = {
10052
/**
@@ -800,38 +752,12 @@ export abstract class BaseDriver implements DriverInterface {
800752
* Returns an array of signed AWS S3 URLs of the unloaded csv files.
801753
*/
802754
protected async extractUnloadedFilesFromS3(
803-
clientOptions: S3ClientConfig,
755+
clientOptions: S3StorageClientConfig,
804756
bucketName: string,
805757
prefix: string
806758
): Promise<string[]> {
807-
const storage = new S3(clientOptions);
808-
// It looks that different driver configurations use different formats
809-
// for the bucket - some expect only names, some - full url-like names.
810-
// So we unify this.
811-
bucketName = bucketName.replace(/^[a-zA-Z]+:\/\//, '');
812-
813-
const list = await storage.listObjectsV2({
814-
Bucket: bucketName,
815-
Prefix: prefix,
816-
});
817-
if (list) {
818-
if (!list.Contents) {
819-
return [];
820-
} else {
821-
const csvFiles = await Promise.all(
822-
list.Contents.map(async (file) => {
823-
const command = new GetObjectCommand({
824-
Bucket: bucketName,
825-
Key: file.Key,
826-
});
827-
return getSignedUrl(storage, command, { expiresIn: 3600 });
828-
})
829-
);
830-
return csvFiles;
831-
}
832-
}
833-
834-
throw new Error('Unable to retrieve list of files from S3 storage after unloading.');
759+
// Lazy loading, because it's using azure SDK, which is quite heavy.
760+
return (await import('./storage-fs/aws.fs')).extractUnloadedFilesFromS3(clientOptions, bucketName, prefix);
835761
}
836762

837763
/**
@@ -842,124 +768,16 @@ export abstract class BaseDriver implements DriverInterface {
842768
bucketName: string,
843769
tableName: string
844770
): Promise<string[]> {
845-
const storage = new Storage(
846-
gcsConfig.credentials
847-
? { credentials: gcsConfig.credentials, projectId: gcsConfig.credentials.project_id }
848-
: undefined
849-
);
850-
const bucket = storage.bucket(bucketName);
851-
const [files] = await bucket.getFiles({ prefix: `${tableName}/` });
852-
if (files.length) {
853-
const csvFiles = await Promise.all(files.map(async (file) => {
854-
const [url] = await file.getSignedUrl({
855-
action: 'read',
856-
expires: new Date(new Date().getTime() + 60 * 60 * 1000)
857-
});
858-
return url;
859-
}));
860-
861-
return csvFiles;
862-
} else {
863-
throw new Error('No CSV files were obtained from the bucket');
864-
}
771+
// Lazy loading, because it's using azure SDK, which is quite heavy.
772+
return (await import('./storage-fs/gcs.fs')).extractFilesFromGCS(gcsConfig, bucketName, tableName);
865773
}
866774

867775
protected async extractFilesFromAzure(
868776
azureConfig: AzureStorageClientConfig,
869777
bucketName: string,
870778
tableName: string
871779
): Promise<string[]> {
872-
const splitter = bucketName.includes('blob.core') ? '.blob.core.windows.net/' : '.dfs.core.windows.net/';
873-
const parts = bucketName.split(splitter);
874-
const account = parts[0];
875-
const container = parts[1].split('/')[0];
876-
let credential: StorageSharedKeyCredential | ClientSecretCredential | DefaultAzureCredential;
877-
let blobServiceClient: BlobServiceClient;
878-
let getSas;
879-
880-
if (azureConfig.azureKey) {
881-
credential = new StorageSharedKeyCredential(account, azureConfig.azureKey);
882-
getSas = async (name: string, startsOn: Date, expiresOn: Date) => generateBlobSASQueryParameters(
883-
{
884-
containerName: container,
885-
blobName: name,
886-
permissions: ContainerSASPermissions.parse('r'),
887-
startsOn,
888-
expiresOn,
889-
protocol: SASProtocol.Https,
890-
version: '2020-08-04',
891-
},
892-
credential as StorageSharedKeyCredential
893-
).toString();
894-
} else if (azureConfig.clientSecret && azureConfig.tenantId && azureConfig.clientId) {
895-
credential = new ClientSecretCredential(
896-
azureConfig.tenantId,
897-
azureConfig.clientId,
898-
azureConfig.clientSecret,
899-
);
900-
getSas = async (name: string, startsOn: Date, expiresOn: Date) => {
901-
const userDelegationKey = await blobServiceClient.getUserDelegationKey(startsOn, expiresOn);
902-
return generateBlobSASQueryParameters(
903-
{
904-
containerName: container,
905-
blobName: name,
906-
permissions: ContainerSASPermissions.parse('r'),
907-
startsOn,
908-
expiresOn,
909-
protocol: SASProtocol.Https,
910-
version: '2020-08-04',
911-
},
912-
userDelegationKey,
913-
account
914-
).toString();
915-
};
916-
} else {
917-
const opts = {
918-
tenantId: azureConfig.tenantId,
919-
clientId: azureConfig.clientId,
920-
tokenFilePath: azureConfig.tokenFilePath,
921-
};
922-
credential = new DefaultAzureCredential(opts);
923-
getSas = async (name: string, startsOn: Date, expiresOn: Date) => {
924-
// getUserDelegationKey works only for authorization with Microsoft Entra ID
925-
const userDelegationKey = await blobServiceClient.getUserDelegationKey(startsOn, expiresOn);
926-
return generateBlobSASQueryParameters(
927-
{
928-
containerName: container,
929-
blobName: name,
930-
permissions: ContainerSASPermissions.parse('r'),
931-
startsOn,
932-
expiresOn,
933-
protocol: SASProtocol.Https,
934-
version: '2020-08-04',
935-
},
936-
userDelegationKey,
937-
account,
938-
).toString();
939-
};
940-
}
941-
942-
const url = `https://${account}.blob.core.windows.net`;
943-
blobServiceClient = azureConfig.sasToken ?
944-
new BlobServiceClient(`${url}?${azureConfig.sasToken}`) :
945-
new BlobServiceClient(url, credential);
946-
947-
const csvFiles: string[] = [];
948-
const containerClient = blobServiceClient.getContainerClient(container);
949-
const blobsList = containerClient.listBlobsFlat({ prefix: `${tableName}` });
950-
for await (const blob of blobsList) {
951-
if (blob.name && (blob.name.endsWith('.csv.gz') || blob.name.endsWith('.csv'))) {
952-
const starts = new Date();
953-
const expires = new Date(starts.valueOf() + 1000 * 60 * 60);
954-
const sas = await getSas(blob.name, starts, expires);
955-
csvFiles.push(`${url}/${container}/${blob.name}?${sas}`);
956-
}
957-
}
958-
959-
if (csvFiles.length === 0) {
960-
throw new Error('No CSV files were obtained from the bucket');
961-
}
962-
963-
return csvFiles;
780+
// Lazy loading, because it's using azure SDK, which is quite (extremely) heavy.
781+
return (await import('./storage-fs/azure.fs')).extractFilesFromAzure(azureConfig, bucketName, tableName);
964782
}
965783
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import { getSignedUrl } from '@aws-sdk/s3-request-presigner';
2+
import { S3, GetObjectCommand, S3ClientConfig } from '@aws-sdk/client-s3';
3+
4+
export type S3StorageClientConfig = S3ClientConfig;
5+
6+
/**
7+
* Returns an array of signed AWS S3 URLs of the unloaded csv files.
8+
*/
9+
export async function extractUnloadedFilesFromS3(
10+
clientOptions: S3StorageClientConfig,
11+
bucketName: string,
12+
prefix: string
13+
): Promise<string[]> {
14+
const storage = new S3(clientOptions);
15+
// It looks that different driver configurations use different formats
16+
// for the bucket - some expect only names, some - full url-like names.
17+
// So we unify this.
18+
bucketName = bucketName.replace(/^[a-zA-Z]+:\/\//, '');
19+
20+
const list = await storage.listObjectsV2({
21+
Bucket: bucketName,
22+
Prefix: prefix,
23+
});
24+
if (list) {
25+
if (!list.Contents) {
26+
return [];
27+
} else {
28+
const csvFiles = await Promise.all(
29+
list.Contents.map(async (file) => {
30+
const command = new GetObjectCommand({
31+
Bucket: bucketName,
32+
Key: file.Key,
33+
});
34+
return getSignedUrl(storage, command, { expiresIn: 3600 });
35+
})
36+
);
37+
return csvFiles;
38+
}
39+
}
40+
41+
throw new Error('Unable to retrieve list of files from S3 storage after unloading.');
42+
}

0 commit comments

Comments
 (0)