Skip to content

Commit 243176e

Browse files
committed
feat: add maxConcurrentDownloads configuration for improved download management
- Introduced maxConcurrentDownloads option in the configuration to control the number of concurrent downloads (1-10). - Added validation for maxConcurrentDownloads to ensure it falls within the specified range. - Updated the scraper context and download logic to utilize the maxConcurrentDownloads setting, enhancing download efficiency.
1 parent e2f957b commit 243176e

File tree

1 file changed

+28
-4
lines changed

1 file changed

+28
-4
lines changed

index.ts

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ interface Config {
9191
host?: HostType;
9292
outputDir?: string;
9393
maxPosts?: number;
94+
maxConcurrentDownloads?: number;
9495
proxies?: ProxyConfig[];
9596
proxyRotation?: ProxyRotationMode;
9697
// List of creators to scrape
@@ -113,6 +114,15 @@ async function loadConfig(configPath: string): Promise<Config> {
113114
// Normalize and validate proxy configuration
114115
config.proxies = Array.isArray(config.proxies) ? config.proxies : [];
115116
config.proxyRotation = config.proxyRotation || 'round_robin';
117+
118+
// Validate maxConcurrentDownloads if provided
119+
if (config.maxConcurrentDownloads !== undefined) {
120+
if (typeof config.maxConcurrentDownloads !== 'number' ||
121+
config.maxConcurrentDownloads < 1 ||
122+
config.maxConcurrentDownloads > 10) {
123+
throw new Error('maxConcurrentDownloads must be a number between 1 and 10');
124+
}
125+
}
116126

117127
for (const proxy of config.proxies) {
118128
if (!proxy || typeof proxy !== 'object') {
@@ -200,11 +210,21 @@ const argv = yargs(hideBin(process.argv))
200210
description: 'Maximum number of posts to fetch (0 = unlimited, default: 5000)',
201211
default: 5000,
202212
})
213+
.option('maxConcurrentDownloads', {
214+
alias: 'd',
215+
type: 'number',
216+
description: 'Maximum concurrent downloads (1-10, default: 2)',
217+
default: 2,
218+
})
203219
.check((argv) => {
204220
// Either config file or service+userId must be provided
205221
if (!argv.config && (!argv.service || !argv.userId)) {
206222
throw new Error('Either --config or both --service and --userId must be provided');
207223
}
224+
// Validate maxConcurrentDownloads range
225+
if (argv.maxConcurrentDownloads < 1 || argv.maxConcurrentDownloads > 10) {
226+
throw new Error('maxConcurrentDownloads must be between 1 and 10');
227+
}
208228
return true;
209229
})
210230
.help()
@@ -215,7 +235,6 @@ const argv = yargs(hideBin(process.argv))
215235

216236
// Constants
217237
const PAGE_SIZE = 50;
218-
const MAX_CONCURRENT_DOWNLOADS = 2;
219238
const MAX_DOWNLOAD_RETRIES = 3;
220239
const DOWNLOAD_RETRY_WAIT_SECONDS = 10000;
221240
const REQUEST_TIMEOUT_MS = 120000; // Abort the HTTP request itself if it never responds
@@ -247,6 +266,7 @@ interface ScraperContext {
247266
host: HostType;
248267
outputDir: string;
249268
maxPosts: number;
269+
maxConcurrentDownloads: number;
250270
baseDomain: string;
251271
subdomains: string[];
252272
blacklistFile: string;
@@ -280,6 +300,7 @@ function createScraperContext(
280300
host: HostType,
281301
outputDir: string,
282302
maxPosts: number,
303+
maxConcurrentDownloads: number,
283304
downloadBars: MultiProgressBars,
284305
proxyManager: ProxyManager | null
285306
): ScraperContext {
@@ -292,6 +313,7 @@ function createScraperContext(
292313
host,
293314
outputDir: resolvedOutputDir,
294315
maxPosts,
316+
maxConcurrentDownloads,
295317
baseDomain,
296318
subdomains,
297319
blacklistFile: path.join(resolvedOutputDir, 'blacklist.json'),
@@ -1025,7 +1047,7 @@ interface DownloadResult {
10251047
}
10261048

10271049
async function downloadFiles(ctx: ScraperContext, downloadQueue: DownloadQueueEntry[], totalFiles: number, completedSoFar: number): Promise<DownloadResult> {
1028-
const queue = new AsyncQueue({ limit: MAX_CONCURRENT_DOWNLOADS });
1050+
const queue = new AsyncQueue({ limit: ctx.maxConcurrentDownloads });
10291051
const failedDownloads: DownloadQueueEntry[] = [];
10301052

10311053
return new Promise<DownloadResult>((resolve, reject) => {
@@ -1376,12 +1398,13 @@ async function scrapeCreator(ctx: ScraperContext): Promise<void> {
13761398
const host = creator.host || config.host || (argv.host as HostType);
13771399
const outputDir = creator.outputDir || config.outputDir || argv.outputDir;
13781400
const maxPosts = creator.maxPosts ?? config.maxPosts ?? argv.maxPosts;
1401+
const maxConcurrentDownloads = config.maxConcurrentDownloads ?? argv.maxConcurrentDownloads ?? 2;
13791402

13801403
console.log(chalk.magenta(`\n${'='.repeat(60)}`));
13811404
console.log(chalk.magenta(`[${creatorNum}/${config.creators.length}] Scraping ${service}/${userId}`));
13821405
console.log(chalk.magenta(`${'='.repeat(60)}\n`));
13831406

1384-
const ctx = createScraperContext(service, userId, host, outputDir, maxPosts, downloadBars, proxyManager);
1407+
const ctx = createScraperContext(service, userId, host, outputDir, maxPosts, maxConcurrentDownloads, downloadBars, proxyManager);
13851408

13861409
try {
13871410
await scrapeCreator(ctx);
@@ -1407,9 +1430,10 @@ async function scrapeCreator(ctx: ScraperContext): Promise<void> {
14071430
const host = argv.host as HostType;
14081431
const outputDir = argv.outputDir;
14091432
const maxPosts = argv.maxPosts;
1433+
const maxConcurrentDownloads = argv.maxConcurrentDownloads ?? 2;
14101434
const proxyManager = null;
14111435

1412-
const ctx = createScraperContext(service, userId, host, outputDir, maxPosts, downloadBars, proxyManager);
1436+
const ctx = createScraperContext(service, userId, host, outputDir, maxPosts, maxConcurrentDownloads, downloadBars, proxyManager);
14131437
await scrapeCreator(ctx);
14141438
}
14151439

0 commit comments

Comments
 (0)