Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1244,7 +1244,9 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
if (items.length === 0) {
value = '';
} else {
const keys = Object.keys(items[0]);
const keys = options?.collectAllKeys
? Array.from(new Set(items.flatMap(Object.keys)))
: Object.keys(items[0]);

value = stringify([
keys,
Expand Down
13 changes: 7 additions & 6 deletions packages/core/src/storages/dataset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,13 @@ export interface DatasetDataOptions {
skipEmpty?: boolean;
}

export interface DatasetExportOptions extends Omit<DatasetDataOptions, 'offset' | 'limit'> {}
export interface DatasetExportOptions extends Omit<DatasetDataOptions, 'offset' | 'limit'> {
/**
* If true, includes all unique keys from all dataset items in the CSV export header.
* If omitted or false, only keys from the first item are used.
*/
collectAllKeys?: boolean;
}

export interface DatasetIteratorOptions
extends Omit<DatasetDataOptions, 'offset' | 'limit' | 'clean' | 'skipHidden' | 'skipEmpty'> {
Expand Down Expand Up @@ -170,11 +176,6 @@ export interface DatasetIteratorOptions
export interface DatasetExportToOptions extends DatasetExportOptions {
fromDataset?: string;
toKVS?: string;
/**
* If true, includes all unique keys from all dataset items in the CSV export.
* If omitted or false, only keys from the first item are used.
*/
collectAllKeys?: boolean;
}

/**
Expand Down
13 changes: 13 additions & 0 deletions test/core/crawlers/basic_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1745,6 +1745,19 @@ describe('BasicCrawler', () => {
expect((await crawler.getData()).items).toEqual(payload);
});

test('crawler.exportData works with `collectAllKeys`', async () => {
const crawler = new BasicCrawler();
await crawler.pushData([{ foo: 'bar', baz: 123 }]);
await crawler.pushData([{ foo: 'baz', qux: 456 }]);

await crawler.exportData(`${tmpDir}/result.csv`, 'csv', { collectAllKeys: true });

const csv = await readFile(`${tmpDir}/result.csv`);
expect(csv.toString()).toBe('foo,baz,qux\nbar,123,\nbaz,,456\n');

await rm(`${tmpDir}/result.csv`);
});

test("Crawlers with different Configurations don't share Datasets", async () => {
const crawlerA = new BasicCrawler({}, new Configuration({ persistStorage: false }));
const crawlerB = new BasicCrawler({}, new Configuration({ persistStorage: false }));
Expand Down