Skip to content

Commit de81ee6

Browse files
committed
feat: add excludedColumns functionality
Signed-off-by: Tomáš Dvořák <toomas2d@gmail.com>
1 parent 1b2e0f6 commit de81ee6

File tree

7 files changed

+131
-27
lines changed

7 files changed

+131
-27
lines changed

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ interface ParserSettings {
6868
colFilter?: (elText: string[], index: number) => string; // (default: (txt: string) => txt.join(' '))
6969
colParser?: (value: string, formattedIndex: number, getColumnIndex: GetColumnIndexType) => string; // (default: (txt: string) => txt.trim())
7070
optionalColNames?: string[]; // (default: [])
71+
excludedColumns?: (rows: string[][], getColumnIndex: GetColumnIndexType) => string[]; // (default: undefined)
7172
};
7273
```
7374

@@ -82,8 +83,9 @@ interface ParserSettings {
8283
7. Run `rowTransform` function for each row.
8384
8. Group results into buckets (`groupBy.cols`) property and pick the aggregated rows.
8485
9. Add processed row to a temp array result.
85-
10. Add `header` column if `withHeader` property is `true`.
86-
11. Merge partial results and return them.
86+
10. Run `excludedColumns` and exclude retrieved columns from the rows and the `header`.
87+
11. Add `header` row (if `withHeader` property is `true`).
88+
12. Merge partial results and return them.
8789

8890
## Examples
8991

src/helpers.ts

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,10 @@ export async function getColumnsInfo(
7777
const excludedKeyIndexes: number[] = [];
7878
const colKeyToIndexWithExcluded = new Map<string, number>();
7979
const colIndexToKeyWithExcluded = new Map<number, string>();
80-
extraColsMapper(allowedColNamesKeys, 'colName').forEach((key, index) => {
80+
81+
const allColumnKeys = extraColsMapper(allowedColNamesKeys, 'colName');
82+
83+
allColumnKeys.forEach((key, index) => {
8184
colKeyToIndexWithExcluded.set(key, index);
8285
colIndexToKeyWithExcluded.set(index, key);
8386

@@ -99,13 +102,19 @@ export async function getColumnsInfo(
99102
return index;
100103
};
101104

102-
const getColumnName = (colIndex: number) => {
103-
const value = colIndexToKeyWithExcluded.get(colIndex);
104-
if (value === undefined) {
105-
throw new InvalidColumnError(`Column with index '${colIndex}' does not exist!`);
106-
}
105+
const updateExcludedColumns = (colNames: string[]) => {
106+
colNames.forEach((colName) => {
107+
const index = getColumnIndex(colName);
108+
if (!excludedKeyIndexes.includes(index)) {
109+
excludedKeyIndexes.push(index);
110+
}
111+
});
112+
};
107113

108-
return value;
114+
const getOutputColumnKeys = () => {
115+
return allColumnKeys
116+
.filter((_, index) => !excludedKeyIndexes.includes(index))
117+
.map((colName) => settings.allowedColNames[colName] || colName);
109118
};
110119

111120
return {
@@ -115,7 +124,8 @@ export async function getColumnsInfo(
115124
},
116125
missingColNames: Object.values(missingColNames),
117126
getColumnIndex,
118-
getColumnName,
127+
updateExcludedColumns,
128+
getOutputColumnKeys,
119129
};
120130
}
121131

src/parseTable.ts

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,8 @@ export function parseTableFactory(settings: FullParserSettings) {
104104
return [];
105105
}
106106

107-
const { indexes, getColumnIndex, getColumnName, missingColNames } = await getColumnsInfo(
108-
settings,
109-
header.el,
110-
extraColsMapper,
111-
);
107+
const { indexes, getColumnIndex, missingColNames, updateExcludedColumns, getOutputColumnKeys } =
108+
await getColumnsInfo(settings, header.el, extraColsMapper);
112109

113110
let parsedRows = await new PipelineExecutor<string[][], string[][]>(
114111
await getRowsData(table, header.bodyRowsOffset, indexes.allowed),
@@ -124,16 +121,26 @@ export function parseTableFactory(settings: FullParserSettings) {
124121
parsedRows = groupBy(parsedRows, settings.groupBy, getColumnIndex);
125122
}
126123

124+
if (settings.excludedColumns) {
125+
const excludedColNames = settings.excludedColumns(parsedRows, getColumnIndex) ?? [];
126+
updateExcludedColumns(excludedColNames);
127+
}
128+
127129
if (addHeader) {
128130
const headerRow = getOutputHeaderRow(missingColNames);
129131
parsedRows.unshift(headerRow);
130132
}
131133

132134
const rowOutputMapper = settings.rowValuesAsObject
133-
? mappers.asObject(getColumnName)
135+
? mappers.asObject(
136+
(() => {
137+
const outputColumnKeys = getOutputColumnKeys();
138+
return (index) => outputColumnKeys[index];
139+
})(),
140+
)
134141
: settings.rowValuesAsArray
135-
? mappers.asArray()
136-
: mappers.asCsv(settings.csvSeparator);
142+
? mappers.asArray()
143+
: mappers.asCsv(settings.csvSeparator);
137144

138145
return new PipelineExecutor<string[][], ReturnType<typeof rowOutputMapper>[]>(parsedRows)
139146
.addMap((row) => row.filter((_, index) => !indexes.excluded.includes(index)))

src/settings.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ export const defaultSettings: ParserSettingsOptional = {
2828
headerRowsCellSelector: 'td,th',
2929
bodyRowsSelector: 'tbody tr',
3030
bodyRowsCellSelector: 'td',
31+
excludedColumns: undefined,
3132
};
3233

3334
export function preprocessSettings(options: ParserSettings): FullParserSettings {

src/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ export type ParserSettingsOptional = {
5656
headerRowsCellSelector: string;
5757
bodyRowsSelector: string;
5858
bodyRowsCellSelector: string;
59+
excludedColumns: (rows: string[][], getColumnIndex: GetColumnIndexType) => string[];
5960
};
6061

6162
export interface ParserSettings extends Partial<ParserSettingsOptional> {

test/assets/3.html

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<title>Example #3</title>
5+
</head>
6+
<body>
7+
8+
<table id="table-overview">
9+
<thead>
10+
<tr>
11+
<th>A</th>
12+
<th><input type="checkbox" checked></th>
13+
<th>C</th>
14+
</tr>
15+
</thead>
16+
<tbody>
17+
<tr>
18+
<td>A1</td>
19+
<td>B1</td>
20+
<td><img src='#' alt='image'>C1</td>
21+
</tr>
22+
<tr>
23+
<td><a href='#'>A1</a></td>
24+
<td><input type="checkbox" checked></td>
25+
<td>C1</td>
26+
</tr>
27+
</tbody>
28+
</table>
29+
30+
</body>
31+
</html>

test/index.test.ts

Lines changed: 61 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@ import { createServer, getBaseUrl } from './createServer';
44
import { Browser, launch, Page } from 'puppeteer';
55
import { tableParser, RowValidationPolicy } from '../src';
66

7-
jest.setTimeout(60 * 1000 * 1000);
8-
97
describe('Basic parsing', () => {
108
let server: Server;
119
let browser: Browser;
@@ -14,7 +12,7 @@ describe('Basic parsing', () => {
1412
beforeAll(async () => {
1513
server = await createServer();
1614
browser = await launch({
17-
headless: 'new',
15+
headless: true,
1816
});
1917
page = await browser.newPage();
2018
});
@@ -137,8 +135,8 @@ describe('Basic parsing', () => {
137135
`);
138136
});
139137

140-
it('Throw error with invalid options', () => {
141-
expect(
138+
it('Throw error with invalid options', async () => {
139+
await expect(
142140
tableParser(page, {
143141
selector: 'table',
144142
asArray: false,
@@ -150,7 +148,7 @@ describe('Basic parsing', () => {
150148
}),
151149
).rejects.toThrowError();
152150

153-
expect(
151+
await expect(
154152
tableParser(page, {
155153
selector: 'table',
156154
// @ts-expect-error intended
@@ -163,7 +161,7 @@ describe('Basic parsing', () => {
163161
}),
164162
).rejects.toThrowError();
165163

166-
expect(
164+
await expect(
167165
tableParser(page, {
168166
selector: 'table',
169167
rowValuesAsObject: false,
@@ -176,8 +174,8 @@ describe('Basic parsing', () => {
176174
).rejects.toThrowError();
177175
});
178176

179-
it('Throw error when specified optional column which does not exists', () => {
180-
expect(
177+
it('Throw error when specified optional column which does not exists', async () => {
178+
await expect(
181179
tableParser(page, {
182180
selector: 'table',
183181
allowedColNames: {
@@ -553,4 +551,58 @@ describe('Basic parsing', () => {
553551
expect(data.length).toBe(1);
554552
expect(data[0]).toBe('name,age');
555553
});
554+
555+
it('Handles non-text elements', async () => {
556+
await page.goto(`${getBaseUrl()}/3.html`);
557+
558+
const data = await tableParser(page, {
559+
selector: '#table-overview',
560+
asArray: false,
561+
allowedColNames: {
562+
'A': 'A',
563+
'': 'B',
564+
'C': 'C',
565+
},
566+
colParser: (value, x) => {
567+
console.info({ value, x });
568+
return value.trim();
569+
},
570+
});
571+
572+
expect(data).toMatchInlineSnapshot(`
573+
"A;B;C
574+
A1;B1;C1
575+
A1;;C1"
576+
`);
577+
});
578+
579+
it('Exclude columns', async () => {
580+
await page.goto(`${getBaseUrl()}/1.html`);
581+
582+
const data = await tableParser(page, {
583+
selector: 'table',
584+
allowedColNames: {
585+
'Car Name': 'car',
586+
'Horse Powers': 'hp',
587+
'Manufacture Year': 'year',
588+
},
589+
optionalColNames: ['year'],
590+
rowValidationPolicy: RowValidationPolicy.EXACT_MATCH,
591+
colParser: (value) => value.trim(),
592+
excludedColumns: (rows, getColumnIndex) => {
593+
expect(rows.length).toBe(4);
594+
expect(getColumnIndex('car')).toBe(0);
595+
expect(getColumnIndex('hp')).toBe(1);
596+
expect(getColumnIndex('year')).toBe(2);
597+
return ['year'];
598+
},
599+
});
600+
601+
expect(data).toMatchInlineSnapshot(`
602+
"car;hp
603+
Audi S5;332
604+
Alfa Romeo Giulia;500
605+
BMW X3;215
606+
Skoda Octavia;120"`);
607+
});
556608
});

0 commit comments

Comments
 (0)