Skip to content

Commit 39004b6

Browse files
committed
feat: Enhance nested foreach loops by supporting custom index placeholders and preserving parent context in results.
1 parent b75ee13 commit 39004b6

File tree

6 files changed

+254
-26
lines changed

6 files changed

+254
-26
lines changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "stepwright",
3-
"version": "1.0.8",
3+
"version": "1.0.9",
44
"description": "A powerful web scraping library built with Playwright",
55
"main": "dist/index.js",
66
"types": "dist/index.d.ts",

src/step-executor.ts

Lines changed: 72 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,17 @@ export async function executeStep(
141141
case 'click': {
142142
try {
143143
// Check if element exists first
144-
const locator = locatorFor(page, step.object_type as SelectorType | undefined, step.object ?? '');
144+
const locator = scopeLocator
145+
? locatorFor(
146+
scopeLocator as any,
147+
step.object_type as SelectorType | undefined,
148+
step.object ?? ''
149+
)
150+
: locatorFor(
151+
page,
152+
step.object_type as SelectorType | undefined,
153+
step.object ?? ''
154+
);
145155
const count = await locator.count();
146156

147157
if (count === 0) {
@@ -267,7 +277,17 @@ export async function executeStep(
267277
if (!step.subSteps || step.subSteps.length === 0) {
268278
throw new Error('foreach step requires subSteps');
269279
}
270-
const locatorAll = locatorFor(page, step.object_type as SelectorType | undefined, step.object);
280+
const locatorAll = scopeLocator
281+
? locatorFor(
282+
scopeLocator as any,
283+
step.object_type as SelectorType | undefined,
284+
step.object
285+
)
286+
: locatorFor(
287+
page,
288+
step.object_type as SelectorType | undefined,
289+
step.object
290+
);
271291
try {
272292
await locatorAll.first().waitFor({ state: 'attached', timeout: step.wait ?? 5000 });
273293
} catch {}
@@ -283,11 +303,20 @@ export async function executeStep(
283303
}
284304

285305
// Create a separate collector for each iteration
306+
// Initialize with parent context (non-item_* keys) to preserve metadata
286307
const itemCollector: Record<string, any> = {};
308+
Object.keys(collector).forEach(k => {
309+
if (!k.startsWith('item_')) {
310+
itemCollector[k] = collector[k];
311+
}
312+
});
313+
314+
// Use the specified index key or default to 'i'
315+
const indexKey = step.index_key || 'i';
287316

288317
// For each subStep clone and replace placeholders
289318
for (const s of step.subSteps) {
290-
const cloned = cloneStepWithIndex(s, idx);
319+
const cloned = cloneStepWithIndex(s, idx, indexKey);
291320
try {
292321
await executeStep(page, cloned, itemCollector, onResult, current);
293322
} catch (err: any) {
@@ -307,8 +336,6 @@ export async function executeStep(
307336
);
308337

309338
// Emit the result immediately for streaming
310-
// We need to access the onResult callback from the parent context
311-
// This is a bit of a hack, but it works for immediate streaming
312339
if ((global as any).onResultCallback) {
313340
try {
314341
const flattenedResult = flattenNestedForeachResults(itemCollector);
@@ -329,7 +356,17 @@ export async function executeStep(
329356

330357
try {
331358
// locate link and check if it exists
332-
const linkLoc = locatorFor(page, step.object_type as SelectorType | undefined, step.object);
359+
const linkLoc = scopeLocator
360+
? locatorFor(
361+
scopeLocator as any,
362+
step.object_type as SelectorType | undefined,
363+
step.object
364+
)
365+
: locatorFor(
366+
page,
367+
step.object_type as SelectorType | undefined,
368+
step.object
369+
);
333370
const count = await linkLoc.count();
334371

335372
if (count === 0) {
@@ -365,8 +402,13 @@ export async function executeStep(
365402
await newPage.waitForLoadState('networkidle');
366403
}
367404

368-
// Pass the parent collector data to subSteps so they can access meeting_title, meeting_date, etc.
369-
const innerCollected: Record<string, any> = { ...collector };
405+
// Pass only the parent context (non-item keys) to subSteps
406+
const innerCollected: Record<string, any> = {};
407+
Object.keys(collector).forEach(k => {
408+
if (!k.startsWith('item_')) {
409+
innerCollected[k] = collector[k];
410+
}
411+
});
370412
for (const s of step.subSteps) {
371413
const cloned: BaseStep = { ...s };
372414
try {
@@ -673,7 +715,17 @@ export async function executeStep(
673715
// If object is provided, click it first (for backward compatibility)
674716
if (step.object) {
675717
try {
676-
const locator = locatorFor(page, step.object_type as SelectorType | undefined, step.object);
718+
const locator = scopeLocator
719+
? locatorFor(
720+
scopeLocator as any,
721+
step.object_type as SelectorType | undefined,
722+
step.object
723+
)
724+
: locatorFor(
725+
page,
726+
step.object_type as SelectorType | undefined,
727+
step.object
728+
);
677729
const count = await locator.count();
678730

679731
if (count > 0) {
@@ -743,11 +795,17 @@ export async function executeStep(
743795

744796
try {
745797
// 1) Find the link and get its href
746-
const link = locatorFor(
747-
page,
748-
step.object_type as SelectorType | undefined,
749-
step.object
750-
);
798+
const link = scopeLocator
799+
? locatorFor(
800+
scopeLocator as any,
801+
step.object_type as SelectorType | undefined,
802+
step.object
803+
)
804+
: locatorFor(
805+
page,
806+
step.object_type as SelectorType | undefined,
807+
step.object
808+
);
751809
const present = (await link.count()) > 0;
752810
if (!present) {
753811
console.log(` ⚠️ PDF link not found: ${step.object}`);

src/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ export interface BaseStep {
1919
terminateonerror?: boolean;
2020
subSteps?: BaseStep[]; // for foreach/open
2121
autoScroll?: boolean; // for foreach action - controls automatic scrolling (default: true)
22+
index_key?: string; // custom index placeholder for loops (e.g., 'j', 'k')
2223
}
2324

2425
export interface PaginationConfig {

src/utils.ts

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@ import { SelectorType } from './types';
1818
* @since v1.0.0
1919
* @company Framework Island
2020
*/
21-
export function replaceIndexPlaceholders(text: string | undefined, i: number): string | undefined {
21+
export function replaceIndexPlaceholders(text: string | undefined, i: number, char: string = 'i'): string | undefined {
2222
if (!text) return text;
23-
return text.replace(/\{\{\s*i\s*\}\}/g, i.toString()).replace(/\{\{\s*i_plus1\s*\}\}/g, (i+1).toString());
23+
const regex = new RegExp(`\\{\\{\\s*${char}\\s*\\}\\}`, 'g');
24+
const plus1Regex = new RegExp(`\\{\\{\\s*${char}_plus1\\s*\\}\\}`, 'g');
25+
return text.replace(regex, i.toString()).replace(plus1Regex, (i + 1).toString());
2426
}
2527

2628
/**
@@ -97,13 +99,13 @@ export function locatorFor(page: Page, type: SelectorType | undefined, selector:
9799
* @since v1.0.0
98100
* @company Framework Island
99101
*/
100-
export function cloneStepWithIndex(step: import('./types').BaseStep, idx: number): import('./types').BaseStep {
102+
export function cloneStepWithIndex(step: import('./types').BaseStep, idx: number, char: string = 'i'): import('./types').BaseStep {
101103
const cloned: import('./types').BaseStep = { ...step };
102-
cloned.object = replaceIndexPlaceholders(cloned.object, idx);
103-
cloned.value = replaceIndexPlaceholders(cloned.value, idx);
104-
cloned.key = replaceIndexPlaceholders(cloned.key, idx);
104+
cloned.object = replaceIndexPlaceholders(cloned.object, idx, char);
105+
cloned.value = replaceIndexPlaceholders(cloned.value, idx, char);
106+
cloned.key = replaceIndexPlaceholders(cloned.key, idx, char);
105107
if (cloned.subSteps && cloned.subSteps.length > 0) {
106-
cloned.subSteps = cloned.subSteps.map((sub) => cloneStepWithIndex(sub, idx));
108+
cloned.subSteps = cloned.subSteps.map((sub) => cloneStepWithIndex(sub, idx, char));
107109
}
108110
return cloned;
109111
}
@@ -121,23 +123,34 @@ export function cloneStepWithIndex(step: import('./types').BaseStep, idx: number
121123
* @company Framework Island
122124
*/
123125
export function flattenNestedForeachResults(item: Record<string, any>): any {
126+
// Extract context keys (keys NOT starting with item_)
127+
const context: Record<string, any> = {};
128+
Object.keys(item).forEach(k => {
129+
if (!k.startsWith('item_')) {
130+
context[k] = item[k];
131+
}
132+
});
133+
124134
// Check if item contains nested item_* keys (from nested foreach)
125135
const nestedItemKeys = Object.keys(item).filter(k => k.startsWith('item_'));
126136
if (nestedItemKeys.length > 0) {
127137
// Flatten nested items into an array
128138
const flattenedItems: any[] = [];
129-
for (const k of nestedItemKeys.sort((a, b) => {
139+
const sortedKeys = nestedItemKeys.sort((a, b) => {
130140
const aIdx = parseInt(a.split('_')[1]);
131141
const bIdx = parseInt(b.split('_')[1]);
132142
return aIdx - bIdx;
133-
})) {
143+
});
144+
145+
for (const k of sortedKeys) {
134146
if (item[k] && Object.keys(item[k]).length > 0) {
135-
flattenedItems.push(item[k]);
147+
// Merge context into the nested item
148+
flattenedItems.push({ ...context, ...item[k] });
136149
}
137150
}
138151
return flattenedItems;
139152
} else {
140-
// No nested items, return item as is
153+
// No nested items, return item with context (already in item)
141154
return item;
142155
}
143156
}

test/nested-loop.html

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<title>Nested Loop Test</title>
5+
</head>
6+
<body>
7+
<div class="parent">
8+
<h1>Meeting 1</h1>
9+
<div class="child">Attachment 1.1</div>
10+
<div class="child">Attachment 1.2</div>
11+
</div>
12+
<div class="parent">
13+
<h1>Meeting 2</h1>
14+
<div class="child">Attachment 2.1</div>
15+
</div>
16+
</body>
17+
</html>

test/nested-loop.test.ts

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
2+
import { getBrowser, navigate } from '../src/scraper';
3+
import { executeStep } from '../src/step-executor';
4+
import { Browser, Page } from 'playwright';
5+
import path from 'path';
6+
import { BaseStep } from '../src/types';
7+
8+
describe('Nested Foreach Loops', () => {
9+
let browser: Browser;
10+
let page: Page;
11+
const testPagePath = path.join(__dirname, 'nested-loop.html');
12+
const testPageUrl = `file://${testPagePath}`;
13+
14+
beforeAll(async () => {
15+
browser = await getBrowser({ headless: true });
16+
page = await browser.newPage();
17+
});
18+
19+
afterAll(async () => {
20+
await browser.close();
21+
});
22+
23+
it('should support nested loops and preserve parent context', async () => {
24+
await navigate(page, testPageUrl);
25+
26+
const collector: Record<string, any> = {};
27+
const nestedStep: BaseStep = {
28+
id: 'loopOuter',
29+
action: 'foreach',
30+
object_type: 'xpath',
31+
object: "//div[@class='parent']",
32+
index_key: 'i',
33+
subSteps: [
34+
{
35+
id: 'meeting_title',
36+
action: 'data',
37+
object_type: 'xpath',
38+
object: ".//h1",
39+
key: 'meeting_title',
40+
data_type: 'text'
41+
},
42+
{
43+
id: 'loopInner',
44+
action: 'foreach',
45+
object_type: 'xpath',
46+
object: ".//div[@class='child']",
47+
index_key: 'j',
48+
subSteps: [
49+
{
50+
id: 'attachment_name',
51+
action: 'data',
52+
object_type: 'xpath',
53+
object: ".",
54+
key: 'attachment_name',
55+
data_type: 'text'
56+
}
57+
]
58+
}
59+
]
60+
};
61+
62+
await executeStep(page, nestedStep, collector);
63+
64+
// Verify structure
65+
// collector should have item_0 and item_1
66+
expect(collector.item_0).toBeDefined();
67+
expect(collector.item_1).toBeDefined();
68+
69+
// item_0 should have meeting_title and item_0, item_1 (inner items)
70+
expect(collector.item_0.meeting_title).toBe('Meeting 1');
71+
expect(collector.item_0.item_0.attachment_name).toBe('Attachment 1.1');
72+
expect(collector.item_0.item_1.attachment_name).toBe('Attachment 1.2');
73+
74+
// VERY IMPORTANT: Parent context (meeting_title) should be in inner items because we copy it now
75+
expect(collector.item_0.item_0.meeting_title).toBe('Meeting 1');
76+
expect(collector.item_0.item_1.meeting_title).toBe('Meeting 1');
77+
78+
// Test flattening (implicitly happens in executeStep if we check final collector,
79+
// but executeStep doesn't flatten the ROOT collector, it only flattens on emit or if we call it)
80+
81+
// items should be available for Meeting 2 as well
82+
expect(collector.item_1.meeting_title).toBe('Meeting 2');
83+
expect(collector.item_1.item_0.attachment_name).toBe('Attachment 2.1');
84+
expect(collector.item_1.item_0.meeting_title).toBe('Meeting 2');
85+
});
86+
87+
it('should correctly handle unique index placeholders {{i}} and {{j}}', async () => {
88+
await navigate(page, testPageUrl);
89+
90+
const collector: Record<string, any> = {};
91+
const nestedStep: BaseStep = {
92+
id: 'loopOuter',
93+
action: 'foreach',
94+
object_type: 'xpath',
95+
object: "//div[@class='parent']",
96+
index_key: 'i',
97+
subSteps: [
98+
{
99+
id: 'loopInner',
100+
action: 'foreach',
101+
object_type: 'xpath',
102+
object: ".//div[@class='child']",
103+
index_key: 'j',
104+
subSteps: [
105+
{
106+
id: 'combined',
107+
action: 'data',
108+
object_type: 'xpath',
109+
// Use both indices in a locator (hypothetical)
110+
object: "//div[@id='p{{i}}c{{j}}']",
111+
key: 'val',
112+
data_type: 'text'
113+
}
114+
]
115+
}
116+
]
117+
};
118+
119+
// We can't easily run this because the elements don't exist,
120+
// but we can verify that the sub-steps are cloned with correct replacements.
121+
// Instead of executeStep, let's manually test cloneStepWithIndex.
122+
123+
const { cloneStepWithIndex } = await import('../src/utils');
124+
125+
// Outer loop idx 0
126+
const outerCloned = cloneStepWithIndex(nestedStep.subSteps![0], 0, 'i');
127+
128+
// Now outerCloned is the inner loop. Its object should NOT have i replaced (it doesn't use it)
129+
// But its subSteps should have had {{i}} replaced if they used it.
130+
const innerLoop = outerCloned;
131+
const dataStep = innerLoop.subSteps![0];
132+
133+
expect(dataStep.object).toBe("//div[@id='p0c{{j}}']");
134+
135+
// Inner loop idx 5
136+
const innerCloned = cloneStepWithIndex(dataStep, 5, 'j');
137+
expect(innerCloned.object).toBe("//div[@id='p0c5']");
138+
});
139+
});

0 commit comments

Comments
 (0)