Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ Sources/ContentScopeScripts/dist/
test-results
!Sources/ContentScopeScripts/dist/pages/.gitignore

# Test output files (generated during tests)
injected/unit-test/fixtures/page-context/output/

# Local Netlify folder
.netlify
# VS Code user config
Expand Down
5 changes: 3 additions & 2 deletions injected/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@
},
"type": "module",
"dependencies": {
"@duckduckgo/privacy-configuration": "github:duckduckgo/privacy-configuration#1752154773643",
"esbuild": "^0.25.10",
"minimist": "^1.2.8",
"parse-address": "^1.1.2",
"seedrandom": "^3.0.5",
"sjcl": "^1.0.8",
"@duckduckgo/privacy-configuration": "github:duckduckgo/privacy-configuration#1752154773643",
"esbuild": "^0.25.10",
"urlpattern-polyfill": "^10.1.0"
},
"devDependencies": {
Expand All @@ -43,6 +43,7 @@
"@typescript-eslint/eslint-plugin": "^8.46.0",
"fast-check": "^4.2.0",
"jasmine": "^5.12.0",
"jsdom": "^27.0.0",
"web-ext": "^9.0.0"
}
}
87 changes: 72 additions & 15 deletions injected/src/features/page-context.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { getFaviconList } from './favicon.js';
import { isDuckAi, isBeingFramed, getTabUrl } from '../utils.js';
const MSG_PAGE_CONTEXT_RESPONSE = 'collectionResult';

function checkNodeIsVisible(node) {
export function checkNodeIsVisible(node) {
try {
const style = window.getComputedStyle(node);

Expand Down Expand Up @@ -36,6 +36,29 @@ function isHtmlElement(node) {
* @returns {Document | null}
*/
function getSameOriginIframeDocument(iframe) {
// Pre-check conditions that would prevent access without triggering security errors
const src = iframe.src;

// Skip sandboxed iframes unless they explicitly allow scripts
// Avoids: Blocked script execution in 'about:blank' because the document's frame is sandboxed and the 'allow-scripts' permission is not set.
// Note: iframe.sandbox always returns a DOMTokenList, so check hasAttribute instead
if (iframe.hasAttribute('sandbox') && !iframe.sandbox.contains('allow-scripts')) {
return null;
}

// Check for cross-origin URLs (but allow about:blank and empty src as they inherit parent origin)
if (src && src !== 'about:blank' && src !== '') {
try {
const iframeUrl = new URL(src, window.location.href);
if (iframeUrl.origin !== window.location.origin) {
return null;
}
} catch (e) {
// Invalid URL, skip
return null;
}
}

try {
// Try to access the contentDocument - this will throw if cross-origin
const doc = iframe.contentDocument;
Expand Down Expand Up @@ -76,8 +99,9 @@ function domToMarkdownChildren(childNodes, settings, depth = 0) {
* @typedef {Object} DomToMarkdownSettings
* @property {number} maxLength - Maximum length of content
* @property {number} maxDepth - Maximum depth to traverse
* @property {string} excludeSelectors - CSS selectors to exclude from processing
* @property {string | null} excludeSelectors - CSS selectors to exclude from processing
* @property {boolean} includeIframes - Whether to include iframe content
* @property {boolean} trimBlankLinks - Whether to trim blank links
*/

/**
Expand All @@ -87,7 +111,7 @@ function domToMarkdownChildren(childNodes, settings, depth = 0) {
* @param {number} depth
* @returns {string}
*/
function domToMarkdown(node, settings, depth = 0) {
export function domToMarkdown(node, settings, depth = 0) {
if (depth > settings.maxDepth) {
return '';
}
Expand All @@ -97,7 +121,7 @@ function domToMarkdown(node, settings, depth = 0) {
if (!isHtmlElement(node)) {
return '';
}
if (!checkNodeIsVisible(node) || node.matches(settings.excludeSelectors)) {
if (!checkNodeIsVisible(node) || (settings.excludeSelectors && node.matches(settings.excludeSelectors))) {
return '';
}

Expand Down Expand Up @@ -127,12 +151,15 @@ function domToMarkdown(node, settings, depth = 0) {
return `${children}\n`;
case 'br':
return `\n`;
case 'img':
return `\n![${getAttributeOrBlank(node, 'alt')}](${getAttributeOrBlank(node, 'src')})\n`;
case 'ul':
case 'ol':
return `\n${children}\n`;
case 'li':
return `\n- ${children.trim()}\n`;
return `\n- ${collapseAndTrim(children)}\n`;
case 'a':
return getLinkText(node);
return getLinkText(node, children, settings);
case 'iframe': {
if (!settings.includeIframes) {
return children;
Expand All @@ -151,13 +178,30 @@ function domToMarkdown(node, settings, depth = 0) {
}
}

/**
* @param {Element} node
* @param {string} attr
* @returns {string}
*/
function getAttributeOrBlank(node, attr) {
const attrValue = node.getAttribute(attr) ?? '';
return attrValue.trim();
}

function collapseAndTrim(str) {
return collapseWhitespace(str).trim();
}

function getLinkText(node) {
function getLinkText(node, children, settings) {
const href = node.getAttribute('href');
return href ? `[${collapseAndTrim(node.textContent)}](${href})` : collapseWhitespace(node.textContent);
const trimmedContent = collapseAndTrim(children);
if (settings.trimBlankLinks && trimmedContent.length === 0) {
return '';
}
// The difference in whitespace handling is intentional here.
// Where we don't wrap in a link:
// we should retain at least one preceding and following space.
return href ? `[${trimmedContent}](${href})` : collapseWhitespace(children);
}

export default class PageContext extends ContentFeature {
Expand Down Expand Up @@ -420,6 +464,7 @@ export default class PageContext extends ContentFeature {
const maxDepth = this.getFeatureSetting('maxDepth') || 5000;
let excludeSelectors = this.getFeatureSetting('excludeSelectors') || ['.ad', '.sidebar', '.footer', '.nav', '.header'];
const excludedInertElements = this.getFeatureSetting('excludedInertElements') || [
'img', // Note we're currently disabling images which we're handling in domToMarkdown (this can be per-site enabled in the config if needed).
'script',
'style',
'link',
Expand All @@ -436,22 +481,34 @@ export default class PageContext extends ContentFeature {
const mainContentSelector = this.getFeatureSetting('mainContentSelector') || 'main, article, .content, .main, #content, #main';
let mainContent = document.querySelector(mainContentSelector);
const mainContentLength = this.getFeatureSetting('mainContentLength') || 100;
// Fast path to avoid processing main content if it's too short
if (mainContent && mainContent.innerHTML.trim().length <= mainContentLength) {
mainContent = null;
}
const contentRoot = mainContent || document.body;
let contentRoot = mainContent || document.body;

if (contentRoot) {
this.log.info('Getting main content', contentRoot);
content += domToMarkdown(contentRoot, {
// Use a closure to reuse the domToMarkdown parameters
const extractContent = (root) => {
this.log.info('Getting content', root);
const result = domToMarkdown(root, {
maxLength: upperLimit,
maxDepth,
includeIframes: this.getFeatureSettingEnabled('includeIframes', 'enabled'),
excludeSelectors: excludeSelectorsString,
});
this.log.info('Content markdown', content, contentRoot);
trimBlankLinks: this.getFeatureSettingEnabled('trimBlankLinks', 'enabled'),
}).trim();
this.log.info('Content markdown', result, root);
return result;
};

if (contentRoot) {
content += extractContent(contentRoot);
}
// If the main content is empty, use the body
if (content.length === 0 && contentRoot !== document.body && this.getFeatureSettingEnabled('bodyFallback', 'enabled')) {
contentRoot = document.body;
content += extractContent(contentRoot);
}
content = content.trim();

// Store the full content length before truncation
this.fullContentLength = content.length;
Expand Down
53 changes: 53 additions & 0 deletions injected/unit-test/fixtures/page-context/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Page Context DOM-to-Markdown Tests

This directory contains test fixtures for testing the `domToMarkdown` function from `page-context.js`.

## Directory Structure

- `output/` - Generated markdown files from test runs (temporary, regenerated on each run)
- `expected/` - Expected markdown output files (committed to git)

## How It Works

The test suite (`page-context-dom.spec.js`) does the following:

1. **Creates test cases** with HTML snippets and settings for `domToMarkdown`
2. **Converts HTML to Markdown** using JSDom to simulate a browser environment
3. **Writes output** to `output/` directory for inspection
4. **Compares output** with expected files in `expected/` directory
5. **Fails if different** - Any difference between output and expected causes test failure

## Test Cases

The suite includes 20 test cases covering:

- Basic HTML elements (paragraphs, headings, lists, links, images)
- Formatting (bold, italic, mixed formatting)
- Complex structures (nested lists, articles, blog posts)
- Edge cases (hidden content, empty links, whitespace handling)
- Configuration options (max length truncation, excluded selectors, trim blank links)

## Updating Expected Output

When the `domToMarkdown` function behavior changes:

1. Review the changes in `output/` directory
2. If changes are correct, copy them to `expected/`:
```bash
cp unit-test/fixtures/page-context/output/*.md unit-test/fixtures/page-context/expected/
```
3. Commit the updated expected files

## Running Tests

```bash
npm run test-unit -- unit-test/page-context-dom.spec.js
```

## Why This Approach?

- **Visibility**: Output files make it easy to review markdown generation
- **Regression detection**: Tests fail on any unintended changes
- **Documentation**: Expected files serve as examples of the function's behavior
- **Easy updates**: Simple to update baselines when behavior intentionally changes

Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Article Title
By **Author Name**
This is the introduction paragraph with some *emphasis*.

## First Section
Content of the first section.


- Point one

- Point two


## Second Section
Content with a [link](https://example.com).
16 changes: 16 additions & 0 deletions injected/unit-test/fixtures/page-context/expected/blog-post.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Blog Post Title
Published on January 1, 2024

![Header image](header.jpg)
Lorem ipsum dolor sit amet, consectetur adipiscing elit.

## Key Takeaways


- First takeaway

- Second takeaway

- Third takeaway

Read more on [our blog](https://blog.example.com).
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is **bold** and this is *italic*.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Article Title
Introduction paragraph.

## Section 1
Section content with **bold** text.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[](https://example.com)
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Keep this
Keep this too
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Main Heading

## Subheading

### Sub-subheading
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Visible text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
![A beautiful landscape](photo.jpg)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
First line
Second line
Third line
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Visit [our website](https://example.com) for more info.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is a very long paragraph ...
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This has ***bold and italic*** together.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
First paragraph.
Second paragraph.
Third paragraph.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- Item 1 - Subitem 1.1 - Subitem 1.2

- Item 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- First step

- Second step

- Third step
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is a simple paragraph.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- First item

- Second item

- Third item
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Text with multiple spaces
Loading
Loading