Skip to content

Commit 8380c04

Browse files
authored
Merge pull request #259 from nickmorri/nickmorri/handle-invalid-json-parsing
fix: Add `OpenGraphScraperOptions.jsonLDOptions.throwOnJSONParseError` and change default behavior to not throw on JSON-LD string parse errors
2 parents 686fcc5 + a55a6e1 commit 8380c04

File tree

4 files changed

+119
-11
lines changed

4 files changed

+119
-11
lines changed

README.md

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,17 @@ Check the return for a ```success``` flag. If success is set to true, then the u
5454

5555
## Options
5656

57-
| Name | Info | Default Value | Required |
58-
|----------------------|----------------------------------------------------------------------------|---------------|----------|
59-
| url | URL of the site. | | x |
60-
| html | You can pass in an HTML string to run ogs on it. (use without options.url) | | |
61-
| fetchOptions | Options that are used by the Fetch API | {} | |
62-
| timeout | Request timeout for Fetch (Default is 10 seconds) | 10 | |
63-
| blacklist | Pass in an array of sites you don't want ogs to run on. | [] | |
64-
| onlyGetOpenGraphInfo | Only fetch open graph info and don't fall back on anything else. Also accepts an array of properties for which no fallback should be used | false | |
65-
| customMetaTags | Here you can define custom meta tags you want to scrape. | [] | |
66-
| urlValidatorSettings | Sets the options used by validator.js for testing the URL | [Here](https://github.com/jshemas/openGraphScraper/blob/master/lib/utils.ts#L4-L17) | |
57+
| Name | Info | Default Value | Required |
58+
|----------------------|-------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------|----------|
59+
| url | URL of the site. | | x |
60+
| html | You can pass in an HTML string to run ogs on it. (use without options.url) | | |
61+
| fetchOptions | Options that are used by the Fetch API | {} | |
62+
| timeout | Request timeout for Fetch (Default is 10 seconds) | 10 | |
63+
| blacklist | Pass in an array of sites you don't want ogs to run on. | [] | |
64+
| onlyGetOpenGraphInfo | Only fetch open graph info and don't fall back on anything else. Also accepts an array of properties for which no fallback should be used | false | |
65+
| customMetaTags | Here you can define custom meta tags you want to scrape. | [] | |
66+
| urlValidatorSettings | Sets the options used by validator.js for testing the URL | [Here](https://github.com/jshemas/openGraphScraper/blob/master/lib/utils.ts#L4-L17) | |
67+
| jsonLDOptions | Sets the options used when parsing JSON-LD data | | |
6768

6869
Note: `open-graph-scraper` uses the [Fetch API](https://nodejs.org/dist/latest-v18.x/docs/api/globals.html#fetch) for requests and most of [Fetch's options](https://developer.mozilla.org/en-US/docs/Web/API/fetch#options) should work as `open-graph-scraper`'s `fetchOptions` options.
6970

@@ -159,6 +160,26 @@ ogs({ url: 'https://www.wikipedia.org/', fetchOptions: { headers: { 'user-agent'
159160
})
160161
```
161162

163+
## JSON-LD Parsing Options Example
164+
165+
`throwOnJSONParseError` and `logOnJSONParseError` properties control what happens if `JSON.parse`
166+
throws an error when parsing JSON-LD data.
167+
If `throwOnJSONParseError` is set to `true`, then the error will be thrown.
168+
If `logOnJSONParseError` is set to `true`, then the error will be logged to the console.
169+
170+
```javascript
171+
const ogs = require("open-graph-scraper");
172+
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36';
173+
ogs({ url: 'https://www.wikipedia.org/', jsonLDOptions: { throwOnJSONParseError: true } })
174+
.then((data) => {
175+
const { error, html, result, response } = data;
176+
console.log('error:', error); // This returns true or false. True if there was an error. The error itself is inside the result object.
177+
console.log('html:', html); // This contains the HTML of page
178+
console.log('result:', result); // This contains all of the Open Graph results
179+
console.log('response:', response); // This contains response from the Fetch API
180+
})
181+
```
182+
162183
## Running the example app
163184

164185
Inside the `example` folder contains a simple express app where you can run `npm ci && npm run start` to spin up. Once the app is running, open a web browser and go to `http://localhost:3000/scraper?url=http://ogp.me/` to test it out. There is also a `Dockerfile` if you want to run this example app in a docker container.

lib/extract.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,16 @@ export default function extractMetaTags(body: string, options: OpenGraphScraperO
9999
if (scriptText) {
100100
scriptText = scriptText.replace(/(\r\n|\n|\r)/gm, ''); // remove newlines
101101
scriptText = unescapeScriptText(scriptText);
102-
ogObject.jsonLD.push(JSON.parse(scriptText));
102+
try {
103+
ogObject.jsonLD.push(JSON.parse(scriptText));
104+
} catch (error: unknown) {
105+
if (options.jsonLDOptions?.logOnJSONParseError) {
106+
console.error('Error parsing JSON-LD script tag:', error);
107+
}
108+
if (options.jsonLDOptions?.throwOnJSONParseError) {
109+
throw error;
110+
}
111+
}
103112
}
104113
}
105114
});

lib/types.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ export interface OpenGraphScraperOptions {
3838
timeout?: number;
3939
url?: string;
4040
urlValidatorSettings?: ValidatorSettings;
41+
jsonLDOptions?: JSONLDOptions;
4142
}
4243

4344
/**
@@ -67,6 +68,14 @@ export interface ValidatorSettings {
6768
validate_length: boolean;
6869
}
6970

71+
/**
72+
* Options for the JSON-LD parser
73+
*/
74+
export interface JSONLDOptions {
75+
throwOnJSONParseError?: boolean;
76+
logOnJSONParseError?: boolean;
77+
}
78+
7079
/**
7180
* The type for user defined custom meta tags you want to scrape.
7281
*

tests/unit/static.spec.ts

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,75 @@ describe('static check meta tags', function () {
279279
});
280280
});
281281

282+
it('jsonLD - invalid JSON string that cannot be parsed does not throw error', function () {
283+
const metaHTML = `<html><head>
284+
<script type="application/ld+json">
285+
{
286+
"@context": "http://schema.org",
287+
"@type": "Organization",
288+
"name": "Blah ",
289+
"sameAs": [
290+
"https:\\\\/\\\\/twitter.com\\\\/blah?lang=en"
291+
"https:\\\\/\\\\/www.facebook.com\\\\/blah\\\\/"
292+
""
293+
"https:\\\\/\\\\/www.instagram.com\\\\/blah\\\\/"
294+
""
295+
""
296+
"https:\\\\/\\\\/www.youtube.com\\\\/@blah"
297+
""
298+
],
299+
"url": "https:\\\\/\\\\/blah.com"
300+
}
301+
302+
</script>
303+
</head></html>`;
304+
305+
mockAgent.get('http://www.test.com')
306+
.intercept({ path: '/' })
307+
.reply(200, metaHTML);
308+
309+
return ogs({ url: 'www.test.com' })
310+
.then(function (data) {
311+
expect(data.result.success).to.be.eql(true);
312+
expect(data.result.requestUrl).to.be.eql('http://www.test.com');
313+
expect(data.result.jsonLD).to.be.eql([]);
314+
expect(data.html).to.be.eql(metaHTML);
315+
expect(data.response).to.be.a('response');
316+
});
317+
});
318+
319+
it('jsonLD - invalid JSON string that cannot be parsed throws error when options.jsonLDOptions.throwOnJSONParseError = true', function () {
320+
const metaHTML = `<html><head>
321+
<script type="application/ld+json">
322+
{
323+
"@context": "http://schema.org",
324+
"@type": "Organization",
325+
"name": "Blah ",
326+
"sameAs": [
327+
"https:\\\\/\\\\/twitter.com\\\\/blah?lang=en"
328+
"https:\\\\/\\\\/www.facebook.com\\\\/blah\\\\/"
329+
""
330+
"https:\\\\/\\\\/www.instagram.com\\\\/blah\\\\/"
331+
""
332+
""
333+
"https:\\\\/\\\\/www.youtube.com\\\\/@blah"
334+
""
335+
],
336+
"url": "https:\\\\/\\\\/blah.com"
337+
}
338+
339+
</script>
340+
</head></html>`;
341+
342+
mockAgent.get('http://www.test.com')
343+
.intercept({ path: '/' })
344+
.reply(200, metaHTML);
345+
346+
return ogs({ url: 'www.test.com', jsonLDOptions: { throwOnJSONParseError: true } }).catch((data) => {
347+
expect(data.result.success).to.be.eql(false);
348+
});
349+
});
350+
282351
it('encoding - utf-8', function () {
283352
/* eslint-disable max-len */
284353
const metaHTML = `<html><head>

0 commit comments

Comments
 (0)