diff --git a/README.md b/README.md index 483509d..d1ac103 100644 --- a/README.md +++ b/README.md @@ -54,16 +54,17 @@ Check the return for a ```success``` flag. If success is set to true, then the u ## Options -| Name | Info | Default Value | Required | -|----------------------|----------------------------------------------------------------------------|---------------|----------| -| url | URL of the site. | | x | -| html | You can pass in an HTML string to run ogs on it. (use without options.url) | | | -| fetchOptions | Options that are used by the Fetch API | {} | | -| timeout | Request timeout for Fetch (Default is 10 seconds) | 10 | | -| blacklist | Pass in an array of sites you don't want ogs to run on. | [] | | -| onlyGetOpenGraphInfo | Only fetch open graph info and don't fall back on anything else. Also accepts an array of properties for which no fallback should be used | false | | -| customMetaTags | Here you can define custom meta tags you want to scrape. | [] | | -| urlValidatorSettings | Sets the options used by validator.js for testing the URL | [Here](https://github.com/jshemas/openGraphScraper/blob/master/lib/utils.ts#L4-L17) | | +| Name | Info | Default Value | Required | +|----------------------|-------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------|----------| +| url | URL of the site. | | x | +| html | You can pass in an HTML string to run ogs on it. (use without options.url) | | | +| fetchOptions | Options that are used by the Fetch API | {} | | +| timeout | Request timeout for Fetch (Default is 10 seconds) | 10 | | +| blacklist | Pass in an array of sites you don't want ogs to run on. | [] | | +| onlyGetOpenGraphInfo | Only fetch open graph info and don't fall back on anything else. Also accepts an array of properties for which no fallback should be used | false | | +| customMetaTags | Here you can define custom meta tags you want to scrape. | [] | | +| urlValidatorSettings | Sets the options used by validator.js for testing the URL | [Here](https://github.com/jshemas/openGraphScraper/blob/master/lib/utils.ts#L4-L17) | | +| jsonLDOptions | Sets the options used when parsing JSON-LD data | | | Note: `open-graph-scraper` uses the [Fetch API](https://nodejs.org/dist/latest-v18.x/docs/api/globals.html#fetch) for requests and most of [Fetch's options](https://developer.mozilla.org/en-US/docs/Web/API/fetch#options) should work as `open-graph-scraper`'s `fetchOptions` options. @@ -159,6 +160,26 @@ ogs({ url: 'https://www.wikipedia.org/', fetchOptions: { headers: { 'user-agent' }) ``` +## JSON-LD Parsing Options Example + +`throwOnJSONParseError` and `logOnJSONParseError` properties control what happens if `JSON.parse` +throws an error when parsing JSON-LD data. +If `throwOnJSONParseError` is set to `true`, then the error will be thrown. +If `logOnJSONParseError` is set to `true`, then the error will be logged to the console. + +```javascript +const ogs = require("open-graph-scraper"); +const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'; +ogs({ url: 'https://www.wikipedia.org/', jsonLDOptions: { throwOnJSONParseError: true } }) + .then((data) => { + const { error, html, result, response } = data; + console.log('error:', error); // This returns true or false. True if there was an error. The error itself is inside the result object. + console.log('html:', html); // This contains the HTML of page + console.log('result:', result); // This contains all of the Open Graph results + console.log('response:', response); // This contains response from the Fetch API + }) +``` + ## Running the example app Inside the `example` folder contains a simple express app where you can run `npm ci && npm run start` to spin up. Once the app is running, open a web browser and go to `http://localhost:3000/scraper?url=http://ogp.me/` to test it out. There is also a `Dockerfile` if you want to run this example app in a docker container. diff --git a/lib/extract.ts b/lib/extract.ts index f50fb74..a3a6836 100644 --- a/lib/extract.ts +++ b/lib/extract.ts @@ -99,7 +99,16 @@ export default function extractMetaTags(body: string, options: OpenGraphScraperO if (scriptText) { scriptText = scriptText.replace(/(\r\n|\n|\r)/gm, ''); // remove newlines scriptText = unescapeScriptText(scriptText); - ogObject.jsonLD.push(JSON.parse(scriptText)); + try { + ogObject.jsonLD.push(JSON.parse(scriptText)); + } catch (error: unknown) { + if (options.jsonLDOptions?.logOnJSONParseError) { + console.error('Error parsing JSON-LD script tag:', error); + } + if (options.jsonLDOptions?.throwOnJSONParseError) { + throw error; + } + } } } }); diff --git a/lib/types.ts b/lib/types.ts index acd352d..99fcc6b 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -38,6 +38,7 @@ export interface OpenGraphScraperOptions { timeout?: number; url?: string; urlValidatorSettings?: ValidatorSettings; + jsonLDOptions?: JSONLDOptions; } /** @@ -67,6 +68,14 @@ export interface ValidatorSettings { validate_length: boolean; } +/** + * Options for the JSON-LD parser + */ +export interface JSONLDOptions { + throwOnJSONParseError?: boolean; + logOnJSONParseError?: boolean; +} + /** * The type for user defined custom meta tags you want to scrape. * diff --git a/tests/unit/static.spec.ts b/tests/unit/static.spec.ts index 6a29541..25af81a 100644 --- a/tests/unit/static.spec.ts +++ b/tests/unit/static.spec.ts @@ -279,6 +279,75 @@ describe('static check meta tags', function () { }); }); + it('jsonLD - invalid JSON string that cannot be parsed does not throw error', function () { + const metaHTML = `
+ + `; + + mockAgent.get('http://www.test.com') + .intercept({ path: '/' }) + .reply(200, metaHTML); + + return ogs({ url: 'www.test.com' }) + .then(function (data) { + expect(data.result.success).to.be.eql(true); + expect(data.result.requestUrl).to.be.eql('http://www.test.com'); + expect(data.result.jsonLD).to.be.eql([]); + expect(data.html).to.be.eql(metaHTML); + expect(data.response).to.be.a('response'); + }); + }); + + it('jsonLD - invalid JSON string that cannot be parsed throws error when options.jsonLDOptions.throwOnJSONParseError = true', function () { + const metaHTML = ` + + `; + + mockAgent.get('http://www.test.com') + .intercept({ path: '/' }) + .reply(200, metaHTML); + + return ogs({ url: 'www.test.com', jsonLDOptions: { throwOnJSONParseError: true } }).catch((data) => { + expect(data.result.success).to.be.eql(false); + }); + }); + it('encoding - utf-8', function () { /* eslint-disable max-len */ const metaHTML = `