Skip to content

Commit 3777efe

Browse files
authored
chore(helpers): handle break line on jsonld (microlinkhq#793)
* chore(helpers): handle break line on jsonld * test: update snapshot * ci(fix): run package tests * test: update snapshot * test: update snapshot * test: update snapshot * test: update snapshot * test: update snapshot * test: update snapshot * test: update snapshot * test: update snapshot * test: update snapshot * test: update snapshot * test: update snapshot * test: update snapshot
1 parent 8cab1eb commit 3777efe

File tree

52 files changed

+615
-646
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+615
-646
lines changed

.github/workflows/get-matrix.mjs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@ for (const packagePath of await readdir(packagesPath)) {
1212
const pkg = join(packagesPath, packagePath, 'package.json')
1313
const { name, scripts } = JSON.parse(await readFile(pkg))
1414
if (scripts && scripts.test && scripts.test !== 'exit 0') {
15-
packages.push(name)
15+
packages.push({
16+
name,
17+
filter: `./packages/${packagePath}`
18+
})
1619
}
1720
}
1821

19-
console.log(`{"package":${JSON.stringify(packages)}}`)
22+
console.log(JSON.stringify({ package: packages }))

.github/workflows/main.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ jobs:
4242
outputs:
4343
matrix: ${{ steps.matrix.outputs.matrix }}
4444
test:
45+
name: Test ${{ matrix.package.name }}
4546
if: |
4647
!startsWith(github.event.head_commit.message, 'chore(release):') &&
4748
!startsWith(github.event.head_commit.message, 'docs:') &&
@@ -64,15 +65,15 @@ jobs:
6465
version: latest
6566
run_install: true
6667
- name: Test
67-
run: pnpm --filter "${{ matrix.package }}" exec c8 pnpm test
68+
run: pnpm --filter "${{ matrix.package.filter }}" exec c8 pnpm test
6869
- name: Coverage
69-
run: pnpm --filter "${{ matrix.package }}" exec c8 report --reporter=lcov --report-dir=coverage
70+
run: pnpm --filter "${{ matrix.package.filter }}" exec c8 report --reporter=lcov --report-dir=coverage
7071
- name: Upload
7172
uses: coverallsapp/github-action@main
7273
with:
73-
flag-name: ${{ matrix.package }}
74+
flag-name: ${{ matrix.package.name }}
7475
parallel: true
75-
file: $(pnpm --filter "${{ matrix.package }}" exec pwd)/coverage/lcov.info
76+
file: $(pnpm --filter "${{ matrix.package.filter }}" exec pwd)/coverage/lcov.info
7677

7778
finish:
7879
needs: test

.github/workflows/pull_request.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ jobs:
2121
outputs:
2222
matrix: ${{ steps.matrix.outputs.matrix }}
2323
test:
24+
name: Test ${{ matrix.package.name }}
2425
needs: matrix
2526
runs-on: ubuntu-latest
2627
strategy:
@@ -39,15 +40,15 @@ jobs:
3940
version: latest
4041
run_install: true
4142
- name: Test
42-
run: pnpm --filter "${{ matrix.package }}" exec c8 pnpm test
43+
run: pnpm --filter "${{ matrix.package.filter }}" exec c8 pnpm test
4344
- name: Coverage
44-
run: pnpm --filter "${{ matrix.package }}" exec c8 report --reporter=lcov --report-dir=coverage
45+
run: pnpm --filter "${{ matrix.package.filter }}" exec c8 report --reporter=lcov --report-dir=coverage
4546
- name: Upload
4647
uses: coverallsapp/github-action@main
4748
with:
48-
flag-name: ${{ matrix.package }}
49+
flag-name: ${{ matrix.package.name }}
4950
parallel: true
50-
file: $(pnpm --filter "${{ matrix.package }}" exec pwd)/coverage/lcov.info
51+
file: $(pnpm --filter "${{ matrix.package.filter }}" exec pwd)/coverage/lcov.info
5152

5253
finish:
5354
needs: test

packages/metascraper-helpers/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
"iso-639-3": "~2.2.0",
3737
"isostring": "0.0.1",
3838
"jsdom": "~27.4.0",
39+
"jsonrepair": "~3.13.2",
3940
"lodash": "~4.17.23",
4041
"memoize-one": "~6.0.0",
4142
"microsoft-capitalize": "~1.0.6",

packages/metascraper-helpers/src/index.js

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ const capitalize = require('microsoft-capitalize')
88
const isRelativeUrl = require('is-relative-url')
99
const fileExtension = require('file-extension')
1010
const _normalizeUrl = require('normalize-url')
11+
const { jsonrepair } = require('jsonrepair')
1112
const smartquotes = require('smartquotes')
1213
const { decodeHTML } = require('entities')
1314
const iso6393 = require('iso-639-3/to-1')
@@ -354,19 +355,29 @@ memoizeOne.EqualityUrlAndHtmlDom = (newArgs, oldArgs) =>
354355
memoizeOne.EqualityFirstArgument = (newArgs, oldArgs) =>
355356
newArgs[0] === oldArgs[0]
356357

358+
const parseJSON = text => {
359+
try {
360+
return JSON.parse(text)
361+
} catch {
362+
try {
363+
return JSON.parse(jsonrepair(text))
364+
} catch {
365+
return undefined
366+
}
367+
}
368+
}
369+
357370
const jsonld = memoizeOne(
358371
$ =>
359372
$('script[type="application/ld+json"]')
360373
.map((_, element) => {
361-
try {
362-
const el = $(element)
363-
const json = JSON.parse($(el).contents().text())
364-
const { '@graph': graph, ...props } = json
365-
if (!graph) return json
366-
return graph.map(item => ({ ...props, ...item }))
367-
} catch (_) {
368-
return undefined
369-
}
374+
const el = $(element)
375+
const text = $(el).contents().text()
376+
const json = parseJSON(text)
377+
if (!json) return undefined
378+
const { '@graph': graph, ...props } = json
379+
if (!graph) return json
380+
return graph.map(item => ({ ...props, ...item }))
370381
})
371382
.get()
372383
.filter(Boolean),

packages/metascraper-helpers/test/index.js

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -381,12 +381,32 @@ test('.description', t => {
381381
})
382382

383383
test('.$jsonld', t => {
384-
const $ = cheerio.load(`
384+
{
385+
const $ = cheerio.load(`
385386
<script type="application/ld+json">{ "offers": { "price": 119.99 }}</script>
386387
<script type="application/ld+json">{ "offers": { "price": "" }}</script>
387388
`)
388-
const value = $jsonld('offers.price')($)
389-
t.is(value, 119.99)
389+
const value = $jsonld('offers.price')($)
390+
t.is(value, 119.99)
391+
}
392+
{
393+
const $ = cheerio.load('<script type="application/ld+json">{{</script>')
394+
const value = $jsonld('offers.price')($)
395+
t.is(value, undefined)
396+
}
397+
{
398+
const $ =
399+
cheerio.load(`<script type="application/ld+json">{"@context":"https://schema.org","mainEntity":{"description":"This is an example
400+
401+
🌐 of a multiline description
402+
📬 to see how it is parsed
403+
📧 and how it is decoded"}}</script>`)
404+
const value = $jsonld('mainEntity.description')($)
405+
t.is(
406+
value,
407+
'This is an example\n\n🌐 of a multiline description\n📬 to see how it is parsed\n📧 and how it is decoded'
408+
)
409+
}
390410
})
391411

392412
test('.lang', t => {

packages/metascraper-helpers/test/jsonld.js

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ test('ensure to return first value', t => {
5151
})
5252

5353
test('reads multiple JSON-LD blocks', t => {
54-
const $ = cheerio.load(`<script type="application/ld+json"> { "@context": "http://schema.org", "@type": "Organization", "url": "https://bykvu.com/ru", "logo": "https://bykvu.com/wp-content/themes/bykvu/img/logo.svg" } </script>
54+
const $ =
55+
cheerio.load(`<script type="application/ld+json"> { "@context": "http://schema.org", "@type": "Organization", "url": "https://bykvu.com/ru", "logo": "https://bykvu.com/wp-content/themes/bykvu/img/logo.svg" } </script>
5556
<script type="application/ld+json"> { "@context": "http://schema.org", "@type": "NewsArticle", "mainEntityOfPage": { "@type": "WebPage", "@id": "https://bykvu.com/ru/bukvy/uchenye-nazvali-depressiju-prichinoj-22-opasnyh-zabolevanij/" }, "headline": "Ученые назвали депрессию причиной 22 опасных заболеваний", "image": [ "https://bykvu.com/wp-content/themes/bykvu/includes/images/noimage_large.jpg" ], "datePublished": "2019-09-09T00:29:09+02:00", "dateModified": "2019-09-09T00:29:09+02:00", "author": { "@type": "Person", "name": "Буквы" }, "publisher": { "@type": "Organization", "name": "Буквы", "logo": { "@type": "ImageObject", "url": "https://bykvu.com/wp-content/themes/bykvu/img/apple-icon-180x180.png" } }, "description": "Ученые австралийского центра точного здравоохранения при Университете Южной Австралии выяснили, что депрессия является причиной 22 различных заболеваний." } </script>
5657
<script type="application/ld+json"> { "@context": "https://schema.org", "@type": "BreadcrumbList", "itemListElement": [ { "@type": "ListItem", "position": 1, "item": { "@id": "https://bykvu.com/ru", "name": "Буквы" } }, { "@type": "ListItem", "position": 2, "item": { "@id": "https://bykvu.com/ru/category/bukvy/", "name": "Новости" } }, { "@type": "ListItem", "position": 3, "item": { "@id": "https://bykvu.com/ru/bukvy/uchenye-nazvali-depressiju-prichinoj-22-opasnyh-zabolevanij/", "name": "Ученые назвали депрессию причиной 22 опасных заболеваний" } } ] } </script>`)
5758

@@ -63,7 +64,6 @@ test('only caches the last invocation', t => {
6364
const html =
6465
'<script type="application/ld+json">[{"@context":"http://schema.org","@type":"NewsArticle","mainEntityOfPage":"https://www.theverge.com/2017/11/16/16667366/tesla-semi-truck-announced-price-release-date-electric-self-driving","headline":"This is the Tesla Semi truck","description":"500 miles of range and more aerodynamic than a supercar","speakable":{"@type":"SpeakableSpecification","xpath":["/html/head/title","/html/head/meta[@name=\'description\']/@content"]},"datePublished":"2017-11-16T23:47:07-05:00","dateModified":"2017-11-16T23:47:07-05:00","author":{"@type":"Person","name":"Zac Estrada"},"publisher":{"@type":"Organization","name":"The Verge","logo":{"@type":"ImageObject","url":"https://cdn.vox-cdn.com/uploads/chorus_asset/file/13668586/google_amp.0.png","width":600,"height":60}},"about":{"@type":"Event","name":"Tesla Semi Truck Event 2017","startDate":"2017-11-17T04:00:00+00:00","location":{"@type":"Place","name":"Tesla Motors factory","address":"Hawthorne, California, USA"}},"image":[{"@type":"ImageObject","url":"https://cdn.vox-cdn.com/thumbor/k8ssXKPAuRwxa1pKew982ZMgv0o=/1400x1400/filters:format(jpeg)/cdn.vox-cdn.com/uploads/chorus_asset/file/9699573/Semi_Front_Profile.jpg","width":1400,"height":1400},{"@type":"ImageObject","url":"https://cdn.vox-cdn.com/thumbor/l6nkV8CkJIdUrJIzHFWUFc1zLRM=/1400x1050/filters:format(jpeg)/cdn.vox-cdn.com/uploads/chorus_asset/file/9699573/Semi_Front_Profile.jpg","width":1400,"height":1050},{"@type":"ImageObject","url":"https://cdn.vox-cdn.com/thumbor/5Sqo6J73lBi1hwzEiKCQy6FLx3I=/1400x788/filters:format(jpeg)/cdn.vox-cdn.com/uploads/chorus_asset/file/9699573/Semi_Front_Profile.jpg","width":1400,"height":788}]}]</script>'
6566

66-
// Load it and process it with jsonld.
6767
const $ = cheerio.load(html)
6868
const $mutate = cheerio.load($.html())
6969
const json = jsonld($)
@@ -83,3 +83,17 @@ test('only caches the last invocation', t => {
8383
// another.
8484
t.deepEqual(json, jsonld(cheerio.load($.html())))
8585
})
86+
87+
test('parse json with break lines', t => {
88+
const $ =
89+
cheerio.load(`<script type="application/ld+json">{"@context":"https://schema.org","mainEntity":{"description":"This is an example
90+
🌐 of a multiline description
91+
📬 to see how it is parsed
92+
📧 and how it is decoded"}}</script>`)
93+
t.snapshot(jsonld($))
94+
})
95+
96+
test('returns empty array if JSON-LD is invalid', t => {
97+
const $ = cheerio.load('<script type="application/ld+json">{{</script>')
98+
t.deepEqual(jsonld($), [])
99+
})

packages/metascraper-helpers/test/snapshots/jsonld.js.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,19 @@ Generated by [AVA](https://avajs.dev).
9191
},
9292
},
9393
]
94+
95+
## parse json with break lines
96+
97+
> Snapshot 1
98+
99+
[
100+
{
101+
'@context': 'https://schema.org',
102+
mainEntity: {
103+
description: `This is an example␊
104+
🌐 of a multiline description␊
105+
📬 to see how it is parsed␊
106+
📧 and how it is decoded`,
107+
},
108+
},
109+
]
127 Bytes
Binary file not shown.

packages/metascraper-uol/test/snapshots/index.js.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Generated by [AVA](https://avajs.dev).
99
> Snapshot 1
1010
1111
{
12-
description: null,
12+
description: 'A modelo que acusa Neymar de agressão e estupro, Najila Mendes de Souza afirmou que continuou a conversa com o jogador para ter provas do estupro.',
1313
title: 'Modelo diz que Neymar foi “estúpido” e dá detalhes sobre suposto estupro',
1414
}
1515

0 commit comments

Comments
 (0)