Skip to content

Commit 1a1ab44

Browse files
committed
added many new scrapers
1 parent f09f571 commit 1a1ab44

File tree

21 files changed

+31551
-0
lines changed

21 files changed

+31551
-0
lines changed

scrapers/brill.json

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
{
2+
"url": "brillonline\\.com",
3+
"headless": true,
4+
"elements": {
5+
"publisher": {
6+
"selector": "//meta[@name='citation_publisher']",
7+
"attribute": "content"
8+
},
9+
"title": {
10+
"selector": "//meta[@name='citation_title']",
11+
"attribute": "content"
12+
},
13+
"authors": {
14+
"selector": "//meta[@name='citation_authors']",
15+
"attribute": "content"
16+
},
17+
"date": {
18+
"selector": "//meta[@name='citation_date']",
19+
"attribute": "content"
20+
},
21+
"doi": {
22+
"selector": "//meta[@name='citation_doi']",
23+
"attribute": "content"
24+
},
25+
"issn": {
26+
"selector": "//meta[@name='citation_issn']",
27+
"attribute": "content"
28+
},
29+
"publisher": {
30+
"selector": "//meta[@name='citation_publisher']",
31+
"attribute": "content"
32+
},
33+
"fulltext_pdf": {
34+
"selector": "//meta[@name='citation_pdf_url']",
35+
"attribute": "content",
36+
"download": {
37+
"rename": "fulltext.pdf"
38+
}
39+
},
40+
"fulltext_html": {
41+
"selector": "//meta[@name='citation_fulltext_pdf_url']",
42+
"attribute": "content",
43+
"download": {
44+
"rename": "fulltext.html"
45+
}
46+
}
47+
}
48+
}

scrapers/dois.txt

Lines changed: 315 additions & 0 deletions
Large diffs are not rendered by default.

scrapers/emerald.json

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
{
2+
"url": "emeraldinsight\\.com",
3+
"headless": true,
4+
"elements": {
5+
"publisher": {
6+
"selector": "//meta[@name='dc.Publisher']",
7+
"attribute": "content"
8+
},
9+
"title": {
10+
"selector": "//meta[@name='dc.Title']",
11+
"attribute": "content"
12+
},
13+
"authors": {
14+
"selector": "//meta[@name='dc.Creator']",
15+
"attribute": "content"
16+
},
17+
"date": {
18+
"selector": "//div[@id='pubDate']",
19+
"attribute": "text"
20+
},
21+
"volume": {
22+
"selector": "//span[@class='citation_volume']",
23+
"attribute": "text"
24+
},
25+
"doi": {
26+
"selector": "//meta[@scheme='doi']",
27+
"attribute": "content"
28+
},
29+
"description": {
30+
"selector": "//meta[@name='dc.Description']",
31+
"attribute": "content"
32+
},
33+
"journal": {
34+
"selector": "//meta[@name='citation_journal_title']",
35+
"attribute": "text"
36+
},
37+
"abstract": {
38+
"selector": "//p[@class='articleBody_abstractText']",
39+
"attribute": "text"
40+
},
41+
"abstract2": {
42+
"selector": "//a[@title='View the Abstract']",
43+
"attribute": "html",
44+
"download": true
45+
},
46+
"language": {
47+
"selector": "//meta[@name='dc.Language']",
48+
"attribute": "content"
49+
},
50+
"fulltext_html": {
51+
"selector": "//a[@title='View the Full Text HTML']",
52+
"attribute": "href",
53+
"download": {
54+
"rename": "fulltext.html"
55+
}
56+
},
57+
"fulltext_pdf": {
58+
"selector": "//a[@title='Download the PDF Full Text']",
59+
"attribute": "href",
60+
"download": {
61+
"rename": "fulltext.pdf"
62+
}
63+
},
64+
"supplementary_material": {
65+
"selector": "//a[title='View Supporting Information']",
66+
"attribute": "href",
67+
"download": true
68+
},
69+
"figure": {
70+
"selector": "//img[@alt='Abstract Image']",
71+
"attribute": "src",
72+
"download": true
73+
},
74+
"copyright": {
75+
"selector": "//div[contains(@id, 'artCopyright')]",
76+
"attribute": "text"
77+
}
78+
}
79+
}

scrapers/fpsych.json

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
{
2+
"url": "frontiersin\\.org",
3+
"headless": true,
4+
"elements": {
5+
"publisher": {
6+
"selector": "//meta[@name='citation_publisher']",
7+
"attribute": "content"
8+
},
9+
"title": {
10+
"selector": "//meta[@name='citation_title']",
11+
"attribute": "content"
12+
},
13+
"authors": {
14+
"selector": "//meta[@name='citation_authors']",
15+
"attribute": "content"
16+
},
17+
"date": {
18+
"selector": "//meta[@name='citation_date']",
19+
"attribute": "content"
20+
},
21+
"doi": {
22+
"selector": "//meta[@name='citation_doi']",
23+
"attribute": "content"
24+
},
25+
"issn": {
26+
"selector": "//meta[@name='citation_issn']",
27+
"attribute": "content"
28+
},
29+
"publisher": {
30+
"selector": "//meta[@name='citation_publisher']",
31+
"attribute": "content"
32+
},
33+
"fulltext_pdf": {
34+
"selector": "//meta[@name='citation_pdf_url']",
35+
"attribute": "content",
36+
"download": {
37+
"rename": "fulltext.pdf"
38+
}
39+
},
40+
"fulltext_html": {
41+
"selector": "//meta[@name='citation_fulltext_pdf_url']",
42+
"attribute": "content",
43+
"download": {
44+
"rename": "fulltext.html"
45+
}
46+
}
47+
}
48+
}

scrapers/humkin.json

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
{
2+
"url": "humankinetics\\.com",
3+
"headless": true,
4+
"elements": {
5+
"publisher": {
6+
"selector": "//meta[@name='DC.Publisher']",
7+
"attribute": "content"
8+
},
9+
"title": {
10+
"selector": "//meta[@name='DC.Title']",
11+
"attribute": "content"
12+
},
13+
"authors": {
14+
"selector": "//meta[@name='citation_author']",
15+
"attribute": "content"
16+
},
17+
"date": {
18+
"selector": "//meta[@name='DC.Date']",
19+
"attribute": "content"
20+
},
21+
"doi": {
22+
"selector": "//meta[@name='DC.Identifier']",
23+
"attribute": "content"
24+
},
25+
"issn": {
26+
"selector": "//meta[@name='citation_issn']",
27+
"attribute": "content"
28+
},
29+
"fulltext_html": {
30+
"selector": "//meta[@name='citation_fulltext_html_url']",
31+
"attribute": "content",
32+
"download": {
33+
"rename": "fulltext.html"
34+
}
35+
},
36+
"fulltext_pdf": {
37+
"selector": "//meta[@name='citation_pdf_url']",
38+
"attribute": "content",
39+
"download": {
40+
"rename": "fulltext.pdf"
41+
}
42+
}
43+
}
44+
}

0 commit comments

Comments
 (0)