-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathlibrofm.js
More file actions
170 lines (141 loc) · 5.19 KB
/
librofm.js
File metadata and controls
170 lines (141 loc) · 5.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import { Extractor } from "./AbstractExtractor.js";
import { addContributor, getCoverData, logMarian, cleanText, collectObject } from "../shared/utils.js";
class librofmScraper extends Extractor {
get _name() { return "Libro.fm Extractor"; }
_sitePatterns = [
/^https?:\/\/(www\.)?libro\.fm\/audiobooks\/\d+(-[a-zA-Z0-9-]+)?/,
];
async getDetails() {
const bookDetails = {};
const imggrab = document.querySelector('.audiobook-cover .book-cover-wrap img.book-cover');
const coverData = getCoverData(imggrab?.src);
// Title
getLibroBookTitle(bookDetails);
// Series name and number
getLibroSeries(bookDetails);
// Contributors
extractLibroContributors(bookDetails);
//get format and length
getLibroFormatInfo(bookDetails, window.location.href)
// get extra block of info - isbn, language, etc.
extraLibroInfo(bookDetails);
// Description
extractLibroDescription(bookDetails);
logMarian("Libro extraction complete:", bookDetails);
return collectObject([
coverData,
bookDetails,
]);
}
}
function extractLibroContributors(bookDetails) {
let contributors = []
const section = extractSection('audiobook details')
const authors = section.querySelectorAll('span[itemprop="author"] a')
authors.forEach(author => {
const name = author.textContent.trim()
addContributor(contributors, name, "Author");
})
const narrators = section.querySelectorAll('a[href$="searchby=narrators"]')
narrators.forEach(narrator => {
const name = narrator.textContent.trim()
addContributor(contributors, name, "Narrator");
})
if (contributors.length) {
bookDetails["Contributors"] = contributors;
}
}
function getLibroSeries(bookDetails) {
const seriesName = document.querySelector('.audiobook-title__series a');
if (seriesName) {
let name = seriesName.textContent.trim();
bookDetails['Series'] = name;
let seriesPlace = extractTextNode(document.querySelector('.audiobook-title__series'));
let number = seriesPlace.match(/\d+/);
if (number) {
bookDetails['Series Place'] = number[0];
}
}
}
function getLibroBookTitle(bookDetails) {
const h1 = document.querySelector('h1.audiobook-title');
const rawTitle = cleanText(h1?.childNodes[0]?.textContent);
rawTitle ? bookDetails["Title"] = rawTitle : null;
}
function joinContent(elements) {
return Array.from(elements)
// libro.fm uses <br> tags instead of <p> tags for paragraphs, so have to use innerText
.map(item => item.innerText.trim())
// split by newlines so that everything isn't on one line
.flatMap(item => item.split('\n'))
// strip out empty lines (there are some random empty <p> tags)
.filter(item => item.length > 0)
.join("\n");
}
function extractTextNode(element) {
return Array.from(element?.childNodes || [])
.filter(n => n.nodeType == Node.TEXT_NODE)
.map(n => n.textContent.trim())
.join("\n")
.trim();
}
function extractSection(title) {
const sections = document.querySelectorAll('section')
return Array.from(sections)
.find(section => section.querySelector('h2')?.textContent.trim().toLowerCase() == title)
}
function getLibroFormatInfo(bookDetails) {
bookDetails['Reading Format'] = 'Audiobook';
const informationSections = document.querySelectorAll(".audiobook-information .audiobook-information__section");
const audioLength = extractTextNode(
Array.from(informationSections)
.find(section => section.querySelector("strong")?.textContent.trim().toLowerCase() == 'length')
);
// split the length by number boundary
const lengthParts = audioLength.split(/ (?=\d+)/);
bookDetails['Listening Length'] = lengthParts;
}
function extractLibroDescription(bookDetails) {
const summaryEl = extractSection('summary');
// if there is a tab for more information about the authors, it's a different element
const summaryTabEl = document.querySelector('#panel_summary')
const element = summaryEl || summaryTabEl;
if (element) {
const summary = joinContent(element.querySelectorAll('p'))
bookDetails["Description"] = summary;
}
}
function extraLibroInfo(bookDetails) {
const section = extractSection('audiobook details')
const publisher = section.querySelector('span[itemprop="publisher"]')
if (publisher) {
bookDetails['Publisher'] = publisher.textContent.trim();
}
const releaseDate = section.querySelector('span[itemprop="datePublished"]')
if (releaseDate) {
bookDetails['Publication date'] = releaseDate.textContent.trim();
}
const language = section.querySelector('span[itemprop="inLanguage"]')
if (language) {
bookDetails['Language'] = language.textContent.trim();
}
const isbn = section.querySelector('span[itemprop="isbn"]')
if (isbn) {
const isbnText = isbn.textContent.trim()
if (isbnText.length == 13) {
bookDetails['ISBN-13'] = isbnText;
} else if (isbn.length == 10) {
bookDetails['ISBN-10'] = isbnText;
}
}
// no nice itemprop attribute for edition type :(
const cells = section.querySelectorAll('.cell')
// try to find the relevant cell with the 'Edition' header
const editionCell = Array.from(cells)
.find(cell => cell.querySelector('strong')?.textContent.trim().toLowerCase() == 'edition');
if (editionCell) {
let editionFormat = editionCell.querySelector('span')?.textContent.trim()
bookDetails['Edition Information'] = editionFormat;
}
}
export { librofmScraper };