marian-extension/src/extractors/librofm.js at main · jacobtender/marian-extension · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import { Extractor } from "./AbstractExtractor.js";
import { addContributor, getCoverData, logMarian, cleanText, collectObject } from "../shared/utils.js";

class librofmScraper extends Extractor {
	get _name() { return "Libro.fm Extractor"; }
	_sitePatterns = [
		/^https?:\/\/(www\.)?libro\.fm\/audiobooks\/\d+(-[a-zA-Z0-9-]+)?/,
	];

	async getDetails() {
		const bookDetails = {};

		const imggrab = document.querySelector('.audiobook-cover .book-cover-wrap img.book-cover');
		const coverData = getCoverData(imggrab?.src);

		// Title
		getLibroBookTitle(bookDetails);

		// Series name and number
		getLibroSeries(bookDetails);

		// Contributors
		extractLibroContributors(bookDetails);

		//get format and length
		getLibroFormatInfo(bookDetails, window.location.href)

		// get extra block of info - isbn, language, etc.
		extraLibroInfo(bookDetails);

		// Description
		extractLibroDescription(bookDetails);

		logMarian("Libro extraction complete:", bookDetails);
		return collectObject([
			coverData,
			bookDetails,
		]);
	}
}

function extractLibroContributors(bookDetails) {
	let contributors = []

	const section = extractSection('audiobook details')
	const authors = section.querySelectorAll('span[itemprop="author"] a')
	authors.forEach(author => {
		const name = author.textContent.trim()
		addContributor(contributors, name, "Author");
	})

	const narrators = section.querySelectorAll('a[href$="searchby=narrators"]')
	narrators.forEach(narrator => {
		const name = narrator.textContent.trim()
		addContributor(contributors, name, "Narrator");
	})

	if (contributors.length) {
		bookDetails["Contributors"] = contributors;
	}
}

function getLibroSeries(bookDetails) {
	const seriesName = document.querySelector('.audiobook-title__series a');
	if (seriesName) {
		let name = seriesName.textContent.trim();
		bookDetails['Series'] = name;
		let seriesPlace = extractTextNode(document.querySelector('.audiobook-title__series'));
		let number = seriesPlace.match(/\d+/);
		if (number) {
			bookDetails['Series Place'] = number[0];
		}
	}
}

function getLibroBookTitle(bookDetails) {
	const h1 = document.querySelector('h1.audiobook-title');
	const rawTitle = cleanText(h1?.childNodes[0]?.textContent);
	rawTitle ? bookDetails["Title"] = rawTitle : null;
}

function joinContent(elements) {
	return Array.from(elements)
		// libro.fm uses <br> tags instead of <p> tags for paragraphs, so have to use innerText
		.map(item => item.innerText.trim())
		// split by newlines so that everything isn't on one line
		.flatMap(item => item.split('\n'))
		// strip out empty lines (there are some random empty <p> tags)
		.filter(item => item.length > 0)
		.join("\n");
}

function extractTextNode(element) {
	return Array.from(element?.childNodes || [])
		.filter(n => n.nodeType == Node.TEXT_NODE)
		.map(n => n.textContent.trim())
		.join("\n")
		.trim();
}

function extractSection(title) {
	const sections = document.querySelectorAll('section')
	return Array.from(sections)
		.find(section => section.querySelector('h2')?.textContent.trim().toLowerCase() == title)
}

function getLibroFormatInfo(bookDetails) {
	bookDetails['Reading Format'] = 'Audiobook';
	const informationSections = document.querySelectorAll(".audiobook-information .audiobook-information__section");

	const audioLength = extractTextNode(
		Array.from(informationSections)
			.find(section => section.querySelector("strong")?.textContent.trim().toLowerCase() == 'length')
	);

	// split the length by number boundary
	const lengthParts = audioLength.split(/ (?=\d+)/);

	bookDetails['Listening Length'] = lengthParts;

}

function extractLibroDescription(bookDetails) {
	const summaryEl = extractSection('summary');
	// if there is a tab for more information about the authors, it's a different element
	const summaryTabEl = document.querySelector('#panel_summary')
	const element = summaryEl || summaryTabEl;
	if (element) {
		const summary = joinContent(element.querySelectorAll('p'))
		bookDetails["Description"] = summary;
	}
}


function extraLibroInfo(bookDetails) {
	const section = extractSection('audiobook details')
	const publisher = section.querySelector('span[itemprop="publisher"]')
	if (publisher) {
		bookDetails['Publisher'] = publisher.textContent.trim();
	}
	const releaseDate = section.querySelector('span[itemprop="datePublished"]')
	if (releaseDate) {
		bookDetails['Publication date'] = releaseDate.textContent.trim();
	}
	const language = section.querySelector('span[itemprop="inLanguage"]')
	if (language) {
		bookDetails['Language'] = language.textContent.trim();
	}
	const isbn = section.querySelector('span[itemprop="isbn"]')
	if (isbn) {
		const isbnText = isbn.textContent.trim()
		if (isbnText.length == 13) {
			bookDetails['ISBN-13'] = isbnText;
		} else if (isbn.length == 10) {
			bookDetails['ISBN-10'] = isbnText;
		}
	}

	// no nice itemprop attribute for edition type :(
	const cells = section.querySelectorAll('.cell')
	// try to find the relevant cell with the 'Edition' header
	const editionCell = Array.from(cells)
		.find(cell => cell.querySelector('strong')?.textContent.trim().toLowerCase() == 'edition');
	if (editionCell) {
		let editionFormat = editionCell.querySelector('span')?.textContent.trim()
		bookDetails['Edition Information'] = editionFormat;
	}
}

export { librofmScraper };