Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/background.js
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ runtime.onMessage.addListener((request, sender, sendResponse) => {
return true;
}

if (request.action === 'fetchDepositData') {
if (request.action === 'fetchUrl') {
handleFetchRequest(request.url).then(sendResponse);
return true;
}
Expand Down
2 changes: 1 addition & 1 deletion src/content.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => {
sendResponse(details);
} catch (e) {
logMarian("Error getting info", e);
sendResponse(null);
sendResponse({ __marian_error: e.message || String(e) });
}
};

Expand Down
20 changes: 20 additions & 0 deletions src/extractors/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ Takes in a list of objects, promises of objects or null/undefined.
it will await all promises at the same time and then merge them into a single object,
if a key is present in more than one object it will be overwritten, by order of the list.

It will merge `Collections` and `Mappings`

### getCoverData

Takes in a URL, or a list of URLs and returns an
Expand Down Expand Up @@ -267,6 +269,24 @@ const title = queryDeep('h1', ['product-header']);
Gets a DOM from a html request.
If this is used in a scraper script, then it will only work for domains that the current page has CORS access to.

### fetchBackground

Performs an HTTP request via the background script to bypass CORS and Content-Security-Policy (CSP) restrictions.

Takes in a `url` string to fetch.
Returns a promise that resolves with the response text on success, or rejects with an error message on failure.

Example:
```javascript
try {
const htmlString = await fetchBackground('https://example.com/data');
const tempDiv = document.createElement('div');
tempDiv.innerHTML = htmlString;
} catch (err) {
console.error("Failed to fetch data:", err);
}
```

### runtime

Exports the `browser.runtime` (Firefox) or `chrome.runtime` (Chrome) API,
Expand Down
88 changes: 70 additions & 18 deletions src/extractors/amazon.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import { addContributor, cleanText, collectObject, fetchBackground, getCoverData, getFormattedText, logMarian, normalizeReadingFormat } from '../shared/utils.js';
import { Extractor } from "./AbstractExtractor.js"
import { logMarian, getFormattedText, getCoverData, addContributor, cleanText, normalizeReadingFormat, collectObject } from '../shared/utils.js';
import { getRegion, fetchAudnexusApiDetails, fetchAudibleApiDetails } from './audible.js';

const bookSeriesRegex = /^Book (\d+) of \d+$/i;

const includedLabels = new Set([
Expand Down Expand Up @@ -34,7 +36,7 @@ class amazonScraper extends Extractor {
const contributors = extractAmazonContributors();

bookDetails["Edition Format"] = getSelectedFormat() || '';
bookDetails["Title"] = document.querySelector('#productTitle')?.innerText.trim();
bookDetails["Title"] = cleanText(document.querySelector('#productTitle')?.innerText);
bookDetails["Description"] = getBookDescription() || '';
bookDetails["Contributors"] = contributors;

Expand All @@ -48,11 +50,11 @@ class amazonScraper extends Extractor {

// combined publisher date
const pubDate = bookDetails["Publisher"]?.match(/^(?<pub>[^(;]+?)(?:; (?<edition>[\w ]+))? \((?<date>\d{1,2} \w+ \d{4})\)$/);
if (pubDate != undefined) {
bookDetails["Publisher"] = pubDate.groups["pub"].trim();
if (pubDate != null) {
bookDetails["Publisher"] = cleanText(pubDate.groups["pub"]);
bookDetails["Publication date"] = pubDate.groups["date"];
if (pubDate.groups["edition"]) {
bookDetails["Edition Information"] = pubDate.groups["edition"].trim();
bookDetails["Edition Information"] = cleanText(pubDate.groups["edition"]);
}
}

Expand All @@ -68,7 +70,7 @@ class amazonScraper extends Extractor {
// If the isbn10 is the isbn13 and is in the ASIN
const isbn10 = bookDetails["ISBN-10"]?.replace("-", "");
const isbn13 = bookDetails["ISBN-13"]?.replace("-", "");
const asin = bookDetails["ASIN"];
const asin = bookDetails["ASIN"] ?? audibleDetails["ASIN"];
if (
isbn10 != null &&
isbn13 != null &&
Expand All @@ -81,21 +83,48 @@ class amazonScraper extends Extractor {
bookDetails["ISBN-10"] = asin;
}

const audibleAsin = getAudibleAsin();
let apiPromise = {};
if (audibleAsin &&
(bookDetails["Reading Format"] === "Audiobook" || audibleDetails["Reading Format"] === "Audiobook")
) {
delete bookDetails["ASIN"];
bookDetails["Amazon ASIN"] = asin;
audibleDetails["ASIN"] = audibleAsin;
apiPromise = fetchApiDetails(audibleAsin, audibleDetails);
}

const mergedDetails = await collectObject([
bookDetails,
audibleDetails,
apiPromise,
coverData,
]);

delete mergedDetails.Edition;
delete mergedDetails.Version;
delete mergedDetails._detectedRegion;

// logMarian("details", mergedDetails);

return mergedDetails;
}
}

async function fetchApiDetails(asin, audibleDetails) {
if (!asin || audibleDetails['Reading Format'] !== 'Audiobook') {
return {};
}

let tld = audibleDetails['_detectedRegion'] || document.location.host.split("amazon").pop();
const region = getRegion(tld);

return await collectObject([
fetchAudibleApiDetails(asin, tld),
fetchAudnexusApiDetails(asin, region),
])
}

async function getCover() {
const imgEl = document.querySelector("#landingImage, #imgTagWrapperId img"); // same element
const imgEl2 = document.querySelector("#imgBlkFront");
Expand Down Expand Up @@ -126,11 +155,13 @@ async function getCover() {
});

// get original image
covers.forEach((value) => value && covers.add(getHighResImageUrl(value)));
[...covers]
.filter(i => i)
.forEach((url) => { covers.add(getHighResImageUrl(url)); });

const coverList = Array.from(covers)
.filter((x) => !x.includes("01RmK+J4pJL.gif")); // filter out no image image
console.log(coverList)
// console.log(coverList)

const coverRes = await getCoverData(coverList);
if (coverRes.imgScore === 0) return {}
Expand Down Expand Up @@ -197,7 +228,7 @@ function getDetailBullets() {
// Double check book series
const series = document.querySelector("div[data-feature-name='seriesBulletWidget'] a")
if (!details["Series"] && series != undefined) {
const match = series.textContent.trim().match(/Book (\d+) of \d+: (.+)/i);
const match = cleanText(series.textContent).match(/Book (\d+) of \d+: (.+)/i);
if (match) {
details['Series'] = match[2];
details['Series Place'] = match[1];
Expand All @@ -215,8 +246,8 @@ function getAudibleDetails() {
const rows = table.querySelectorAll('tr');

rows.forEach(row => {
const label = row.querySelector('th span')?.textContent?.trim();
const value = row.querySelector('td')?.innerText?.trim();
const label = cleanText(row.querySelector('th span')?.textContent);
const value = cleanText(row.querySelector('td')?.innerText);
const match = bookSeriesRegex.exec(label) || bookSeriesRegex.exec(value);

// Handle book series special case
Expand All @@ -235,10 +266,10 @@ function getAudibleDetails() {
}

// Match any Audible.<TLD> Release Date
if (/^Audible\.[^\s]+ Release Date$/i.test(label)) {
details['Publication date'] = value;
} else if (label === 'Audible.com Release Date') {
const regionMatch = label?.match(/^Audible(\.[a-z.]+) Release Date$/i);
if (regionMatch) {
details['Publication date'] = value;
details['_detectedRegion'] = regionMatch[1].toLowerCase();
} else if (label === 'Program Type') {
details['Reading Format'] = value;
details['Edition Format'] = "Audible";
Expand Down Expand Up @@ -267,10 +298,31 @@ function getBookDescription() {
return getFormattedText(container);
}

function getAudibleAsin() {
// 1. Check hidden input
const hiddenInput = document.querySelector('input[name="audibleASIN"]');
if (hiddenInput?.value) return hiddenInput.value;
// 2. Check Sample Player JSON
const samplePlayer = document.querySelector('[data-play-audiosample-cloud-player]');
if (samplePlayer) {
try {
const config = JSON.parse(samplePlayer.dataset.playAudiosampleCloudPlayer);
const urlParams = new URLSearchParams(config.cloudPlayerUrl.split('?')[1]);
const asin = urlParams.get('asin');
if (asin) return asin;
} catch { }
}
// 3. Check Swatches
const audioSwatch = Array.from(document.querySelectorAll('#tmmSwatches .swatchElement'))
.find(el => el.textContent.toLowerCase().includes('audiobook') || el.textContent.toLowerCase().includes('audible'));

return audioSwatch?.dataset.asin || audioSwatch?.dataset.defaultasin || null;
}

function getSelectedFormat() {
const selected = document.querySelector('#tmmSwatches .swatchElement.selected .slot-title span[aria-label]');
if (selected) {
return selected.getAttribute('aria-label')?.replace(' Format:', '').trim();
return cleanText(selected.getAttribute('aria-label')?.replace(' Format:', ''));
}
return null;
}
Expand All @@ -280,16 +332,16 @@ function extractAmazonContributors() {

const authorSpans = document.querySelectorAll('#bylineInfo .author');
authorSpans.forEach(span => {
const name = span.querySelector('a')?.innerText.trim();
const roleText = span.querySelector('.contribution span')?.innerText.trim();
const name = cleanText(span.querySelector('a')?.innerText);
const roleText = cleanText(span.querySelector('.contribution span')?.innerText);
let roles = [];

if (roleText) {
// e.g., "(Author)", "(Illustrator)", "(Author, Narrator)"
const roleMatch = roleText.match(/\(([^)]+)\)/);
if (roleMatch) {
// Split by comma and trim each role
roles = roleMatch[1].split(',').map(r => r.trim());
roles = roleMatch[1].split(',').map(cleanText);
}
} else {
roles.push("Contributor"); // fallback if role is missing
Expand Down
Loading