Skip to content
Open
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions packages/utils/src/internals/extract-microdata.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import type { Dictionary } from '@crawlee/types';
import type { CheerioAPI } from 'cheerio';
import { load } from 'cheerio';

/**
* Extract schema.org microdata from an HTML document using Cheerio.
*
* @param $ A `CheerioAPI` instance OR raw HTML string.
* @returns Extracted metadata as a Dictionary.
*/
export function extractMicrodata(raw: string): Dictionary<any>;
export function extractMicrodata($: CheerioAPI): Dictionary<any>;
export function extractMicrodata(_item: CheerioAPI | string): Dictionary<any> {
const $ = typeof _item === 'string' ? load(_item) : _item;

const extractValue = (elem: any) => {
return $(elem).attr('content') || $(elem).text()?.trim() || $(elem).attr('src') || $(elem).attr('href') || null;
};
Comment on lines +16 to +18
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not entirely correct as per the microdata specification. See point 5.2.4 Values - the type of the value extracted is based on the element type.


const addProperty = (obj: any, propName: string, value: any) => {
if (typeof value === 'string') value = value.trim();

if (Array.isArray(obj[propName])) {
obj[propName].push(value);
} else if (obj[propName] !== undefined) {
obj[propName] = [obj[propName], value];
} else {
obj[propName] = value;
}
};

const extractItem = (elem: any): any => {
const item: any = { _type: $(elem).attr('itemtype') };
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As per the spec, itemtype is a unordered set of unique space-separated tokens, we should split this by ASCII whitespace.

let count = 0;

$(elem)
.find('[itemprop]')
.filter(function () {
return $(this).parentsUntil(elem, '[itemscope]').length === 0;
})
.each(function () {
const propName = $(this).attr('itemprop');
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

itemprop is also an unordered set of unique space-separated tokens, this needs to be split too - see example (link):

Image


const value = $(this).is('[itemscope]') ? extractItem(this) : extractValue(this);

addProperty(item, propName as string, value);
count++;
});

if (count === 0) {
addProperty(item, '_value', extractValue(elem));
}

return item;
};

const extractAllItems = () => {
const items: any[] = [];

$('[itemscope]')
.filter(function () {
return $(this).parentsUntil('body', '[itemscope]').length === 0;
})
.each(function () {
items.push(extractItem(this));
});

return items;
};

return extractAllItems();
}
Loading