Skip to content

Commit 1dcf3e1

Browse files
added first metascraper iteration
1 parent c561c48 commit 1dcf3e1

File tree

4 files changed

+206
-25
lines changed

4 files changed

+206
-25
lines changed

package.json

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
"compression": "~1.7.1",
5757
"cors": "~2.8.4",
5858
"crypto": "~1.0.1",
59-
"crypto-js": "~3.1.9-1",
59+
"crypto-js": "^3.1.9-1",
6060
"dauria": "~2.0.0",
6161
"email-templates": "2.6.0",
6262
"feathers": "~2.2.4",
@@ -81,12 +81,23 @@
8181
"feathers-socketio": "~2.0.1",
8282
"fs-blob-store": "~5.2.1",
8383
"fs-extra": "~4.0.2",
84-
"got": "~8.3.0",
84+
"got": "^8.3.0",
8585
"handlebars": "~4.0.11",
8686
"handlebars-layouts": "~3.1.4",
8787
"helmet": "~3.10.0",
8888
"html-excerpt": "~0.1.0",
89-
"metascraper": "~3.9.2",
89+
"metascraper": "^3.9.2",
90+
"metascraper-author": "^3.9.2",
91+
"metascraper-clearbit-logo": "^3.7.0",
92+
"metascraper-date": "^3.3.0",
93+
"metascraper-description": "^3.9.2",
94+
"metascraper-image": "^3.9.2",
95+
"metascraper-logo": "^3.9.2",
96+
"metascraper-logo-favicon": "^3.7.0",
97+
"metascraper-publisher": "^3.4.0",
98+
"metascraper-title": "^3.9.2",
99+
"metascraper-url": "^3.9.2",
100+
"metascraper-youtube": "^3.9.2",
90101
"mime": "^2.2.0",
91102
"mongoose": "~4.13.2",
92103
"multer": "~1.3.0",

server/models/contributions.model.js

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,17 @@ module.exports = function (app) {
1414
reason: { type: String }
1515
});
1616

17+
const metaSchema = mongooseClient.Schema({
18+
hasVideo: {
19+
type: Boolean,
20+
default: false
21+
},
22+
embedds: {
23+
type: Object,
24+
default: {}
25+
}
26+
});
27+
1728
const contributions = new mongooseClient.Schema({
1829
userId: { type: String, required: true },
1930
categoryIds: { type: Array },
@@ -28,12 +39,7 @@ module.exports = function (app) {
2839
teaserImg: { type: String },
2940
language: { type: String, required: true },
3041
shoutCount: { type: Number, default: 0 },
31-
meta: {
32-
type: Object,
33-
default: {
34-
hasVideo: false
35-
}
36-
},
42+
meta: metaSchema,
3743
visibility: {
3844
type: String,
3945
enum: ['public', 'friends', 'private'],
Lines changed: 62 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,32 @@
11
// get link metadata
22
// TODO: add more services and use the metascraper to fill some metadata on the article
33

4-
// const metascraper = require('metascraper');
5-
// const got = require('got');
4+
const metascraper = require('metascraper').load([
5+
require('metascraper-author')(),
6+
require('metascraper-date')(),
7+
require('metascraper-description')(),
8+
require('metascraper-image')(),
9+
require('metascraper-logo')(),
10+
require('metascraper-clearbit-logo')(),
11+
require('metascraper-logo-favicon')(),
12+
require('metascraper-publisher')(),
13+
require('metascraper-title')(),
14+
require('metascraper-url')(),
15+
require('metascraper-youtube')(),
16+
]);
17+
const got = require('got');
18+
const _ = require('lodash');
19+
20+
const getMetadata = async (targetUrl, app) => {
21+
const { body: html, url } = await got(targetUrl);
22+
app.debug(`getMetadata - getting metadata for ${url}`);
23+
// app.debug(html);
24+
// app.debug(html);
25+
const metadata = await metascraper({ html, url });
26+
// app.debug(metadata);
27+
app.debug(`getMetadata - got metadata for ${url}`);
28+
return metadata;
29+
};
630

731
module.exports = function () {
832
return function (hook) {
@@ -17,28 +41,55 @@ module.exports = function () {
1741
return resolve(hook);
1842
}
1943

44+
let promises = [];
45+
let embedds = {};
46+
2047
try {
2148
// find links
2249
const youtubeRegex = new RegExp(/(?:(?:https?:)?\/\/)?(?:www\.)?youtu(?:be\.com\/(?:watch\?(?:.*?&(?:amp;)?)*v=|v\/|embed\/)|\.be\/)([\w\-]+)(?:(?:&(?:amp;)?|\?)[\w\?=]*)*/, 'ig'); // eslint-disable-line
2350
const youtubeLinks = youtubeRegex.exec(hook.data.content);
2451

25-
// html link
26-
// const htmlLink = new RegExp(/<a\s[^>]*href=\"([^\"]*)\"[^>]*>(.*)<\/a>/, 'ig'); // eslint-disable-line
52+
// html links
53+
const linkRegex = new RegExp(/<a\s[^>]*href=\"([^\"]*)\"[^>]*>([^<]*)<\/a>/, 'ig') // eslint-disable-line
54+
let match;
55+
while (match = linkRegex.exec(hook.data.content)) {
56+
const url = match[1];
57+
hook.app.debug(url);
2758

28-
// here you could scrape the url for metadata
29-
// hook.app.debug('#6');
30-
// const { body: html, url } = await got(youtubeLinks[0]);
31-
// const metadata = await metascraper({html, url});
32-
// hook.app.debug('metadata');
33-
// hook.app.debug(metadata);
59+
// skip if url already exists
60+
if (!_.isEmpty(embedds[url])) {
61+
continue;
62+
}
63+
// here you could scrape the url for metadata
64+
// hook.app.debug(match);
65+
promises.push(new Promise(async (resolve) => {
66+
try {
67+
const metadata = await getMetadata(url, hook.app);
68+
embedds[url] = metadata;
69+
return resolve(metadata);
70+
} catch (err) {
71+
hook.app.error('FAILED TO GRAB THE LINK');
72+
return resolve();
73+
}
74+
}));
75+
}
3476

3577
if (youtubeLinks.length >= 2) {
3678
hook.data.teaserImg = `https://img.youtube.com/vi/${youtubeLinks[1]}/hqdefault.jpg`;
3779
hook.data.meta = Object.assign(hook.data.meta || {}, { hasVideo: true });
3880
}
3981
} catch (err) {} // eslint-disable-line
4082

41-
return resolve(hook);
83+
return Promise.all(promises)
84+
.then(() => {
85+
hook.app.debug('embedds:');
86+
hook.app.debug(embedds);
87+
88+
hook.data.meta.embedds = embedds;
89+
90+
hook.app.debug('FINISHED!');
91+
resolve(hook);
92+
});
4293
});
4394
};
4495
};

0 commit comments

Comments
 (0)