-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathrun.js
More file actions
219 lines (195 loc) · 7.79 KB
/
run.js
File metadata and controls
219 lines (195 loc) · 7.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
"use strict";
var FeedParser = require('feedparser');
var request = require('request')
var async = require('async');
var req = require('request');
var mongoose = require('mongoose');
var blogger = require('./models/blogger').Blogger;
var blog = require('./models/blog').Blog;
var URI = require('URIjs');
var cheerio = require('cheerio');
var colors = require('colors/safe');
// Get database connection
mongoose.connect(process.env.CUSTOMCONNSTR_MONGODB_URI || 'mongodb://localhost');
var database = mongoose.connection;
database.on('error', console.error.bind(console, 'MongoDB Connection Error:'));
database.once('open', function(callback) {
console.log('[INFO] Database connection established successfully.');
// Get all feed URLs
blogger.find({}, function(error, allBloggers) {
if (error) {
console.log("[FATAL] %j", error);
} else {
if (!allBloggers) {
console.log("[WARN] No bloggers in DB");
} else {
downloadAllFeeds(allBloggers);
}
}
});
});
// Download all RSS/ATOM feeds and process them.
var changesOccured = 0;
function downloadAllFeeds(allBloggers) {
async.each(allBloggers, function(blogger, done) {
downloadFeed(blogger, done);
},
function(err) {
if (!err) {
if(changesOccured > 0) {
console.log('[END] %d changes were made to the database...', changesOccured);
}
else {
console.log('[END] No changes were made to the database');
}
process.exit();
} else {
console.error(colors.red('[FATAL] %j'), err);
}
});
}
function downloadFeed(blogger, callback) {
var req = request(blogger.feedUrl, {
timeout: 10000,
pool: false
});
req.setMaxListeners(50);
// Some feeds do not respond without user-agent and accept headers.
req.setHeader('user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')
req.setHeader('accept', 'text/html,application/xhtml+xml');
var items = [];
req.on('error', function(error) {
console.error(colors.red("[ERROR] Connection Error on %s %s's feed: (%s)"), blogger.firstName, blogger.lastName, error);
})
.pipe(new FeedParser())
.on('error', function(error) {
console.error(colors.red("[ERROR] Feed Parser Error on %s %s's feed: (%s)"), blogger.firstName, blogger.lastName, error);
})
.on('meta', function(meta) {
//if(meta["rss:lastbuilddate"] && meta["rss:lastbuilddate"]["#"]) {
//We could use meta["rss:lastbuilddate"]["#"] to determine whether to bother even parsing the feed.
//This would require another db to hold lastbuilddates so I haven't implemented it yet
//}
})
.on('readable', function() {
var stream = this, item;
while (item = stream.read()) {
items.push(item);
}
})
.on('end', function() {
async.each(items, function(item, done) {
insertBlogPostToDBIfNew(blogger, item, done);
}, function(err) {
callback();
});
});;
}
function insertBlogPostToDBIfNew(blogger, blogPost, done) {
blog.findOne({
userProvider: blogger.userProvider,
userId: blogger.userId,
pubDate: blogPost.pubdate
}, function(error, blogPostFromDB) {
if (error) {
console.error("[ERROR] Looking up blog \n %j \n\n got error: \n %j \n\n\n\n", blogPost, error);
} else {
if (!blogPostFromDB) {
//No blog, add as new
console.log('[INFO] Adding \'%s\' as new blog post', blogPost.title);
insertNewBlog(blogPost, blogger, done);
} else {
//Blog already exists. Has it been updated?
if (blogPost.date.getTime() != blogPostFromDB.updateDate.getTime()) {
//Blog has been updated
console.log('[INFO] Updating \'%s\'', blogPost.title);
updateBlog(blogPost, blogPostFromDB, blogger, done);
} else {
//Blog has not been updated
console.log('[INFO] \'%s\' is already in DB and will not be updated.', blogPost.title);
done();
}
}
}
});
function insertNewBlog(blogPost, blogger, done) {
grabImage(blogPost, function (image) {
var newBlog = new blog({
// Author Details
userProvider: blogger.userProvider,
userId: blogger.userId,
// Information about blog
title: blogPost.title,
imageUrl: image,
summary: blogPost.summary,
pubDate: setDate(blogPost.pubdate),
updateDate: setDate(blogPost.date),
link: blogPost.link
});
newBlog.save();
changesOccured++;
done();
});
}
function updateBlog(blogPost, blogPostFromDB, blogger, done) {
//This can be implemented much more effeciency...
//However, am reusing insertNewBlog() because its not quite as simple as updating fields.
//Have to re-grab images etc.
blog.find(blogPostFromDB).remove(function() {
insertNewBlog(blogPost, blogger, done);
});
}
function grabImage(blogPost, done) {
if ((blogPost.image) && (blogPost.image.url)) {
//If the RSS/ATOM feed is nice enough to tell us an image to use, use it.
done(blogPost.image.url.split("?")[0]); //This fixes the fact that Wordpress tries to give us a small thumbnail
//by attaching a width query. ?w=150 for example
}
else {
var $ = cheerio.load(blogPost.description);
var firstImageSrc = null;
var firstImage = $('img').get(0);
if(firstImage != null && firstImage.attribs != null && firstImage.attribs.src != null) {
firstImageSrc = firstImage.attribs.src;
}
if (firstImageSrc != null) {
var firstImageUrl = new URI(firstImageSrc);
var blogPostLink = new URI(blogPost.link);
// Make all URLs absolute. We're not in the same relative positions as the blogs themselves, so relative links won't work
var image;
if(firstImageUrl.is("relative")) {
image = new URI(blogPostLink.protocol() + "://" + blogPostLink.domain() + firstImageUrl.toString());
}
else {
image = new URI(firstImageUrl.toString());
}
// Remove the ?w=546&h=123 style width and height tags provided by wordpress blogs
image.removeSearch(['w', 'h']);
// Remove the "Add a comment" images added to feeds by wordpress.com blogs
if(image.toString().indexOf("http://feeds.wordpress.com/1.0/comments") > -1) {
image = null;
}
done(image);
}
else {
done(null); //An error occured or no images in post
}
}
}
function setDate(dateString) {
var date = new Date(dateString);
if (isValidDate(date)) {
return date;
}
else {
return new Date();
// Returning today's date as it seems likely the blog was posted
// on the day the aggregator puts it in the database.
}
}
function isValidDate(date) {
if (Object.prototype.toString.call(date) !== "[object Date]")
return false;
return !isNaN(date.getTime());
}
}