Skip to content

Commit 0888e9b

Browse files
committed
refactored using w3c nu checker
1 parent 288a5d9 commit 0888e9b

File tree

5 files changed

+176
-48
lines changed

5 files changed

+176
-48
lines changed

README.md

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,32 @@
11
# Node W3C Validator
22
Crawls a given site and checks for W3C validity.
33

4+
## Installation
5+
6+
```BASH
7+
$ npm install -g w3c-validator
8+
```
9+
410
## Usage
11+
```BASH
12+
$ w3c-validator [options] <url>
13+
```
514

15+
The crawler will fetch all sites matching folder URLs and certain file extensions. You can include files with `-i ext` or ignore files with `-e ext`.
16+
17+
**Tip**: Omit the URL protocol, the crawler will detect the right one.
18+
19+
### Options
620
```BASH
7-
npm install -g
21+
$ w3c-validator --help
22+
23+
Usage: w3c-validator [options] <url>
824

9-
# w3c-validator <url>
10-
w3c-validator example.com
25+
Options:
26+
27+
-h, --help output usage information
28+
-V, --version output the version number
29+
-q, --query consider query string
1130
```
31+
32+
**Important**: Executing the w3c-validator with sites using HTML `base`-tag along with links *without* leading slashes will probably not work.

package.json

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,40 @@
11
{
22
"name": "@lgraubner/w3c-validator",
3-
"version": "1.0.2",
3+
"version": "2.0.0",
44
"description": "Crawls a given site and checks for W3C validity.",
55
"homepage": "https://github.com/lgraubner/node-w3c-validator",
66
"author": "Lars Graubner <[email protected]>",
7+
"keywords": [
8+
"w3c",
9+
"validator",
10+
"crawler",
11+
"check"
12+
],
713
"main": "w3c-validator.js",
814
"repository": {
915
"type": "git",
1016
"url": "https://github.com/lgraubner/node-w3c-validator.git"
1117
},
1218
"bugs": {
13-
"url": "https://github.com/lgraubner/node-w3c-validator/issues"
19+
"url": "https://github.com/lgraubner/node-w3c-validator/issues"
1420
},
1521
"dependencies": {
1622
"simplecrawler": "^0.5.2",
1723
"commander": "^2.8.1",
18-
"chalk": "^1.0.0",
19-
"request": "^2.55.0"
24+
"chalk": "^1.1.1",
25+
"lodash": "^3.10.1",
26+
"cli-spinner": "^0.2.1"
2027
},
2128
"preferGlobal": true,
2229
"bin": {
2330
"w3c-validator": "w3c-validator.js"
2431
},
25-
"license": "MIT"
32+
"license": "MIT",
33+
"scripts": {
34+
"test": "mocha test"
35+
},
36+
"devDependencies": {
37+
"chai": "^3.2.0",
38+
"mocha": "^2.2.5"
39+
}
2640
}

test/cli.js

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
var should = require("chai").should();
2+
var exec = require("child_process").exec;
3+
4+
describe("$ w3c-validator invalid", function() {
5+
var _error;
6+
var _stdout;
7+
var _stderr;
8+
9+
before(function(done) {
10+
var cmd = exec("node w3c-validator.js illegal", function(error, stdout, stderr) {
11+
_error = error;
12+
_stdout = stdout;
13+
_stderr = stderr;
14+
done();
15+
});
16+
});
17+
18+
it("should fail because of invalid url", function() {
19+
_stderr.should.not.be.empty;
20+
});
21+
22+
it("should exit with error code '1'", function() {
23+
_error.code.should.equal(1);
24+
});
25+
});
26+
27+
describe("$ w3c-validator abc.xyz", function() {
28+
this.timeout(10000);
29+
30+
var _error;
31+
var _stdout;
32+
var _stderr;
33+
34+
before(function(done) {
35+
var cmd = exec("node w3c-validator.js abc.xyz", function(error, stdout, stderr) {
36+
_error = error;
37+
_stdout = stdout;
38+
_stderr = stderr;
39+
done();
40+
});
41+
});
42+
43+
it("should not throw any errors", function() {
44+
_stderr.should.be.empty;
45+
should.equal(_error, null);
46+
});
47+
48+
it("should return success message", function() {
49+
_stdout.should.not.be.empty;
50+
});
51+
});
52+
53+
describe("$ w3c-validator http://abc.xyz", function() {
54+
this.timeout(10000);
55+
56+
var _error;
57+
var _stdout;
58+
var _stderr;
59+
60+
before(function(done) {
61+
var cmd = exec("node w3c-validator.js http://abc.xyz", function(error, stdout, stderr) {
62+
_error = error;
63+
_stdout = stdout;
64+
_stderr = stderr;
65+
done();
66+
});
67+
});
68+
69+
it("should remove protocol and not throw any errors", function() {
70+
_stderr.should.be.empty;
71+
should.equal(_error, null);
72+
});
73+
});
74+
75+
describe("$ w3c-validator https://abc.xyz", function() {
76+
this.timeout(10000);
77+
78+
var _error;
79+
var _stdout;
80+
var _stderr;
81+
82+
before(function(done) {
83+
var cmd = exec("node w3c-validator.js https://abc.xyz", function(error, stdout, stderr) {
84+
_error = error;
85+
_stdout = stdout;
86+
_stderr = stderr;
87+
done();
88+
});
89+
});
90+
91+
it("should remove protocol and not throw any errors", function() {
92+
_stderr.should.be.empty;
93+
should.equal(_error, null);
94+
});
95+
});

vnu/vnu.jar

21.9 MB
Binary file not shown.

w3c-validator.js

Lines changed: 38 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
11
#!/usr/bin/env node
22

33
var Crawler = require("simplecrawler");
4-
var fs = require("fs");
54
var program = require("commander");
65
var chalk = require("chalk");
7-
var request = require("request");
6+
var exec = require("child_process").exec;
7+
var _ = require("lodash");
8+
var Spinner = require("cli-spinner").Spinner;
89
var pkg = require("./package.json");
910

1011
program.version(pkg.version)
11-
.usage("<url>")
12+
.usage("[options] <url>")
13+
.option("-q, --query", "consider query string")
1214
.parse(process.argv);
1315

1416
if (!program.args[0]) {
@@ -19,76 +21,72 @@ var chunk = [];
1921
var count = 0;
2022
var valid = 0;
2123
var invalid = 0;
22-
var c = new Crawler(program.args[0]);
24+
25+
var url = program.args[0].replace(/^(http:\/\/|https:\/\/)/, "");
26+
var c = new Crawler(url);
2327

2428
c.initialPath = "/";
2529
c.initialPort = 80;
2630
c.initialProtocol = "http";
2731
c.userAgent = "Node/W3C-Validator";
2832

33+
if (!program.query) {
34+
c.stripQuerystring = true;
35+
}
36+
37+
var exclude = ["swf", "pdf", "ps", "dwf", "kml", "kmz", "gpx", "hwp", "ppt", "pptx", "doc", "docx", "odp", "ods", "odt", "rtf", "wri", "svg", "tex", "txt", "text", "wml", "wap", "xml", "gif", "jpg", "jpeg", "png", "ico", "bmp", "ogg", "webp", "mp4", "webm", "mp3", "ttf", "woff", "json", "rss", "atom", "gz", "zip", "rar", "7z", "css", "js", "gzip", "exe"];
38+
39+
var exts = exclude.join("|");
40+
var regex = new RegExp("\.(" + exts + ")", "i");
41+
42+
c.addFetchCondition(function(parsedURL) {
43+
return !parsedURL.path.match(regex);
44+
});
45+
46+
var spinner = new Spinner("Fetching links... %s");
47+
2948
c.on("crawlstart", function() {
30-
console.log(chalk.white("Fetching links..."));
49+
spinner.start();
3150
});
3251

3352
c.on("fetchcomplete", function(item) {
3453
chunk.push(item.url);
3554
});
3655

3756
c.on("complete", function() {
57+
spinner.stop(true);
58+
count = chunk.length;
3859

39-
if (chunk.length > 0) {
40-
count = chunk.length;
41-
console.log(chalk.white("Done! Validating..."));
60+
if (!_.isEmpty(chunk)) {
61+
console.log(chalk.white("Validating..."));
4262
checkURL(chunk);
4363
} else {
44-
console.log(chalk.white("No URLs to validate."));
64+
console.error(chalk.red.bold("Error: Site '" + program.args[0] + "' could not be found."));
65+
process.exit(1);
4566
}
4667
});
4768

4869
var checkURL = function(chunk) {
4970
var url = chunk.pop();
5071

51-
request.head("http://validator.w3.org/check?uri=" + encodeURIComponent(url)).on("response", function(response) {
52-
var status = response.caseless.dict["x-w3c-validator-status"];
72+
var child = exec("java -jar ./vnu/vnu.jar --format json " + url, function(error, stdout, stderr) {
73+
var result = JSON.parse(stderr);
5374

54-
if (status == "Valid") {
75+
if (_.isEmpty(result.messages)) {
5576
valid++;
5677
console.log(chalk.bold.green("✓"), chalk.gray(url));
5778
} else {
5879
invalid++;
59-
console.log(chalk.red("×", url));
80+
console.log(chalk.red.bold("×", url));
6081
}
6182

62-
if (chunk.length > 0) {
63-
return checkURL(chunk);
83+
if (!_.isEmpty(chunk)) {
84+
checkURL(chunk);
6485
} else {
65-
return console.log(chalk.white("Checked %s sites. %s valid, %s invalid."), count, valid, invalid);
86+
console.log(chalk.white("Checked %s sites. %s valid, %s invalid."), count, valid, invalid);
87+
process.exit();
6688
}
6789
});
6890
};
6991

70-
var image = c.addFetchCondition(function(parsedURL) {
71-
return !parsedURL.path.match(/\.(gif|jpg|jpeg|png|ico|bmp)/i);
72-
});
73-
74-
var media = c.addFetchCondition(function(parsedURL) {
75-
return !parsedURL.path.match(/\.(ogg|webp|mp4|webm|mp3)/i);
76-
});
77-
78-
var font = c.addFetchCondition(function(parsedURL) {
79-
return !parsedURL.path.match(/\.(ttf|woff)$/i);
80-
});
81-
82-
var data = c.addFetchCondition(function(parsedURL) {
83-
return !parsedURL.path.match(/\.(json|rss|atom|gz|zip|rar|7z|vcf)/i);
84-
});
85-
86-
var misc = c.addFetchCondition(function(parsedURL) {
87-
return !parsedURL.path.match(/\.(css|js|gzip|exe)/i);
88-
});
89-
90-
var google = c.addFetchCondition(function(parsedURL) {
91-
return !parsedURL.path.match(/\.(swf|pdf|ps|dwf|kml|kmz|gpx|hwp|ppt|pptx|doc|docx|odp|ods|odt|rtf|wri|svg|tex|txt|text|wml|wap|xml)/i);
92-
});
93-
9492
c.start();

0 commit comments

Comments
 (0)