Skip to content

Commit 1ad9fd9

Browse files
committed
Refactored/Restructured the whole project
1 parent 23250de commit 1ad9fd9

File tree

14 files changed

+2608
-16
lines changed

14 files changed

+2608
-16
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
umd/
2+
lib/

bin/cli.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env node
22

3-
const languageEncoding = require("../index.js");
3+
const languageEncoding = require("../src/index.js");
44

55
const path = process.argv[2];
66

package-lock.json

Lines changed: 2383 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,20 @@
22
"name": "detect-file-encoding-and-language",
33
"version": "1.6.1",
44
"description": "An NPM package to detect the encoding and language of a file",
5-
"main": "index.js",
5+
"main": "src/server/index-node.js",
66
"scripts": {
7-
"regextest": "node regexTester.test.js",
8-
"test": "node index.test.js",
7+
"regextest": "node ./src/testing/regexTester.test.js",
8+
"test": "node ./src/testing/index.test.js",
99
"prepublishOnly": "npm test"
1010
},
11-
"browser": {
12-
"fs": false
13-
},
11+
"browser": "umd/browser-bundle.js",
1412
"bin": {
15-
"dfeal": "./bin/cli.js"
13+
"dfeal": "bin/cli.js"
1614
},
15+
"files": [
16+
"umd"
17+
],
18+
"unpkg": "umd/browser-bundle.js",
1719
"repository": {
1820
"type": "git",
1921
"url": "git+https://github.com/gignupg/Detect-File-Encoding-and-Language.git"
@@ -33,6 +35,5 @@
3335
"bugs": {
3436
"url": "https://github.com/gignupg/Detect-File-Encoding-and-Language/issues"
3537
},
36-
"homepage": "https://github.com/gignupg/Detect-File-Encoding-and-Language#readme",
37-
"dependencies": {}
38+
"homepage": "https://github.com/gignupg/Detect-File-Encoding-and-Language#readme"
3839
}

src/components/checkUTF.js

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
module.exports = (content) => {
2+
for (let b = 0; b < content.length; b++) {
3+
// If ? is encountered it's definitely not utf8!
4+
if (content[b] === "�") {
5+
return false;
6+
}
7+
}
8+
return true;
9+
}

src/components/processContent.js

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
const countAllMatches = require('./processing-content/countAllMatches.js');
2+
const calculateConfidenceScore = require('./processing-content/calculateConfidenceScore.js');
3+
4+
module.exports = (data) => {
5+
const fileInfo = {};
6+
const languageArr = require('../language-config/languageObject.js');
7+
8+
data.languageArr = countAllMatches(data, languageArr);
9+
10+
fileInfo.language = data.languageArr.reduce((acc, val) => acc.count > val.count ? acc : val).name;
11+
12+
// "pos" gives us the position in the language array that has the most matches
13+
data.pos = data.languageArr.findIndex(elem => elem.name === fileInfo.language);
14+
15+
// Determine the encoding
16+
fileInfo.encoding = data.utf8 ? "UTF-8" : data.languageArr[data.pos].encoding;
17+
18+
const calculations = calculateConfidenceScore(data, fileInfo);
19+
20+
if (data.testFilePath) {
21+
return calculations;
22+
}
23+
24+
fileInfo.confidence = calculations;
25+
26+
// Edge case, when no matches were found
27+
if (!data.languageArr[data.pos].count) {
28+
fileInfo.language = null;
29+
fileInfo.encoding = data.utf8 ? "UTF-8" : null;
30+
fileInfo.confidence = data.utf8 ? 1 : null;
31+
}
32+
33+
return fileInfo;
34+
};
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
module.exports = (data, fileInfo) => {
2+
const charRegex = new RegExp(/\d|\n|\s|\-|\.|\,|\:|\;|\?|\!|\<|\>|\[|\]|\{|\}|\&|\=|\|/, "g");
3+
const totalCharacters = data.content.replace(charRegex, "").length;
4+
const langArr = data.languageArr;
5+
const pos = data.pos;
6+
const testFilePath = data.testFilePath;
7+
8+
const secondLanguage = langArr.reduce((acc, val) => {
9+
if (acc.name === fileInfo.language) return val;
10+
if (val.name === fileInfo.language) return acc;
11+
12+
return acc.count >= val.count ? acc : val;
13+
});
14+
15+
const languageRatio = langArr[pos].count / (secondLanguage.count + langArr[pos].count);
16+
const characterWordRatio = langArr[pos].count / totalCharacters;
17+
18+
let lowerLimit = null;
19+
let upperLimit = null;
20+
const multiplier = 0.8;
21+
22+
if (data.utf8) {
23+
lowerLimit = langArr[pos].utfFrequency ? langArr[pos].utfFrequency.low * multiplier : null;
24+
upperLimit = langArr[pos].utfFrequency ? (langArr[pos].utfFrequency.low + langArr[pos].utfFrequency.high) / 2 : null;
25+
26+
} else {
27+
lowerLimit = langArr[pos].isoFrequency ? langArr[pos].isoFrequency.low * multiplier : null;
28+
upperLimit = langArr[pos].isoFrequency ? (langArr[pos].isoFrequency.low + langArr[pos].isoFrequency.high) / 2 : null;
29+
}
30+
31+
let confidenceScore;
32+
33+
if (!lowerLimit || !upperLimit) {
34+
confidenceScore = null;
35+
36+
} else if (characterWordRatio >= upperLimit) {
37+
confidenceScore = 1;
38+
39+
} else if (characterWordRatio > lowerLimit) {
40+
const range = upperLimit - lowerLimit;
41+
const surplus = characterWordRatio - lowerLimit;
42+
const confidenceRaisePercentage = surplus / range;
43+
const confidenceRaise = (1 - languageRatio) * confidenceRaisePercentage;
44+
confidenceScore = Number((languageRatio + confidenceRaise).toFixed(2));
45+
46+
} else {
47+
confidenceScore = Number((languageRatio * (characterWordRatio / lowerLimit)).toFixed(2));
48+
}
49+
50+
// If the test script is running
51+
if (testFilePath) {
52+
return {
53+
name: testFilePath.substr(testFilePath.lastIndexOf('/') + 1),
54+
path: testFilePath,
55+
language: fileInfo.language,
56+
utf8: data.utf8,
57+
confidence: confidenceScore,
58+
ratio: Number(languageRatio.toFixed(2)),
59+
count: langArr[pos].count,
60+
totalCharacters: totalCharacters,
61+
characterWordRatio: characterWordRatio.toFixed(6),
62+
secondLanguage: {
63+
name: secondLanguage.name,
64+
count: secondLanguage.count
65+
}
66+
};
67+
}
68+
69+
return confidenceScore;
70+
};
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
module.exports = (data, languageArr) => {
2+
const newLanguageArr = [];
3+
4+
// Cloning the language array and making sure that "count" has no reference to "languageArr"!
5+
languageArr.forEach((obj) => {
6+
const updatedLangObj = {};
7+
Object.keys(obj).forEach(key => {
8+
if (key !== "count") {
9+
updatedLangObj[key] = obj[key];
10+
} else {
11+
updatedLangObj.count = 0;
12+
}
13+
});
14+
newLanguageArr.push(updatedLangObj);
15+
});
16+
17+
const regex = data.utf8 ? "utfRegex" : "isoRegex";
18+
19+
// Populate the count property of our language array!
20+
newLanguageArr.forEach(lang => {
21+
if (lang[regex]) {
22+
const matches = data.content.match(lang[regex]);
23+
24+
if (matches) lang.count = matches.length;
25+
}
26+
});
27+
28+
return newLanguageArr;
29+
}

src/index-browser.js

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
const checkUTF = require('./components/checkUTF.js');
2+
const processContent = require('./components/processContent.js');
3+
4+
module.exports = (file, test) => {
5+
return new Promise((resolve, reject) => {
6+
const input = {};
7+
const utfReader = new FileReader();
8+
9+
utfReader.onerror = (err) => {
10+
reject(err);
11+
};
12+
13+
utfReader.onload = () => {
14+
const utfContent = utfReader.result;
15+
16+
input.utf8 = checkUTF(utfContent);
17+
18+
if (utf8) {
19+
input.content = utfContent;
20+
resolve(processContent(input));
21+
22+
} else {
23+
const isoReader = new FileReader();
24+
25+
isoReader.onload = () => {
26+
input.content = isoReader.result;
27+
resolve(processContent(input));
28+
};
29+
30+
isoReader.readAsText(file, "ISO-8859-1");
31+
}
32+
};
33+
utfReader.readAsText(file, "UTF-8");
34+
});
35+
};

src/index-node.js

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
const checkUTF = require('./components/checkUTF.js');
2+
const processContent = require('./components/processContent.js');
3+
4+
module.exports = (file, test) => {
5+
return new Promise((resolve) => {
6+
const data = {};
7+
const fs = require('fs');
8+
data.testFilePath = test ? file : null;
9+
10+
fs.readFile(file, "UTF-8", (err, utfContent) => {
11+
if (err) reject(err);
12+
13+
data.utf8 = checkUTF(utfContent);
14+
15+
if (data.utf8) {
16+
data.content = utfContent;
17+
resolve(processContent(data));
18+
19+
} else {
20+
fs.readFile(file, "latin1", (err, isoContent) => {
21+
if (err) reject(err);
22+
23+
data.content = isoContent;
24+
resolve(processContent(data));
25+
});
26+
}
27+
});
28+
});
29+
};

0 commit comments

Comments
 (0)