-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathindex.js
More file actions
138 lines (115 loc) · 4.92 KB
/
index.js
File metadata and controls
138 lines (115 loc) · 4.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
module.exports = config;
const fs = require('fs');
const path = require('path');
const UnionFind = require('union-find');
/**
* config() Return a Language's tokens or if not specified all tokens available
*
* @param {String} lang [optional] ISO 639-1 Code - If not specified return object of all codes
* @param {Boolean} singletons [optional] whether to include single-entry abbreviation list items. These are not used for substitution but can be useful for string comparison. Defaults to false.
* @param {Boolean} advanced [optional] whether to opt into a more complex representation of the tokens than a list of lists, which can represent per-replacement configuration, etc.
*
* @return {Array|Object} Return an array for a single lang tokens or an object map of all tokens by ISO code
*/
function config(lang, singletons, advanced) {
singletons = !!singletons;
advanced = !!advanced;
if (lang && (typeof lang !== 'string' || lang.length != 2)) throw Error('optional lang param must be string containing 2 letter ISO 639-1 Code');
if (lang) {
if (!fs.statSync(path.resolve(__dirname, `./tokens/${lang}.json`))) {
if (!fs.statSync(path.resolve(__dirname, `./tokens/${lang}.json`))) {
return [];
} else {
let tokenjs = require(`./tokens/${lang}`);
return prepare(tokenjs(), singletons, advanced);
}
} else {
let tokenjson = require(`./tokens/${lang}.json`);
return prepare(tokenjson, singletons, advanced);
}
}
const tokens = {};
fs.readdirSync(path.resolve(__dirname, './tokens/')).forEach((token) => {
if (token.match(/\.json$/)) {
let json = require(`./tokens/${token}`);
tokens[token.replace(/\.json/, '')] = prepare(json, singletons, advanced);
} else if (token.match(/\.js$/)) {
let js = require(`./tokens/${token.replace('\.js$', '')}`);
tokens[token.replace(/\.js/, '')] = prepare(js(), singletons, advanced);
} else {
return;
}
});
return tokens;
}
function prepare(data, singletons, advanced) {
if (!singletons) data = removeSingletons(data);
if (!advanced) data = simplify(data);
return data;
}
function simplify(data) {
// the advanced representation differs from the old-school list of lists in a couple of ways:
// * the same token may occur in more than one group
// * groups contain additional configuration options
// to get the old-style form, we want to extract just the tokens, and also merge the groups
// only bother if the data is shaped the way we expect:
if (!data.length || !data[0].tokens) return data;
let tokens = new Set();
let props = new Map();
let positions = new Map();
for (let group of data) {
const groupProps = {};
for (const toKeep of ['skipBoundaries', 'skipDiacriticStripping', 'regex']) {
if (group[toKeep]) groupProps[toKeep] = group[toKeep];
}
const keepCount = Object.keys(groupProps).length;
for (let token of group.tokens) {
tokens.add(token);
if (keepCount > 0) props.set(token, groupProps);
positions.set(token, positions.size);
}
}
tokens = Array.from(tokens).sort();
let invTokens = new Map();
tokens.forEach((v, i) => { invTokens.set(v, i); });
let uf = new UnionFind(tokens.length);
for (let group of data) {
let idx1 = invTokens.get(group.tokens[0]);
for (let token of group.tokens.slice(1)) {
let idx2 = invTokens.get(token);
uf.link(idx1, idx2);
}
}
let out = [];
let groups = Array.from(new Set(uf.roots)).sort((a, b) => a - b);
let invGroups = new Map();
groups.forEach((v, i) => { invGroups.set(v, i); });
for (let g = 0; g < groups.length; g++) out[g] = [];
for (let i = 0; i < tokens.length; i++) {
out[invGroups.get(uf.roots[i])].push(tokens[i]);
}
out.forEach((arr) => {
arr.sort((a, b) => a.length - b.length);
// the output format we expect for the ones with special characteristics
// is for the first thing to be a plain string and subsequent ones to
// have properties, so skip the first one
for (let i = 1; i < arr.length; i++) {
let tokenProps = props.get(arr[i]);
if (tokenProps) {
// clone
tokenProps = Object.assign({}, tokenProps);
tokenProps.text = arr[i];
arr[i] = tokenProps;
}
}
});
out.sort((a, b) => (positions.get(a[0]) || 0) - (positions.get(b[0]) || 0))
return out;
}
function removeSingletons(tokens) {
if (!(tokens instanceof Array)) return tokens;
return tokens.filter((token) => {
return (token instanceof Array && token.length > 1) ||
(token.tokens instanceof Array && token.tokens.length > 1);
});
}