Skip to content

Commit 10cf549

Browse files
authored
Merge pull request #22 from alexankitty/roman-numeral
Search Keywords, Roman Numeral Processing, etc.
2 parents 1ddd349 + d1b6c87 commit 10cf549

File tree

17 files changed

+561
-223
lines changed

17 files changed

+561
-223
lines changed

docker-compose.dev.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ services:
99
deploy:
1010
resources:
1111
limits:
12-
memory: 1GB
12+
memory: 2GB
1313
ports:
1414
- "9200:9200"
1515
volumes:
@@ -28,4 +28,4 @@ services:
2828

2929
volumes:
3030
elasticsearch_data:
31-
postgres_data:
31+
postgres_data:

lib/dbkwworker.js

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import { ToWords } from "to-words";
2+
import { getSample } from "./services/elasticsearch.js";
3+
4+
const toWords = new ToWords({
5+
localeCode: "en-US",
6+
converterOptions: {
7+
ignoreDecimal: false,
8+
doNotAddOnly: true,
9+
},
10+
});
11+
12+
function stringToWordArray(string) {
13+
let symbolRegex =
14+
/_|\+|=|\)|\(|\[|{|}|]|;|:|"|'|<|>|\.|,|\/|\?|\||\\|!|@|#|\$|%|\^|&|\*/g;
15+
let workingString = string.replaceAll("-", " ");
16+
workingString = workingString.replaceAll(symbolRegex, " ");
17+
let stringArray = workingString.split(" ");
18+
return stringArray.filter((entry) => entry.trim() != "");
19+
}
20+
21+
function kwProcessor(terms, kwArr) {
22+
for (let term in terms) {
23+
terms[term] = terms[term].toLowerCase();
24+
}
25+
let foundKws = [];
26+
27+
for (let word in terms) {
28+
for (let group in kwArr) {
29+
let currentGroup = kwArr[group];
30+
for (let index in currentGroup) {
31+
if (currentGroup[index] == terms[word]) {
32+
foundKws.push(...currentGroup);
33+
break;
34+
}
35+
}
36+
}
37+
}
38+
if (foundKws) return [...new Set(foundKws)];
39+
}
40+
41+
async function getNumerals(stringArr) {
42+
let numerals = [];
43+
let nameWordLen = 0;
44+
for (let word in stringArr) {
45+
let curWord = stringArr[word];
46+
if (validateRomanNumeral(curWord)) {
47+
nameWordLen = word;
48+
let numeral = parseNumeral(curWord);
49+
if (numeral) numerals.push(numeral);
50+
}
51+
}
52+
//Guard clause, exits when we didn't find a valid numeral
53+
if (!nameWordLen) return;
54+
let searchQuery = stringArr.slice(0, nameWordLen).join(" ").trim();
55+
//Check if this is a series
56+
let results = await getSample(searchQuery);
57+
let series = false;
58+
//always return ii if it's available
59+
for (let x in numerals) {
60+
if (numerals[x] == 2) return [...new Set(numerals)];
61+
}
62+
if (results.length > 1) {
63+
for (let x in results) {
64+
let seriesNumeral = [];
65+
let words = stringToWordArray(results[x].sample);
66+
for (let word in words) {
67+
let numeral = parseNumeral(words[word]);
68+
if (numeral) seriesNumeral.push(numeral);
69+
}
70+
if (seriesNumeral > 0) {
71+
for (let x in numerals) {
72+
for (let y in seriesNumeral) {
73+
if (numerals[x] != seriesNumeral[y]) {
74+
series = true;
75+
}
76+
}
77+
}
78+
}
79+
}
80+
if (!series) return;
81+
numerals.push(getNumberNames(numerals));
82+
return [...new Set(numerals)];
83+
}
84+
}
85+
86+
function parseNumeral(string) {
87+
//Keep these upper case to reduce the number of false positives. Make sure the input isn't tolower
88+
const romanNumerals = {
89+
/*M: 1000,
90+
CM: 900,
91+
D: 500,
92+
CD: 400,
93+
C: 100,
94+
XC: 90,
95+
L: 50,
96+
XL: 40,*/
97+
X: 10,
98+
IX: 9,
99+
V: 5,
100+
IV: 4,
101+
I: 1,
102+
};
103+
if (validateRomanNumeral(string)) {
104+
let numeralSum = 0;
105+
string = string.toUpperCase();
106+
for (let numeral in romanNumerals) {
107+
while (string.startsWith(numeral)) {
108+
numeralSum += romanNumerals[numeral];
109+
string = string.substring(numeral.length);
110+
}
111+
}
112+
if (string.length > 0) return 0;
113+
return numeralSum;
114+
}
115+
}
116+
117+
function getNumberNames(stringArr) {
118+
let numbers = [];
119+
for (let number in stringArr) {
120+
let curNum = stringArr[number];
121+
if (/^\d+$/.test(curNum)) {
122+
let numberName = toWords.convert(parseInt(curNum));
123+
if (numberName) numbers.push(numberName.trim());
124+
}
125+
}
126+
return [...new Set(numbers)];
127+
}
128+
129+
function validateRomanNumeral(string) {
130+
if (!string) return false;
131+
if (string == "vim") return false;
132+
let romanRegex = /i|v|x|l|c|d|m/gi;
133+
return !string.replaceAll(romanRegex, "");
134+
}
135+
136+
export async function optimizeKws(object) {
137+
for (let column in object.keywords) {
138+
if (!object.data[column]) continue;
139+
let wordArr = stringToWordArray(object.data[column]);
140+
let workKws = kwProcessor(wordArr, object.keywords[column]);
141+
//special case for filenames
142+
if (column == "filename") {
143+
let numerals = await getNumerals(wordArr);
144+
if (numerals) {
145+
workKws.push(...numerals);
146+
}
147+
workKws.push(...getNumberNames(wordArr));
148+
}
149+
object.data[column + "kws"] = workKws.join(" ").trim();
150+
}
151+
return object.data;
152+
}

lib/dboptimize.js

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import debugPrint from "./debugprint.js";
2+
import { bulkIndexFiles } from "./services/elasticsearch.js";
3+
import { File } from "./models/index.js";
4+
import { readFileSync } from "fs";
5+
import { fileURLToPath } from "url";
6+
import { dirname, resolve } from "path";
7+
import { Piscina, FixedQueue } from "piscina";
8+
import { timer } from "./time.js";
9+
10+
let piscina = new Piscina({
11+
filename: resolve("./lib", "dbkwworker.js"),
12+
taskQueue: new FixedQueue(),
13+
});
14+
15+
const BATCH_SIZE = 1000;
16+
const __filename = fileURLToPath(import.meta.url);
17+
const __dirname = dirname(__filename);
18+
const relatedKwRoot = "../lib/json/relatedkeywords/";
19+
const catKwPath = resolve(__dirname, relatedKwRoot + "categories.json");
20+
const nameKwpath = resolve(__dirname, relatedKwRoot + "names.json");
21+
const regionKwpath = resolve(__dirname, relatedKwRoot + "regions.json");
22+
//make sure the child object matches the column in the file db model
23+
const keywords = {
24+
filename: JSON.parse(readFileSync(nameKwpath, "utf8")),
25+
category: JSON.parse(readFileSync(catKwPath, "utf8")),
26+
subcategories: JSON.parse(readFileSync(catKwPath, "utf8")),
27+
region: JSON.parse(readFileSync(regionKwpath, "utf8")),
28+
};
29+
30+
export async function optimizeDatabaseKws() {
31+
let proctime = new timer();
32+
let changes = 0;
33+
console.log("Optimizing DB Keywords...");
34+
let dbLength = await File.count();
35+
let optimizeTasks = [];
36+
let resolvedTasks = [];
37+
for (let i = 0; i < dbLength; ) {
38+
singleLineStatus(`Optimizing Keywords: ${i} / ${dbLength}`);
39+
let result = await File.findAndCountAll({
40+
limit: BATCH_SIZE,
41+
offset: i,
42+
});
43+
for (let x = 0; x < result.rows.length; x++) {
44+
debugPrint(`Submitting job for: ${result.rows[x]["filename"]}`);
45+
let data = [];
46+
for (let column in keywords) {
47+
data[column] = result.rows[x][column];
48+
}
49+
optimizeTasks.push(
50+
piscina
51+
.run(
52+
{
53+
data: data,
54+
keywords: keywords,
55+
},
56+
{ name: "optimizeKws" }
57+
)
58+
.catch((err) => {
59+
console.error(err);
60+
})
61+
);
62+
i++;
63+
}
64+
let settledTasks = await Promise.all(optimizeTasks);
65+
resolvedTasks.push(...settledTasks);
66+
debugPrint(`Resolving ${resolvedTasks.length} optimization tasks.`);
67+
for (let y = 0; y < resolvedTasks.length; y++) {
68+
let changed = false;
69+
for (let column in keywords) {
70+
if (result.rows[y][column + "kws"] == resolvedTasks[y][column + "kws"])
71+
continue;
72+
result.rows[y][column + "kws"] = resolvedTasks[y][column + "kws"];
73+
changed = true;
74+
}
75+
if (changed) {
76+
result.rows[y].save();
77+
changes++;
78+
}
79+
}
80+
await bulkIndexFiles(result.rows);
81+
optimizeTasks = [];
82+
resolvedTasks = [];
83+
}
84+
console.log(
85+
`\nCompleted Keyword Optimization for ${changes} row${
86+
changes > 1 || changes == 0 ? "s" : ""
87+
} in ${proctime.elapsed()}.`
88+
);
89+
}
90+
91+
function singleLineStatus(str) {
92+
if (process.stdout.isTTY && process.env.DEBUG != "1") {
93+
process.stdout.clearLine(0);
94+
process.stdout.cursorTo(0);
95+
process.stdout.write(str);
96+
} else {
97+
console.log(str);
98+
}
99+
}

lib/dircrawl.js

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ import { resolve } from "path";
44
import debugPrint from "./debugprint.js";
55
import { File } from './models/index.js';
66
import { bulkIndexFiles } from './services/elasticsearch.js';
7+
import { optimizeDatabaseKws } from "./dboptimize.js";
8+
import { timer } from "./time.js";
79

810
let piscina = new Piscina({
911
filename: resolve("./lib", "fileworker.js"),
@@ -13,7 +15,7 @@ let piscina = new Piscina({
1315
const BATCH_SIZE = 1000; // Process files in batches for better performance
1416

1517
export default async function getAllFiles(catList) {
16-
var startTime = process.hrtime();
18+
var proctime = new timer()
1719
const url = "https://myrient.erista.me/files/";
1820
let parentRows = await getTableRows({ url: url, base: "" });
1921
let parents = [];
@@ -28,7 +30,7 @@ export default async function getAllFiles(catList) {
2830
);
2931
}
3032
let dirWork = splitFilesAndFolders(parents);
31-
let files = dirWork.files;
33+
// First run should only have directories. Is there a reason this could change in the future?
3234
let dirs = dirWork.directories;
3335
let fetchTasks = [];
3436
let resolvedFetchTasks = [];
@@ -141,8 +143,9 @@ export default async function getAllFiles(catList) {
141143
var elapsed = parseHrtimeToSeconds(process.hrtime(startTime));
142144
var m = Math.floor(elapsed / 60);
143145
var s = Math.floor(elapsed % 60);
144-
console.log(`\nFinished crawling Myrient in ${m}m${s}s.`);
146+
console.log(`\nFinished crawling Myrient in ${proctime.elapsed()}.`);
145147
await piscina.close();
148+
await optimizeDatabaseKws();
146149
return fileCount;
147150
}
148151

@@ -204,9 +207,4 @@ function singleLineStatus(str) {
204207
} else {
205208
console.log(str);
206209
}
207-
}
208-
209-
function parseHrtimeToSeconds(hrtime) {
210-
var seconds = (hrtime[0] + hrtime[1] / 1e9).toFixed(3);
211-
return seconds;
212-
}
210+
}

lib/fileworker.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ export async function parseOutFile(data) {
5151
path: data.url + path,
5252
size: size,
5353
category: category,
54-
hidden: `${category.replaceAll(' ', '')} ${cats.subCat.replaceAll(' ', '')}`,
54+
subcategories: `${cats.subCat.replaceAll(' ', '')}`,
5555
type: findType(fullName, data.catList),
5656
date: innertext(file.querySelector(".date").innerHTML).trim(),
5757
region: findRegion(fullName, data.catList),
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
[
2+
[
3+
"supernintendoentertainmentsystem",
4+
"snes",
5+
"super famicom",
6+
"family computer"
7+
],
8+
["nintendoentertainmentsystem", "nes", "famicom", "family computer"],
9+
["playstation", "ps", "play station"],
10+
["playstation1", "ps1", "playstation 1", "psone", "one"],
11+
["playstation2", "ps2", "playstation 2"],
12+
["playstation3", "ps3", "playstation 3"],
13+
["playstationmobile", "psm", "mobile"],
14+
["xbox360", "x360", "xbox"],
15+
["xbox", "xb"],
16+
["famicomdisksystem", "fds", "famicom disk system"],
17+
["gameboyadvance", "gba", "gameboy advance", "game boy"],
18+
["gameboycolor", "gbc", "gameboy color", "game boy"],
19+
["gameboy", "gb", "gameboy", "game boy"],
20+
["gamecube", "gc", "game cube", "dolphin"],
21+
["megadrive", "md", "megadrive", "mega drive", "genesis"],
22+
["dreamcast", "dc", "dream cast"],
23+
["playstationvita", "psv", "playstation vita"],
24+
["playstationnetwork", "psn", "playstation network"],
25+
["nintendo switch", "switch", "nx"],
26+
["nintendo 3ds", "3ds", "three ds", "3d dual screen"],
27+
["nintendo ds", "ds", "dual screen"],
28+
["nintendo 64", "n64", "ultra 64"],
29+
["wiiu", "wii u"],
30+
["atari 2600", "vcs", "video computer system"],
31+
["playstationvr", "psvr", "playstation vr"],
32+
["pc engine", "pcengine", "turbografx", "turbografx-16"],
33+
["neogeo", "neo geo", "mvs"],
34+
["xboxone", "xone", "xbox one"],
35+
["xboxseriesx", "xsx", "xbox series x"],
36+
["xboxseriess", "xss", "xbox series s"],
37+
["amiibo", "nfc figure"],
38+
["mastersystem", "ms", "master system"],
39+
["wii", "revolution"],
40+
["appple II", "apple 2"]
41+
]
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[
2+
["vmu", "visual memory unit"],
3+
["cd", "compact disc"],
4+
["bd", "blu-ray", "blu ray"],
5+
["hd", "high definition"],
6+
["pdf", "portable document format"],
7+
["dlc", "downloadable content"],
8+
["byteswapped", "byte swapped"],
9+
["bigendian", "big endian"],
10+
["littleendian", "little endian"],
11+
["pc88", "pc-88", "pc 88"],
12+
["dvd", "digital video disc", "digital versatile disc"],
13+
["bros", "brothers", "bros."]
14+
]
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[
2+
["uk", "united kingdom"],
3+
["usa", "united states of america"]
4+
]

0 commit comments

Comments
 (0)