Skip to content

Commit 6a87abc

Browse files
authored
Alternate queries (#142) (#143)
* Bump the production-dependencies group with 3 updates (#120) Bumps the production-dependencies group with 3 updates: [@natlibfi/marc-record](https://github.com/natlibfi/marc-record-js), [@natlibfi/melinda-commons](https://github.com/natlibfi/melinda-commons-js) and [natural](https://github.com/NaturalNode/natural). * Bump the development-dependencies group with 7 updates (#119) | Package | From | To | | --- | --- | --- | | [@babel/cli](https://github.com/babel/babel/tree/HEAD/packages/babel-cli) | `7.23.0` | `7.23.4` | | [@babel/core](https://github.com/babel/babel/tree/HEAD/packages/babel-core) | `7.23.3` | `7.23.6` | | [@babel/preset-env](https://github.com/babel/babel/tree/HEAD/packages/babel-preset-env) | `7.23.3` | `7.23.6` | | [@natlibfi/fixugen](https://github.com/natlibfi/fixugen-js) | `2.0.2` | `2.0.3` | | [@natlibfi/fixugen-http-client](https://github.com/natlibfi/fixugen-http-client-js) | `3.0.1` | `3.0.2` | | [eslint](https://github.com/eslint/eslint) | `8.53.0` | `8.55.0` | | [nodemon](https://github.com/remy/nodemon) | `3.0.1` | `3.0.2` | * Simpler scripts (#122) * Add feature alternateQueries alternates -type candidateSearch creates several search queries, gets search result amounts for all of those queries and actually uses only the queries that get less than maxCandidates search results * Add new alternates-type candidateSearchType for bib: titleAuthorYearAlternates * Fix using word search for queries starting with a boolean [MRA-762] Use word search for queries only when the first word is a boolean, not when the first word starts with a boolean Add new type of candidateSearch queries: 'alternates' * Update copyright year * Update deps * 4.3.2-alpha.7
1 parent e813be9 commit 6a87abc

File tree

31 files changed

+1145
-213
lines changed

31 files changed

+1145
-213
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22

33
## License and copyright
44

5-
Copyright (c) 2020-2023 **University Of Helsinki (The National Library Of Finland)**
5+
Copyright (c) 2020-2024 **University Of Helsinki (The National Library Of Finland)**
66

77
This project's source code is licensed under the terms of **GNU Lesser General Public License Version 3** or any later version.

package-lock.json

Lines changed: 226 additions & 172 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"url": "git@github.com:natlibfi/melinda-record-matching-js.git"
1515
},
1616
"license": "LGPL-3.0+",
17-
"version": "4.3.1",
17+
"version": "4.3.2-alpha.7",
1818
"main": "./dist/index.js",
1919
"engines": {
2020
"node": ">=18"
@@ -43,7 +43,7 @@
4343
"@natlibfi/melinda-commons": "^13.0.12",
4444
"@natlibfi/sru-client": "^6.0.8",
4545
"debug": "^4.3.4",
46-
"isbn3": "^1.1.44",
46+
"isbn3": "^1.1.45",
4747
"moment": "^2.30.1",
4848
"natural": "^6.10.4",
4949
"uuid": "^9.0.1",
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
/**
2+
*
3+
* @licstart The following is the entire license notice for the JavaScript code in this file.
4+
*
5+
* Melinda record matching modules for Javascript
6+
*
7+
* Copyright (C) 2023 University Of Helsinki (The National Library Of Finland)
8+
*
9+
* This file is part of melinda-record-matching-js
10+
*
11+
* melinda-record-matching-js program is free software: you can redistribute it and/or modify
12+
* it under the terms of the GNU Lesser General Public License as
13+
* published by the Free Software Foundation, either version 3 of the
14+
* License, or (at your option) any later version.
15+
*
16+
* melinda-record-matching-js is distributed in the hope that it will be useful,
17+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
18+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19+
* GNU Lesser General Public License for more details.
20+
*
21+
* You should have received a copy of the GNU Affero General Public License
22+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
23+
*
24+
* @licend The above is the entire license notice
25+
* for the JavaScript code in this file.
26+
*
27+
*/
28+
29+
import createDebugLogger from 'debug';
30+
import createClient, {SruSearchError} from '@natlibfi/sru-client';
31+
32+
export class CandidateSearchError extends Error {}
33+
34+
export default async function ({url, queryList, queryListType, maxCandidates = 50}) {
35+
36+
const debug = createDebugLogger('@natlibfi/melinda-record-matching:candidate-search:choose-queries');
37+
const debugData = debug.extend('data');
38+
const debugDev = debug.extend('dev');
39+
40+
debugData(`Url: ${url}`);
41+
debugData(`QueryList: ${queryList}`);
42+
debugData(`queryListType: ${queryListType}`);
43+
44+
const client = createClient({
45+
url,
46+
maxRecordsPerRequest: 0,
47+
version: '2.0',
48+
retrieveAll: false
49+
});
50+
51+
debugDev(`QueryList (type: ${queryListType}) ${JSON.stringify(queryList)}`);
52+
try {
53+
const {queriesWithTotals} = await getQueryTotals({queryList, queryOffset: 0, queriesWithTotals: []});
54+
debugDev(`QueryResult: ${JSON.stringify(queriesWithTotals)}`);
55+
const filteredQueryResult = filterQueryResult({queriesWithTotals, maxCandidates});
56+
debugDev(`filteredQueryResult: ${JSON.stringify(filteredQueryResult)}`);
57+
return filteredQueryResult;
58+
} catch (err) {
59+
throw new CandidateSearchError(err);
60+
}
61+
62+
async function getQueryTotals({queryList, queryOffset = 0, queriesWithTotals = []}) {
63+
64+
const query = queryList[queryOffset];
65+
debug(`Running query ${JSON.stringify(query)} (${queryOffset}) for total`);
66+
67+
if (query) {
68+
const {total} = await retrieveTotal();
69+
70+
const newQueriesWithTotals = [...queriesWithTotals, {query, total}];
71+
debug(`Query ${queryOffset} ${query} done.`);
72+
debug(`There are (${queryList.length - (queryOffset + 1)} queries left)`);
73+
return getQueryTotals({queryList, queryOffset: queryOffset + 1, queriesWithTotals: newQueriesWithTotals});
74+
}
75+
76+
debug(`All ${queryList.length} queries done, there's no query for ${queryOffset}`);
77+
return {queriesWithTotals};
78+
79+
function retrieveTotal() {
80+
return new Promise((resolve, reject) => {
81+
// eslint-disable-next-line functional/no-let
82+
let totalRecords = 0;
83+
84+
debug(`Searching total amount of candidates for query: ${query}`);
85+
86+
client.searchRetrieve(query)
87+
.on('error', err => {
88+
// eslint-disable-next-line functional/no-conditional-statements
89+
if (err instanceof SruSearchError) {
90+
debug(`SRU SruSearchError for query: ${query}: ${err}`);
91+
reject(new CandidateSearchError(`SRU SruSearchError for query: ${query}: ${err}`));
92+
}
93+
debug(`SRU error for query: ${query}: ${err}`);
94+
reject(new CandidateSearchError(`SRU error for query: ${query}: ${err}`));
95+
})
96+
.on('total', total => {
97+
debug(`Got total: ${total}`);
98+
totalRecords += total;
99+
})
100+
.on('end', () => {
101+
try {
102+
resolve({total: totalRecords});
103+
} catch (err) {
104+
debug(`Error caught on END`);
105+
reject(err);
106+
}
107+
})
108+
.on('record', () => {
109+
debugDev(`RECORD: We should no get records here`);
110+
});
111+
});
112+
}
113+
}
114+
function filterQueryResult({queriesWithTotals, maxCandidates}) {
115+
debug(`Filtering queries (${queriesWithTotals.length}), maxCandidates: ${maxCandidates}`);
116+
debugData(`${JSON.stringify(queriesWithTotals)}`);
117+
// Drop queries where total result is 0 or greater than given maxCandidates
118+
const filteredQueryResult = queriesWithTotals.filter((queryWithTotal) => queryWithTotal.total !== 0 && queryWithTotal.total < maxCandidates);
119+
debugData(`${JSON.stringify(filteredQueryResult)}`);
120+
return filteredQueryResult;
121+
}
122+
123+
}

src/candidate-search/index.js

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
*
55
* Melinda record matching modules for Javascript
66
*
7-
* Copyright (C) 2020-2022 University Of Helsinki (The National Library Of Finland)
7+
* Copyright (C) 2020-2023 University Of Helsinki (The National Library Of Finland)
88
*
99
* This file is part of melinda-record-matching-js
1010
*
@@ -32,14 +32,15 @@ import {MarcRecord} from '@natlibfi/marc-record';
3232
import {MARCXML} from '@natlibfi/marc-record-serializers';
3333
import generateQueryList from './query-list';
3434
import {Error as MatchingError} from '@natlibfi/melinda-commons';
35+
import chooseQueries from './choose-queries';
3536

3637
export {searchTypes} from './query-list';
3738

3839
export class CandidateSearchError extends Error {}
3940

4041
// serverMaxResults : maximum size of total search result available from the server, defaults to Aleph's 20000
4142

42-
export default ({record, searchSpec, url, maxCandidates, maxRecordsPerRequest = 50, serverMaxResult = 20000}) => {
43+
export default async ({record, searchSpec, url, maxCandidates, maxRecordsPerRequest = 50, serverMaxResult = 20000}) => {
4344
MarcRecord.setValidationOptions({subfieldValues: false});
4445

4546
const debug = createDebugLogger('@natlibfi/melinda-record-matching:candidate-search');
@@ -55,7 +56,20 @@ export default ({record, searchSpec, url, maxCandidates, maxRecordsPerRequest =
5556
const adjustedMaxRecordsPerRequest = maxRecordsPerRequest >= maxCandidates ? maxCandidates : maxRecordsPerRequest;
5657

5758
const inputRecordId = getRecordId(record);
58-
const queryList = generateQueryList(record, searchSpec);
59+
const queryListResult = generateQueryList(record, searchSpec);
60+
const queryList = queryListResult[0]?.queryList ? queryListResult[0].queryList : queryListResult;
61+
const queryListType = queryListResult[0]?.queryListType ? queryListResult[0].queryListType : undefined;
62+
63+
// if generateQueryList errored we should throw 422
64+
if (queryList.length === 0) {
65+
debug(`Empty list`);
66+
throw new CandidateSearchError(`Generated query list contains no queries`);
67+
}
68+
if (queryListType && queryListType !== 'alternates') {
69+
debug(`Unknown queryListType`);
70+
throw new CandidateSearchError(`Generated query list has invalid type`);
71+
}
72+
5973
const client = createClient({
6074
url,
6175
maxRecordsPerRequest: adjustedMaxRecordsPerRequest,
@@ -64,14 +78,19 @@ export default ({record, searchSpec, url, maxCandidates, maxRecordsPerRequest =
6478
});
6579

6680
debug(`Searching matches for ${inputRecordId}`);
67-
debug(`Generated queryList ${JSON.stringify(queryList)}`);
81+
const chosenQueryList = await filterQueryList({queryList, queryListType});
82+
debug(`Chosen queries: ${JSON.stringify(chosenQueryList)}`);
6883

69-
// if generateQueryList errored we should throw 422
84+
async function filterQueryList({queryList, queryListType, maxCandidates}) {
85+
debug(`Generated queryList (type: ${queryListType}) ${JSON.stringify(queryList)}`);
7086

71-
if (queryList.length === 0) {
72-
throw new CandidateSearchError(`Generated query list contains no queries`);
87+
if (queryListType === 'alternates' && queryList.length > 1) {
88+
const queryListResult = await chooseQueries({url, queryList, queryListType, maxCandidates});
89+
debug(`queryListResult: ${JSON.stringify(queryListResult)}`);
90+
return queryListResult.map(elem => elem.query);
91+
}
92+
return queryList;
7393
}
74-
7594
// state.totalRecords : amount of candidate records available to the current query (undefined, if there was no queries left)
7695
// state.query : current query (undefined if there was no queries left)
7796
// state.searchCounter : sequence for current search for current query (undefined, if there we no queries left)
@@ -80,9 +99,13 @@ export default ({record, searchSpec, url, maxCandidates, maxRecordsPerRequest =
8099
// state.queryCounter : sequence for current query
81100
// state.maxedQueries : queries that resulted in more than serverMaxResults hits
82101

102+
return {search};
83103

84-
return async ({queryOffset = 0, resultSetOffset = 1, totalRecords = 0, searchCounter = 0, queryCandidateCounter = 0, queryCounter = 0, maxedQueries = []}) => {
85-
const query = queryList[queryOffset];
104+
// eslint-disable-next-line max-statements
105+
async function search({queryOffset = 0, resultSetOffset = 1, totalRecords = 0, searchCounter = 0, queryCandidateCounter = 0, queryCounter = 0, maxedQueries = []}) {
106+
107+
const query = chosenQueryList[queryOffset];
108+
debug(`Running query ${JSON.stringify(query)} (${queryOffset})`);
86109

87110
if (query) {
88111
const {records, failures, nextOffset, total} = await retrieveRecords();
@@ -171,7 +194,7 @@ export default ({record, searchSpec, url, maxCandidates, maxRecordsPerRequest =
171194
});
172195
});
173196
}
174-
};
197+
}
175198

176199
function checkMaxedQuery(query, total, serverMaxResult) {
177200
if (total >= serverMaxResult) {
@@ -180,7 +203,6 @@ export default ({record, searchSpec, url, maxCandidates, maxRecordsPerRequest =
180203
}
181204
}
182205

183-
184206
function getRecordId(record) {
185207
const [field] = record.get(/^001$/u);
186208
return field ? field.value : '';
@@ -191,5 +213,4 @@ export default ({record, searchSpec, url, maxCandidates, maxRecordsPerRequest =
191213
debug(`Cannot yet find possible database record id from recordXML (length ${recordXML.length})`);
192214
return undefined;
193215
}
194-
195216
};

src/candidate-search/index.spec.js

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ describe('candidate-search', () => {
4646
}
4747
});
4848

49+
// eslint-disable-next-line max-statements
4950
async function callback({getFixture, factoryOptions, searchOptions, expectedFactoryError = false, expectedSearchError = false, enabled = true}) {
5051
const url = 'http://foo.bar';
5152

@@ -54,16 +55,29 @@ describe('candidate-search', () => {
5455
}
5556

5657
if (expectedFactoryError) {
58+
debug(`We're expecting an error`);
5759
if (expectedFactoryError.isCandidateSearchError) {
58-
expect(() => createSearchInterface({...formatFactoryOptions(), url})).to.throw(CandidateSearchError, new RegExp(expectedFactoryError, 'u'));
60+
try {
61+
const result = createSearchInterface({...formatFactoryOptions(), url});
62+
debug(result);
63+
} catch (err) {
64+
expect(err).to.equal(new CandidateSearchError(expectedFactoryError));
65+
}
5966
return;
6067
}
6168

62-
expect(() => createSearchInterface({...formatFactoryOptions(), url})).to.throw(new RegExp(expectedFactoryError, 'u'));
69+
try {
70+
const result = createSearchInterface({...formatFactoryOptions(), url});
71+
debug(result);
72+
} catch (err) {
73+
expect(err).to.equal(new Error(expectedFactoryError));
74+
}
6375
return;
6476
}
6577

66-
const search = createSearchInterface({...formatFactoryOptions(), url});
78+
const {search} = await createSearchInterface({...formatFactoryOptions(), url});
79+
// eslint-disable-next-line no-console
80+
console.log(search);
6781
await iterate({searchOptions, expectedSearchError});
6882

6983
function formatFactoryOptions() {

0 commit comments

Comments
 (0)