Skip to content

Commit e046813

Browse files
authored
Merge pull request #643 from NYPL/scc-5168
Scc 5168
2 parents 48066b4 + b0f7585 commit e046813

File tree

11 files changed

+2145
-3
lines changed

11 files changed

+2145
-3
lines changed

lib/elasticsearch/config.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ const SEARCH_SCOPES = {
7373
},
7474
standard_number: {
7575
// We do custom field matching for this search-scope
76-
}
76+
},
77+
cql: {} // see cql/index_mapping for this search scope
7778
}
7879

7980
const FILTER_CONFIG = {
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
const indexMapping = {
2+
keyword: {
3+
fields: [
4+
'title',
5+
'title.folded',
6+
'description.foldedStemmed',
7+
'subjectLiteral',
8+
'subjectLiteral.folded',
9+
'creatorLiteral',
10+
'creatorLiteral.folded',
11+
'contributorLiteral.folded',
12+
'note.label.foldedStemmed',
13+
'publisherLiteral.folded',
14+
'seriesStatement.folded',
15+
'titleAlt.folded',
16+
'titleDisplay.folded',
17+
'contentsTitle.folded',
18+
'tableOfContents.folded',
19+
'genreForm',
20+
'donor.folded',
21+
'parallelTitle.folded',
22+
'parallelTitleDisplay.folded',
23+
'parallelTitleAlt.folded',
24+
'parallelSeriesStatement.folded',
25+
'parallelCreatorLiteral.folded',
26+
'parallelPublisher',
27+
'parallelPublisherLiteral',
28+
'uniformTitle.folded',
29+
'parallelUniformTitle',
30+
'formerTitle',
31+
'addedAuthorTitle',
32+
'placeOfPublication.folded',
33+
// Try to detect shelfmark searches (e.g. JFD 16-5143)
34+
{ field: 'items.shelfMark', on: (q) => /^[A-Z]{1,3} \d{2,}/.test(q) }
35+
],
36+
exact_fields: [
37+
'title.keywordLowercasedStripped',
38+
// missing description
39+
'subjectLiteral.raw',
40+
'creatorLiteral.keywordLowercased',
41+
'contributorLiteral.keywordLowercased',
42+
// note.label is missing
43+
'publisherLiteral.raw',
44+
'seriesStatement.raw',
45+
'titleAlt.raw',
46+
// titleDisplay missing
47+
// contentsTitle missing
48+
// tableOfContents missing
49+
'genreForm.raw',
50+
'donor.raw',
51+
// parallelTitle missing
52+
// parallelTitleDisplay missing
53+
'parallelTitleAlt.raw',
54+
'parallelSeriesStatement.raw',
55+
'parallelCreatorLiteral.raw',
56+
// parallelPublisher/parallelPublisherLiteral missing
57+
'uniformTitle.raw',
58+
'parallelUniformTitle.raw',
59+
// formerTitle missing
60+
'addedAuthorTitle.raw',
61+
'placeOfPublication',
62+
{ field: 'items.shelfMark.raw', on: (q) => /^[A-Z]{1,3} \d{2,}/.test(q) }
63+
],
64+
term: [
65+
{ field: 'items.idBarcode', on: (q) => /\d{6,}/.test(q) }
66+
]
67+
},
68+
title: {
69+
fields: [
70+
'title',
71+
'title.folded',
72+
'titleAlt.folded',
73+
'uniformTitle.folded',
74+
'titleDisplay.folded',
75+
'seriesStatement.folded',
76+
'contentsTitle.folded',
77+
'donor.folded',
78+
'parallelTitle.folded',
79+
'parallelTitleDisplay.folded',
80+
'parallelSeriesStatement.folded',
81+
'parallelTitleAlt.folded',
82+
'parallelCreatorLiteral.folded',
83+
'parallelUniformTitle',
84+
'formerTitle',
85+
'addedAuthorTitle'
86+
],
87+
exact_fields: [
88+
'title.keywordLowercasedStripped',
89+
'seriesStatement.raw',
90+
'titleAlt.raw',
91+
// titleDisplay missing
92+
// contentsTitle missing
93+
// tableOfContents missing
94+
'donor.raw',
95+
// parallelTitle missing
96+
// parallelTitleDisplay missing
97+
'parallelTitleAlt.raw',
98+
'parallelSeriesStatement.raw',
99+
'parallelCreatorLiteral.raw',
100+
'uniformTitle.raw',
101+
'parallelUniformTitle.raw',
102+
// formerTitle missing
103+
'addedAuthorTitle.raw',
104+
'placeOfPublication'
105+
]
106+
},
107+
author: {
108+
fields: ['creatorLiteral', 'creatorLiteral.folded', 'contributorLiteral.folded', 'parallelCreatorLiteral.folded', 'parallelContributorLiteral.folded'],
109+
exact_fields: [
110+
'creatorLiteral.keywordLowercased', 'contributorLiteral.keywordLowercased',
111+
'parallelCreatorLiteral.raw', 'parallelContributorLiteral.raw'
112+
]
113+
},
114+
callnumber: {
115+
term: ['shelfMark.keywordLowercased', 'items.shelfMark.keywordLowercased']
116+
},
117+
identifier: {
118+
prefix: ['identifierV2.value', 'items.shelfMark.keywordLowercased'],
119+
term: ['uri', 'items.idBarcode', 'idIsbn.clean', 'idIssn.clean']
120+
},
121+
subject: {
122+
fields: ['subjectLiteral', 'subjectLiteral.folded', 'parallelSubjectLiteral.folded'],
123+
exact_fields: ['subjectLiteral.raw']
124+
},
125+
language: { term: ['language.id', 'language.label'] },
126+
date: { fields: ['dates.range'] },
127+
series: {
128+
term: ['series', 'parallelSeries']
129+
},
130+
genre: { fields: ['genreForm'], exact_fields: ['genreForm.raw'] },
131+
center: { term: ['buildingLocationIds'] },
132+
division: { term: ['collectionIds'] },
133+
format: { term: ['formatId'] }
134+
}
135+
136+
module.exports = {
137+
indexMapping
138+
}

lib/elasticsearch/cql_grammar.js

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
const { Grammars } = require('ebnf')
2+
3+
function reverseGrammar (grammar) {
4+
return grammar.split('\n')
5+
.map(line =>
6+
(line.split('::=')
7+
.map(side =>
8+
(side.split('|')
9+
.map(dis =>
10+
(dis.split(' ')
11+
.map(word =>
12+
(word.includes('"') ? word.split('').reverse().join('') : word))
13+
.reverse().join(' '))
14+
).join('|'))).join('::= '))).join('\n')
15+
}
16+
17+
const leftCql = `
18+
query ::= query whitespace connective whitespace sub_query | sub_query
19+
connective ::= "AND NOT" | "AND" | "OR" | "NOT"
20+
sub_query ::= atomic_query | "(" query ")"
21+
atomic_query ::= scope relation quoted_term
22+
scope ::= scope_term whitespace | scope_term
23+
relation ::= relation_term whitespace | relation_term
24+
scope_term ::= "title" | "author" | "keyword" | "callnumber" | "identifier" | "subject" | "language" | "date" | "series"| "genre" | "center" | "division" | "format"
25+
relation_term ::= "any" | "adj" | "all" | "<=" | ">=" | "<" | ">" | "==" | "=" | "within" | "encloses"
26+
quoted_term ::= quote phrase quote
27+
phrase ::= phrase whitespace word | word
28+
whitespace ::= [#x20#x09#x0A#x0D]+
29+
word ::= word escaped_char | word regular_char | escaped_char | regular_char
30+
regular_char ::= [^#x22#x5c#x20#x09#x0A#x0D]
31+
escaped_char ::= slash char
32+
slash ::= [#x5c]
33+
char ::= [a-z]|[^a-z]
34+
quote ::= [#x22]
35+
`
36+
37+
const rightCql = reverseGrammar(leftCql)
38+
39+
function simplify (ast) {
40+
switch (ast.type) {
41+
case 'query': {
42+
const children = ast.children.filter(child => child.type !== 'whitespace').map(child => simplify(child))
43+
return children.length > 1 ? children : children[0]
44+
}
45+
case 'connective':
46+
return ast.text
47+
case 'sub_query':
48+
return simplify(ast.children.find(child => child.type.includes('query')))
49+
case 'atomic_query':
50+
return ast.children.map(child => simplify(child))
51+
case 'scope':
52+
return simplify(ast.children.find(child => child.type.includes('scope_term')))
53+
case 'relation':
54+
return simplify(ast.children.find(child => child.type.includes('relation_term')))
55+
case 'scope_term':
56+
return ast.text
57+
case 'relation_term':
58+
return ast.text
59+
case 'quoted_term':
60+
return simplify(ast.children.find(child => child.type.includes('phrase')))
61+
case 'phrase': {
62+
const word = ast.children.find(child => child.type === 'word')
63+
const phrase = ast.children.find(child => child.type === 'phrase')
64+
return [simplify(word)].concat(phrase ? simplify(phrase) : [])
65+
}
66+
case 'word':
67+
return ast.text
68+
default:
69+
break
70+
}
71+
}
72+
73+
function reverseString (string) {
74+
return string.split('').reverse().join('')
75+
}
76+
77+
function reverseAST (tree) {
78+
if (!tree) return null
79+
tree.text = reverseString(tree.text)
80+
tree.children = tree.children.map(child => reverseAST(child)).reverse()
81+
return tree
82+
}
83+
84+
const rightCqlParser = new Grammars.W3C.Parser(rightCql)
85+
86+
// we want to associate operators to the left, but we have a right parser.
87+
// so: reverse the grammar and the input string, then reverse the output
88+
function parseRight (string, parser) {
89+
return reverseAST(parser.getAST(reverseString(string)))
90+
}
91+
function parseWithRightCql (string) {
92+
return parseRight(string, rightCqlParser)
93+
}
94+
95+
function parsedASTtoNestedArray (ast) {
96+
if (!ast.type.includes('query')) {
97+
return reverseString(ast.text)
98+
}
99+
100+
const childTypes = [
101+
'atomic_query', 'sub_query', 'query', 'connective',
102+
'scope', 'relation', 'quoted_term'
103+
]
104+
105+
const children = ast.children
106+
.filter(child => childTypes.includes(child.type))
107+
.map(child => parsedASTtoNestedArray(child))
108+
109+
if (children.length === 1) {
110+
return children[0]
111+
}
112+
113+
return children
114+
}
115+
116+
// we need to reverse the error message since `parseWithRightCql` doesn't
117+
function displayParsed (string) {
118+
const parsed = parseWithRightCql(string)
119+
if (!parsed) return {}
120+
if (parsed.errors.length) {
121+
return {
122+
error: parsed.errors.map(error =>
123+
`Parsing error likely near end of "${reverseString(error.token.rest)}"`
124+
).join('\n')
125+
}
126+
}
127+
return { parsed: parsedASTtoNestedArray(parsed) }
128+
}
129+
130+
module.exports = { simplify, reverseAST, reverseGrammar, parseRight, parseWithRightCql, rightCqlParser, reverseString, displayParsed }

0 commit comments

Comments
 (0)