Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 44 additions & 11 deletions src/model/sparql/JenaTextSparql.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,32 +20,65 @@ class JenaTextSparql extends GenericSparql

/*
* Characters that need to be quoted for the Lucene query parser.
* See http://lucene.apache.org/core/4_10_1/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#Escaping_Special_Characters
* See https://lucene.apache.org/core/9_12_1/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#Escaping_Special_Characters
* Note: space is NOT included here because it must act as a word separator,
* not be escaped into a literal character. Escaping space breaks multi-word
* queries when using word-level analyzers such as StandardAnalyzer.
* Note: * is not included because we want wildcard expansion.
*/
public const LUCENE_ESCAPE_CHARS = ' +-&|!(){}[]^"~?:\\/'; /* note: don't include * because we want wildcard expansion
public const LUCENE_ESCAPE_CHARS = '+-&|!(){}[]^"~?:\\/';

/**
* Escape a single word for the Lucene query parser.
*
* @param string $word a single search word (no spaces)
* @return string the word with Lucene special characters escaped
*/
private function escapeLuceneWord($word)
{
$lucenemap = array();
foreach (str_split(self::LUCENE_ESCAPE_CHARS) as $char) {
$lucenemap[$char] = '\\' . $char;
}
return strtr($word, $lucenemap);
}

/**
* Make a jena-text query condition that narrows the amount of search
* results in term searches
* results in term searches.
*
* Multi-word terms are split into individual required Lucene terms
* using the '+' operator, e.g. "Siamese cat*" becomes "+Siamese +cat*".
* This works correctly with word-level analyzers (StandardAnalyzer).
*
* @param string $term search term
* @param string $property property to search (e.g. 'skos:prefLabel'), or '' for default
* @param string $langClause jena-text clause to limit search by language code
* @return string SPARQL text search clause
*/

private function createTextQueryCondition($term, $property = '', $langClause = '')
{
// construct the lucene search term for jena-text
// Split on whitespace into individual words
$words = preg_split('/\s+/', trim($term), -1, PREG_SPLIT_NO_EMPTY);
if (empty($words)) {
$words = array('*');
}

// 1. Ensure characters with special meaning in Lucene are escaped
$lucenemap = array();
foreach (str_split(self::LUCENE_ESCAPE_CHARS) as $char) {
$lucenemap[$char] = '\\' . $char; // escape with a backslash
// Build Lucene query: each word is required (+), wildcards are preserved
$parts = array();
foreach ($words as $word) {
// Separate trailing wildcard(s) from the word before escaping
$suffix = '';
if (preg_match('/^(.*?)(\*+)$/', $word, $m)) {
$word = $m[1];
$suffix = $m[2];
}
$escaped = $this->escapeLuceneWord($word);
$parts[] = '+' . $escaped . $suffix;
}
$term = strtr($term, $lucenemap);
$term = implode(' ', $parts);

// 2. Ensure proper SPARQL quoting
// Ensure proper SPARQL quoting
$term = str_replace('\\', '\\\\', $term); // escape backslashes
$term = str_replace("'", "\\'", $term); // escape single quotes

Expand Down