Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ In the example below the documents index contains a property `topics` that maps
}
```


##### File content property
To make the content of a file searchable, it needs to be indexed as a property in a search index. Basic indexing of PDF, Word etc. files is provided using a local [Apache Tika](https://tika.apache.org/) instance. A default ingest pipeline named `attachment` is created on startup of the mu-search service. Note that this is under development and liable to change.

Expand Down Expand Up @@ -643,6 +644,30 @@ The example below contains 2 simple indexes for documents and creative works, an
]
}
```
#### Using Prefixes
To make the configuration more concise and maintainable, you can define prefixes for commonly used URI namespaces. Prefixes are defined at the root level of the configuration using the `prefixes` property:

```json
{
"prefixes": {
"foaf": "http://xmlns.com/foaf/0.1/",
"dct": "http://purl.org/dc/terms/",
"skos": "http://www.w3.org/2004/02/skos/core#"
},
"types": [
{
"type": "document",
"on_path": "documents",
"rdf_type": "foaf:Document",
"properties": {
"title": "dct:title",
"label": "skos:prefLabel",
"creator": "^foaf:made"
}
}
]
}
```

#### Elasticsearch settings
Elasticsearch provides a lot of [index configuration settings](https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules.html) for analysis, logging, etc. Mu-search allows to provide this configuration for the whole domain and/or to be overridden (currently not merged!) on a per-type basis.
Expand Down
33 changes: 31 additions & 2 deletions lib/mu_search/config_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ def self.parse(path)
eager_indexing_groups: [],
update_wait_interval_minutes: 1,
number_of_threads: 1,
enable_raw_dsl_endpoint: false
enable_raw_dsl_endpoint: false,
prefixes: {}
}

json_config = JSON.parse(File.read(path))
Expand Down Expand Up @@ -48,7 +49,9 @@ def self.parse(path)
config[:eager_indexing_groups] = json_config["eager_indexing_groups"]
end
config[:ignored_allowed_groups] = json_config["ignored_allowed_groups"] || []
config[:type_definitions] = Hash[MuSearch::IndexDefinition.from_json_config(json_config["types"])]
config[:prefixes] = json_config["prefixes"] || {}
config[:type_definitions] = Hash[MuSearch::IndexDefinition.from_json_config(json_config["types"], config[:prefixes])]

config
end

Expand Down Expand Up @@ -120,6 +123,9 @@ def self.validate_config(json_config)
if json_config.has_key?("ignored_allowed_groups")
errors = errors.concat(self.validate_ignored_allowed_groups(json_config["ignored_allowed_groups"]))
end
if json_config.has_key?("prefixes")
errors = errors.concat(self.validate_prefixes(json_config["prefixes"]))
end
if errors.length > 0
Mu::log.error("CONFIG_PARSER") { errors.join("\n") }
raise "invalid config"
Expand Down Expand Up @@ -242,5 +248,28 @@ def self.validate_eager_indexing_groups(groups)

errors
end

def self.validate_prefixes(prefixes)
errors = []
if ! prefixes.kind_of?(Hash)
errors << "prefixes should be an object mapping prefix names to URIs"
else
prefixes.each do |prefix, uri|
if ! prefix.kind_of?(String)
errors << "prefix name should be a string: #{prefix.inspect}"
end
if ! uri.kind_of?(String)
errors << "prefix URI should be a string: #{uri.inspect}"
else
begin
parsed_uri = URI.parse(uri)
rescue URI::InvalidURIError => e
errors << "prefix URI '#{uri}' is not a valid URI: #{e.message}"
end
end
end
end
errors
end
end
end
33 changes: 25 additions & 8 deletions lib/mu_search/index_definition.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
require_relative './prefix_utils'

module MuSearch
# This class represents index definitions as defined in the configuration file of mu-search
# in the config file you will find these definitions on the the keyword "types"
Expand Down Expand Up @@ -58,23 +60,34 @@ def self.create_composite_sub_definitions(composite_definition, definitions)

# builds a tuples mapping the index name to the full definition for all provided types
# expects all types as param
def self.from_json_config(all_definitions)
def self.from_json_config(all_definitions, prefixes = {})
all_definitions.collect do |definition|
name = definition["type"]
composite_types = []
if definition["composite_types"]
composite_types = create_composite_sub_definitions(definition, all_definitions)
composite_types.each do |definition|
ensure_uuid_in_properties definition.properties
build_property_definitions(definition.properties, prefixes)
end
else
# ensure uuid is included because it may be used for folding
ensure_uuid_in_properties definition["properties"]
build_property_definitions(definition["properties"], prefixes)
end

# Expand prefixes in rdf_type if present
rdf_type = definition["rdf_type"]
if rdf_type
if rdf_type.is_a?(Array)
rdf_type = rdf_type.map { |t| PrefixUtils.expand_prefix(t, prefixes) }
else
rdf_type = PrefixUtils.expand_prefix(rdf_type, prefixes)
end
end

index_definition = IndexDefinition.new(
name: name,
on_path: definition["on_path"],
rdf_type: definition["rdf_type"],
rdf_type: rdf_type,
composite_types: composite_types,
properties: definition["properties"],
mappings: definition["mappings"],
Expand All @@ -84,16 +97,20 @@ def self.from_json_config(all_definitions)
end
end

def self.ensure_uuid_in_properties properties
properties["uuid"] = ["http://mu.semte.ch/vocabularies/core/uuid"] unless properties.key?("uuid")
def self.build_property_definitions(properties, prefixes)
ensure_uuid_property(properties)
properties.each do |(key, value)|
property_definition = PropertyDefinition.from_json_config(key, value)
property_definition = PropertyDefinition.from_json_config(key, value, prefixes)
if property_definition.type == "nested"
ensure_uuid_in_properties value["properties"]
build_property_definitions(value["properties"], prefixes)
end
end
end

def self.ensure_uuid_property(properties)
properties["uuid"] = ["http://mu.semte.ch/vocabularies/core/uuid"] unless properties.key?("uuid")
end

def type
@name
end
Expand Down
14 changes: 14 additions & 0 deletions lib/mu_search/prefix_utils.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
module MuSearch
module PrefixUtils
def self.expand_prefix(uri, prefixes)
return uri unless uri.is_a?(String)

prefixes.each do |prefix, base_uri|
if uri.start_with?("#{prefix}:")
return uri.sub("#{prefix}:", base_uri)
end
end
uri
end
end
end
11 changes: 8 additions & 3 deletions lib/mu_search/property_definition.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
require_relative './prefix_utils'

module MuSearch
class PropertyDefinition
PROPERTY_TYPES = ["simple", "nested", "attachment", "language-string"]
Expand All @@ -16,7 +18,7 @@ def initialize(name: , path:, type: "auto", rdf_type: nil, sub_properties:)
end
end

def self.from_json_config(name, config)
def self.from_json_config(name, config, prefixes = {})
type = "simple"
rdf_type = sub_properties = pipeline = nil
if config.is_a?(Hash)
Expand All @@ -26,7 +28,7 @@ def self.from_json_config(name, config)
elsif config.key?("properties")
type = "nested"
sub_properties = config["properties"].map do |subname, subconfig|
from_json_config(subname, subconfig)
from_json_config(subname, subconfig, prefixes)
end
rdf_type = config["rdf_type"]
elsif config.key?("type") && config["type"] == "language-string"
Expand All @@ -38,6 +40,10 @@ def self.from_json_config(name, config)
path = [config]
end

path = path.map do |p|
PrefixUtils.expand_prefix(p, prefixes)
end

PropertyDefinition.new(
name: name,
type: type,
Expand All @@ -46,6 +52,5 @@ def self.from_json_config(name, config)
sub_properties: sub_properties,
)
end

end
end