From 24f7b4e007bb2b95b4ae4bd8a4b7fd8815b9fa0d Mon Sep 17 00:00:00 2001 From: Niels V Date: Tue, 3 Jun 2025 21:34:57 +0200 Subject: [PATCH] initial support for prefixes --- README.md | 25 +++++++++++++++++++++ lib/mu_search/config_parser.rb | 33 ++++++++++++++++++++++++++-- lib/mu_search/index_definition.rb | 33 +++++++++++++++++++++------- lib/mu_search/prefix_utils.rb | 14 ++++++++++++ lib/mu_search/property_definition.rb | 11 +++++++--- 5 files changed, 103 insertions(+), 13 deletions(-) create mode 100644 lib/mu_search/prefix_utils.rb diff --git a/README.md b/README.md index 8a72364..9fff527 100644 --- a/README.md +++ b/README.md @@ -406,6 +406,7 @@ In the example below the documents index contains a property `topics` that maps } ``` + ##### File content property To make the content of a file searchable, it needs to be indexed as a property in a search index. Basic indexing of PDF, Word etc. files is provided using a local [Apache Tika](https://tika.apache.org/) instance. A default ingest pipeline named `attachment` is created on startup of the mu-search service. Note that this is under development and liable to change. @@ -643,6 +644,30 @@ The example below contains 2 simple indexes for documents and creative works, an ] } ``` +#### Using Prefixes +To make the configuration more concise and maintainable, you can define prefixes for commonly used URI namespaces. Prefixes are defined at the root level of the configuration using the `prefixes` property: + +```json +{ + "prefixes": { + "foaf": "http://xmlns.com/foaf/0.1/", + "dct": "http://purl.org/dc/terms/", + "skos": "http://www.w3.org/2004/02/skos/core#" + }, + "types": [ + { + "type": "document", + "on_path": "documents", + "rdf_type": "foaf:Document", + "properties": { + "title": "dct:title", + "label": "skos:prefLabel", + "creator": "^foaf:made" + } + } + ] +} +``` #### Elasticsearch settings Elasticsearch provides a lot of [index configuration settings](https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules.html) for analysis, logging, etc. Mu-search allows to provide this configuration for the whole domain and/or to be overridden (currently not merged!) on a per-type basis. diff --git a/lib/mu_search/config_parser.rb b/lib/mu_search/config_parser.rb index 5490b15..cc0d76f 100644 --- a/lib/mu_search/config_parser.rb +++ b/lib/mu_search/config_parser.rb @@ -16,7 +16,8 @@ def self.parse(path) eager_indexing_groups: [], update_wait_interval_minutes: 1, number_of_threads: 1, - enable_raw_dsl_endpoint: false + enable_raw_dsl_endpoint: false, + prefixes: {} } json_config = JSON.parse(File.read(path)) @@ -48,7 +49,9 @@ def self.parse(path) config[:eager_indexing_groups] = json_config["eager_indexing_groups"] end config[:ignored_allowed_groups] = json_config["ignored_allowed_groups"] || [] - config[:type_definitions] = Hash[MuSearch::IndexDefinition.from_json_config(json_config["types"])] + config[:prefixes] = json_config["prefixes"] || {} + config[:type_definitions] = Hash[MuSearch::IndexDefinition.from_json_config(json_config["types"], config[:prefixes])] + config end @@ -120,6 +123,9 @@ def self.validate_config(json_config) if json_config.has_key?("ignored_allowed_groups") errors = errors.concat(self.validate_ignored_allowed_groups(json_config["ignored_allowed_groups"])) end + if json_config.has_key?("prefixes") + errors = errors.concat(self.validate_prefixes(json_config["prefixes"])) + end if errors.length > 0 Mu::log.error("CONFIG_PARSER") { errors.join("\n") } raise "invalid config" @@ -242,5 +248,28 @@ def self.validate_eager_indexing_groups(groups) errors end + + def self.validate_prefixes(prefixes) + errors = [] + if ! prefixes.kind_of?(Hash) + errors << "prefixes should be an object mapping prefix names to URIs" + else + prefixes.each do |prefix, uri| + if ! prefix.kind_of?(String) + errors << "prefix name should be a string: #{prefix.inspect}" + end + if ! uri.kind_of?(String) + errors << "prefix URI should be a string: #{uri.inspect}" + else + begin + parsed_uri = URI.parse(uri) + rescue URI::InvalidURIError => e + errors << "prefix URI '#{uri}' is not a valid URI: #{e.message}" + end + end + end + end + errors + end end end diff --git a/lib/mu_search/index_definition.rb b/lib/mu_search/index_definition.rb index ed1aaba..04d5a05 100644 --- a/lib/mu_search/index_definition.rb +++ b/lib/mu_search/index_definition.rb @@ -1,3 +1,5 @@ +require_relative './prefix_utils' + module MuSearch # This class represents index definitions as defined in the configuration file of mu-search # in the config file you will find these definitions on the the keyword "types" @@ -58,23 +60,34 @@ def self.create_composite_sub_definitions(composite_definition, definitions) # builds a tuples mapping the index name to the full definition for all provided types # expects all types as param - def self.from_json_config(all_definitions) + def self.from_json_config(all_definitions, prefixes = {}) all_definitions.collect do |definition| name = definition["type"] composite_types = [] if definition["composite_types"] composite_types = create_composite_sub_definitions(definition, all_definitions) composite_types.each do |definition| - ensure_uuid_in_properties definition.properties + build_property_definitions(definition.properties, prefixes) end else # ensure uuid is included because it may be used for folding - ensure_uuid_in_properties definition["properties"] + build_property_definitions(definition["properties"], prefixes) + end + + # Expand prefixes in rdf_type if present + rdf_type = definition["rdf_type"] + if rdf_type + if rdf_type.is_a?(Array) + rdf_type = rdf_type.map { |t| PrefixUtils.expand_prefix(t, prefixes) } + else + rdf_type = PrefixUtils.expand_prefix(rdf_type, prefixes) + end end + index_definition = IndexDefinition.new( name: name, on_path: definition["on_path"], - rdf_type: definition["rdf_type"], + rdf_type: rdf_type, composite_types: composite_types, properties: definition["properties"], mappings: definition["mappings"], @@ -84,16 +97,20 @@ def self.from_json_config(all_definitions) end end - def self.ensure_uuid_in_properties properties - properties["uuid"] = ["http://mu.semte.ch/vocabularies/core/uuid"] unless properties.key?("uuid") + def self.build_property_definitions(properties, prefixes) + ensure_uuid_property(properties) properties.each do |(key, value)| - property_definition = PropertyDefinition.from_json_config(key, value) + property_definition = PropertyDefinition.from_json_config(key, value, prefixes) if property_definition.type == "nested" - ensure_uuid_in_properties value["properties"] + build_property_definitions(value["properties"], prefixes) end end end + def self.ensure_uuid_property(properties) + properties["uuid"] = ["http://mu.semte.ch/vocabularies/core/uuid"] unless properties.key?("uuid") + end + def type @name end diff --git a/lib/mu_search/prefix_utils.rb b/lib/mu_search/prefix_utils.rb new file mode 100644 index 0000000..9fc207d --- /dev/null +++ b/lib/mu_search/prefix_utils.rb @@ -0,0 +1,14 @@ +module MuSearch + module PrefixUtils + def self.expand_prefix(uri, prefixes) + return uri unless uri.is_a?(String) + + prefixes.each do |prefix, base_uri| + if uri.start_with?("#{prefix}:") + return uri.sub("#{prefix}:", base_uri) + end + end + uri + end + end +end diff --git a/lib/mu_search/property_definition.rb b/lib/mu_search/property_definition.rb index 6777b28..3d09111 100644 --- a/lib/mu_search/property_definition.rb +++ b/lib/mu_search/property_definition.rb @@ -1,3 +1,5 @@ +require_relative './prefix_utils' + module MuSearch class PropertyDefinition PROPERTY_TYPES = ["simple", "nested", "attachment", "language-string"] @@ -16,7 +18,7 @@ def initialize(name: , path:, type: "auto", rdf_type: nil, sub_properties:) end end - def self.from_json_config(name, config) + def self.from_json_config(name, config, prefixes = {}) type = "simple" rdf_type = sub_properties = pipeline = nil if config.is_a?(Hash) @@ -26,7 +28,7 @@ def self.from_json_config(name, config) elsif config.key?("properties") type = "nested" sub_properties = config["properties"].map do |subname, subconfig| - from_json_config(subname, subconfig) + from_json_config(subname, subconfig, prefixes) end rdf_type = config["rdf_type"] elsif config.key?("type") && config["type"] == "language-string" @@ -38,6 +40,10 @@ def self.from_json_config(name, config) path = [config] end + path = path.map do |p| + PrefixUtils.expand_prefix(p, prefixes) + end + PropertyDefinition.new( name: name, type: type, @@ -46,6 +52,5 @@ def self.from_json_config(name, config) sub_properties: sub_properties, ) end - end end