Skip to content

Commit 03abd6a

Browse files
committed
Add Semantic Search core to OSS
1 parent 09037e2 commit 03abd6a

40 files changed

+5925
-29
lines changed

conf/openmetadata.yaml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,8 @@ elasticsearch:
431431
serviceName: ${SEARCH_AWS_SERVICE_NAME:-"es"} # Use "es" for OpenSearch, "aoss" for OpenSearch Serverless
432432
naturalLanguageSearch:
433433
enabled: ${NATURAL_LANGUAGE_SEARCH_ENABLED:-false}
434-
embeddingProvider: ${EMBEDDING_PROVIDER:-bedrock}
434+
semanticSearchEnabled: ${SEMANTIC_SEARCH_ENABLED:-false}
435+
embeddingProvider: ${EMBEDDING_PROVIDER:-bedrock} # Options: "openai", "bedrock", "djl"
435436
providerClass: ${NATURAL_LANGUAGE_SEARCH_PROVIDER_CLASS:-org.openmetadata.service.search.nlq.NoOpNLQService}
436437
bedrock:
437438
awsConfig:
@@ -443,6 +444,16 @@ elasticsearch:
443444
modelId: ${AWS_BEDROCK_MODEL_ID:-""}
444445
embeddingModelId: ${AWS_BEDROCK_EMBED_MODEL_ID:-""}
445446
embeddingDimension: ${AWS_BEDROCK_EMBEDDING_DIMENSION:-""}
447+
openai:
448+
apiKey: ${OPENAI_API_KEY:-""}
449+
# For Azure OpenAI, set endpoint and deploymentName:
450+
endpoint: ${OPENAI_API_ENDPOINT:-""} # e.g., https://your-resource.openai.azure.com
451+
deploymentName: ${OPENAI_DEPLOYMENT_NAME:-""} # Required for Azure OpenAI
452+
apiVersion: ${OPENAI_API_VERSION:-"2024-02-01"} # Azure OpenAI API version
453+
embeddingModelId: ${OPENAI_EMBEDDING_MODEL_ID:-"text-embedding-3-small"}
454+
embeddingDimension: ${OPENAI_EMBEDDING_DIMENSION:-1536}
455+
djl:
456+
embeddingModel: ${DJL_EMBEDDING_MODEL:-"ai.djl.huggingface.pytorch/sentence-transformers/all-MiniLM-L6-v2"}
446457

447458
eventMonitoringConfiguration:
448459
eventMonitor: ${EVENT_MONITOR:-prometheus} # Possible values are "prometheus", "cloudwatch"

openmetadata-service/pom.xml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,31 @@
531531
<artifactId>s3</artifactId>
532532
</dependency>
533533

534+
<!-- Dependencies for vector embedding (Bedrock) -->
535+
<dependency>
536+
<groupId>software.amazon.awssdk</groupId>
537+
<artifactId>bedrockruntime</artifactId>
538+
<version>${bedrock.runtime.version}</version>
539+
</dependency>
540+
541+
<!-- Dependencies for vector embedding (DJL) -->
542+
<dependency>
543+
<groupId>ai.djl</groupId>
544+
<artifactId>api</artifactId>
545+
<version>0.34.0</version>
546+
</dependency>
547+
<dependency>
548+
<groupId>ai.djl.pytorch</groupId>
549+
<artifactId>pytorch-engine</artifactId>
550+
<version>0.34.0</version>
551+
<scope>runtime</scope>
552+
</dependency>
553+
<dependency>
554+
<groupId>ai.djl.huggingface</groupId>
555+
<artifactId>tokenizers</artifactId>
556+
<version>0.34.0</version>
557+
</dependency>
558+
534559
<!--test dependencies-->
535560
<dependency>
536561
<groupId>org.glassfish.jersey.core</groupId>

openmetadata-service/src/main/java/org/openmetadata/service/OpenMetadataApplication.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@
130130
import org.openmetadata.service.resources.filters.ETagResponseFilter;
131131
import org.openmetadata.service.resources.settings.SettingsCache;
132132
import org.openmetadata.service.search.SearchRepository;
133+
import org.openmetadata.service.search.SearchRepositoryFactory;
133134
import org.openmetadata.service.secrets.SecretsManagerFactory;
134135
import org.openmetadata.service.secrets.masker.EntityMaskerFactory;
135136
import org.openmetadata.service.security.AuthCallbackServlet;
@@ -277,6 +278,9 @@ public void run(OpenMetadataApplicationConfig catalogConfig, Environment environ
277278
// Phase 2: Advanced search features (after settings are available)
278279
initializeAdvancedSearchFeatures();
279280

281+
// Phase 3: Vector search (embeddings + vector index)
282+
Entity.getSearchRepository().initializeVectorSearchService();
283+
280284
SecurityConfigurationManager.getInstance().initialize(this, catalogConfig, environment);
281285

282286
// Instantiate JWT Token Generator
@@ -550,7 +554,7 @@ protected void initializeCoreSearchInfrastructure(OpenMetadataApplicationConfig
550554
databaseMaxSize);
551555

552556
SearchRepository searchRepository =
553-
new SearchRepository(
557+
SearchRepositoryFactory.createSearchRepository(
554558
config.getElasticSearchConfiguration(), config.getDataSourceFactory().getMaxSize());
555559
Entity.setSearchRepository(searchRepository);
556560

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package org.openmetadata.service.exception;
2+
3+
import lombok.Getter;
4+
5+
@Getter
6+
public class VectorDimensionException extends Exception {
7+
private final String currentModel;
8+
private final int currentDimension;
9+
private final String requiredModel;
10+
private final int requiredDimension;
11+
12+
public VectorDimensionException(
13+
String message,
14+
String currentModel,
15+
int currentDimension,
16+
String requiredModel,
17+
int requiredDimension) {
18+
super(message);
19+
this.currentModel = currentModel;
20+
this.currentDimension = currentDimension;
21+
this.requiredModel = requiredModel;
22+
this.requiredDimension = requiredDimension;
23+
}
24+
25+
public boolean isModelMismatch() {
26+
return currentModel != null && requiredModel != null && !currentModel.equals(requiredModel);
27+
}
28+
29+
public boolean isDimensionMismatch() {
30+
return currentDimension != requiredDimension;
31+
}
32+
}

openmetadata-service/src/main/java/org/openmetadata/service/jdbi3/SystemRepository.java

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
import org.openmetadata.schema.security.client.OidcClientConfig;
4949
import org.openmetadata.schema.security.client.OpenMetadataJWTClientConfig;
5050
import org.openmetadata.schema.security.scim.ScimConfiguration;
51+
import org.openmetadata.schema.service.configuration.elasticsearch.NaturalLanguageSearchConfiguration;
5152
import org.openmetadata.schema.service.configuration.slackApp.SlackAppConfiguration;
5253
import org.openmetadata.schema.services.connections.metadata.AuthProvider;
5354
import org.openmetadata.schema.services.connections.metadata.OpenMetadataConnection;
@@ -74,6 +75,7 @@
7475
import org.openmetadata.service.migration.MigrationValidationClient;
7576
import org.openmetadata.service.resources.settings.SettingsCache;
7677
import org.openmetadata.service.search.SearchRepository;
78+
import org.openmetadata.service.search.vector.client.EmbeddingClient;
7779
import org.openmetadata.service.secrets.SecretsManager;
7880
import org.openmetadata.service.secrets.SecretsManagerFactory;
7981
import org.openmetadata.service.secrets.masker.PasswordEntityMasker;
@@ -544,6 +546,11 @@ public ValidationResponse validateSystem(
544546
validation.setLogStorage(logStorageValidation);
545547
}
546548

549+
if (Entity.getSearchRepository().isVectorEmbeddingEnabled()) {
550+
validation.setAdditionalProperty(
551+
"Semantic Search", getEmbeddingsValidation(applicationConfig));
552+
}
553+
547554
addExtraValidations(applicationConfig, validation);
548555

549556
return validation;
@@ -552,6 +559,133 @@ public ValidationResponse validateSystem(
552559
public void addExtraValidations(
553560
OpenMetadataApplicationConfig applicationConfig, ValidationResponse validation) {}
554561

562+
private StepValidation getEmbeddingsValidation(OpenMetadataApplicationConfig applicationConfig) {
563+
StepValidation embeddingsValidation = new StepValidation();
564+
String description = "Embeddings are used to allow Semantic Search";
565+
SearchRepository searchRepository = Entity.getSearchRepository();
566+
567+
String configMessage = getEmbeddingConfigurationMessage(applicationConfig);
568+
569+
if (searchRepository.getVectorIndexService() == null) {
570+
return embeddingsValidation
571+
.withDescription(description)
572+
.withMessage("Embeddings are not configured properly. " + configMessage)
573+
.withPassed(false);
574+
}
575+
576+
try {
577+
searchRepository.ensureVectorIndexDimension();
578+
} catch (Exception e) {
579+
LOG.error("Vector dimension mismatch detected", e);
580+
return embeddingsValidation
581+
.withDescription(description)
582+
.withMessage("Vector dimension mismatch: " + e.getMessage())
583+
.withPassed(false);
584+
}
585+
586+
try {
587+
return validateEmbeddingGeneration(
588+
searchRepository.getEmbeddingClient(), embeddingsValidation, description, configMessage);
589+
} catch (Exception e) {
590+
LOG.error("Error during embedding generation validation", e);
591+
return embeddingsValidation
592+
.withDescription(description)
593+
.withMessage("Embedding generation failed: " + e.getMessage() + ". " + configMessage)
594+
.withPassed(false);
595+
}
596+
}
597+
598+
private StepValidation validateEmbeddingGeneration(
599+
EmbeddingClient embeddingClient,
600+
StepValidation embeddingsValidation,
601+
String description,
602+
String configMessage) {
603+
String testText = "OpenMetadata embedding validation test";
604+
float[] embedding = embeddingClient.embed(testText);
605+
606+
if (embedding == null) {
607+
return embeddingsValidation
608+
.withDescription(description)
609+
.withMessage("Embedding generation returned null. " + configMessage)
610+
.withPassed(false);
611+
}
612+
613+
int expectedDimension = embeddingClient.getDimension();
614+
if (embedding.length != expectedDimension) {
615+
return embeddingsValidation
616+
.withDescription(description)
617+
.withMessage(
618+
String.format(
619+
"Embedding dimension mismatch: expected %d, got %d. %s",
620+
expectedDimension, embedding.length, configMessage))
621+
.withPassed(false);
622+
}
623+
624+
boolean allZeros = true;
625+
for (float value : embedding) {
626+
if (value != 0.0f) {
627+
allZeros = false;
628+
break;
629+
}
630+
}
631+
if (allZeros) {
632+
return embeddingsValidation
633+
.withDescription(description)
634+
.withMessage("Embedding generation returned all zeros. " + configMessage)
635+
.withPassed(false);
636+
}
637+
638+
return embeddingsValidation
639+
.withDescription(description)
640+
.withMessage(String.format("Embeddings are working correctly. %s", configMessage))
641+
.withPassed(true);
642+
}
643+
644+
private String getEmbeddingConfigurationMessage(OpenMetadataApplicationConfig applicationConfig) {
645+
try {
646+
NaturalLanguageSearchConfiguration nlpConfig =
647+
applicationConfig.getElasticSearchConfiguration().getNaturalLanguageSearch();
648+
String provider = nlpConfig.getEmbeddingProvider();
649+
if (nullOrEmpty(provider)) {
650+
return "Required configuration: embeddingProvider";
651+
}
652+
653+
return switch (provider.toLowerCase()) {
654+
case "djl" -> String.format(
655+
"DJL configuration: embeddingModel: %s", nlpConfig.getDjl().getEmbeddingModel());
656+
case "bedrock" -> String.format(
657+
"Bedrock configuration: region: %s, embeddingModelId: %s, embeddingDimension %s",
658+
nlpConfig.getBedrock().getAwsConfig() != null
659+
? nlpConfig.getBedrock().getAwsConfig().getRegion()
660+
: "not configured",
661+
nlpConfig.getBedrock().getEmbeddingModelId(),
662+
nlpConfig.getBedrock().getEmbeddingDimension());
663+
case "openai" -> {
664+
String openaiEndpoint =
665+
nullOrEmpty(nlpConfig.getOpenai().getEndpoint())
666+
? "api.openai.com"
667+
: nlpConfig.getOpenai().getEndpoint();
668+
String deploymentInfo =
669+
nullOrEmpty(nlpConfig.getOpenai().getDeploymentName())
670+
? ""
671+
: String.format(
672+
", deploymentName: %s", nlpConfig.getOpenai().getDeploymentName());
673+
yield String.format(
674+
"OpenAI configuration: endpoint: %s, embeddingModelId: %s, embeddingDimension: %s%s",
675+
openaiEndpoint,
676+
nlpConfig.getOpenai().getEmbeddingModelId(),
677+
nlpConfig.getOpenai().getEmbeddingDimension(),
678+
deploymentInfo);
679+
}
680+
default -> String.format(
681+
"Unknown provider '%s'. Supported providers: djl, bedrock, openai", provider);
682+
};
683+
} catch (Exception e) {
684+
LOG.error("Error getting embedding configuration", e);
685+
return "Unable to determine embedding configuration";
686+
}
687+
}
688+
555689
private StepValidation getDatabaseValidation(OpenMetadataApplicationConfig applicationConfig) {
556690
try {
557691
dao.testConnection();

0 commit comments

Comments
 (0)