diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 000000000000..d792ac9587bc --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,9 @@ +{ + "permissions": { + "allow": [ + "Bash(chmod:*)" + ], + "deny": [], + "ask": [] + } +} diff --git a/LLMD_INTEGRATION_SUMMARY.md b/LLMD_INTEGRATION_SUMMARY.md new file mode 100644 index 000000000000..310ac2e288fe --- /dev/null +++ b/LLMD_INTEGRATION_SUMMARY.md @@ -0,0 +1,296 @@ +# LLM-D Integration Summary + +This document provides a summary of the llm-d controller integration into the opendatahub-operator. + +## What Was Implemented + +A complete POC integration of llm-d as a component in the OpenDataHub operator using an operator SDK + Helm-based approach. + +## Components Created + +### 1. API Types +**Location**: `api/components/v1alpha1/llmd_types.go` + +Defines the CRD for llm-d component including: +- `Llmd`: Main CRD +- `LlmdSpec`: Configuration for three Helm charts (ModelService, Infra, Gateway API) +- `LlmdStatus`: Status information +- `DSCLlmd`: Integration with DataScienceCluster + +### 2. Component Handler +**Location**: `internal/controller/components/llmd/` + +Files created: +- `llmd.go`: Component handler implementing registry interface +- `llmd_controller.go`: Reconciliation controller with resource watching +- `llmd_controller_actions.go`: Custom reconciliation actions +- `llmd_support.go`: Helper functions for manifest paths +- `monitoring/llmd-prometheusrules.tmpl.yaml`: Prometheus monitoring rules + +### 3. Helm Chart Integration +**Location**: `hack/fetch-helm-charts.sh` + +Script that: +- Downloads three Helm charts: + - llm-d-modelservice (v0.2.11) + - llm-d-infra (v1.3.3) + - Gateway API Inference Extension (v1.0.1) +- Templates them with default values +- Creates Kustomize-compatible manifests in `opt/manifests/llmd/` + +### 4. Integration Points + +#### DataScienceCluster Integration +**Modified**: `api/datasciencecluster/v2/datasciencecluster_types.go` +- Added `Llmd` field to `Components` struct +- Added `Llmd` field to `ComponentsStatus` struct + +#### Main Controller Registration +**Modified**: `cmd/main.go` +- Added import for llmd controller + +#### Project Configuration +**Modified**: `PROJECT` +- Added Llmd resource definition + +#### Manifest Fetching +**Modified**: `get_all_manifests.sh` +- Added call to `hack/fetch-helm-charts.sh` + +### 5. Documentation & Examples + +Created: +- `docs/llmd-integration.md`: Comprehensive integration guide +- `internal/controller/components/llmd/README.md`: Component-specific README +- `config/samples/components_v1alpha1_llmd.yaml`: Example Llmd CR +- `config/samples/datasciencecluster_v2_llmd_example.yaml`: Example DSC with llm-d + +## How It Works + +### Build-Time +1. `make get-manifests` runs `hack/fetch-helm-charts.sh` +2. Script downloads and templates Helm charts +3. Manifests are stored in `opt/manifests/llmd/` +4. Operator image includes these pre-rendered manifests + +### Runtime +1. User creates DataScienceCluster or Llmd CR +2. Component handler creates/updates Llmd CR +3. Controller reconciles Llmd CR +4. Deploys manifests from `opt/manifests/llmd/overlays/default` +5. Updates status in Llmd and DataScienceCluster + +## Deployment Example + +### Using DataScienceCluster + +```yaml +apiVersion: datasciencecluster.opendatahub.io/v2 +kind: DataScienceCluster +metadata: + name: default-dsc +spec: + components: + llmd: + managementState: Managed + modelService: + enabled: true + version: "0.2.11" + infra: + enabled: true + version: "1.3.3" + gatewayAPI: + enabled: true + version: "1.0.1" +``` + +### Using Llmd CR Directly + +```yaml +apiVersion: components.platform.opendatahub.io/v1alpha1 +kind: Llmd +metadata: + name: default-llm-d +spec: + modelService: + enabled: true + version: "0.2.11" + infra: + enabled: true + version: "1.3.3" + gatewayAPI: + enabled: true + version: "1.0.1" +``` + +## Testing the Integration + +### 1. Generate Manifests +```bash +make get-manifests +``` + +This will: +- Fetch all component manifests +- Download and template llm-d Helm charts + +### 2. Generate CRDs +```bash +make manifests +``` + +This generates the Kubernetes CRDs including the new Llmd CRD. + +### 3. Build Operator +```bash +make image IMG=quay.io//opendatahub-operator: +``` + +### 4. Deploy Operator +```bash +make deploy IMG=quay.io//opendatahub-operator: +``` + +### 5. Create Test Instance +```bash +kubectl apply -f config/samples/datasciencecluster_v2_llmd_example.yaml +``` + +### 6. Verify Deployment +```bash +# Check Llmd CR status +kubectl get llmd default-llm-d -o yaml + +# Check DataScienceCluster status +kubectl get datasciencecluster default-dsc -o jsonpath='{.status.components.llmd}' + +# Check deployed resources +kubectl get all -n llm-d -l app.kubernetes.io/part-of=llm-d +``` + +## Files Modified/Created + +### Created Files (22 total) +1. `api/components/v1alpha1/llmd_types.go` +2. `internal/controller/components/llmd/llmd.go` +3. `internal/controller/components/llmd/llmd_controller.go` +4. `internal/controller/components/llmd/llmd_controller_actions.go` +5. `internal/controller/components/llmd/llmd_support.go` +6. `internal/controller/components/llmd/monitoring/llmd-prometheusrules.tmpl.yaml` +7. `internal/controller/components/llmd/README.md` +8. `hack/fetch-helm-charts.sh` +9. `config/samples/components_v1alpha1_llmd.yaml` +10. `config/samples/datasciencecluster_v2_llmd_example.yaml` +11. `docs/llmd-integration.md` +12. `LLMD_INTEGRATION_SUMMARY.md` (this file) + +### Modified Files (4 total) +1. `api/datasciencecluster/v2/datasciencecluster_types.go` +2. `cmd/main.go` +3. `PROJECT` +4. `get_all_manifests.sh` + +## Architecture Diagram + +``` +┌─────────────────────────────────────────────────────────────┐ +│ DataScienceCluster CR │ +│ spec.components.llmd.managementState: Managed │ +└─────────────────────┬───────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ Component Handler (llmd.go) │ +│ - Registered in component registry │ +│ - Creates/Updates Llmd CR from DSC spec │ +└─────────────────────┬───────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ Llmd CR (CRD) │ +│ apiVersion: components.platform.opendatahub.io/v1alpha1 │ +│ kind: Llmd │ +└─────────────────────┬───────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ Controller (llmd_controller.go) │ +│ - Watches Llmd CR │ +│ - Renders Kustomize manifests │ +│ - Deploys resources to cluster │ +│ - Updates status │ +└─────────────────────┬───────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ Deployed Resources in llm-d namespace │ +│ │ +│ ┌──────────────────┐ ┌──────────────────┐ │ +│ │ ModelService │ │ Infra │ │ +│ │ (v0.2.11) │ │ (v1.3.3) │ │ +│ └──────────────────┘ └──────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────────┐ │ +│ │ Gateway API Inference Extension │ │ +│ │ (v1.0.1) │ │ +│ └──────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Next Steps + +To use this integration: + +1. **Build Prerequisites**: + - Install Helm CLI on build machine + - Ensure network access to Helm chart repositories + +2. **Generate Manifests**: + ```bash + make get-manifests + ``` + +3. **Generate CRDs**: + ```bash + make manifests + ``` + +4. **Build & Deploy**: + ```bash + make image-build + make deploy + ``` + +5. **Create Instance**: + ```bash + kubectl apply -f config/samples/datasciencecluster_v2_llmd_example.yaml + ``` + +## Limitations & Future Work + +### Current Limitations +1. Helm charts are templated at build time, not runtime +2. Limited support for custom Helm values override +3. Fixed namespace (`llm-d`) +4. Chart version changes require operator rebuild + +### Recommended Enhancements +1. **Dynamic Helm Integration**: Use Helm SDK to deploy charts at runtime +2. **Full Values Support**: Allow complete Helm values.yaml override +3. **Namespace Configuration**: Make namespace configurable +4. **Chart Upgrade Logic**: Implement proper Helm upgrade/rollback +5. **Chart Repository Config**: Make chart URLs and versions configurable + +## References + +- **LLM-D ModelService Chart**: https://llm-d-incubation.github.io/llm-d-modelservice/ +- **LLM-D Infra Chart**: https://llm-d-incubation.github.io/llm-d-infra/ +- **Gateway API Inference Extension**: https://github.com/kubernetes-sigs/gateway-api-inference-extension +- **OpenDataHub Operator**: https://github.com/opendatahub-io/opendatahub-operator + +## Questions & Support + +For questions or issues with this integration, refer to: +- `docs/llmd-integration.md` - Full integration guide +- `internal/controller/components/llmd/README.md` - Component README +- OpenDataHub documentation - https://opendatahub.io/ diff --git a/PROJECT b/PROJECT index e56587b8dd32..f7c57c4820f6 100644 --- a/PROJECT +++ b/PROJECT @@ -144,6 +144,14 @@ resources: kind: LlamaStackOperator path: github.com/opendatahub-io/opendatahub-operator/v2/api/components/v1alpha1 version: v1alpha1 +- api: + crdVersion: v1alpha1 + controller: true + domain: platform.opendatahub.io + group: components + kind: Llmd + path: github.com/opendatahub-io/opendatahub-operator/v2/api/components/v1alpha1 + version: v1alpha1 - api: crdVersion: v1alpha1 controller: true diff --git a/api/components/v1alpha1/llmd_types.go b/api/components/v1alpha1/llmd_types.go new file mode 100644 index 000000000000..6f8fb9a2d8e8 --- /dev/null +++ b/api/components/v1alpha1/llmd_types.go @@ -0,0 +1,159 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + operatorv1 "github.com/openshift/api/operator/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/opendatahub-io/opendatahub-operator/v2/api/common" +) + +const ( + LlmdComponentName = "llm-d" + // value should match what's set in the XValidation below + LlmdInstanceName = "default-llm-d" + LlmdKind = "Llmd" +) + +// Check that the component implements common.PlatformObject. +var _ common.PlatformObject = (*Llmd)(nil) + +// LlmdCommonSpec spec defines the shared desired state of Llmd +type LlmdCommonSpec struct { + // ModelService configuration for llm-d modelservice chart + ModelService LlmdModelServiceSpec `json:"modelService,omitempty"` + + // Infra configuration for llm-d infra chart + Infra LlmdInfraSpec `json:"infra,omitempty"` + + // GatewayAPI configuration for Gateway API Inference Extension + GatewayAPI LlmdGatewayAPISpec `json:"gatewayAPI,omitempty"` +} + +// LlmdModelServiceSpec defines configuration for the llm-d modelservice Helm chart +type LlmdModelServiceSpec struct { + // Enable or disable modelservice deployment + // +kubebuilder:default=true + Enabled bool `json:"enabled,omitempty"` + + // Chart version to use (default: v0.2.11) + // +kubebuilder:default="0.2.11" + Version string `json:"version,omitempty"` + + // Additional Helm values to override defaults + // +optional + Values map[string]string `json:"values,omitempty"` +} + +// LlmdInfraSpec defines configuration for the llm-d infra Helm chart +type LlmdInfraSpec struct { + // Enable or disable infra deployment + // +kubebuilder:default=true + Enabled bool `json:"enabled,omitempty"` + + // Chart version to use (default: v1.3.3) + // +kubebuilder:default="1.3.3" + Version string `json:"version,omitempty"` + + // Additional Helm values to override defaults + // +optional + Values map[string]string `json:"values,omitempty"` +} + +// LlmdGatewayAPISpec defines configuration for the Gateway API Inference Extension Helm chart +type LlmdGatewayAPISpec struct { + // Enable or disable Gateway API Inference Extension deployment + // +kubebuilder:default=true + Enabled bool `json:"enabled,omitempty"` + + // Chart version to use (default: v1.0.1) + // +kubebuilder:default="1.0.1" + Version string `json:"version,omitempty"` + + // Additional Helm values to override defaults + // +optional + Values map[string]string `json:"values,omitempty"` +} + +// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. + +// LlmdSpec defines the desired state of Llmd +type LlmdSpec struct { + // llmd spec exposed to DSC api + LlmdCommonSpec `json:",inline"` + // llmd spec exposed only to internal api +} + +// LlmdCommonStatus defines the shared observed state of Llmd +type LlmdCommonStatus struct { + common.ComponentReleaseStatus `json:",inline"` +} + +// LlmdStatus defines the observed state of Llmd +type LlmdStatus struct { + common.Status `json:",inline"` + LlmdCommonStatus `json:",inline"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster +// +kubebuilder:validation:XValidation:rule="self.metadata.name == 'default-llm-d'",message="Llmd name must be default-llm-d" +// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].status`,description="Ready" +// +kubebuilder:printcolumn:name="Reason",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].reason`,description="Reason" + +// Llmd is the Schema for the llmd API +type Llmd struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec LlmdSpec `json:"spec,omitempty"` + Status LlmdStatus `json:"status,omitempty"` +} + +func (c *Llmd) GetStatus() *common.Status { + return &c.Status.Status +} + +// +kubebuilder:object:root=true + +// LlmdList contains a list of Llmd +type LlmdList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []Llmd `json:"items"` +} + +func init() { //nolint:gochecknoinits + SchemeBuilder.Register(&Llmd{}, &LlmdList{}) +} + +// DSCLlmd contains configuration for llm-d component in DSC +type DSCLlmd struct { + // ManagementState indicates the component's management state + // +kubebuilder:validation:Enum=Managed;Removed + ManagementState operatorv1.ManagementState `json:"managementState,omitempty"` + + LlmdCommonSpec `json:",inline"` +} + +// DSCLlmdStatus contains status information for llm-d component in DSC +type DSCLlmdStatus struct { + ManagementState operatorv1.ManagementState `json:"managementState,omitempty"` + *LlmdCommonStatus `json:",inline"` +} diff --git a/api/datasciencecluster/v2/datasciencecluster_types.go b/api/datasciencecluster/v2/datasciencecluster_types.go index beebb3b167e7..e524eed5b1f6 100644 --- a/api/datasciencecluster/v2/datasciencecluster_types.go +++ b/api/datasciencecluster/v2/datasciencecluster_types.go @@ -63,6 +63,9 @@ type Components struct { // LlamaStack Operator component configuration. LlamaStackOperator componentApi.DSCLlamaStackOperator `json:"llamastackoperator,omitempty"` + + // Llmd component configuration. + Llmd componentApi.DSCLlmd `json:"llmd,omitempty"` } // ComponentsStatus defines the custom status of DataScienceCluster components. @@ -99,6 +102,9 @@ type ComponentsStatus struct { // LlamaStack Operator component status. LlamaStackOperator componentApi.DSCLlamaStackOperatorStatus `json:"llamastackoperator,omitempty"` + + // Llmd component status. + Llmd componentApi.DSCLlmdStatus `json:"llmd,omitempty"` } // DataScienceClusterStatus defines the observed state of DataScienceCluster. diff --git a/cmd/main.go b/cmd/main.go index 1ac0b8443785..87e47ad57fbb 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -90,6 +90,7 @@ import ( _ "github.com/opendatahub-io/opendatahub-operator/v2/internal/controller/components/feastoperator" _ "github.com/opendatahub-io/opendatahub-operator/v2/internal/controller/components/kserve" _ "github.com/opendatahub-io/opendatahub-operator/v2/internal/controller/components/kueue" + _ "github.com/opendatahub-io/opendatahub-operator/v2/internal/controller/components/llmd" _ "github.com/opendatahub-io/opendatahub-operator/v2/internal/controller/components/llamastackoperator" _ "github.com/opendatahub-io/opendatahub-operator/v2/internal/controller/components/modelcontroller" _ "github.com/opendatahub-io/opendatahub-operator/v2/internal/controller/components/modelregistry" diff --git a/config/samples/components_v1alpha1_llmd.yaml b/config/samples/components_v1alpha1_llmd.yaml new file mode 100644 index 000000000000..880b5d4ec120 --- /dev/null +++ b/config/samples/components_v1alpha1_llmd.yaml @@ -0,0 +1,28 @@ +apiVersion: components.platform.opendatahub.io/v1alpha1 +kind: Llmd +metadata: + name: default-llm-d +spec: + # ModelService configuration + modelService: + enabled: true + version: "0.2.11" + # Optional: Override default Helm values + # values: + # key: value + + # Infra configuration + infra: + enabled: true + version: "1.3.3" + # Optional: Override default Helm values + # values: + # key: value + + # Gateway API Inference Extension configuration + gatewayAPI: + enabled: true + version: "1.0.1" + # Optional: Override default Helm values + # values: + # key: value diff --git a/config/samples/datasciencecluster_v2_llmd_example.yaml b/config/samples/datasciencecluster_v2_llmd_example.yaml new file mode 100644 index 000000000000..3edb0f455130 --- /dev/null +++ b/config/samples/datasciencecluster_v2_llmd_example.yaml @@ -0,0 +1,24 @@ +apiVersion: datasciencecluster.opendatahub.io/v2 +kind: DataScienceCluster +metadata: + name: default-dsc +spec: + components: + # Enable llm-d component + llmd: + managementState: Managed + modelService: + enabled: true + version: "0.2.11" + infra: + enabled: true + version: "1.3.3" + gatewayAPI: + enabled: true + version: "1.0.1" + + # Other components can be configured as needed + dashboard: + managementState: Managed + workbenches: + managementState: Managed diff --git a/docs/llmd-integration.md b/docs/llmd-integration.md new file mode 100644 index 000000000000..ba8fafc30504 --- /dev/null +++ b/docs/llmd-integration.md @@ -0,0 +1,248 @@ +# LLM-D Integration Guide + +## Overview + +This document describes the integration of llm-d (LLM Distribution) as a component in the OpenDataHub operator. This is a POC implementation using an operator SDK + Helm-based approach. + +## Architecture + +The llm-d integration consists of three main Helm charts: + +1. **ModelService** (v0.2.11) - Core model serving functionality + - Chart URL: https://llm-d-incubation.github.io/llm-d-modelservice/ + +2. **Infra** (v1.3.3) - Infrastructure components + - Chart URL: https://llm-d-incubation.github.io/llm-d-infra/ + +3. **Gateway API Inference Extension** (v1.0.1) - Gateway API integration + - Chart URL: https://github.com/kubernetes-sigs/gateway-api-inference-extension + +## Components + +### API Types + +The llm-d component is defined in `api/components/v1alpha1/llmd_types.go` with the following structure: + +- **Llmd**: The main CRD for llm-d component +- **LlmdSpec**: Configuration for all three Helm charts +- **LlmdStatus**: Status information for the component + +### Controller + +The controller is implemented in `internal/controller/components/llmd/` with: + +- **llmd.go**: Component handler implementing the registry interface +- **llmd_controller.go**: Reconciliation logic +- **llmd_controller_actions.go**: Custom actions during reconciliation +- **llmd_support.go**: Helper functions + +### Manifest Management + +Helm charts are fetched and templated using `hack/fetch-helm-charts.sh`, which: + +1. Downloads the specified Helm chart versions +2. Templates them with default values +3. Creates Kustomize-compatible manifests in `opt/manifests/llmd/` + +## Installation + +### Prerequisites + +- OpenShift 4.19 or higher +- Helm CLI installed (for manifest generation) +- OpenDataHub operator installed + +### Deploying llm-d Component + +#### Method 1: Using DataScienceCluster + +Create a DataScienceCluster CR with llm-d enabled: + +```yaml +apiVersion: datasciencecluster.opendatahub.io/v2 +kind: DataScienceCluster +metadata: + name: default-dsc +spec: + components: + llmd: + managementState: Managed + modelService: + enabled: true + version: "0.2.11" + infra: + enabled: true + version: "1.3.3" + gatewayAPI: + enabled: true + version: "1.0.1" +``` + +#### Method 2: Using Llmd Component Directly + +Create an Llmd CR: + +```yaml +apiVersion: components.platform.opendatahub.io/v1alpha1 +kind: Llmd +metadata: + name: default-llm-d +spec: + modelService: + enabled: true + version: "0.2.11" + infra: + enabled: true + version: "1.3.3" + gatewayAPI: + enabled: true + version: "1.0.1" +``` + +### Building the Operator with llm-d Support + +1. Fetch manifests (including llm-d Helm charts): + ```bash + make get-manifests + ``` + +2. Generate CRDs: + ```bash + make manifests + ``` + +3. Build the operator image: + ```bash + make image IMG=quay.io//opendatahub-operator: + ``` + +4. Deploy the operator: + ```bash + make deploy IMG=quay.io//opendatahub-operator: + ``` + +## Configuration + +### Customizing Helm Values + +You can override default Helm chart values using the `values` field in each component: + +```yaml +spec: + modelService: + enabled: true + version: "0.2.11" + values: + replicas: "3" + resourceLimits: "high" +``` + +### Disabling Components + +Individual components can be disabled: + +```yaml +spec: + modelService: + enabled: true + infra: + enabled: false # Disable infra component + gatewayAPI: + enabled: true +``` + +## Monitoring + +The llm-d component includes Prometheus monitoring rules defined in: +`internal/controller/components/llmd/monitoring/llmd-prometheusrules.tmpl.yaml` + +These rules monitor: +- ModelService availability +- Infra component availability +- Overall component health + +## Troubleshooting + +### Check Component Status + +```bash +oc get llmd default-llm-d -o yaml +``` + +Look for the status section to see the current state. + +### Check DataScienceCluster Status + +```bash +oc get datasciencecluster default-dsc -o jsonpath='{.status.components.llmd}' +``` + +### View Deployed Resources + +```bash +oc get all -n llm-d -l app.kubernetes.io/part-of=llm-d +``` + +### Common Issues + +1. **Helm charts not found** + - Ensure `hack/fetch-helm-charts.sh` has been executed + - Check that Helm CLI is installed + +2. **CRD validation errors** + - Run `make manifests` to regenerate CRDs + - Ensure API types are properly defined + +3. **Deployment failures** + - Check operator logs: `oc logs -n opendatahub-operator-system deployment/opendatahub-operator-controller-manager` + - Verify namespace exists: `oc get ns llm-d` + +## Development + +### Adding New Helm Chart Versions + +1. Update versions in `hack/fetch-helm-charts.sh` +2. Update default versions in `api/components/v1alpha1/llmd_types.go` +3. Run `make get-manifests` to fetch new versions + +### Modifying Component Behavior + +Component reconciliation logic can be modified in: +- `internal/controller/components/llmd/llmd_controller.go` +- `internal/controller/components/llmd/llmd_controller_actions.go` + +### Testing Changes + +```bash +# Run unit tests +make unit-test + +# Run e2e tests +make e2e-test +``` + +## Limitations + +This is a POC implementation with the following limitations: + +1. **Helm Chart Management**: Charts are templated at build time, not dynamically +2. **Version Upgrades**: Changing chart versions requires rebuilding the operator +3. **Value Overrides**: Limited support for custom Helm values +4. **Namespace**: Currently deploys to a fixed `llm-d` namespace + +## Future Enhancements + +Potential improvements for production readiness: + +1. **Dynamic Helm Integration**: Use Helm SDK to deploy charts dynamically +2. **Advanced Configuration**: Support full Helm values override +3. **Multi-Namespace Support**: Allow deployment to custom namespaces +4. **Upgrade Strategy**: Implement proper Helm chart upgrade logic +5. **Rollback Support**: Add ability to rollback failed deployments + +## References + +- [LLM-D ModelService Chart](https://llm-d-incubation.github.io/llm-d-modelservice/) +- [LLM-D Infra Chart](https://llm-d-incubation.github.io/llm-d-infra/) +- [Gateway API Inference Extension](https://github.com/kubernetes-sigs/gateway-api-inference-extension) +- [OpenDataHub Operator Documentation](../README.md) diff --git a/get_all_manifests.sh b/get_all_manifests.sh index 21591ee00dd2..1430b414f2f2 100755 --- a/get_all_manifests.sh +++ b/get_all_manifests.sh @@ -189,3 +189,11 @@ for key in "${!PLATFORM_MANIFESTS[@]}"; do ln -s $(pwd)/${source_path} ${DST_MANIFESTS_DIR}/${target_path} fi done + +# Fetch and template llm-d Helm charts +echo -e "\033[32mFetching llm-d Helm charts...\033[0m" +if [[ -x "./hack/fetch-helm-charts.sh" ]]; then + ./hack/fetch-helm-charts.sh +else + echo -e "\033[33mWarning: hack/fetch-helm-charts.sh not found or not executable. Skipping llm-d Helm chart fetch.\033[0m" +fi diff --git a/hack/fetch-helm-charts.sh b/hack/fetch-helm-charts.sh new file mode 100755 index 000000000000..6be6d6a79aac --- /dev/null +++ b/hack/fetch-helm-charts.sh @@ -0,0 +1,145 @@ +#!/usr/bin/env bash +set -e + +# This script fetches and templates Helm charts for llm-d integration +# It creates a kustomize-compatible structure in opt/manifests/llmd + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +MANIFESTS_DIR="${REPO_ROOT}/opt/manifests/llmd" + +# Helm chart configurations +MODELSERVICE_CHART_URL="https://llm-d-incubation.github.io/llm-d-modelservice/" +MODELSERVICE_VERSION="0.2.11" +MODELSERVICE_NAME="llm-d-modelservice" + +INFRA_CHART_URL="https://llm-d-incubation.github.io/llm-d-infra/" +INFRA_VERSION="1.3.3" +INFRA_NAME="llm-d-infra" + +GATEWAY_API_CHART_URL="https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.0.1" +GATEWAY_API_VERSION="1.0.1" +GATEWAY_API_NAME="inferencepool" + +# Check if helm is installed +if ! command -v helm &> /dev/null; then + echo "ERROR: helm is not installed. Please install helm to proceed." + exit 1 +fi + +# Create manifests directory structure +echo "Creating manifests directory structure..." +mkdir -p "${MANIFESTS_DIR}"/{modelservice,infra,gateway-api,overlays/default} + +# Function to template and save helm chart +template_helm_chart() { + local chart_url=$1 + local chart_name=$2 + local version=$3 + local output_dir=$4 + local release_name=$5 + + echo "Templating ${chart_name} version ${version}..." + + # Add helm repo if it's a repo URL (not a direct chart URL) + if [[ $chart_url == https://*.github.io/* ]]; then + local repo_name="llmd-${chart_name}" + helm repo add "${repo_name}" "${chart_url}" 2>/dev/null || true + helm repo update "${repo_name}" + helm template "${release_name}" "${repo_name}/${chart_name}" \ + --version "${version}" \ + --namespace llm-d \ + > "${output_dir}/resources.yaml" + else + # For GitHub releases or direct chart URLs + local temp_dir=$(mktemp -d) + cd "${temp_dir}" + wget -q "${chart_url}/${chart_name}-${version}.tgz" -O chart.tgz || { + echo "ERROR: Failed to download chart from ${chart_url}" + cd - > /dev/null + rm -rf "${temp_dir}" + return 1 + } + tar -xzf chart.tgz + helm template "${release_name}" "./${chart_name}" \ + --namespace llm-d \ + > "${output_dir}/resources.yaml" + cd - > /dev/null + rm -rf "${temp_dir}" + fi +} + +# Template each Helm chart +template_helm_chart "${MODELSERVICE_CHART_URL}" "${MODELSERVICE_NAME}" "${MODELSERVICE_VERSION}" "${MANIFESTS_DIR}/modelservice" "llmd-modelservice" +template_helm_chart "${INFRA_CHART_URL}" "${INFRA_NAME}" "${INFRA_VERSION}" "${MANIFESTS_DIR}/infra" "llmd-infra" + +# For Gateway API, we need to handle it differently as it's from GitHub releases +echo "Templating Gateway API Inference Extension..." +GATEWAY_TEMP_DIR=$(mktemp -d) +cd "${GATEWAY_TEMP_DIR}" +git clone --depth 1 --branch v${GATEWAY_API_VERSION} https://github.com/kubernetes-sigs/gateway-api-inference-extension.git +cd gateway-api-inference-extension/config/charts/inferencepool +helm template llmd-gateway-api . --namespace llm-d > "${MANIFESTS_DIR}/gateway-api/resources.yaml" +cd - > /dev/null +rm -rf "${GATEWAY_TEMP_DIR}" + +# Create kustomization.yaml for each component +cat > "${MANIFESTS_DIR}/modelservice/kustomization.yaml" << EOF +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: llm-d + +resources: + - resources.yaml + +commonLabels: + app.kubernetes.io/part-of: llm-d + app.kubernetes.io/component: modelservice +EOF + +cat > "${MANIFESTS_DIR}/infra/kustomization.yaml" << EOF +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: llm-d + +resources: + - resources.yaml + +commonLabels: + app.kubernetes.io/part-of: llm-d + app.kubernetes.io/component: infra +EOF + +cat > "${MANIFESTS_DIR}/gateway-api/kustomization.yaml" << EOF +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: llm-d + +resources: + - resources.yaml + +commonLabels: + app.kubernetes.io/part-of: llm-d + app.kubernetes.io/component: gateway-api +EOF + +# Create default overlay that includes all components +cat > "${MANIFESTS_DIR}/overlays/default/kustomization.yaml" << EOF +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: llm-d + +resources: + - ../../infra + - ../../modelservice + - ../../gateway-api + +commonLabels: + app.kubernetes.io/managed-by: opendatahub-operator +EOF + +echo "Successfully templated llm-d Helm charts to ${MANIFESTS_DIR}" diff --git a/internal/controller/components/llmd/README.md b/internal/controller/components/llmd/README.md new file mode 100644 index 000000000000..b5050b19fb82 --- /dev/null +++ b/internal/controller/components/llmd/README.md @@ -0,0 +1,108 @@ +# LLM-D Component + +## Overview + +The llm-d component integrates LLM Distribution capabilities into OpenDataHub using a Helm-based approach. + +## Structure + +``` +llmd/ +├── llmd.go # Component handler (registry integration) +├── llmd_controller.go # Reconciliation controller +├── llmd_controller_actions.go # Custom reconciliation actions +├── llmd_support.go # Helper functions and manifest paths +├── monitoring/ +│ └── llmd-prometheusrules.tmpl.yaml # Prometheus monitoring rules +└── README.md # This file +``` + +## Helm Charts Deployed + +1. **llm-d-modelservice** (v0.2.11) + - Core model serving functionality + - Deployed from https://llm-d-incubation.github.io/llm-d-modelservice/ + +2. **llm-d-infra** (v1.3.3) + - Infrastructure components + - Deployed from https://llm-d-incubation.github.io/llm-d-infra/ + +3. **inferencepool** (v1.0.1) + - Gateway API Inference Extension + - Deployed from https://github.com/kubernetes-sigs/gateway-api-inference-extension + +## Manifest Generation + +Manifests are generated using `hack/fetch-helm-charts.sh` which: +1. Downloads the Helm charts +2. Templates them with default namespace `llm-d` +3. Creates Kustomize-compatible structure in `opt/manifests/llmd/` + +## Component Lifecycle + +The component follows the standard OpenDataHub component pattern: + +1. **Registration**: Registered in `init()` with the component registry +2. **Initialization**: `Init()` is called during operator startup +3. **CR Creation**: `NewCRObject()` creates Llmd CR from DataScienceCluster +4. **Reconciliation**: Controller watches Llmd CR and deploys manifests +5. **Status Updates**: `UpdateDSCStatus()` syncs status back to DataScienceCluster + +## Development + +### Local Testing + +```bash +# Generate manifests +make get-manifests + +# Run controller locally +make run + +# In another terminal, apply test CR +kubectl apply -f config/samples/components_v1alpha1_llmd.yaml +``` + +### Modifying Chart Versions + +1. Edit `hack/fetch-helm-charts.sh` with new versions +2. Update default versions in `api/components/v1alpha1/llmd_types.go` +3. Run `make get-manifests` +4. Test the changes + +## Configuration + +### Default Configuration + +```yaml +spec: + modelService: + enabled: true + version: "0.2.11" + infra: + enabled: true + version: "1.3.3" + gatewayAPI: + enabled: true + version: "1.0.1" +``` + +### Custom Values (Future Enhancement) + +```yaml +spec: + modelService: + enabled: true + version: "0.2.11" + values: + customKey: customValue +``` + +## Monitoring + +Prometheus rules are automatically deployed with the component. See `monitoring/llmd-prometheusrules.tmpl.yaml`. + +## See Also + +- [LLM-D Integration Guide](../../../../docs/llmd-integration.md) +- [Component Integration Guide](../../../../docs/COMPONENT_INTEGRATION.md) diff --git a/internal/controller/components/llmd/llmd.go b/internal/controller/components/llmd/llmd.go new file mode 100644 index 000000000000..99adda27c37e --- /dev/null +++ b/internal/controller/components/llmd/llmd.go @@ -0,0 +1,120 @@ +package llmd + +import ( + "context" + "errors" + + operatorv1 "github.com/openshift/api/operator/v1" + k8serr "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/opendatahub-io/opendatahub-operator/v2/api/common" + componentApi "github.com/opendatahub-io/opendatahub-operator/v2/api/components/v1alpha1" + dscv2 "github.com/opendatahub-io/opendatahub-operator/v2/api/datasciencecluster/v2" + "github.com/opendatahub-io/opendatahub-operator/v2/internal/controller/components" + cr "github.com/opendatahub-io/opendatahub-operator/v2/internal/controller/components/registry" + "github.com/opendatahub-io/opendatahub-operator/v2/internal/controller/status" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/conditions" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/types" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/metadata/annotations" +) + +const ( + componentName = componentApi.LlmdComponentName + + // LegacyComponentName is the name of the component that is assigned to deployments + // via Kustomize. Since a deployment selector is immutable, we can't upgrade existing + // deployment to the new component name, so keep it around till we figure out a solution. + LegacyComponentName = "llm-d" + + ReadyConditionType = componentApi.LlmdKind + status.ReadySuffix +) + +var ( + conditionTypes = []string{ + status.ConditionDeploymentsAvailable, + } +) + +type componentHandler struct{} + +func init() { //nolint:gochecknoinits + cr.Add(&componentHandler{}) +} + +// Init initializes the llmd component. +func (s *componentHandler) Init(platform common.Platform) error { + // No special initialization needed for llmd as manifests are managed via Helm charts + return nil +} + +func (s *componentHandler) GetName() string { + return componentName +} + +// NewCRObject creates a new Llmd CR for the DataScienceCluster. +func (s *componentHandler) NewCRObject(dsc *dscv2.DataScienceCluster) common.PlatformObject { + return &componentApi.Llmd{ + TypeMeta: metav1.TypeMeta{ + Kind: componentApi.LlmdKind, + APIVersion: componentApi.GroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: componentApi.LlmdInstanceName, + Annotations: map[string]string{ + annotations.ManagementStateAnnotation: string(dsc.Spec.Components.Llmd.ManagementState), + }, + }, + Spec: componentApi.LlmdSpec{ + LlmdCommonSpec: dsc.Spec.Components.Llmd.LlmdCommonSpec, + }, + } +} + +func (s *componentHandler) IsEnabled(dsc *dscv2.DataScienceCluster) bool { + return dsc.Spec.Components.Llmd.ManagementState == operatorv1.Managed +} + +func (s *componentHandler) UpdateDSCStatus(ctx context.Context, rr *types.ReconciliationRequest) (metav1.ConditionStatus, error) { + cs := metav1.ConditionUnknown + + c := componentApi.Llmd{} + c.Name = componentApi.LlmdInstanceName + + if err := rr.Client.Get(ctx, client.ObjectKeyFromObject(&c), &c); err != nil && !k8serr.IsNotFound(err) { + return cs, nil + } + + dsc, ok := rr.Instance.(*dscv2.DataScienceCluster) + if !ok { + return cs, errors.New("failed to convert to DataScienceCluster") + } + + ms := components.NormalizeManagementState(dsc.Spec.Components.Llmd.ManagementState) + + dsc.Status.Components.Llmd.ManagementState = ms + dsc.Status.Components.Llmd.LlmdCommonStatus = nil + + rr.Conditions.MarkFalse(ReadyConditionType) + + if s.IsEnabled(dsc) { + dsc.Status.Components.Llmd.LlmdCommonStatus = c.Status.LlmdCommonStatus.DeepCopy() + + if rc := conditions.FindStatusCondition(c.GetStatus(), status.ConditionTypeReady); rc != nil { + rr.Conditions.MarkFrom(ReadyConditionType, *rc) + cs = rc.Status + } else { + cs = metav1.ConditionFalse + } + } else { + rr.Conditions.MarkFalse( + ReadyConditionType, + conditions.WithReason(string(ms)), + conditions.WithMessage("Component ManagementState is set to %s", string(ms)), + conditions.WithSeverity(common.ConditionSeverityInfo), + ) + } + + return cs, nil +} diff --git a/internal/controller/components/llmd/llmd_controller.go b/internal/controller/components/llmd/llmd_controller.go new file mode 100644 index 000000000000..9bc97faaf0c0 --- /dev/null +++ b/internal/controller/components/llmd/llmd_controller.go @@ -0,0 +1,79 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package llmd + +import ( + "context" + + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + ctrl "sigs.k8s.io/controller-runtime" + + componentApi "github.com/opendatahub-io/opendatahub-operator/v2/api/components/v1alpha1" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster/gvk" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/actions/deploy" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/actions/gc" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/actions/render/kustomize" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/actions/status/deployments" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/actions/status/releases" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/predicates/resources" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/reconciler" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/metadata/labels" +) + +// NewComponentReconciler creates a ComponentReconciler for the Llmd component. +func (s *componentHandler) NewComponentReconciler(ctx context.Context, mgr ctrl.Manager) error { + _, err := reconciler.ReconcilerFor(mgr, &componentApi.Llmd{}). + // operands - owned + Owns(&corev1.Secret{}). + Owns(&corev1.Service{}). + Owns(&corev1.ConfigMap{}). + Owns(&corev1.ServiceAccount{}). + Owns(&rbacv1.Role{}). + Owns(&rbacv1.RoleBinding{}). + Owns(&rbacv1.ClusterRole{}). + Owns(&rbacv1.ClusterRoleBinding{}). + Owns(&monitoringv1.ServiceMonitor{}). + Owns(&appsv1.Deployment{}, reconciler.WithPredicates(resources.NewDeploymentPredicate())). + Owns(&appsv1.StatefulSet{}, reconciler.WithPredicates(resources.NewStatefulSetPredicate())). + + // operands - dynamically owned (llm-d custom resources) + OwnsGVK(gvk.InferencePoolV1alpha2, reconciler.Dynamic(reconciler.CrdExists(gvk.InferencePoolV1alpha2))). + + // actions + WithAction(initialize). + WithAction(releases.NewAction()). + WithAction(kustomize.NewAction( + // These are the default labels added to the resources + kustomize.WithLabel(labels.ODH.Component(LegacyComponentName), labels.True), + kustomize.WithLabel(labels.K8SCommon.PartOf, LegacyComponentName), + )). + WithAction(deploy.NewAction( + deploy.WithCache(), + )). + WithAction(deployments.NewAction()). + // must be the final action + WithAction(gc.NewAction()). + // declares the list of additional, controller specific conditions that are + // contributing to the controller readiness status + WithConditions(conditionTypes...). + Build(ctx) + + return err +} diff --git a/internal/controller/components/llmd/llmd_controller_actions.go b/internal/controller/components/llmd/llmd_controller_actions.go new file mode 100644 index 000000000000..42dc5a36a7c5 --- /dev/null +++ b/internal/controller/components/llmd/llmd_controller_actions.go @@ -0,0 +1,26 @@ +package llmd + +import ( + "context" + + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/actions/render" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/types" +) + +func initialize(ctx context.Context, rr *types.ReconciliationRequest) error { + rr.Manifests = []types.ManifestInfo{ + llmdManifestInfo(llmdManifestSourcePath), + } + + return nil +} + +var _ render.ResourceCustomizer = customizeLlmdResources + +func customizeLlmdResources(_ context.Context, _ client.Object, _ *render.GVRKey, _ map[string]any) error { + // Add any resource customization logic here if needed + // For now, we rely on the Helm-templated manifests + return nil +} diff --git a/internal/controller/components/llmd/llmd_support.go b/internal/controller/components/llmd/llmd_support.go new file mode 100644 index 000000000000..bf14e6233a5c --- /dev/null +++ b/internal/controller/components/llmd/llmd_support.go @@ -0,0 +1,18 @@ +package llmd + +import ( + odhtypes "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/types" + odhdeploy "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" +) + +const ( + llmdManifestSourcePath = "overlays/default" +) + +func llmdManifestInfo(sourcePath string) odhtypes.ManifestInfo { + return odhtypes.ManifestInfo{ + Path: odhdeploy.DefaultManifestPath, + ContextDir: componentName, + SourcePath: sourcePath, + } +} diff --git a/internal/controller/components/llmd/monitoring/llmd-prometheusrules.tmpl.yaml b/internal/controller/components/llmd/monitoring/llmd-prometheusrules.tmpl.yaml new file mode 100644 index 000000000000..339cf74dcb4d --- /dev/null +++ b/internal/controller/components/llmd/monitoring/llmd-prometheusrules.tmpl.yaml @@ -0,0 +1,34 @@ +apiVersion: monitoring.rhobs/v1 +kind: PrometheusRule +metadata: + name: llmd-prometheusrules + namespace: {{.Namespace}} +spec: + groups: + - name: SLOs-llmd-availability + rules: + - alert: LlmdModelServiceAvailability + annotations: + message: 'LLM-D ModelService availability is degraded in namespace {{`{{`}}$labels.namespace{{`}}`}}.' + summary: LLM-D ModelService Availability Alert + expr: | + (1 - (sum(rate(container_cpu_usage_seconds_total{namespace="llm-d", pod=~".*modelservice.*"}[5m])) by (namespace) / count(up{job="kubelet", namespace="llm-d"}) by (namespace))) > 0.02 + for: 5m + labels: + severity: warning + - alert: LlmdInfraAvailability + annotations: + message: 'LLM-D Infra availability is degraded in namespace {{`{{`}}$labels.namespace{{`}}`}}.' + summary: LLM-D Infra Availability Alert + expr: | + (1 - (sum(rate(container_cpu_usage_seconds_total{namespace="llm-d", pod=~".*infra.*"}[5m])) by (namespace) / count(up{job="kubelet", namespace="llm-d"}) by (namespace))) > 0.02 + for: 5m + labels: + severity: warning + + # RecordingRules for LLM-D Components + - name: SLOs - LLM-D Components + rules: + - expr: | + sum(up{namespace="llm-d"}) by (namespace, pod) + record: llmd:component:availability