diff --git a/api/v1alpha1/minicluster_types.go b/api/v1alpha1/minicluster_types.go index 260abb50..43882934 100644 --- a/api/v1alpha1/minicluster_types.go +++ b/api/v1alpha1/minicluster_types.go @@ -47,6 +47,18 @@ type MiniClusterSpec struct { // +optional Interactive bool `json:"interactive"` + // Specify the name of the cluster service + // +kubebuilder:default="flux-service" + // +default="flux-service" + // +optional + ServiceName string `json:"serviceName,omitempty"` + + // Specify the name of the job selector. + // You would want to do this if you intend to connect miniclusters + // Multiple jobs can be selected under a single services + // +optional + JobSelector string `json:"jobSelector"` + // Flux options for the broker, shared across cluster // +optional Flux FluxSpec `json:"flux"` @@ -318,6 +330,16 @@ type FluxSpec struct { // +optional Wrap string `json:"wrap,omitempty"` + // Connect to this job in the same namespace (akin to BootServer but within cluster) + // +optional + Connection string `json:"connection,omitempty"` + + // Additional number of nodes to allow from external boot-server + // This currently only allows local MiniCluster but could be + // extended to any general URI + // +optional + ConnectionSize int `json:"connectionSize,omitempty"` + // Single user executable to provide to flux start // +kubebuilder:default="5s" // +default="5s" @@ -585,6 +607,17 @@ func (f *MiniCluster) Validate() bool { f.Spec.Flux.InstallRoot = "/usr" } + // If connected host or size is defined, both must be defined! + if f.Spec.Flux.Connection != "" && f.Spec.Flux.ConnectionSize <= 0 { + fmt.Printf("😥️ A Connection is defined by no nodes. Please define the size.\n") + return false + } + // Inverse of that... + if f.Spec.Flux.Connection == "" && f.Spec.Flux.ConnectionSize > 0 { + fmt.Printf("😥️ A Connection size is defined, but no MiniCluster name. Please define the flux->connection.\n") + return false + } + // If the MaxSize isn't set, ensure it's equal to the size if f.Spec.MaxSize == 0 { f.Spec.MaxSize = f.Spec.Size diff --git a/api/v1alpha1/swagger.json b/api/v1alpha1/swagger.json index 85dc420f..3c27977c 100644 --- a/api/v1alpha1/swagger.json +++ b/api/v1alpha1/swagger.json @@ -123,6 +123,15 @@ "type": "string", "default": "5s" }, + "connection": { + "description": "Connect to this job in the same namespace (akin to BootServer but within cluster)", + "type": "string" + }, + "connectionSize": { + "description": "Additional number of nodes to allow from external boot-server This currently only allows local MiniCluster but could be extended to any general URI", + "type": "integer", + "format": "int32" + }, "installRoot": { "description": "Install root location", "type": "string", @@ -480,6 +489,11 @@ "default": "" } }, + "jobSelector": { + "description": "Specify the name of the job selector. You would want to do this if you intend to connect miniclusters Multiple jobs can be selected under a single services", + "type": "string", + "default": "" + }, "logging": { "description": "Logging modes determine the output you see in the job log", "default": {}, @@ -495,6 +509,11 @@ "default": {}, "$ref": "#/definitions/PodSpec" }, + "serviceName": { + "description": "Specify the name of the cluster service", + "type": "string", + "default": "flux-service" + }, "services": { "description": "Services are one or more service containers to bring up alongside the MiniCluster.", "type": "array", diff --git a/api/v1alpha1/zz_generated.openapi.go b/api/v1alpha1/zz_generated.openapi.go index f6b8244a..a70ef665 100644 --- a/api/v1alpha1/zz_generated.openapi.go +++ b/api/v1alpha1/zz_generated.openapi.go @@ -258,6 +258,20 @@ func schema__api_v1alpha1__FluxSpec(ref common.ReferenceCallback) common.OpenAPI Format: "", }, }, + "connection": { + SchemaProps: spec.SchemaProps{ + Description: "Connect to this job in the same namespace (akin to BootServer but within cluster)", + Type: []string{"string"}, + Format: "", + }, + }, + "connectionSize": { + SchemaProps: spec.SchemaProps{ + Description: "Additional number of nodes to allow from external boot-server This currently only allows local MiniCluster but could be extended to any general URI", + Type: []string{"integer"}, + Format: "int32", + }, + }, "connectTimeout": { SchemaProps: spec.SchemaProps{ Description: "Single user executable to provide to flux start", @@ -860,6 +874,22 @@ func schema__api_v1alpha1__MiniClusterSpec(ref common.ReferenceCallback) common. Format: "", }, }, + "serviceName": { + SchemaProps: spec.SchemaProps{ + Description: "Specify the name of the cluster service", + Default: "flux-service", + Type: []string{"string"}, + Format: "", + }, + }, + "jobSelector": { + SchemaProps: spec.SchemaProps{ + Description: "Specify the name of the job selector. You would want to do this if you intend to connect miniclusters Multiple jobs can be selected under a single services", + Default: "", + Type: []string{"string"}, + Format: "", + }, + }, "flux": { SchemaProps: spec.SchemaProps{ Description: "Flux options for the broker, shared across cluster", diff --git a/chart/templates/minicluster-crd.yaml b/chart/templates/minicluster-crd.yaml index 33281617..7ceefa9f 100644 --- a/chart/templates/minicluster-crd.yaml +++ b/chart/templates/minicluster-crd.yaml @@ -249,6 +249,15 @@ spec: default: 5s description: Single user executable to provide to flux start type: string + connection: + description: Connect to this job in the same namespace (akin to + BootServer but within cluster) + type: string + connectionSize: + description: Additional number of nodes to allow from external boot-server + This currently only allows local MiniCluster but could be extended + to any general URI + type: integer installRoot: default: /usr description: Install root location @@ -302,6 +311,11 @@ spec: type: string description: Labels for the job type: object + jobSelector: + description: Specify the name of the job selector. You would want to + do this if you intend to connect miniclusters Multiple jobs can be + selected under a single services + type: string logging: description: Logging modes determine the output you see in the job log properties: @@ -362,6 +376,10 @@ spec: description: Service account name for the pod type: string type: object + serviceName: + default: flux-service + description: Specify the name of the cluster service + type: string services: description: Services are one or more service containers to bring up alongside the MiniCluster. diff --git a/config/crd/bases/flux-framework.org_miniclusters.yaml b/config/crd/bases/flux-framework.org_miniclusters.yaml index aa71c665..38bc2738 100644 --- a/config/crd/bases/flux-framework.org_miniclusters.yaml +++ b/config/crd/bases/flux-framework.org_miniclusters.yaml @@ -251,6 +251,15 @@ spec: default: 5s description: Single user executable to provide to flux start type: string + connection: + description: Connect to this job in the same namespace (akin to + BootServer but within cluster) + type: string + connectionSize: + description: Additional number of nodes to allow from external + boot-server This currently only allows local MiniCluster but + could be extended to any general URI + type: integer installRoot: default: /usr description: Install root location @@ -304,6 +313,11 @@ spec: type: string description: Labels for the job type: object + jobSelector: + description: Specify the name of the job selector. You would want + to do this if you intend to connect miniclusters Multiple jobs can + be selected under a single services + type: string logging: description: Logging modes determine the output you see in the job log @@ -365,6 +379,10 @@ spec: description: Service account name for the pod type: string type: object + serviceName: + default: flux-service + description: Specify the name of the cluster service + type: string services: description: Services are one or more service containers to bring up alongside the MiniCluster. diff --git a/controllers/flux/extra.go b/controllers/flux/extra.go index 7cfd90cd..a4f695ea 100644 --- a/controllers/flux/extra.go +++ b/controllers/flux/extra.go @@ -247,12 +247,12 @@ func (r *MiniClusterReconciler) createMiniClusterIngress( } err := ctrl.SetControllerReference(cluster, ingress, r.Scheme) if err != nil { - r.log.Error(err, "🔴 Create ingress", "Service", restfulServiceName) + r.log.Error(err, "🔴 Create ingress", "Service", cluster.Spec.ServiceName) return err } err = r.Client.Create(ctx, ingress) if err != nil { - r.log.Error(err, "🔴 Create ingress", "Service", restfulServiceName) + r.log.Error(err, "🔴 Create ingress", "Service", cluster.Spec.ServiceName) return err } return nil diff --git a/controllers/flux/job.go b/controllers/flux/job.go index 7125da73..d3cd16aa 100644 --- a/controllers/flux/job.go +++ b/controllers/flux/job.go @@ -31,9 +31,13 @@ func (r *MiniClusterReconciler) newMiniClusterJob( setAsFQDN := false // We add the selector for the horizontal auto scaler, if active - // We can't use the job-name selector, as this would include the - // external sidecar service! + // We can't use the job-name or job-group selector, as this + // would include the external sidecar service! podLabels["hpa-selector"] = cluster.Name + podLabels["job-group"] = cluster.Name + if cluster.Spec.JobSelector != "" { + podLabels["job-group"] = cluster.Spec.JobSelector + } // This is an indexed-job job := &batchv1.Job{ @@ -61,7 +65,7 @@ func (r *MiniClusterReconciler) newMiniClusterJob( }, Spec: corev1.PodSpec{ // matches the service - Subdomain: restfulServiceName, + Subdomain: cluster.Spec.ServiceName, SetHostnameAsFQDN: &setAsFQDN, Volumes: getVolumes(cluster), RestartPolicy: corev1.RestartPolicyOnFailure, diff --git a/controllers/flux/minicluster.go b/controllers/flux/minicluster.go index 3c23a0eb..dd704bfd 100644 --- a/controllers/flux/minicluster.go +++ b/controllers/flux/minicluster.go @@ -86,9 +86,15 @@ func (r *MiniClusterReconciler) ensureMiniCluster( } } - // Create headless service for the MiniCluster - selector := map[string]string{"job-name": cluster.Name} - result, err = r.exposeServices(ctx, cluster, restfulServiceName, selector) + // Create headless service for the MiniCluster. But... + // If we are expecting to connect to another one, use the service name + selector := map[string]string{"job-group": cluster.Name} + if cluster.Spec.JobSelector != "" { + selector["job-group"] = cluster.Spec.JobSelector + } + r.log.Info("MiniCluster", "ServiceSelector", selector["job-group"]) + + result, err = r.exposeServices(ctx, cluster, cluster.Spec.ServiceName, selector) if err != nil { return result, err } @@ -377,19 +383,32 @@ func (r *MiniClusterReconciler) getConfigMap( } // generateHostlist for a specific size given the cluster namespace and a size -func generateHostlist(cluster *api.MiniCluster, size int) string { +func generateHostlist(name string, size int) string { // The hosts are generated through the max size, so the cluster can expand - return fmt.Sprintf("%s-[%s]", cluster.Name, generateRange(size)) + return fmt.Sprintf("%s-[%s]", name, generateRange(size)) } // generateFluxConfig creates the broker.toml file used to boostrap flux func generateFluxConfig(cluster *api.MiniCluster) string { // The hosts are generated through the max size, so the cluster can expand - fqdn := fmt.Sprintf("%s.%s.svc.cluster.local", restfulServiceName, cluster.Namespace) + fqdn := fmt.Sprintf("%s.%s.svc.cluster.local", cluster.Spec.ServiceName, cluster.Namespace) hosts := fmt.Sprintf("[%s]", generateRange(int(cluster.Spec.MaxSize))) - fluxConfig := fmt.Sprintf(brokerConfigTemplate, cluster.FluxInstallRoot(), fqdn, cluster.Name, hosts) + + // If we are connecting another cluster, we need to register the hostnames here + connectedHosts := "" + if cluster.Spec.Flux.Connection != "" { + connectedRange := fmt.Sprintf("[%s]", generateRange(int(cluster.Spec.Flux.ConnectionSize))) + connectedHosts = fmt.Sprintf(",%s-%s", cluster.Spec.Flux.Connection, connectedRange) + } + + // TODO: clean this up and make into template + fluxConfig := fmt.Sprintf(brokerConfigTemplate, + cluster.FluxInstallRoot(), + fqdn, cluster.Name, hosts, + connectedHosts) + fluxConfig += "\n" + brokerArchiveSection return fluxConfig } @@ -416,7 +435,14 @@ func generateWaitScript(cluster *api.MiniCluster, containerIndex int) (string, e mainHost := fmt.Sprintf("%s-0", cluster.Name) // The resources size must also match the max size in the cluster - hosts := generateHostlist(cluster, int(cluster.Spec.MaxSize)) + hosts := generateHostlist(cluster.Name, int(cluster.Spec.MaxSize)) + + // And if we are adding external connected hosts (from a different MiniCluster) + // We need to account for them too. + if cluster.Spec.Flux.Connection != "" { + connectedHosts := generateHostlist(cluster.Spec.Flux.Connection, int(cluster.Spec.Flux.ConnectionSize)) + hosts = fmt.Sprintf("%s,%s", hosts, connectedHosts) + } // Ensure our requested users each each have a password for i, user := range cluster.Spec.Users { @@ -445,9 +471,11 @@ func generateWaitScript(cluster *api.MiniCluster, containerIndex int) (string, e // The token uuid is the same across images wt := WaitTemplate{ + TotalSize: cluster.Spec.MaxSize + int32(cluster.Spec.Flux.ConnectionSize), FluxUser: getFluxUser(cluster.Spec.FluxRestful.Username), FluxToken: getRandomToken(cluster.Spec.FluxRestful.Token), MainHost: mainHost, + Namespace: cluster.Namespace, Hosts: hosts, Cores: cores, Container: container, diff --git a/controllers/flux/pods.go b/controllers/flux/pods.go index b54deb9e..ff46f119 100644 --- a/controllers/flux/pods.go +++ b/controllers/flux/pods.go @@ -109,8 +109,11 @@ func (r *MiniClusterReconciler) newServicePod( podLabels := r.getPodLabels(cluster) podServiceName := cluster.Name + "-services" - // service selector? - podLabels["job-name"] = cluster.Name + // service selector + podLabels["job-group"] = cluster.Name + if cluster.Spec.JobSelector != "" { + podLabels["job-group"] = cluster.Spec.JobSelector + } // This is an indexed-job pod := &corev1.Pod{ @@ -122,7 +125,7 @@ func (r *MiniClusterReconciler) newServicePod( }, Spec: corev1.PodSpec{ // This is the headless service name - Subdomain: restfulServiceName, + Subdomain: cluster.Spec.ServiceName, Hostname: podServiceName, SetHostnameAsFQDN: &setAsFQDN, RestartPolicy: corev1.RestartPolicyOnFailure, diff --git a/controllers/flux/scale.go b/controllers/flux/scale.go index 4759d47d..1cb28a99 100644 --- a/controllers/flux/scale.go +++ b/controllers/flux/scale.go @@ -22,7 +22,7 @@ import ( ) // addScaleSelector populates the fields the horizontal auto scaler needs. -// Meaning: job-name is used to select pods to check. The size variable +// Meaning: hpa-selector is used to select pods to check. The size variable // is updated later. func (r *MiniClusterReconciler) addScaleSelector( ctx context.Context, diff --git a/controllers/flux/service.go b/controllers/flux/service.go index c905f715..cc9d7a91 100644 --- a/controllers/flux/service.go +++ b/controllers/flux/service.go @@ -25,8 +25,7 @@ import ( ) var ( - restfulServiceName = "flux-service" - servicePort = 5000 + servicePort = 5000 ) // exposeService will expose services - one for the port 5000 forward, and the other for job networking (headless) diff --git a/controllers/flux/templates.go b/controllers/flux/templates.go index 82672abb..b5bbaa59 100644 --- a/controllers/flux/templates.go +++ b/controllers/flux/templates.go @@ -28,9 +28,6 @@ var brokerArchiveSection string //go:embed templates/wait.sh var waitToStartTemplate string -//go:embed templates/cert-generate.sh -var generateCertTemplate string - // WaitTemplate populates wait.sh type WaitTemplate struct { FluxToken string // Token to log into the UI, should be consistent across containers @@ -38,8 +35,10 @@ type WaitTemplate struct { MainHost string // Main host identifier Hosts string // List of hosts Cores int32 + TotalSize int32 Container api.MiniClusterContainer Spec api.MiniClusterSpec + Namespace string // Broker initial quorum that must be online to start // This is used if the cluster MaxSize > Size diff --git a/controllers/flux/templates/broker.toml b/controllers/flux/templates/broker.toml index 9aeb9cd4..c3eb561c 100644 --- a/controllers/flux/templates/broker.toml +++ b/controllers/flux/templates/broker.toml @@ -16,5 +16,5 @@ default_port = 8050 default_bind = "tcp://eth0:%%p" default_connect = "tcp://%%h.%s:%%p" hosts = [ - { host="%s-%s"}, + { host="%s-%s%s"}, ] diff --git a/controllers/flux/templates/wait.sh b/controllers/flux/templates/wait.sh index ba0093bc..3ac2d2ab 100644 --- a/controllers/flux/templates/wait.sh +++ b/controllers/flux/templates/wait.sh @@ -91,6 +91,8 @@ brokerOptions="-Scron.directory=/etc/flux/system/cron.d \ -Srundir=/run/flux {{ if .Spec.Interactive }}-Sbroker.rc2_none {{ end }} \ -Sstatedir=${STATE_DIR} \ -Slocal-uri=local:///run/flux/local \ +{{ if .Spec.Flux.ConnectionSize }}-Ssize={{ .Spec.Flux.ConnectionSize }}{{ end }} \ +{{ if .Spec.Flux.Connection }}-Sbroker.boot-server={{ .Spec.Flux.Connection }}-0{{ end }} \ {{ if .Spec.Flux.ConnectTimeout }}-Stbon.connect_timeout={{ .Spec.Flux.ConnectTimeout }}{{ end }} \ {{ if .RequiredRanks }}-Sbroker.quorum={{ .RequiredRanks }}{{ end }} \ {{ if .Spec.Logging.Zeromq }}-Stbon.zmqdebug=1{{ end }} \ diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index 9c90baec..ef4b000a 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -22,9 +22,18 @@ The following tutorials are provided from their respective directories (and are ### Experimental +These are experimental setups, either testing a feature not yet fully supported in Flux, or via +a custom container base build. + #### Nested - - [K3s](https://github.com/flux-framework/flux-operator/tree/main/examples/nested/k3s/basic): instiatiate k3s inside Flux, and deploy an app. + - [K3s](https://github.com/flux-framework/flux-operator/tree/main/examples/experimental/nested/k3s/basic): instiatiate k3s inside Flux, and deploy an app. + + +#### Trees + + - [Basic Tree](https://github.com/flux-framework/flux-operator/blob/main/examples/experimental/workflows/tree) + - [Instance Variables](https://github.com/flux-framework/flux-operator/blob/main/examples/experimental/workflows/tree-with-variables) ### Machine Learning @@ -46,15 +55,6 @@ The following tutorials are provided from their respective directories (and are - [Merlin Basic](https://github.com/flux-framework/flux-operator/blob/main/examples/launchers/merlin/basic) - [Merlin Singularity Openfoam](https://github.com/flux-framework/flux-operator/blob/main/examples/launchers/merlin/singularity-openfoam) -### Workflows - -Although some of the others above are also workflows, these examples are going to use `flux tree` (in various contexts) to -submit different job hierarchies and get around the etcd bottleneck in Kubernetes. - - - [Basic Tree](https://github.com/flux-framework/flux-operator/blob/main/examples/workflows/tree) - - [Instance Variables](https://github.com/flux-framework/flux-operator/blob/main/examples/workflows/tree-with-variables) - -We have just started this arm of our experiments and you can expect more as we go! ## Integrated Tutorials diff --git a/examples/dist/flux-operator.yaml b/examples/dist/flux-operator.yaml index a985c084..a86b8d62 100644 --- a/examples/dist/flux-operator.yaml +++ b/examples/dist/flux-operator.yaml @@ -257,6 +257,15 @@ spec: default: 5s description: Single user executable to provide to flux start type: string + connection: + description: Connect to this job in the same namespace (akin to + BootServer but within cluster) + type: string + connectionSize: + description: Additional number of nodes to allow from external + boot-server This currently only allows local MiniCluster but + could be extended to any general URI + type: integer installRoot: default: /usr description: Install root location @@ -310,6 +319,11 @@ spec: type: string description: Labels for the job type: object + jobSelector: + description: Specify the name of the job selector. You would want + to do this if you intend to connect miniclusters Multiple jobs can + be selected under a single services + type: string logging: description: Logging modes determine the output you see in the job log @@ -371,6 +385,10 @@ spec: description: Service account name for the pod type: string type: object + serviceName: + default: flux-service + description: Specify the name of the cluster service + type: string services: description: Services are one or more service containers to bring up alongside the MiniCluster. diff --git a/examples/experimental/flux-bootstrap/README.md b/examples/experimental/flux-bootstrap/README.md new file mode 100755 index 00000000..0f80b962 --- /dev/null +++ b/examples/experimental/flux-bootstrap/README.md @@ -0,0 +1,110 @@ +# Flux Bootrap + +> All about dem' FLUB! (FLUx Bootstrap) 🤪️ + +Flubbing? Flubbed? Flub-ified? So many ideas! + +This tests a [new feature](https://github.com/flux-framework/flux-core/pull/5184) to add support for Flux Bootstrap, +which means we can create a cluster with a max size that can support adding another broker, and then +create a second broker and easily connect the two. In other words, we would be connecting MiniClusters. + +## Usage + +First, let's create a kind cluster. From the context of this directory: + +```bash +$ kind create cluster --config ../../kind-config.yaml +``` + +And then install the operator, create the namespace, and apply the MiniCluster YAML here. Note +that you'll likely need to build the custom image for this branch and then apply the development CRD +(this feature is not merged into Flux and is considered experimental): + +```bash +$ make test-deploy +$ kubectl apply -f ../../dist/flux-operator-dev.yaml +$ kubectl create namespace flux-operator +``` + +### Create First MiniCluster + +Let's first create the base MiniCluster - number one! This cluster is going to be the one +we have up first, and we allow the second cluster to see and connect to. I'm not sure +if the ordering is important (e.g., we could create a cluster that knows about an external one +that doesn't exist yet) but for this first experiment this order seemed logical to me. +Here is how to create: + +```bash +$ kubectl apply -f ./minicluster-1st.yaml +``` + +If you look at the [minicluster-1st.yaml](minicluster-1st.yaml) you'll see it is just +an interactive mode cluster, meaning we start the broker and do nothing else. However, +we add one attribute to explicitly define the service selector label, and we do this +so the headless service for this MiniCluster is not scoped to the one job: + +```yaml +jobSelector: connected-service +``` + +You should wait for the pods to be running before moving on to the next step. + +### Create Second MiniCluster + +This second MiniCluster has extra (new!) attributes that will make it possible to +see (and connect to) the already existing MiniCluster. It could also be the case +that this doesn't have to exist, but I haven't tested that yet. Specifically, +this part of the custom resource definition is what is important: + +```yaml + + # Ensure this job can fall under the same networking namespace + jobSelector: connected-service + + # Broker options + flux: + + # Allow the minicluster-1st brokers to connect + bootServer: flux-sample-1-0.flux-service.flux-operator.svc.cluster.local + + # The number of nodes the first server has + bootServerSize: 2 +``` + +You can check the logs of the broker pods (the index 0s) to make sure that everything started OK. +Since we told cluster 2 about cluster 1, let's shell into 2: + +```bash +$ kubectl exec -it -n flux-operator flux-sample-2-0-xxx bash +``` + +Connect to the broker as the flux user: + +```bash +$ sudo -u flux -E $(env) -E HOME=/home/flux flux proxy local:///run/flux/local bash +``` + +I currently can see the other node! + +```console + STATE NNODES NCORES NGPUS NODELIST + free 2 8 0 flux-sample-2-[0-1] + allocated 0 0 0 + down 2 8 0 flux-sample-1-[0-1] +``` + +However they are down, so likely I have a value incorrect somewhere. I suspect this is an issue +with formatting the hostnames vs. service, and the value that is needed for the flux broker option. + +**Questions**: + +- Do we need to change the tbon topology to be kary:2 or similar? +- I am assuming the hosts (from the second cluster) need to be defined in `[[resource.config]]` AND the flux R resource spec +- What should be the format of the broker option? + +When you are done: + +```bash +$ kubectl delete -f minicluster-1st.yaml +$ kubectl delete -f minicluster-2nd.yaml +``` diff --git a/examples/experimental/flux-bootstrap/minicluster-1st.yaml b/examples/experimental/flux-bootstrap/minicluster-1st.yaml new file mode 100755 index 00000000..05624db5 --- /dev/null +++ b/examples/experimental/flux-bootstrap/minicluster-1st.yaml @@ -0,0 +1,36 @@ +apiVersion: flux-framework.org/v1alpha1 +kind: MiniCluster +metadata: + name: flux-sample-1 + namespace: flux-operator +spec: + # Number of pods to create for MiniCluster + size: 2 + + # Make this interactive so we can launch a bunch of jobs! + interactive: true + + # This tells the MiniCluster to be on a specific named network. + # Giving the same selector name to two different MiniClusters will + # put them under this same network + jobSelector: connected-service + + # We don't actually need this, but it gives the two MiniCluster a shared filesystem + volumes: + data: + storageClass: hostpath + path: /tmp/workflow + + # This is a list because a pod can support multiple containers + containers: + - image: ghcr.io/rse-ops/flub:flux-sched-bionic + workingDir: /tmp/workflow + + # Container will be pre-pulled here only by the broker + volumes: + data: + path: /tmp/workflow + + # I forgot to add the flux user, oups + commands: + pre: useradd -ms /bin/bash -u 1234 flux diff --git a/examples/experimental/flux-bootstrap/minicluster-2nd.yaml b/examples/experimental/flux-bootstrap/minicluster-2nd.yaml new file mode 100755 index 00000000..2c6f9ce8 --- /dev/null +++ b/examples/experimental/flux-bootstrap/minicluster-2nd.yaml @@ -0,0 +1,45 @@ +apiVersion: flux-framework.org/v1alpha1 +kind: MiniCluster +metadata: + name: flux-sample-2 + namespace: flux-operator +spec: + # Number of pods to create for MiniCluster + size: 2 + + # This tells the MiniCluster to be on a specific named network. + # Giving the same selector name to two different MiniClusters will + # put them under this same network + jobSelector: connected-service + + # Broker options for connecting to another MiniCluster - both of these are required! + flux: + + # Allow connection to the minicluster-1st brokers + connection: flux-sample-1 + + # The number of nodes the first server has + connectionSize: 2 + + # We don't actually need this, but it gives the two MiniCluster a shared filesystem + volumes: + data: + storageClass: hostpath + path: /tmp/workflow + + # Make this interactive so we can launch a bunch of jobs! + interactive: true + + # This is a list because a pod can support multiple containers + containers: + - image: ghcr.io/rse-ops/flub:flux-sched-bionic + workingDir: /tmp/workflow + + # Container will be pre-pulled here only by the broker + volumes: + data: + path: /tmp/workflow + + # I forgot to add the flux user, oups + commands: + pre: useradd -ms /bin/bash -u 1234 flux \ No newline at end of file diff --git a/examples/nested/k3s/basic/README.md b/examples/experimental/nested/k3s/basic/README.md similarity index 100% rename from examples/nested/k3s/basic/README.md rename to examples/experimental/nested/k3s/basic/README.md diff --git a/examples/nested/k3s/basic/kubeconfig.yaml b/examples/experimental/nested/k3s/basic/kubeconfig.yaml similarity index 100% rename from examples/nested/k3s/basic/kubeconfig.yaml rename to examples/experimental/nested/k3s/basic/kubeconfig.yaml diff --git a/examples/nested/k3s/basic/minicluster.yaml b/examples/experimental/nested/k3s/basic/minicluster.yaml similarity index 100% rename from examples/nested/k3s/basic/minicluster.yaml rename to examples/experimental/nested/k3s/basic/minicluster.yaml diff --git a/examples/nested/k3s/basic/my-echo.yaml b/examples/experimental/nested/k3s/basic/my-echo.yaml similarity index 100% rename from examples/nested/k3s/basic/my-echo.yaml rename to examples/experimental/nested/k3s/basic/my-echo.yaml diff --git a/examples/nested/k3s/basic/start.sh b/examples/experimental/nested/k3s/basic/start.sh similarity index 100% rename from examples/nested/k3s/basic/start.sh rename to examples/experimental/nested/k3s/basic/start.sh diff --git a/examples/workflows/tree-with-variables/README.md b/examples/experimental/workflows/tree-with-variables/README.md similarity index 100% rename from examples/workflows/tree-with-variables/README.md rename to examples/experimental/workflows/tree-with-variables/README.md diff --git a/examples/workflows/tree-with-variables/minicluster.yaml b/examples/experimental/workflows/tree-with-variables/minicluster.yaml similarity index 100% rename from examples/workflows/tree-with-variables/minicluster.yaml rename to examples/experimental/workflows/tree-with-variables/minicluster.yaml diff --git a/examples/workflows/tree-with-variables/run-on-instance.sh b/examples/experimental/workflows/tree-with-variables/run-on-instance.sh similarity index 100% rename from examples/workflows/tree-with-variables/run-on-instance.sh rename to examples/experimental/workflows/tree-with-variables/run-on-instance.sh diff --git a/examples/experimental/workflows/tree-with-variables/tree.1.1-output.txt b/examples/experimental/workflows/tree-with-variables/tree.1.1-output.txt new file mode 100644 index 00000000..040b584a --- /dev/null +++ b/examples/experimental/workflows/tree-with-variables/tree.1.1-output.txt @@ -0,0 +1,5 @@ +FLUX_TREE_ID tree.1.1 +FLUX_TREE_JOBSCRIPT_INDEX 1 +FLUX_TREE_NNODES 1 +FLUX_TREE_NCORES_PER_NODE 4 +FLUX_TREE_NGPUS_PER_NODE 0 diff --git a/examples/experimental/workflows/tree-with-variables/tree.1.2-output.txt b/examples/experimental/workflows/tree-with-variables/tree.1.2-output.txt new file mode 100644 index 00000000..86ca6135 --- /dev/null +++ b/examples/experimental/workflows/tree-with-variables/tree.1.2-output.txt @@ -0,0 +1,5 @@ +FLUX_TREE_ID tree.1.2 +FLUX_TREE_JOBSCRIPT_INDEX 1 +FLUX_TREE_NNODES 1 +FLUX_TREE_NCORES_PER_NODE 4 +FLUX_TREE_NGPUS_PER_NODE 0 diff --git a/examples/experimental/workflows/tree-with-variables/tree.2.1-output.txt b/examples/experimental/workflows/tree-with-variables/tree.2.1-output.txt new file mode 100644 index 00000000..7aafbee7 --- /dev/null +++ b/examples/experimental/workflows/tree-with-variables/tree.2.1-output.txt @@ -0,0 +1,5 @@ +FLUX_TREE_ID tree.2.1 +FLUX_TREE_JOBSCRIPT_INDEX 1 +FLUX_TREE_NNODES 1 +FLUX_TREE_NCORES_PER_NODE 4 +FLUX_TREE_NGPUS_PER_NODE 0 diff --git a/examples/experimental/workflows/tree-with-variables/tree.2.2-output.txt b/examples/experimental/workflows/tree-with-variables/tree.2.2-output.txt new file mode 100644 index 00000000..647e3cbb --- /dev/null +++ b/examples/experimental/workflows/tree-with-variables/tree.2.2-output.txt @@ -0,0 +1,5 @@ +FLUX_TREE_ID tree.2.2 +FLUX_TREE_JOBSCRIPT_INDEX 1 +FLUX_TREE_NNODES 1 +FLUX_TREE_NCORES_PER_NODE 4 +FLUX_TREE_NGPUS_PER_NODE 0 diff --git a/examples/workflows/tree/README.md b/examples/experimental/workflows/tree/README.md similarity index 100% rename from examples/workflows/tree/README.md rename to examples/experimental/workflows/tree/README.md diff --git a/examples/workflows/tree/minicluster.yaml b/examples/experimental/workflows/tree/minicluster.yaml similarity index 100% rename from examples/workflows/tree/minicluster.yaml rename to examples/experimental/workflows/tree/minicluster.yaml diff --git a/examples/experimental/workflows/tree/post-run.sh b/examples/experimental/workflows/tree/post-run.sh new file mode 100644 index 00000000..64ca5cdf --- /dev/null +++ b/examples/experimental/workflows/tree/post-run.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +echo "Checking log in minikube" +minikube ssh -- cat /tmp/data/tree.out +echo "Cleaning up /tmp/data in minikube" +minikube ssh -- sudo rm -rf /tmp/data + diff --git a/examples/experimental/workflows/tree/pre-run.sh b/examples/experimental/workflows/tree/pre-run.sh new file mode 100644 index 00000000..88b7f540 --- /dev/null +++ b/examples/experimental/workflows/tree/pre-run.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +HERE=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +TESTS=$(dirname ${HERE}) + +# The "data" volume will be mounted at /mnt/data +minikube ssh -- mkdir -p /tmp/data \ No newline at end of file diff --git a/sdk/python/v1alpha1/docs/FluxSpec.md b/sdk/python/v1alpha1/docs/FluxSpec.md index 79cc207c..27995cbf 100644 --- a/sdk/python/v1alpha1/docs/FluxSpec.md +++ b/sdk/python/v1alpha1/docs/FluxSpec.md @@ -5,6 +5,8 @@ Name | Type | Description | Notes ------------ | ------------- | ------------- | ------------- **connect_timeout** | **str** | Single user executable to provide to flux start | [optional] [default to '5s'] +**connection** | **str** | Connect to this job in the same namespace (akin to BootServer but within cluster) | [optional] +**connection_size** | **int** | Additional number of nodes to allow from external boot-server This currently only allows local MiniCluster but could be extended to any general URI | [optional] **install_root** | **str** | Install root location | [optional] [default to '/usr'] **log_level** | **int** | Log level to use for flux logging (only in non TestMode) | [optional] [default to 6] **option_flags** | **str** | Flux option flags, usually provided with -o optional - if needed, default option flags for the server These can also be set in the user interface to override here. This is only valid for a FluxRunner \"runFlux\" true | [optional] [default to ''] diff --git a/sdk/python/v1alpha1/docs/MiniClusterSpec.md b/sdk/python/v1alpha1/docs/MiniClusterSpec.md index 3ea154ef..98b0305a 100644 --- a/sdk/python/v1alpha1/docs/MiniClusterSpec.md +++ b/sdk/python/v1alpha1/docs/MiniClusterSpec.md @@ -13,9 +13,11 @@ Name | Type | Description | Notes **flux_restful** | [**FluxRestful**](FluxRestful.md) | | [optional] **interactive** | **bool** | Run a single-user, interactive minicluster | [optional] [default to False] **job_labels** | **dict(str, str)** | Labels for the job | [optional] +**job_selector** | **str** | Specify the name of the job selector. You would want to do this if you intend to connect miniclusters Multiple jobs can be selected under a single services | [optional] [default to ''] **logging** | [**LoggingSpec**](LoggingSpec.md) | | [optional] **max_size** | **int** | MaxSize (maximum number of pods to allow scaling to) | [optional] **pod** | [**PodSpec**](PodSpec.md) | | [optional] +**service_name** | **str** | Specify the name of the cluster service | [optional] [default to 'flux-service'] **services** | [**list[MiniClusterContainer]**](MiniClusterContainer.md) | Services are one or more service containers to bring up alongside the MiniCluster. | [optional] **size** | **int** | Size (number of job pods to run, size of minicluster in pods) This is also the minimum number required to start Flux | [optional] [default to 1] **tasks** | **int** | Total number of CPUs being run across entire cluster | [optional] [default to 1] diff --git a/sdk/python/v1alpha1/fluxoperator/models/flux_spec.py b/sdk/python/v1alpha1/fluxoperator/models/flux_spec.py index 5f0d64f5..d5d1fe98 100644 --- a/sdk/python/v1alpha1/fluxoperator/models/flux_spec.py +++ b/sdk/python/v1alpha1/fluxoperator/models/flux_spec.py @@ -34,6 +34,8 @@ class FluxSpec(object): """ openapi_types = { 'connect_timeout': 'str', + 'connection': 'str', + 'connection_size': 'int', 'install_root': 'str', 'log_level': 'int', 'option_flags': 'str', @@ -42,19 +44,23 @@ class FluxSpec(object): attribute_map = { 'connect_timeout': 'connectTimeout', + 'connection': 'connection', + 'connection_size': 'connectionSize', 'install_root': 'installRoot', 'log_level': 'logLevel', 'option_flags': 'optionFlags', 'wrap': 'wrap' } - def __init__(self, connect_timeout='5s', install_root='/usr', log_level=6, option_flags='', wrap=None, local_vars_configuration=None): # noqa: E501 + def __init__(self, connect_timeout='5s', connection=None, connection_size=None, install_root='/usr', log_level=6, option_flags='', wrap=None, local_vars_configuration=None): # noqa: E501 """FluxSpec - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration.get_default_copy() self.local_vars_configuration = local_vars_configuration self._connect_timeout = None + self._connection = None + self._connection_size = None self._install_root = None self._log_level = None self._option_flags = None @@ -63,6 +69,10 @@ def __init__(self, connect_timeout='5s', install_root='/usr', log_level=6, optio if connect_timeout is not None: self.connect_timeout = connect_timeout + if connection is not None: + self.connection = connection + if connection_size is not None: + self.connection_size = connection_size if install_root is not None: self.install_root = install_root if log_level is not None: @@ -95,6 +105,52 @@ def connect_timeout(self, connect_timeout): self._connect_timeout = connect_timeout + @property + def connection(self): + """Gets the connection of this FluxSpec. # noqa: E501 + + Connect to this job in the same namespace (akin to BootServer but within cluster) # noqa: E501 + + :return: The connection of this FluxSpec. # noqa: E501 + :rtype: str + """ + return self._connection + + @connection.setter + def connection(self, connection): + """Sets the connection of this FluxSpec. + + Connect to this job in the same namespace (akin to BootServer but within cluster) # noqa: E501 + + :param connection: The connection of this FluxSpec. # noqa: E501 + :type connection: str + """ + + self._connection = connection + + @property + def connection_size(self): + """Gets the connection_size of this FluxSpec. # noqa: E501 + + Additional number of nodes to allow from external boot-server This currently only allows local MiniCluster but could be extended to any general URI # noqa: E501 + + :return: The connection_size of this FluxSpec. # noqa: E501 + :rtype: int + """ + return self._connection_size + + @connection_size.setter + def connection_size(self, connection_size): + """Sets the connection_size of this FluxSpec. + + Additional number of nodes to allow from external boot-server This currently only allows local MiniCluster but could be extended to any general URI # noqa: E501 + + :param connection_size: The connection_size of this FluxSpec. # noqa: E501 + :type connection_size: int + """ + + self._connection_size = connection_size + @property def install_root(self): """Gets the install_root of this FluxSpec. # noqa: E501 diff --git a/sdk/python/v1alpha1/fluxoperator/models/mini_cluster_spec.py b/sdk/python/v1alpha1/fluxoperator/models/mini_cluster_spec.py index c693e0f5..4dc737af 100644 --- a/sdk/python/v1alpha1/fluxoperator/models/mini_cluster_spec.py +++ b/sdk/python/v1alpha1/fluxoperator/models/mini_cluster_spec.py @@ -41,9 +41,11 @@ class MiniClusterSpec(object): 'flux_restful': 'FluxRestful', 'interactive': 'bool', 'job_labels': 'dict(str, str)', + 'job_selector': 'str', 'logging': 'LoggingSpec', 'max_size': 'int', 'pod': 'PodSpec', + 'service_name': 'str', 'services': 'list[MiniClusterContainer]', 'size': 'int', 'tasks': 'int', @@ -60,9 +62,11 @@ class MiniClusterSpec(object): 'flux_restful': 'fluxRestful', 'interactive': 'interactive', 'job_labels': 'jobLabels', + 'job_selector': 'jobSelector', 'logging': 'logging', 'max_size': 'maxSize', 'pod': 'pod', + 'service_name': 'serviceName', 'services': 'services', 'size': 'size', 'tasks': 'tasks', @@ -70,7 +74,7 @@ class MiniClusterSpec(object): 'volumes': 'volumes' } - def __init__(self, archive=None, cleanup=False, containers=None, deadline_seconds=31500000, flux=None, flux_restful=None, interactive=False, job_labels=None, logging=None, max_size=None, pod=None, services=None, size=1, tasks=1, users=None, volumes=None, local_vars_configuration=None): # noqa: E501 + def __init__(self, archive=None, cleanup=False, containers=None, deadline_seconds=31500000, flux=None, flux_restful=None, interactive=False, job_labels=None, job_selector='', logging=None, max_size=None, pod=None, service_name='flux-service', services=None, size=1, tasks=1, users=None, volumes=None, local_vars_configuration=None): # noqa: E501 """MiniClusterSpec - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration.get_default_copy() @@ -84,9 +88,11 @@ def __init__(self, archive=None, cleanup=False, containers=None, deadline_second self._flux_restful = None self._interactive = None self._job_labels = None + self._job_selector = None self._logging = None self._max_size = None self._pod = None + self._service_name = None self._services = None self._size = None self._tasks = None @@ -109,12 +115,16 @@ def __init__(self, archive=None, cleanup=False, containers=None, deadline_second self.interactive = interactive if job_labels is not None: self.job_labels = job_labels + if job_selector is not None: + self.job_selector = job_selector if logging is not None: self.logging = logging if max_size is not None: self.max_size = max_size if pod is not None: self.pod = pod + if service_name is not None: + self.service_name = service_name if services is not None: self.services = services if size is not None: @@ -306,6 +316,29 @@ def job_labels(self, job_labels): self._job_labels = job_labels + @property + def job_selector(self): + """Gets the job_selector of this MiniClusterSpec. # noqa: E501 + + Specify the name of the job selector. You would want to do this if you intend to connect miniclusters Multiple jobs can be selected under a single services # noqa: E501 + + :return: The job_selector of this MiniClusterSpec. # noqa: E501 + :rtype: str + """ + return self._job_selector + + @job_selector.setter + def job_selector(self, job_selector): + """Sets the job_selector of this MiniClusterSpec. + + Specify the name of the job selector. You would want to do this if you intend to connect miniclusters Multiple jobs can be selected under a single services # noqa: E501 + + :param job_selector: The job_selector of this MiniClusterSpec. # noqa: E501 + :type job_selector: str + """ + + self._job_selector = job_selector + @property def logging(self): """Gets the logging of this MiniClusterSpec. # noqa: E501 @@ -371,6 +404,29 @@ def pod(self, pod): self._pod = pod + @property + def service_name(self): + """Gets the service_name of this MiniClusterSpec. # noqa: E501 + + Specify the name of the cluster service # noqa: E501 + + :return: The service_name of this MiniClusterSpec. # noqa: E501 + :rtype: str + """ + return self._service_name + + @service_name.setter + def service_name(self, service_name): + """Sets the service_name of this MiniClusterSpec. + + Specify the name of the cluster service # noqa: E501 + + :param service_name: The service_name of this MiniClusterSpec. # noqa: E501 + :type service_name: str + """ + + self._service_name = service_name + @property def services(self): """Gets the services of this MiniClusterSpec. # noqa: E501