ITISFoundation
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 3 additions & 1 deletion b/‎.github/PULL_REQUEST_TEMPLATE.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 4 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 0 additions & 1 deletion b/‎Makefile‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎charts/Makefile‎
Lines changed: 0 additions & 1 deletion b/‎charts/Makefile‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎charts/aws-ebs-csi-driver/README.md‎
Lines changed: 28 additions & 0 deletions b/‎charts/aws-ebs-csi-driver/README.md‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎charts/aws-ebs-csi-driver/values.yaml.gotmpl‎
Lines changed: 1 addition & 1 deletion b/‎charts/aws-ebs-csi-driver/values.yaml.gotmpl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎charts/longhorn/README.md‎
Lines changed: 50 additions & 0 deletions b/‎charts/longhorn/README.md‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎charts/longhorn/values.yaml.gotmpl‎
Lines changed: 68 additions & 0 deletions b/‎charts/longhorn/values.yaml.gotmpl‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎charts/portainer/values.ebs-pv.yaml.gotmpl‎
Lines changed: 1 addition & 1 deletion b/‎charts/portainer/values.ebs-pv.yaml.gotmpl‎
Lines changed: 1 addition & 1 deletion
@@ -18,6 +18,8 @@
 - [ ] Service has placement constraints or is global
 - [ ] Service is restartable
 - [ ] Service restart is zero-downtime
+- [ ] Service is monitored (via prometheus and grafana)
 - [ ] Service is not bound to one specific node (e.g. via files or volumes)
 - [ ] Relevant OPS E2E Test are added
-- [ ] Service's Public URL is included in maintenance mode -->
+- [ ] Service's Public URL is included in maintenance mode
+- [ ] Service's Public URL is included in testing mode -->
@@ -129,7 +129,7 @@ docs/_build
 /services/monitoring/pgsql_query_exporter_config.yaml
 /services/monitoring/docker-compose.yml
 /services/monitoring/smokeping_prober_config.yaml
-
+services/monitoring/tempo_config.yaml
 
 # Simcore: Contains location of repo.config file on the machine and of the whole config directory
 .config.location
 
@@ -79,6 +79,10 @@ repos:
     hooks:
       - id: shellcheck
         name: Shell scripts conform to shellcheck
+  - repo: https://github.com/antonbabenko/pre-commit-terraform
+    rev: v1.89.1 # Get the latest from: https://github.com/antonbabenko/pre-commit-terraform/releases
+    hooks:
+    -   id: terraform_fmt
   - repo: local
     hooks:
       - id: run-pylint
 
@@ -71,7 +71,6 @@ down-maintenance: ## Stop the maintenance mode
 	fi \
 	,)
 
-
 # Misc: info & clean
 .PHONY: info info-vars info-local
 info: ## Displays some important info
 
@@ -49,7 +49,6 @@ helmfile-sync: .check-helmfile-installed helmfile.yaml ## Syncs the helmfile con
 		$(MAKE) -s .helmfile-local-post-install; \
 	fi
 
-
 .PHONY: configure-local-hosts
 configure-local-hosts: ## Adds local hosts entries for the machine
 	@echo "Adding $(MACHINE_FQDN) hosts to /etc/hosts ..."
 
@@ -0,0 +1,28 @@
+## How to delete volumes with `recalimPolicy: retain`
+1. Delete pvc:
+```
+kubectl delete pvc <pvc-name>
+```
+
+2. Verify PV is `released`
+```
+kubectl get pv <pv-name>
+```
+
+3. Manually remove EBS in AWS
+    1. Go to AWS GUI and List EBS Volumes
+    1. Filter by tag `ebs.csi.aws.com/cluster=true`
+    1. Identify the volume associated with your PV (check `kubernetes.io/created-for/pv/name` tag of the EBS Volume)
+    1. Verify that EBS Volume is `Available`
+    1. Delete EBS Volume
+
+4. Delete the PV
+```
+kubectl delete pv <pv-name>
+```
+
+5. Remove Finalizers (if necessary)
+If the PV remains in a Terminating state, remove its finalizers:
+```
+kubectl patch pv <pv-name> -p '{"metadata":{"finalizers":null}}'
+```
@@ -5,7 +5,7 @@ image:
     tag: "v1.38.1"
 
 storageClasses:
-  - name: "ebs-sc"
+  - name: "{{ .Values.ebsStorageClassName }}"
     parameters:
         type: "gp3"
     allowVolumeExpansion: true
 
@@ -0,0 +1,50 @@
+# Longhorn (LH) Knowledge Base
+
+### Can LH be used for critical services (e.g., Databases)?
+
+No. We should not use it for volumes of critical services.
+
+As of now, we should avoid using LH for critical services. Instead, we should rely on easier-to-maintain solutions (e.g., application-level replication [Postgres Operators], S3, etc.). Once we get hands-on experience, extensive monitoring and ability to scale LH, we can consider using it for critical services.
+
+LH uses networking to keep replicas in sync, and IO-heavy workloads may easily overload it, leading to unpredictable consequences. Until we can extensively monitor LH and scale it properly on demand, it should not be used for critical or IO-heavy services.
+
+### How does LH decide which node's disk to use as storage?
+
+It depends on the configuration. There are three possibilities:
+* https://longhorn.io/kb/tip-only-use-storage-on-a-set-of-nodes/
+
+When using the `Create Default Disk on Labeled Nodes` option, it relies on the `node.longhorn.io/create-default-disk` Kubernetes node label.
+
+Source: https://longhorn.io/docs/1.8.1/nodes-and-volumes/nodes/default-disk-and-node-config/#customizing-default-disks-for-new-nodes
+
+### Will LH pick up storage from a newly added node?
+
+By default, LH will use storage on all nodes (including newly created ones) where it runs. If `createDefaultDiskLabeledNodes` is configured, it will depend on the label of the node.
+
+Source:
+* https://longhorn.io/kb/tip-only-use-storage-on-a-set-of-nodes/
+* https://longhorn.io/docs/1.8.1/nodes-and-volumes/nodes/default-disk-and-node-config/#customizing-default-disks-for-new-nodes
+
+### Can workloads be run on nodes where LH is not installed?
+
+Workloads can run on nodes without LH as long as LH is not restricted to specific nodes via the `nodeSelector` or `systemManagedComponentsNodeSelector` settings. If LH is configured to run on specific nodes, workloads can only run on those nodes.
+
+Note: There is an [ongoing bug](https://github.com/longhorn/longhorn/discussions/7312#discussioncomment-13030581) where LH will raise warnings when workloads run on nodes without LH. However, it will still function correctly.
+
+Source: https://longhorn.io/kb/tip-only-use-storage-on-a-set-of-nodes/
+
+### Adding new volumes to (PVs that rely on) LH
+
+Monitor carefully whether LH is capable of handling new volumes. Test the new volume under load (when many read/write operations occur) and ensure LH does not fail due to insufficient resource capacities (e.g., network or CPU). You can also consider LH's performance section from this Readme.
+
+LH's minimum recommended resource requirements:
+* https://longhorn.io/docs/1.8.1/best-practices/#minimum-recommended-hardware
+
+### LH's performance / resources
+
+Insights into LH's performance:
+* https://longhorn.io/blog/performance-scalability-report-aug-2020/
+* https://github.com/longhorn/longhorn/wiki/Performance-Benchmark
+
+Resource requirements:
+* https://github.com/longhorn/longhorn/issues/1691
@@ -0,0 +1,68 @@
+# Values documentation:
+# https://github.com/longhorn/longhorn/tree/v1.8.1/chart#values
+
+global:
+  # Warning: updating node selectors (after installation) will cause downtime
+  # https://longhorn.io/docs/archives/1.2.2/advanced-resources/deploy/node-selector/#setting-up-node-selector-after-longhorn-has-been-installed
+  #
+  # Warning: using node selectors will restrict our workloads to the same nodes
+  # https://longhorn.io/kb/tip-only-use-storage-on-a-set-of-nodes/#deploy-longhorn-components-only-on-a-specific-set-of-nodes
+  nodeSelector: {}
+  systemManagedComponentsNodeSelector: {}
+
+defaultSettings:
+  replicaAutoBalance: best-effort
+
+  # control on which nodes LH will use disks
+  # use `node.longhorn.io/create-default-disk` node label for control
+  createDefaultDiskLabeledNodes: true
+  # use dedicated folder (disk) for storage
+  defaultDataPath: /longhorn
+
+  # https://longhorn.io/docs/1.8.1/best-practices/#minimal-available-storage-and-over-provisioning
+  storageMinimalAvailablePercentage: 10
+
+  # Prevent LH deletion. Set to true if you want to delete LH
+  deletingConfirmationFlag: false
+
+  # let replicas to be scheduled on the same node
+  replicaSoftAntiAffinity: false
+
+  # we always use dedicated disks. 5% is a good value
+  storageReservedPercentageForDefaultDisk: 5
+
+persistence:
+  # use only for non-critical ops workloads
+  # for critical workloads (e.g. database)
+  # use application replication (e.g. postgres HA operator)
+  defaultClass: false
+
+  # https://longhorn.io/docs/1.8.1/best-practices/#io-performance
+  defaultDataLocality: best-effort
+  defaultClassReplicaCount: 2
+
+  # minimum volume size is 300Mi
+  # https://github.com/longhorn/longhorn/issues/8488
+  defaultFsType: xfs
+
+resources: # https://longhorn.io/docs/1.8.1/best-practices/#minimum-recommended-hardware
+    requests:
+      cpu: 0.5
+      memory: 128Mi
+    limits:
+      cpu: 4
+      memory: 4Gi
+
+ingress:
+    enabled: true
+    className: ""
+    annotations:
+      namespace: {{ .Release.Namespace }}
+      cert-manager.io/cluster-issuer: "cert-issuer"
+      traefik.ingress.kubernetes.io/router.entrypoints: websecure
+      traefik.ingress.kubernetes.io/router.middlewares: traefik-traefik-basic-auth@kubernetescrd,traefik-longhorn-strip-prefix@kubernetescrd  # namespace + middleware name
+    tls: true
+    tlsSecret: monitoring-tls
+    host: {{ requiredEnv "K8S_MONITORING_FQDN" }}
+    path: /longhorn
+    pathType: Prefix
@@ -1,4 +1,4 @@
 persistence:
   enabled: true
   size: "1Gi"  # minimal size for gp3 is 1Gi
-  storageClass: "ebs-sc"
+  storageClass: "{{ .Values.ebsStorageClassName }}"