#25 removed plugins for grafana, removed labels for service monitors, disable serviceMonitorSelectorNilUsesHelmValues

khalezin · khalezin · commit a561bee65fd8 · 2021-04-26T13:13:22.000+03:00
diff --git a/README-RU.md b/README-RU.md
@@ -22,11 +22,17 @@
 
 ## Описание
 
-В данном репозитории собраны наработки команды Mad Devs для быстрого развертывания Kubernetes кластера, вспомогательных сервисов и нижележащей инфраструктуры в облаке Amazon. Основным инструментом разработки и поставки является [terraform](https://www.terraform.io/)
+В данном репозитории собраны наработки команды Mad Devs для быстрого развертывания Kubernetes кластера, вспомогательных сервисов и нижележащей инфраструктуры в облаке Amazon. Основным инструментом разработки и поставки является [terraform](https://www.terraform.io/).
 
-За время работы компании мы перепробовали много инфраструктурных решений и сервисов, и прошли путь от on-premise железа до serverless. В итоге на текущий момент нашей стандартной платформой для развертывания приложений стал Kubernetes, а основным облаком - AWS. Тут стоит отметить, что несмотря на то, что 90% наших и клиентских проектов хостится на AWS, а в качестве Kubernetes платформы используется [AWS EKS](https://aws.amazon.com/eks/), мы не упираемся рогом, не тащим все подряд в Kubernetes и не заставляем хостится в AWS. Kubernetes предлагается только после сбора и анализа требований к архитектуре сервиса. А далее при выборе Kubernetes - приложениям почти не важно, как создан сам кластер - вручную, через kops или используя managed услуги облачных провайдеров - в своей основе платформа Kubernetes везде одинакова. И выбор конкретного провайдера уже складывается из дополнительный требований, экспертизы и т.д.
+За время работы компании мы перепробовали много инфраструктурных решений и сервисов, и прошли путь от on-premise железа до serverless. В итоге на текущий момент нашей стандартной платформой для развертывания приложений стал Kubernetes, а основным облаком - AWS. 
 
-Мы знаем, что текущая реализация далеко не идеальна. Например, в кластер мы деплоим сервисы с помощью `terraform` - это довольно топорно и против подходов кубера, но это удобно для бутстрапа - т.к. используя стейт и интерполяцию, мы передаем необходимые `ids`, `arns` и другие указатели на ресурсы и имена или секреты в шаблоны и генерим из них `values` для нужных чартов, не выходя за пределы терраформа. Есть более специфичные минусы: ресурсы `data "template_file"`, которые мы использовали для большинства шаблонов, крайне неудобны для разработки и отладки, особенно если это такие 500+ строчные рулоны, типа `terraform/layer2-k8s/templates/elk-values.yaml`. Также, смотря на `helm3` и избавление от `tiller` - большое количество helm-релизов все равно в какой-то момент приводит к зависанию плана. Частично, но не всегда решается путем таргетированного апплая `terraform apply -target`, но для консистентности стейта желательно выполнять `plan` и `apply` целиком на всей конфигурации. Если собираетесь использовать данный бойлер, желательно разбить слой `terraform/layer2-k8s` на несколько, вынеся крупные и комплексные релизы в отдельные подслои.
+Тут стоит отметить, что несмотря на то, что 90% наших и клиентских проектов хостится на AWS, а в качестве Kubernetes платформы используется [AWS EKS](https://aws.amazon.com/eks/), мы не упираемся рогом, не тащим все подряд в Kubernetes и не заставляем хостится в AWS. Kubernetes предлагается только после сбора и анализа требований к архитектуре сервиса. 
+
+А далее при выборе Kubernetes - приложениям почти не важно, как создан сам кластер - вручную, через kops или используя managed услуги облачных провайдеров - в своей основе платформа Kubernetes везде одинакова. И выбор конкретного провайдера уже складывается из дополнительных требований, экспертизы и т.д.
+
+Мы знаем, что текущая реализация далеко не идеальна. Например, в кластер мы деплоим сервисы с помощью `terraform` - это довольно топорно и против подходов кубера, но это удобно для бутстрапа - т.к. используя стейт и интерполяцию, мы передаем необходимые `ids`, `arns` и другие указатели на ресурсы и имена или секреты в шаблоны и генерим из них `values` для нужных чартов, не выходя за пределы терраформа. 
+
+Есть более специфичные минусы: ресурсы `data "template_file"`, которые мы использовали для большинства шаблонов, крайне неудобны для разработки и отладки, особенно если это такие 500+ строчные рулоны, типа `terraform/layer2-k8s/templates/elk-values.yaml`. Также, смотря на `helm3` и избавление от `tiller` - большое количество helm-релизов все равно в какой-то момент приводит к зависанию плана. Частично, но не всегда решается путем таргетированного апплая `terraform apply -target`, но для консистентности стейта желательно выполнять `plan` и `apply` целиком на всей конфигурации. Если собираетесь использовать данный бойлер, желательно разбить слой `terraform/layer2-k8s` на несколько, вынеся крупные и комплексные релизы в отдельные подслои.
 
 Могут возникнуть справедливые вопросы к количеству `.tf` файлов. Оно конечно просится на рефакторинг и "обмодуливание". Чем мы и займемся в ближайшее время, разбивая этот монолит на микромодули и вводя `terragrunt`, попутно решая озвученные проблемы выше.
 
diff --git a/README.md b/README.md
@@ -22,13 +22,19 @@
 
 ## Description
 
-This repository contains the know-how of the Mad Devs team for the rapid deployment of a Kubernetes cluster, supporting services, and the underlying infrastructure in the Amazon cloud. The main development and delivery tool is [terraform](https://www.terraform.io/)
+This repository contains the know-how of the Mad Devs team for the rapid deployment of a Kubernetes cluster, supporting services, and the underlying infrastructure in the Amazon cloud. The main development and delivery tool is [terraform](https://www.terraform.io/).
 
-In our company’s work, we have tried many infrastructure solutions and services and traveled the path from on-premise hardware to serverless. As of today, Kubernetes has become our standard platform for deploying applications, and AWS has become the main cloud. It is worth noting here that although 90% of our and our clients’ projects are hosted on AWS and [AWS EKS](https://aws.amazon.com/eks/) is used as the Kubernetes platform, we do not insist, do not drag everything to Kubernetes, and do not force anyone to be hosted on AWS. Kubernetes is offered only after the collection and analysis of service architecture requirements. And then, when choosing Kubernetes, it makes almost no difference to applications how the cluster itself is created—manually, through kops or using managed services from cloud providers—in essence, the Kubernetes platform is the same everywhere. So the choice of a particular provider is then made based on additional requirements, expertise, etc.
+In our company’s work, we have tried many infrastructure solutions and services and traveled the path from on-premise hardware to serverless. As of today, Kubernetes has become our standard platform for deploying applications, and AWS has become the main cloud.
 
-We know that the current implementation is far from being perfect. For example, we deploy services to the cluster using `terraform`: it is rather clumsy and against the Kuber approaches, but it is convenient for bootstrap because, by using state and interpolation, we convey proper `IDs`, `ARNs`, and other attributes to resources and names or secrets to templates and generate values ​​from them for the required charts all within terraform. There are more specific drawbacks: the `data "template_file"` resources that we used for most templates are extremely inconvenient for development and debugging, especially if there are 500+ line rolls like `terraform/layer2-k8s/templates/elk-values.yaml`. Also, despite `helm3` got rid of the `tiller`, a large number of helm releases still at some point leads to plan hanging. Partially, but not always, it can be solved by `terraform apply -target`, but for the consistency of the state, it is desirable to execute `plan` and `apply` on the entire configuration. If you are going to use this boilerplate, it is advisable to split the `terraform/layer2-k8s` layer into several ones, taking out large and complex releases into separate modules.
+It is worth noting here that although 90% of our and our clients’ projects are hosted on AWS and [AWS EKS](https://aws.amazon.com/eks/) is used as the Kubernetes platform, we do not insist, do not drag everything to Kubernetes, and do not force anyone to be hosted on AWS. Kubernetes is offered only after the collection and analysis of service architecture requirements. 
 
-You may reasonably question the number of `.tf` files. This monolith certainly should be refactored and split into many micro-modules adopting `terragrunt` approach. This is exactly what we will do in the near future, solving along the way the problems described above.
+And then, when choosing Kubernetes, it makes almost no difference to applications how the cluster itself is created—manually, through kops or using managed services from cloud providers—in essence, the Kubernetes platform is the same everywhere. So the choice of a particular provider is then made based on additional requirements, expertise, etc.
+
+We know that the current implementation is far from being perfect. For example, we deploy services to the cluster using `terraform`: it is rather clumsy and against the Kuber approaches, but it is convenient for bootstrap because, by using state and interpolation, we convey proper `IDs`, `ARNs`, and other attributes to resources and names or secrets to templates and generate values ​​from them for the required charts all within terraform.
+
+There are more specific drawbacks: the `data "template_file"` resources that we used for most templates are extremely inconvenient for development and debugging, especially if there are 500+ line rolls like `terraform/layer2-k8s/templates/elk-values.yaml`. Also, despite `helm3` got rid of the `tiller`, a large number of helm releases still at some point leads to plan hanging. Partially, but not always, it can be solved by `terraform apply -target`, but for the consistency of the state, it is desirable to execute `plan` and `apply` on the entire configuration. If you are going to use this boilerplate, it is advisable to split the `terraform/layer2-k8s` layer into several ones, taking out large and complex releases into separate modules.
+
+You may reasonably question the number of .tf files. This monolith certainly should be refactored and split into many micro-modules adopting terragrunt approach. This is exactly what we will do in the near future, solving along the way the problems described above.
 
 ## Table of contents
 
diff --git a/terraform/layer1-aws/aws-eks.tf b/terraform/layer1-aws/aws-eks.tf
@@ -1,6 +1,6 @@
 module "eks" {
   source  = "terraform-aws-modules/eks/aws"
-  version = "14.0.0"
+  version = "15.1.0"
 
   cluster_name    = local.name
   cluster_version = var.eks_cluster_version
diff --git a/terraform/layer1-aws/demo.tfvars.example b/terraform/layer1-aws/demo.tfvars.example
@@ -18,7 +18,7 @@ single_nat_gateway = true
 ##########
 # EKS
 ##########
-eks_cluster_version = "1.18"
+eks_cluster_version = "1.19"
 
 eks_worker_groups = {
   spot = {
diff --git a/terraform/layer1-aws/variables.tf b/terraform/layer1-aws/variables.tf
@@ -91,7 +91,7 @@ variable "single_nat_gateway" {
 
 # EKS
 variable "eks_cluster_version" {
-  default     = "1.18"
+  default     = "1.19"
   description = "Version of the EKS K8S cluster"
 }
 
diff --git a/terraform/layer2-k8s/README.md b/terraform/layer2-k8s/README.md
@@ -24,21 +24,18 @@
 | additional\_allowed\_ips | IP addresses allowed to connect to private resources | `list(any)` | `[]` | no |
 | alb\_ingress\_chart\_version | Version of alb-ingress helm chart | `string` | `"1.0.4"` | no |
 | alb\_ingress\_image\_tag | Tag of docker image for alb-ingress controller | `string` | `"v1.1.5"` | no |
-| alertmanager\_slack\_channel | Slack channel for alertmanager alerts | `string` | `"madops-demo-alerts"` | no |
 | allowed\_account\_ids | List of allowed AWS account IDs | `list` | `[]` | no |
 | aws\_node\_termination\_handler\_version | Version of aws-node-termination-handler helm chart | `string` | `"0.13.3"` | no |
 | calico\_daemonset | Version of calico helm chart | `string` | `"0.3.4"` | no |
 | cert\_manager\_version | Version of cert-manager helm chart | `string` | `"1.1.0"` | no |
-| cluster\_autoscaler\_version | Version of autoscaler helm chart | `string` | `"1.1.0"` | no |
+| cluster\_autoscaler\_chart\_version | Version of cluster autoscaler helm chart | `string` | `"9.9.2"` | no |
+| cluster\_autoscaler\_version | Version of cluster autoscaler | `string` | `"v1.19.0"` | no |
 | elk\_index\_retention\_days | Days before remove index from system elasticsearch | `number` | `14` | no |
 | elk\_snapshot\_retention\_days | Days to capture index in snapshot | `number` | `90` | no |
 | elk\_version | Version of ELK helm chart | `string` | `"7.8.0"` | no |
 | external\_dns\_version | Version of external-dns helm chart | `string` | `"4.9.4"` | no |
 | external\_secrets\_version | Version of external-secrets helm chart | `string` | `"6.3.0"` | no |
 | gitlab\_runner\_version | Version of gitlab runner helm chart | `string` | `"0.26.0"` | no |
-| grafana\_gitlab\_group | Gitlab group for grafana oauth | `string` | `"madops"` | no |
-| kibana\_gitlab\_group | Gitlab group for kibana oauth2 | `string` | `"madops"` | no |
-| loki\_datasource\_for\_prometheus\_stack | Enable Loki Datasource in prometheus stack chart | `bool` | `false` | no |
 | loki\_stack | Version of Loki Stack helm chart | `string` | `"2.3.1"` | no |
 | nginx\_ingress\_controller\_version | Version of nginx-ingress helm chart | `string` | `"3.23.0"` | no |
 | nginx\_ingress\_ssl\_terminator | Select SSL termination type | `string` | `"lb"` | no |
@@ -48,7 +45,7 @@
 | redis\_version | Version of redis helm chart | `string` | `"12.7.3"` | no |
 | region | Default infrastructure region | `string` | `"us-east-1"` | no |
 | reloader\_version | Version of reloader helm chart | `string` | `"0.0.81"` | no |
-| remote\_state\_bucket | Name of the bucket with the state | `string` | `"madops-terraform-state-us-east-1"` | no |
+| remote\_state\_bucket | Name of the bucket for terraform state | `string` | n/a | yes |
 | remote\_state\_key | Key of the remote state for terraform\_remote\_state | `string` | `"layer1-aws"` | no |
 
 ## Outputs
diff --git a/terraform/layer2-k8s/eks-cluster-autoscaler.tf b/terraform/layer2-k8s/eks-cluster-autoscaler.tf
@@ -14,14 +14,15 @@ data "template_file" "cluster_autoscaler" {
     role_arn     = module.aws_iam_autoscaler.role_arn
     region       = local.region
     cluster_name = local.eks_cluster_id
+    version      = var.cluster_autoscaler_version
   }
 }
 
 resource "helm_release" "cluster_autoscaler" {
-  name       = "cluster-autoscaler-chart"
-  chart      = "cluster-autoscaler-chart"
+  name       = "cluster-autoscaler"
+  chart      = "cluster-autoscaler"
   repository = local.helm_repo_cluster_autoscaler
-  version    = var.cluster_autoscaler_version
+  version    = var.cluster_autoscaler_chart_version
   namespace  = kubernetes_namespace.sys.id
 
   values = [
diff --git a/terraform/layer2-k8s/eks-loki-stack.tf b/terraform/layer2-k8s/eks-loki-stack.tf
@@ -22,6 +22,7 @@ resource "helm_release" "loki_stack" {
   values = [
     local.loki_stack_template
   ]
+
 }
 
 resource "random_string" "grafana_loki_password" {
diff --git a/terraform/layer2-k8s/templates/cluster-autoscaler-values.yaml b/terraform/layer2-k8s/templates/cluster-autoscaler-values.yaml
@@ -1,3 +1,6 @@
+image:
+  tag: ${version}
+
 awsRegion: ${region}
 
 rbac:
@@ -13,9 +16,6 @@ serviceMonitor:
   enabled: true
   interval: 10s
   namespace: monitoring
-  selector:
-    app: kube-prometheus-stack-operator
-    release: kube-prometheus-stack
   path: /metrics
 
 affinity:
@@ -27,4 +27,3 @@ affinity:
           operator: In
           values:
             - ondemand
-
diff --git a/terraform/layer2-k8s/templates/external-dns.yaml b/terraform/layer2-k8s/templates/external-dns.yaml
@@ -18,9 +18,6 @@ metrics:
   serviceMonitor:
     enabled: false
     namespace: monitoring
-    selector:
-      app: kube-prometheus-stack-operator
-      release: kube-prometheus-stack
 
 sources:
   - service
diff --git a/terraform/layer2-k8s/templates/loki-stack-values.tmpl b/terraform/layer2-k8s/templates/loki-stack-values.tmpl
@@ -15,9 +15,6 @@ loki:
   serviceMonitor:
     enabled: true
     interval: ""
-    additionalLabels:
-      app: kube-prometheus-stack-operator
-      release: kube-prometheus-stack
     annotations: {}
     scrapeTimeout: 10s
 
@@ -29,9 +26,6 @@ promtail:
   serviceMonitor:
     enabled: true
     interval: ""
-    additionalLabels:
-      app: kube-prometheus-stack-operator
-      release: kube-prometheus-stack
     annotations: {}
     # scrapeTimeout: 10s
   tolerations:
@@ -122,5 +116,6 @@ grafana:
   dashboards:
     logs:
       logs:
+        ## Dashboard for quick search application logs for loki with two datasources loki and prometheus - https://grafana.com/grafana/dashboards/12019
         url: https://s3.amazonaws.com/grafana-dashboards.maddevs.org/common/aws-eks-base/loki-dashboard-quick-search.json
 
diff --git a/terraform/layer2-k8s/templates/nginx-ingress-certmanager-ssl-termination-values.yaml b/terraform/layer2-k8s/templates/nginx-ingress-certmanager-ssl-termination-values.yaml
@@ -31,9 +31,6 @@ controller:
     enabled: true
     serviceMonitor:
       enabled: true
-      additionalLabels:
-        app: kube-prometheus-stack-operator
-        release: kube-prometheus-stack
 
   affinity:
     nodeAffinity:
diff --git a/terraform/layer2-k8s/templates/nginx-ingress-values.yaml b/terraform/layer2-k8s/templates/nginx-ingress-values.yaml
@@ -22,9 +22,6 @@ controller:
     enabled: true
     serviceMonitor:
       enabled: true
-      additionalLabels:
-        app: kube-prometheus-stack-operator
-        release: kube-prometheus-stack
 
   podAnnotations:
     co.elastic.logs/module: nginx
diff --git a/terraform/layer2-k8s/templates/prometheus-mysql-exporter.yaml b/terraform/layer2-k8s/templates/prometheus-mysql-exporter.yaml
@@ -1,8 +1,5 @@
 serviceMonitor:
   enabled: true
-  additionalLabels:
-    app: kube-prometheus-stack-operator
-    release: kube-prometheus-stack
 
 mysql:
   existingSecret: mysql-exporter
diff --git a/terraform/layer2-k8s/templates/prometheus-postgresql-exporter.tmpl b/terraform/layer2-k8s/templates/prometheus-postgresql-exporter.tmpl
diff --git a/terraform/layer2-k8s/templates/prometheus-values.tmpl b/terraform/layer2-k8s/templates/prometheus-values.tmpl
diff --git a/terraform/layer2-k8s/variables.tf b/terraform/layer2-k8s/variables.tf

Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ variable "single_nat_gateway" {`
`91`	`91`
`92`	`92`	`# EKS`
`93`	`93`	`variable "eks_cluster_version" {`
`94`		`- default = "1.18"`
	`94`	`+ default = "1.19"`
`95`	`95`	`description = "Version of the EKS K8S cluster"`
`96`	`96`	`}`
`97`	`97`
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ resource "helm_release" "loki_stack" {`
`22`	`22`	`values = [`
`23`	`23`	`local.loki_stack_template`
`24`	`24`	`]`
	`25`	`+`
`25`	`26`	`}`
`26`	`27`
`27`	`28`	`resource "random_string" "grafana_loki_password" {`