Skip to content

Commit 4c461b8

Browse files
authored
feat(k3s): multi cluster setup (#1111)
1 parent d9b1cad commit 4c461b8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+3933
-451
lines changed

ansible/play-k3s--clickhouse.yml

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
---
2+
# Prepare k3s cluster for ClickHouse deployment
3+
#
4+
# Applies:
5+
# - sysctl tuning for ClickHouse performance
6+
# - Storage symlink from DO Block Storage to /data/clickhouse
7+
# - local-path provisioner config for /data/clickhouse
8+
#
9+
# Usage:
10+
# ansible-playbook -i inventory/digitalocean.yml play-k3s--clickhouse.yml \
11+
# -e variable_host=<group>
12+
#
13+
# Example:
14+
# uv run ansible-playbook -i inventory/digitalocean.yml play-k3s--clickhouse.yml -e variable_host=logs_k3s
15+
16+
17+
# Play 1: Apply ClickHouse tuning to all nodes
18+
- name: ClickHouse - System tuning
19+
hosts: '{{ variable_host }}'
20+
gather_facts: true
21+
become: true
22+
23+
tasks:
24+
- name: Apply sysctl settings
25+
ansible.posix.sysctl:
26+
name: "{{ item.key }}"
27+
value: "{{ item.value }}"
28+
sysctl_set: true
29+
reload: true
30+
loop:
31+
# Memory settings
32+
- { key: vm.max_map_count, value: "262144" }
33+
- { key: vm.swappiness, value: "1" }
34+
- { key: vm.dirty_background_ratio, value: "5" }
35+
- { key: vm.dirty_ratio, value: "10" }
36+
# Network settings
37+
- { key: net.core.somaxconn, value: "65535" }
38+
- { key: net.core.netdev_max_backlog, value: "65535" }
39+
- { key: net.ipv4.tcp_max_syn_backlog, value: "65535" }
40+
- { key: net.ipv4.tcp_fin_timeout, value: "15" }
41+
- { key: net.ipv4.tcp_keepalive_time, value: "300" }
42+
- { key: net.ipv4.tcp_keepalive_intvl, value: "30" }
43+
- { key: net.ipv4.tcp_keepalive_probes, value: "5" }
44+
45+
- name: Disable transparent huge pages
46+
shell: |
47+
echo never > /sys/kernel/mm/transparent_hugepage/enabled
48+
echo never > /sys/kernel/mm/transparent_hugepage/defrag
49+
changed_when: false
50+
51+
- name: Detect mounted block storage
52+
shell: findmnt -rno TARGET -t ext4 | grep '/mnt' | head -1
53+
register: storage_mount
54+
changed_when: false
55+
failed_when: false
56+
57+
- name: Fail if no block storage detected
58+
fail:
59+
msg: "No DO Block Storage found mounted at /mnt/*. Ensure volume is attached and mounted."
60+
when: storage_mount.stdout | length == 0
61+
62+
- name: Verify mount is in fstab (persistent)
63+
shell: grep -q "{{ storage_mount.stdout }}" /etc/fstab
64+
register: fstab_check
65+
changed_when: false
66+
failed_when: false
67+
68+
- name: Warn if mount not in fstab
69+
debug:
70+
msg: "WARNING: {{ storage_mount.stdout }} not in fstab - mount may not survive reboot"
71+
when: fstab_check.rc != 0
72+
73+
- name: Get block device for mount
74+
shell: findmnt -rno SOURCE "{{ storage_mount.stdout }}"
75+
register: block_device
76+
changed_when: false
77+
78+
- name: Set I/O scheduler to none
79+
shell: |
80+
DEV=$(basename {{ block_device.stdout }} | sed 's/[0-9]*$//')
81+
echo none > /sys/block/${DEV}/queue/scheduler 2>/dev/null || true
82+
changed_when: false
83+
84+
- name: Make I/O scheduler persistent via udev rule
85+
copy:
86+
dest: /etc/udev/rules.d/60-clickhouse-scheduler.rules
87+
content: |
88+
# Set I/O scheduler to none for DO block storage (SSD)
89+
ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/rotational}=="0", ATTR{queue/scheduler}="none"
90+
ACTION=="add|change", KERNEL=="vd[a-z]", ATTR{queue/rotational}=="0", ATTR{queue/scheduler}="none"
91+
mode: '0644'
92+
93+
- name: Make THP disable persistent
94+
copy:
95+
dest: /etc/systemd/system/disable-thp.service
96+
content: |
97+
[Unit]
98+
Description=Disable Transparent Huge Pages
99+
DefaultDependencies=no
100+
After=sysinit.target local-fs.target
101+
Before=basic.target
102+
103+
[Service]
104+
Type=oneshot
105+
ExecStart=/bin/sh -c 'echo never > /sys/kernel/mm/transparent_hugepage/enabled'
106+
ExecStart=/bin/sh -c 'echo never > /sys/kernel/mm/transparent_hugepage/defrag'
107+
108+
[Install]
109+
WantedBy=basic.target
110+
mode: '0644'
111+
register: thp_service
112+
113+
- name: Enable THP disable service
114+
systemd:
115+
name: disable-thp
116+
enabled: true
117+
daemon_reload: "{{ thp_service.changed }}"
118+
119+
- name: Create /data directory
120+
file:
121+
path: /data
122+
state: directory
123+
mode: '0755'
124+
125+
- name: Create storage symlink
126+
file:
127+
src: "{{ storage_mount.stdout }}"
128+
dest: /data/clickhouse
129+
state: link
130+
force: true
131+
132+
133+
# Play 2: Configure local-path provisioner for ClickHouse storage (runs on first server only)
134+
- name: ClickHouse - Storage provisioner
135+
hosts: '{{ variable_host }}[0]'
136+
gather_facts: false
137+
become: true
138+
139+
tasks:
140+
- name: Update local-path provisioner config
141+
shell: |
142+
k3s kubectl patch configmap local-path-config -n kube-system --type merge \
143+
-p '{"data":{"config.json":"{\"nodePathMap\":[{\"node\":\"DEFAULT_PATH_FOR_NON_LISTED_NODES\",\"paths\":[\"/data/clickhouse\"]}]}"}}'
144+
register: patch_result
145+
changed_when: "'patched' in patch_result.stdout"
146+
147+
- name: Restart local-path provisioner
148+
command: k3s kubectl rollout restart deployment local-path-provisioner -n kube-system
149+
150+
- name: Wait for rollout
151+
command: k3s kubectl rollout status deployment local-path-provisioner -n kube-system --timeout=60s
152+
changed_when: false
153+
154+
- name: Verify config
155+
command: k3s kubectl get configmap local-path-config -n kube-system -o jsonpath='{.data.config\.json}'
156+
register: config
157+
changed_when: false
158+
159+
- name: Display
160+
debug:
161+
msg: "local-path provisioner configured: {{ config.stdout }}"

ansible/play-k3s--cluster.yml

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
---
2+
# Deploy k3s HA cluster with Traefik + Gateway API
3+
#
4+
# Prerequisites (manual):
5+
# - 3x Ubuntu VMs on DigitalOcean with VPC attached (eth1)
6+
# - Tailscale installed and connected on all nodes
7+
# - DO Load Balancer: HTTP:80->30080, HTTPS:443->30443 (TLS passthrough)
8+
# - DO Firewall configured
9+
#
10+
# Usage:
11+
# ansible-playbook -i inventory/digitalocean.yml play-k3s--cluster.yml \
12+
# -e variable_host=<group>
13+
#
14+
# Examples:
15+
# ansible-playbook -i inventory/digitalocean.yml play-k3s--cluster.yml \
16+
# -e variable_host=tools_k3s
17+
18+
19+
# Play 1: Validate and Prepare
20+
- name: K3s - Validate Prerequisites
21+
hosts: '{{ variable_host }}'
22+
gather_facts: true
23+
become: true
24+
25+
tasks:
26+
- name: Validate VPC interface exists (eth1)
27+
assert:
28+
that:
29+
- ansible_eth1 is defined
30+
- ansible_eth1.ipv4 is defined
31+
- ansible_eth1.ipv4.address is defined
32+
fail_msg: "VPC interface eth1 not found. Ensure VM is attached to DO VPC."
33+
34+
- name: Validate Tailscale is connected
35+
assert:
36+
that:
37+
- ansible_tailscale0 is defined
38+
- ansible_tailscale0.ipv4 is defined
39+
- ansible_tailscale0.ipv4.address is defined
40+
fail_msg: "Tailscale interface not found. Ensure Tailscale is installed and connected."
41+
42+
- name: Validate VPC IP is in expected range
43+
assert:
44+
that:
45+
- ansible_eth1.ipv4.address | regex_search('^10\\.')
46+
fail_msg: "VPC IP {{ ansible_eth1.ipv4.address }} not in 10.x.x.x range."
47+
48+
- name: Set network facts
49+
set_fact:
50+
vpc_ip: "{{ ansible_eth1.ipv4.address }}"
51+
tailscale_ip: "{{ ansible_tailscale0.ipv4.address }}"
52+
53+
- name: Display network configuration
54+
debug:
55+
msg: "{{ inventory_hostname }}: VPC={{ vpc_ip }}, Tailscale={{ tailscale_ip }}"
56+
57+
- name: Build k3s_cluster group
58+
group_by:
59+
key: k3s_cluster
60+
61+
- name: Build server group
62+
group_by:
63+
key: server
64+
65+
66+
# Play 2: System Prerequisites
67+
- name: K3s - System Prerequisites
68+
hosts: k3s_cluster
69+
gather_facts: true
70+
become: true
71+
roles:
72+
- role: k3s.orchestration.prereq
73+
74+
75+
# Play 3: Deploy k3s Servers
76+
- name: K3s - Deploy Cluster
77+
hosts: server
78+
gather_facts: true
79+
become: true
80+
vars:
81+
k3s_version: v1.32.11+k3s1
82+
api_endpoint: "{{ hostvars[groups['server'][0]]['vpc_ip'] }}"
83+
extra_server_args: >-
84+
--node-ip={{ hostvars[inventory_hostname]['vpc_ip'] }}
85+
--advertise-address={{ hostvars[inventory_hostname]['vpc_ip'] }}
86+
--flannel-iface=eth1
87+
--tls-san={{ hostvars[inventory_hostname]['vpc_ip'] }}
88+
--tls-san={{ hostvars[inventory_hostname]['tailscale_ip'] }}
89+
server_group: server
90+
roles:
91+
- role: k3s.orchestration.k3s_server
92+
93+
94+
# Play 4: Configure Ingress
95+
- name: K3s - Configure Traefik and Gateway API
96+
hosts: server[0]
97+
gather_facts: false
98+
become: true
99+
100+
tasks:
101+
- name: Apply Traefik HelmChartConfig
102+
copy:
103+
src: "{{ playbook_dir }}/../k3s/shared/traefik-config.yaml"
104+
dest: /var/lib/rancher/k3s/server/manifests/traefik-config.yaml
105+
mode: '0600'
106+
107+
- name: Install Gateway API CRDs
108+
command: >
109+
k3s kubectl apply -f
110+
https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.4.0/standard-install.yaml
111+
register: gateway_result
112+
changed_when: "'created' in gateway_result.stdout or 'configured' in gateway_result.stdout"
113+
114+
- name: Wait for all nodes ready
115+
command: k3s kubectl wait --for=condition=Ready nodes --all --timeout=300s
116+
changed_when: false
117+
118+
- name: Display cluster status
119+
command: k3s kubectl get nodes -o wide
120+
register: cluster_status
121+
changed_when: false
122+
123+
- name: Cluster ready
124+
debug:
125+
msg: "{{ cluster_status.stdout_lines }}"

ansible/play-k3s--longhorn.yml

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
---
2+
# Install Longhorn distributed storage on k3s cluster
3+
#
4+
# For clusters running apps without built-in replication (Appsmith, Outline, etc.)
5+
# Provides: replicated volumes, snapshots, backups
6+
#
7+
# Usage:
8+
# ansible-playbook -i inventory/digitalocean.yml play-k3s--longhorn.yml \
9+
# -e variable_host=<group>
10+
#
11+
# Example:
12+
# ansible-playbook -i inventory/digitalocean.yml play-k3s--longhorn.yml \
13+
# -e variable_host=k3s_tools
14+
15+
16+
# Play 1: Install Longhorn prerequisites on all nodes
17+
- name: Longhorn - Prerequisites
18+
hosts: '{{ variable_host }}'
19+
gather_facts: true
20+
become: true
21+
22+
tasks:
23+
- name: Install required packages
24+
apt:
25+
name:
26+
- open-iscsi
27+
- nfs-common
28+
state: present
29+
update_cache: true
30+
31+
- name: Enable and start iscsid
32+
systemd:
33+
name: iscsid
34+
enabled: true
35+
state: started
36+
37+
38+
# Play 2: Deploy Longhorn via k3s HelmChart (runs on first server only)
39+
- name: Longhorn - Deploy
40+
hosts: '{{ variable_host }}[0]'
41+
gather_facts: false
42+
become: true
43+
44+
tasks:
45+
- name: Deploy Longhorn HelmChart
46+
copy:
47+
dest: /var/lib/rancher/k3s/server/manifests/longhorn.yaml
48+
content: |
49+
apiVersion: helm.cattle.io/v1
50+
kind: HelmChart
51+
metadata:
52+
name: longhorn
53+
namespace: default
54+
spec:
55+
chart: longhorn
56+
repo: https://charts.longhorn.io
57+
version: 1.10.1
58+
targetNamespace: longhorn-system
59+
createNamespace: true
60+
failurePolicy: abort
61+
valuesContent: |-
62+
persistence:
63+
defaultClassReplicaCount: 2
64+
defaultSettings:
65+
defaultReplicaCount: 2
66+
storageMinimalAvailablePercentage: 15
67+
storageOverProvisioningPercentage: 100
68+
mode: '0600'
69+
70+
- name: Wait for Longhorn namespace
71+
command: k3s kubectl get namespace longhorn-system
72+
register: ns_check
73+
until: ns_check.rc == 0
74+
retries: 30
75+
delay: 10
76+
changed_when: false
77+
78+
- name: Wait for Longhorn manager
79+
command: >
80+
k3s kubectl -n longhorn-system wait --for=condition=Available
81+
deployment/longhorn-driver-deployer --timeout=300s
82+
changed_when: false
83+
84+
- name: Verify StorageClass
85+
command: k3s kubectl get storageclass longhorn
86+
register: sc_check
87+
until: sc_check.rc == 0
88+
retries: 12
89+
delay: 10
90+
changed_when: false
91+
92+
- name: Get Longhorn status
93+
command: k3s kubectl -n longhorn-system get pods
94+
register: status
95+
changed_when: false
96+
97+
- name: Display
98+
debug:
99+
msg: "{{ status.stdout_lines }}"

0 commit comments

Comments
 (0)