diff --git a/.gitignore b/.gitignore index aed3842..1a29009 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ /tf/.terraform.lock.hcl /ansible/testnet/* /ansible/hosts +# These files are auto-generated during deployment +/ansible/secrets.yaml +/ansible/roles/telegraf/vars/secret.yaml diff --git a/Makefile b/Makefile index 102f338..48fee50 100644 --- a/Makefile +++ b/Makefile @@ -1,50 +1,34 @@ DO_INSTANCE_TAGNAME=v035-testnet LOAD_RUNNER_COMMIT_HASH ?= 51685158fe36869ab600527b852437ca0939d0cc LOAD_RUNNER_CMD=go run github.com/tendermint/tendermint/test/e2e/runner@$(LOAD_RUNNER_COMMIT_HASH) +E2E_RUNNER_VERSION=v0.35.5 export DO_INSTANCE_TAGNAME +export LOAD_RUNNER_CMD +export E2E_RUNNER_VERSION -.PHONY: terraform-init -terraform-init: +.PHONY: init +init: $(MAKE) -C ./tf/ init -.PHONY: terraform-apply -terraform-apply: +.PHONY: deploy +deploy: $(MAKE) -C ./tf/ apply + ./script/configgen.sh ./ansible/hosts + ./script/secretsgen.sh ./ansible/secrets.yaml + ANSIBLE_HOST_KEY_CHECKING=False \ + ansible-playbook -i ./ansible/hosts -u root ./ansible/deploy.yaml -f 10 -.PHONY: hosts -hosts: - echo "[validators]" > ./ansible/hosts - doctl compute droplet list --tag-name $(DO_INSTANCE_TAGNAME) --tag-name "testnet-node" | tail -n+2 | tr -s ' ' | cut -d' ' -f2,3 | sort -k1 | sed 's/\(.*\) \(.*\)/\2 name=\1/g' >> ./ansible/hosts - echo "[prometheus]" >> ./ansible/hosts - doctl compute droplet list --tag-name $(DO_INSTANCE_TAGNAME) --tag-name "testnet-observability" | tail -n+2 | tr -s ' ' | cut -d' ' -f3 >> ./ansible/hosts - -.PHONY: configgen -configgen: - ./script/configgen.sh `tail -n+2 ./ansible/hosts | head -n -2 |cut -d' ' -f1| paste -s -d, -` - -.PHONY: ansible-install -ansible-install: - cd ansible && \ - ansible-playbook -i hosts -u root base.yaml -f 10 && \ - ansible-playbook -i hosts -u root prometheus-node-exporter.yaml -f 10 && \ - ansible-playbook -i hosts -u root init-testapp.yaml -f 10 && \ - ansible-playbook -i hosts -u root update-testapp.yaml -f 10 - -.PHONY: prometheus-init -prometheus-init: - cd ansible && ansible-playbook -i hosts -u root prometheus.yaml -f 10 - -.PHONY: start-network -start-network: - cd ansible && ansible-playbook -i hosts -u root start-testapp.yaml -f 10 +.PHONY: update-testapp +update-testapp: + ./script/configgen.sh ./ansible/hosts + ANSIBLE_HOST_KEY_CHECKING=False \ + ansible-playbook -i ./ansible/hosts -u root ./ansible/update-testapp.yaml .PHONY: runload runload: - $(LOAD_RUNNER_CMD) load \ - --ip-list `tail -n+2 ./ansible/hosts | head -n -2 |cut -d' ' -f1| paste -s -d, -` \ - --seed-delta $(shell echo $$RANDOM) + ./script/runload.sh ./ansible/hosts -.PHONY: terraform-destroy -terraform-destroy: +.PHONY: destroy +destroy: $(MAKE) -C ./tf/ destroy diff --git a/README.md b/README.md index e454e59..38f8b88 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ test networks on Digital Ocean (DO). - [Ansible CLI][Ansible] - Go -## Instructions +## Deployment After you have all the prerequisites installed and have configured your [`testnet.toml`](./testnet.toml) file appropriately: @@ -32,30 +32,49 @@ ssh_keys = ["ab:cd:ef:01:23:45:67:89:ab:cd:ef:01:23:45:67:89"] EOF # 4. Initialize Terraform (only needed once) -make terraform-init +make init -# 5. Create the VMs for the validators and Prometheus as specified in ./testnet.toml -# Be sure to use your actual DO token and SSH key fingerprints for the DO_TOKEN -# and DO_SSH_KEYS variables. -make terraform-apply +# 5. Create the VMs for the validators and monitoring server as specified in +# ./testnet.toml +make deploy -# 6. Discover the IP addresses of the hosts for Ansible -make hosts +# 6. Execute a load test against the network +make runload +``` -# 7. Generate the testnet configuration -make configgen +## Data visualization -# 8. Install all necessary software on the created VMs using Ansible -make ansible-install +Once you have deployed a testnet, there will be a "monitor" server available +running an [InfluxDB] instance. Check the generated `ansible/hosts` file for the +IP address of the monitor and navigate to `http://:8086` in your web +browser to access the InfluxDB interface. -# 9. Initialize the Prometheus instance -make prometheus-init +The username is `admin` and the password is automatically generated during +deployment. The password can be found in the `ansible/secrets.yaml` file (not +committed to the repository). -# 10. Start the test application on all of the validators -make start-network +The UI is relatively straightforward, but if you need additional help please +see the [InfluxDB docs][InfluxDB]. -# 11. Execute a load test against the network -make runload +## Reloading the test app + +In cases where you don't want to tear down the infrastructure and only want to +reload the test app running across the network (say there are new changes on the +`v0.35.x` branch in the Git repo): + +```bash +make update-testapp +``` + +This will stop the test app, remove all config and data, redeploy the config, +and restart the test app. + +## Teardown + +To destroy all Digital Ocean infrastructure: + +```bash +make destroy ``` ## Metrics @@ -68,3 +87,4 @@ metrics and view their associated graphs. [Ansible]: https://docs.ansible.com/ansible/latest/index.html [Terraform]: https://www.terraform.io/docs [doctl]: https://docs.digitalocean.com/reference/doctl/how-to/install/ +[InfluxDB]: https://docs.influxdata.com/influxdb/v2.2/ diff --git a/ansible/base.yaml b/ansible/base.yaml deleted file mode 100644 index 34aea94..0000000 --- a/ansible/base.yaml +++ /dev/null @@ -1,22 +0,0 @@ -- name: base - hosts: validators,prometheus - gather_facts: yes - become_method: sudo - vars: - ansible_host_key_checking: false - - tasks: - - name: Update apt cache - ansible.builtin.apt: - update_cache: yes - cache_valid_time: 60 - - name: install deps - ansible.builtin.apt: - name: - - git - - gcc - - golang-1.17-go - - prometheus - - prometheus-node-exporter - state: latest - become: yes diff --git a/ansible/config-deploy.yaml b/ansible/config-deploy.yaml deleted file mode 100644 index 519e718..0000000 --- a/ansible/config-deploy.yaml +++ /dev/null @@ -1,19 +0,0 @@ -- name: initialize app - hosts: validators - become: false - gather_facts: yes - hosts: validators - vars: - tm_home: /root/.testapp/ - ansible_host_key_checking: false - - tasks: - - name: copy configuration files - ansible.builtin.copy: - src: ./testnet/{{ hostvars[inventory_hostname].name }}/config/config.toml - dest: "{{ tm_home }}/config/config.toml" - - name: restart app - ansible.builtin.systemd: - name: testappd - state: restarted - become: yes diff --git a/ansible/deploy.yaml b/ansible/deploy.yaml new file mode 100644 index 0000000..61c0922 --- /dev/null +++ b/ansible/deploy.yaml @@ -0,0 +1,25 @@ +--- +# This playbook must be executed as root. +# +# It's also critical that the monitor is deployed first before the nodes +# because the monitor deployment generates an API token for Telegraf instances +# on the nodes to access the InfluxDB database on the monitor. +- hosts: monitor + become: no + vars_files: + - ./vars.yaml + - ./secrets.yaml + roles: + - common + - influxdb + +- hosts: nodes + become: no + vars_files: + - ./vars.yaml + - ./secrets.yaml + roles: + - common + - telegraf + - tendermint + - testapp diff --git a/ansible/init-testapp.yaml b/ansible/init-testapp.yaml deleted file mode 100644 index 30c0384..0000000 --- a/ansible/init-testapp.yaml +++ /dev/null @@ -1,14 +0,0 @@ -- name: initialize app - hosts: validators - become: false - gather_facts: yes - hosts: validators - vars: - tm_home: /root/.testapp/ - ansible_host_key_checking: false - - tasks: - - name: copy configuration files - ansible.builtin.copy: - src: ./testnet/{{ hostvars[inventory_hostname].name }}/ - dest: "{{ tm_home }}/" diff --git a/ansible/prometheus-node-exporter.yaml b/ansible/prometheus-node-exporter.yaml deleted file mode 100644 index 18845a0..0000000 --- a/ansible/prometheus-node-exporter.yaml +++ /dev/null @@ -1,19 +0,0 @@ -- name: prometheus node exporter - hosts: validators,prometheus - gather_facts: yes - become_method: sudo - vars: - ansible_host_key_checking: false - - tasks: - - name: add node-exporter systemd unit file - ansible.builtin.copy: - src: templates/prometheus-node-exporter.service - dest: /etc/prometheus/prometheus-node-exporter.service - become: yes - - name: start the systemd unit - ansible.builtin.systemd: - name: prometheus-node-exporter - state: started - daemon_reload: yes - enabled: yes diff --git a/ansible/prometheus.yaml b/ansible/prometheus.yaml deleted file mode 100644 index 8fcd488..0000000 --- a/ansible/prometheus.yaml +++ /dev/null @@ -1,29 +0,0 @@ -- name: create prometheus - become: false - gather_facts: yes - hosts: prometheus - vars: - ansible_host_key_checking: false - tasks: - - name: create unit file - template: - src: templates/prometheus.service.j2 - dest: /lib/systemd/system/prometheus.service - become: yes - - name: create config - template: - src: templates/prometheus.yml.j2 - dest: /etc/prometheus/prometheus.yml - become: yes - - name: start the systemd-unit - ansible.builtin.systemd: - name: prometheus - state: started - daemon_reload: yes - enabled: yes - - name: restart the systemd-unit # not sure why this is necessary - ansible.builtin.systemd: - name: prometheus - state: restarted - daemon_reload: yes - enabled: yes diff --git a/ansible/remove-testapp-data.yaml b/ansible/remove-testapp-data.yaml deleted file mode 100644 index b5ef941..0000000 --- a/ansible/remove-testapp-data.yaml +++ /dev/null @@ -1,23 +0,0 @@ -- name: remove testapp data - hosts: validators - become_method: sudo - gather_facts: yes - vars: - tm_home: /root/.testapp/ - - tasks: - - name: stop app - ansible.builtin.systemd: - name: testappd - state: stopped - become: yes - - name: delete tm data - ansible.builtin.file: - path: "{{ tm_home }}" - state: absent - become: yes - - name: delete app data - ansible.builtin.file: - path: "{{ ansible_user_dir }}/data" - state: absent - become: yes diff --git a/ansible/restart-testapp.yaml b/ansible/restart-testapp.yaml deleted file mode 100644 index 1dc8c44..0000000 --- a/ansible/restart-testapp.yaml +++ /dev/null @@ -1,11 +0,0 @@ -- name: update testapp - hosts: validators - become_method: sudo - gather_facts: yes - - tasks: - - name: restart app - ansible.builtin.systemd: - name: testappd - state: restarted - become: yes diff --git a/ansible/roles/common/files/iptables-rules.v4 b/ansible/roles/common/files/iptables-rules.v4 new file mode 100644 index 0000000..4d55cb0 --- /dev/null +++ b/ansible/roles/common/files/iptables-rules.v4 @@ -0,0 +1,17 @@ +# Allow SSH on port 22 and related traffic. Rate-limit SSH login attempts. +# Log and drop failed SSH logins. +*filter +:INPUT ACCEPT [0:0] +:FORWARD ACCEPT [0:0] +:OUTPUT ACCEPT [368:94560] +:LOGDROP - [0:0] +-A INPUT -i lo -j ACCEPT +-A INPUT -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT +-A INPUT -p tcp -m tcp --dport 22 -m conntrack --ctstate NEW -m recent --update --seconds 60 --hitcount 11 --name DEFAULT --mask 255.255.255.255 --rsource -j LOGDROP +-A INPUT -p tcp -m tcp --dport 22 -m conntrack --ctstate NEW -m recent --set --name DEFAULT --mask 255.255.255.255 --rsource +-A INPUT -p tcp -m tcp --dport 22 -j ACCEPT +-A INPUT -m limit --limit 5/min -j LOG --log-prefix "iptables denied: " --log-level 7 +-A INPUT -j DROP +-A LOGDROP -j LOG --log-prefix "iptables denied ssh: " --log-level 7 +-A LOGDROP -j DROP +COMMIT diff --git a/ansible/roles/common/files/iptables-rules.v6 b/ansible/roles/common/files/iptables-rules.v6 new file mode 100644 index 0000000..cebb06c --- /dev/null +++ b/ansible/roles/common/files/iptables-rules.v6 @@ -0,0 +1,17 @@ +# Allow SSH on port 22 and related traffic. Rate-limit SSH login attempts. +# Log and drop failed SSH logins. +*filter +:INPUT ACCEPT [0:0] +:FORWARD ACCEPT [0:0] +:OUTPUT ACCEPT [0:0] +:LOGDROP - [0:0] +-A INPUT -i lo -j ACCEPT +-A INPUT -m state --state RELATED,ESTABLISHED -j ACCEPT +-A INPUT -p tcp -m tcp --dport 22 -m conntrack --ctstate NEW -m recent --update --seconds 60 --hitcount 11 --name DEFAULT --mask ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff --rsource -j LOGDROP +-A INPUT -p tcp -m tcp --dport 22 -m conntrack --ctstate NEW -m recent --set --name DEFAULT --mask ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff --rsource +-A INPUT -p tcp -m tcp --dport 22 -j ACCEPT +-A INPUT -m limit --limit 5/min -j LOG --log-prefix "ip6tables denied: " --log-level 7 +-A INPUT -j DROP +-A LOGDROP -j LOG --log-prefix "ip6tables denied ssh: " --log-level 7 +-A LOGDROP -j DROP +COMMIT diff --git a/ansible/roles/common/tasks/main.yaml b/ansible/roles/common/tasks/main.yaml new file mode 100644 index 0000000..4663c34 --- /dev/null +++ b/ansible/roles/common/tasks/main.yaml @@ -0,0 +1,27 @@ +- name: install common dependencies + ansible.builtin.apt: + name: + - iptables + - iptables-persistent + state: latest + update_cache: yes + cache_valid_time: 60 + +- name: ensure persistent iptables dir exists + ansible.builtin.file: + path: /etc/iptables + state: directory + +- name: copy base iptables rules + ansible.builtin.copy: + src: "iptables-{{ item }}" + dest: "/etc/iptables/{{ item }}" + loop: + - rules.v4 + - rules.v6 + +- name: apply base ipv4 iptables rules + ansible.builtin.shell: "iptables-restore /etc/iptables/rules.v4" + +- name: apply base ipv6 iptables rules + ansible.builtin.shell: "ip6tables-restore /etc/iptables/rules.v6" diff --git a/ansible/roles/influxdb/files/config.toml b/ansible/roles/influxdb/files/config.toml new file mode 100644 index 0000000..2fdcca6 --- /dev/null +++ b/ansible/roles/influxdb/files/config.toml @@ -0,0 +1,4 @@ +bolt-path = "/var/lib/influxdb/influxd.bolt" +engine-path = "/var/lib/influxdb/engine" +reporting-disabled = true +http-bind-address = ":8086" diff --git a/ansible/roles/influxdb/tasks/main.yaml b/ansible/roles/influxdb/tasks/main.yaml new file mode 100644 index 0000000..51065d9 --- /dev/null +++ b/ansible/roles/influxdb/tasks/main.yaml @@ -0,0 +1,89 @@ +- name: ensure jq and curl are present + ansible.builtin.apt: + name: + - jq + - curl + state: latest + update_cache: yes + cache_valid_time: 60 + +- name: enable WAN access to InfluxDB + ansible.builtin.iptables: + chain: INPUT + protocol: tcp + destination_port: "{{ influxdb.port }}" + jump: ACCEPT + action: insert + rule_num: 6 + +- name: fetch InfluxDB .deb package + ansible.builtin.get_url: + url: "https://dl.influxdata.com/influxdb/releases/influxdb2-{{ influxdb.version }}-amd64.deb" + checksum: "sha256:{{ influxdb.checksum }}" + dest: /tmp/influxdb.deb + +- name: fetch InfluxDB cli .deb package + ansible.builtin.get_url: + url: "https://dl.influxdata.com/influxdb/releases/influxdb2-client-{{ influxdb_cli.version }}-amd64.deb" + checksum: "sha256:{{ influxdb_cli.checksum }}" + dest: /tmp/influxdb-cli.deb + +- name: install InfluxDB + ansible.builtin.apt: + deb: /tmp/influxdb.deb + +- name: copy InfluxDB config + ansible.builtin.copy: + src: config.toml + dest: /etc/influxdb/config.toml + owner: root + group: root + +- name: install InfluxDB CLI + ansible.builtin.apt: + deb: /tmp/influxdb-cli.deb + +- name: enable and start InfluxDB + ansible.builtin.systemd: + name: influxdb + daemon_reload: true + enabled: true + state: restarted + +- name: perform InfluxDB initial setup + ansible.builtin.shell: | + influx setup \ + -u admin \ + -p "{{ generated_secrets.influxdb_admin_password }}" \ + -o tendermint \ + -b tendermint \ + -r 0 \ + -f + args: + creates: /root/.influxdbv2/configs + +# TODO(thane): Restrict telegraf user access when we know which specific +# permissions it needs. +- name: create Telegraf user and login for InfluxDB + ansible.builtin.shell: | + influx user create \ + -n telegraf \ + -p "{{ generated_secrets.influxdb_telegraf_password }}" \ + -o tendermint + influx auth create \ + -u telegraf \ + -o tendermint \ + --all-access + export TELEGRAF_TOKEN=$(influx auth ls --json | jq -r '.[] | select(.userName == "telegraf") | .token') + echo "influxdb_token: ${TELEGRAF_TOKEN}" > /root/telegraf-secret.yaml + args: + creates: /root/telegraf-secret.yaml + +- name: fetch InfluxDB telegraf token to distribute to nodes + ansible.builtin.fetch: + src: /root/telegraf-secret.yaml + dest: "{{ playbook_dir }}/roles/telegraf/vars/secret.yaml" + flat: true + +# TODO(thane): Enable TLS support for longer-running testnets +# See https://docs.influxdata.com/influxdb/v2.2/security/enable-tls/ diff --git a/ansible/roles/telegraf/tasks/main.yaml b/ansible/roles/telegraf/tasks/main.yaml new file mode 100644 index 0000000..188a07e --- /dev/null +++ b/ansible/roles/telegraf/tasks/main.yaml @@ -0,0 +1,26 @@ +- name: load Telegraf token + include_vars: + file: ../vars/secret.yaml + name: telegraf_secret + +- name: fetch Telegraf .deb package + ansible.builtin.get_url: + url: "https://dl.influxdata.com/telegraf/releases/telegraf_{{ telegraf.version }}-1_amd64.deb" + checksum: "sha256:{{ telegraf.checksum }}" + dest: /tmp/telegraf.deb + +- name: install Telegraf + ansible.builtin.apt: + deb: /tmp/telegraf.deb + +- name: deploy Telegraf config + ansible.builtin.template: + src: telegraf.conf.j2 + dest: /etc/telegraf/telegraf.conf + +- name: enable and start Telegraf + ansible.builtin.service: + name: telegraf + enabled: true + state: restarted + diff --git a/ansible/roles/telegraf/templates/telegraf.conf.j2 b/ansible/roles/telegraf/templates/telegraf.conf.j2 new file mode 100644 index 0000000..69ff805 --- /dev/null +++ b/ansible/roles/telegraf/templates/telegraf.conf.j2 @@ -0,0 +1,218 @@ +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply surround +# them with ${}. For strings the variable must be within quotes (ie, "${STR_VAR}"), +# for numbers and booleans they should be plain (ie, ${INT_VAR}, ${BOOL_VAR}) + + +# Global tags can be specified here in key="value" format. +[global_tags] + # dc = "us-east-1" # will tag all metrics with dc=us-east-1 + # rack = "1a" + ## Environment variables can be used as tags, and throughout the config file + # user = "$USER" + + +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "10s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = true + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + metric_batch_size = 1000 + + ## Maximum number of unwritten metrics per output. Increasing this value + ## allows for longer periods of output downtime without dropping metrics at the + ## cost of higher maximum memory usage. + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Collection offset is used to shift the collection by the given amount. + ## This can be be used to avoid many plugins querying constraint devices + ## at the same time by manually scheduling them in time. + # collection_offset = "0s" + + ## Default flushing interval for all outputs. Maximum flush_interval will be + ## flush_interval + flush_jitter + flush_interval = "10s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "0s" + + ## Collected metrics are rounded to the precision specified. Precision is + ## specified as an interval with an integer + unit (e.g. 0s, 10ms, 2us, 4s). + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + ## + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s: + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + precision = "0s" + + ## Override default hostname, if empty use os.Hostname() + hostname = "" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = false + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +# Configuration for sending metrics to InfluxDB 2.0 +[[outputs.influxdb_v2]] + ## The URLs of the InfluxDB cluster nodes. + ## + ## Multiple URLs can be specified for a single cluster, only ONE of the + ## urls will be written to each interval. + ## ex: urls = ["https://us-west-2-1.aws.cloud2.influxdata.com"] + urls = ["http://{{ hostvars['monitor']['ansible_host'] }}:8086"] + + ## Token for authentication. + token = "{{ telegraf_secret.influxdb_token }}" + + ## Organization is the name of the organization you wish to write to. + organization = "tendermint" + + ## Destination bucket to write into. + bucket = "tendermint" + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + + +# Read metrics about cpu usage +[[inputs.cpu]] + ## Whether to report per-cpu stats or not + percpu = true + ## Whether to report total system cpu stats or not + totalcpu = true + ## If true, collect raw CPU time metrics + collect_cpu_time = false + ## If true, compute and report the sum of all non-idle CPU states + report_active = false + ## If true and the info is available then add core_id and physical_id tags + core_tags = false + + +# Read metrics about disk usage by mount point +[[inputs.disk]] + ## By default stats will be gathered for all mount points. + ## Set mount_points will restrict the stats to only the specified mount points. + # mount_points = ["/"] + + ## Ignore mount points by filesystem type. + ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] + + ## Ignore mount points by mount options. + ## The 'mount' command reports options of all mounts in parathesis. + ## Bind mounts can be ignored with the special 'bind' option. + # ignore_mount_opts = [] + + +# Read metrics about disk IO by device +[[inputs.diskio]] + ## By default, telegraf will gather stats for all devices including + ## disk partitions. + ## Setting devices will restrict the stats to the specified devices. + # devices = ["sda", "sdb", "vd*"] + ## Uncomment the following line if you need disk serial numbers. + # skip_serial_number = false + # + ## On systems which support it, device metadata can be added in the form of + ## tags. + ## Currently only Linux is supported via udev properties. You can view + ## available properties for a device by running: + ## 'udevadm info -q property -n /dev/sda' + ## Note: Most, but not all, udev properties can be accessed this way. Properties + ## that are currently inaccessible include DEVTYPE, DEVNAME, and DEVPATH. + # device_tags = ["ID_FS_TYPE", "ID_FS_USAGE"] + # + ## Using the same metadata source as device_tags, you can also customize the + ## name of the device via templates. + ## The 'name_templates' parameter is a list of templates to try and apply to + ## the device. The template may contain variables in the form of '$PROPERTY' or + ## '${PROPERTY}'. The first template which does not contain any variables not + ## present for the device is used as the device name tag. + ## The typical use case is for LVM volumes, to get the VG/LV name instead of + ## the near-meaningless DM-0 name. + # name_templates = ["$ID_FS_LABEL","$DM_VG_NAME/$DM_LV_NAME"] + + +# Get kernel statistics from /proc/stat +[[inputs.kernel]] + # no configuration + + +# Read metrics about memory usage +[[inputs.mem]] + # no configuration + + +# Get the number of processes and group them by status +[[inputs.processes]] + # no configuration + + +# Read metrics about swap memory usage +[[inputs.swap]] + # no configuration + + +# Read metrics about system load & uptime +[[inputs.system]] + # no configuration + + +# Read metrics from one or many prometheus clients +[[inputs.prometheus]] + urls = ["http://localhost:{{ tendermint.prometheus_port }}/metrics"] + +# Parse the new lines appended to a file +[[inputs.tail]] + ## File names or a pattern to tail. + ## These accept standard unix glob matching rules, but with the addition of + ## ** as a "super asterisk". ie: + ## "/var/log/**.log" -> recursively find all .log files in /var/log + ## "/var/log/*/*.log" -> find all .log files with a parent dir in /var/log + ## "/var/log/apache.log" -> just tail the apache log file + ## "/var/log/log[!1-2]* -> tail files without 1-2 + ## "/var/log/log[^1-2]* -> identical behavior as above + ## See https://github.com/gobwas/glob for more examples + ## + files = ["{{ tendermint.log_file }}"] + + ## Read file from beginning. + from_beginning = false + + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "json" + json_strict = false + diff --git a/ansible/roles/telegraf/vars/.gitkeep b/ansible/roles/telegraf/vars/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/ansible/roles/tendermint/tasks/main.yaml b/ansible/roles/tendermint/tasks/main.yaml new file mode 100644 index 0000000..39b4c1e --- /dev/null +++ b/ansible/roles/tendermint/tasks/main.yaml @@ -0,0 +1,34 @@ +- name: install Tendermint prerequisites + ansible.builtin.apt: + name: + - git + - gcc + - golang-1.18-go + state: latest + update_cache: yes + cache_valid_time: 60 + +- name: create tendermint group + ansible.builtin.group: + name: tendermint + state: present + +- name: create tendermint user + ansible.builtin.user: + name: tendermint + group: tendermint + home: "{{ tendermint.home_dir }}" + create_home: true + shell: /usr/bin/bash + state: present + +- name: allow TCP port access to SSH, P2P and RPC + ansible.builtin.iptables: + chain: INPUT + protocol: tcp + destination_ports: + - "26656" + - "26657" + action: insert + rule_num: 6 + jump: ACCEPT diff --git a/ansible/roles/testapp/tasks/main.yaml b/ansible/roles/testapp/tasks/main.yaml new file mode 100644 index 0000000..875e168 --- /dev/null +++ b/ansible/roles/testapp/tasks/main.yaml @@ -0,0 +1,67 @@ +- name: deploy testapp execution scripts + ansible.builtin.template: + src: "{{ item }}.j2" + dest: "{{ tendermint.home_dir }}/{{ item }}" + owner: tendermint + group: tendermint + mode: 0775 + loop: + - run-testapp.sh + - stop-testapp.sh + +- name: clone Tendermint git repo + become_user: tendermint + ansible.builtin.git: + repo: https://github.com/tendermint/tendermint + dest: "{{ tendermint.home_dir }}/src" + version: "{{ tendermint.version }}" + clone: yes + update: yes + +- name: build testapp + become_user: tendermint + ansible.builtin.shell: + cmd: /usr/lib/go-1.18/bin/go build -o /usr/local/bin/tendermint-testapp + chdir: "{{ tendermint.home_dir }}/src/test/e2e/node" + +- name: touch Tendermint log file + ansible.builtin.file: + path: "{{ tendermint.log_file }}" + state: touch + owner: tendermint + group: tendermint + +- name: stop testapp + become_user: tendermint + ansible.builtin.shell: + cmd: ./stop-testapp.sh + chdir: "{{ tendermint.home_dir }}" + +- name: wipe testapp config and data + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - "{{ tendermint.home_dir }}/config" + - "{{ tendermint.home_dir }}/data" + +- name: copy testapp config + ansible.builtin.copy: + src: "{{ playbook_dir }}/testnet/{{ inventory_hostname }}/" + dest: "{{ tendermint.home_dir }}/" + owner: tendermint + group: tendermint + +# We run the testapp as a background process instead of as a service +# specifically so we can control how we kill the process in testnet +# perturbations. +# +# This also enables easier log collection with Telegraf, as it can just tail +# the JSON entries in the log file and ship them off to the monitor. +- name: start testapp + become_user: tendermint + ansible.builtin.shell: | + nohup ./run-testapp.sh /dev/null + args: + chdir: "{{ tendermint.home_dir }}" + diff --git a/ansible/roles/testapp/templates/run-testapp.sh.j2 b/ansible/roles/testapp/templates/run-testapp.sh.j2 new file mode 100644 index 0000000..9c45804 --- /dev/null +++ b/ansible/roles/testapp/templates/run-testapp.sh.j2 @@ -0,0 +1,7 @@ +#!/bin/bash +set -euo pipefail + +export TMHOME="{{ tendermint.home_dir }}" +tendermint-testapp {{ tendermint.home_dir }}/config/app.toml > {{ tendermint.log_file }} 2>&1 & +echo "$!" > {{ tendermint.pid_file }} + diff --git a/ansible/roles/testapp/templates/stop-testapp.sh.j2 b/ansible/roles/testapp/templates/stop-testapp.sh.j2 new file mode 100644 index 0000000..94253eb --- /dev/null +++ b/ansible/roles/testapp/templates/stop-testapp.sh.j2 @@ -0,0 +1,7 @@ +#!/bin/bash +set -euo pipefail + +if [ -f "{{ tendermint.pid_file }}" ]; then + kill `cat {{ tendermint.pid_file }}` || true + rm -rf {{ tendermint.pid_file }} +fi diff --git a/ansible/start-testapp.yaml b/ansible/start-testapp.yaml deleted file mode 100644 index 637b932..0000000 --- a/ansible/start-testapp.yaml +++ /dev/null @@ -1,10 +0,0 @@ -- name: start testapp - hosts: validators - gather_facts: yes - - tasks: - - name: start the systemd-unit - ansible.builtin.systemd: - name: testappd - state: started - enabled: yes diff --git a/ansible/stop-testapp.yaml b/ansible/stop-testapp.yaml deleted file mode 100644 index 33d1a69..0000000 --- a/ansible/stop-testapp.yaml +++ /dev/null @@ -1,10 +0,0 @@ -- name: start testapp - hosts: validators - gather_facts: yes - - tasks: - - name: start the systemd-unit - ansible.builtin.systemd: - name: testappd - state: stopped - enabled: yes diff --git a/ansible/templates/prometheus-node-exporter.service b/ansible/templates/prometheus-node-exporter.service deleted file mode 100644 index e8256fe..0000000 --- a/ansible/templates/prometheus-node-exporter.service +++ /dev/null @@ -1,13 +0,0 @@ -[Unit] -Description=Node Exporter -Wants=network-online.target -After=network-online.target - -[Service] -User=prometheus -Group=prometheus -Type=simple -ExecStart=/usr/bin/prometheus-node-exporter - -[Install] -WantedBy=multi-user.target diff --git a/ansible/templates/prometheus.service.j2 b/ansible/templates/prometheus.service.j2 deleted file mode 100644 index 2b2d59b..0000000 --- a/ansible/templates/prometheus.service.j2 +++ /dev/null @@ -1,17 +0,0 @@ -[Unit] -Description=Prometheus -Wants=network-online.target -After=network-online.target - -[Service] -User=prometheus -Group=prometheus -Type=simple -ExecStart=/usr/bin/prometheus \ - --config.file /etc/prometheus/prometheus.yml \ - --storage.tsdb.path /var/lib/prometheus/ \ - --web.console.templates=/etc/prometheus/consoles \ - --web.console.libraries=/etc/prometheus/console_libraries - -[Install] -WantedBy=multi-user.target diff --git a/ansible/templates/prometheus.yml.j2 b/ansible/templates/prometheus.yml.j2 deleted file mode 100644 index 1dee48d..0000000 --- a/ansible/templates/prometheus.yml.j2 +++ /dev/null @@ -1,21 +0,0 @@ -global: - scrape_interval: 10s # By default, scrape targets every 10 seconds. - evaluation_interval: 15s # Evaluate rules every 15 seconds. - -scrape_configs: -{% for host in groups['validators'] %} - - job_name: {{ hostvars[host].name }} - - scrape_interval: 5s - - static_configs: - - targets: ['{{ hostvars[host].inventory_hostname }}:26660'] - - - job_name: {{ hostvars[host].name }}-node-exporter - - scrape_interval: 5s - - static_configs: - - targets: ['{{ hostvars[host].inventory_hostname }}:9100'] - -{% endfor %} diff --git a/ansible/templates/testappd.service.j2 b/ansible/templates/testappd.service.j2 deleted file mode 100644 index 1c4a655..0000000 --- a/ansible/templates/testappd.service.j2 +++ /dev/null @@ -1,17 +0,0 @@ -# /etc/systemd/system/testappd.service -[Unit] -Description=Testapp Node -After=network.target - -[Service] -Type=simple -User={{ ansible_user_id }} -WorkingDirectory={{ ansible_user_dir }} -ExecStart={{ansible_user_dir }}/go/bin/node {{ tm_home }}config/app.toml -Environment=TMHOME={{ tm_home }} -Restart=on-failure -RestartSec=3 -LimitNOFILE=4096 - -[Install] -WantedBy=multi-user.target diff --git a/ansible/update-testapp.yaml b/ansible/update-testapp.yaml index 6f7ecc4..a5ca37b 100644 --- a/ansible/update-testapp.yaml +++ b/ansible/update-testapp.yaml @@ -1,25 +1,7 @@ -- name: update testapp - hosts: validators - become_method: sudo - gather_facts: yes - vars: - version_tag: v0.35.x - tm_home: /root/.testapp/ - - tasks: - - name: clone tendermint repo - ansible.builtin.git: - repo: https://github.com/tendermint/tendermint - dest: "{{ ansible_user_dir }}/tendermint" - version: "{{ version_tag }}" - - name: rebuild testapp - shell: "cd tendermint/test/e2e/node && /usr/lib/go-1.17/bin/go install" - - name: update unit file - template: - src: templates/testappd.service.j2 - dest: /lib/systemd/system/testappd.service - become: yes - - name: reload systemd daemon - ansible.builtin.systemd: - daemon_reload: yes - become: yes +--- +- hosts: nodes + become: no + vars_files: + - ./vars.yaml + roles: + - testapp diff --git a/ansible/vars.yaml b/ansible/vars.yaml new file mode 100644 index 0000000..83d68e4 --- /dev/null +++ b/ansible/vars.yaml @@ -0,0 +1,21 @@ +influxdb: + version: "2.2.0" + checksum: dccc6cbf8af734407488d9b91c71b72f49c8cf4da2746e891be09b16f9b510d6 + port: "8086" +influxdb_cli: + version: "2.3.0" + checksum: d88f9dd7707a4d1e1ecf6d04d9102626a7c72262dcf1ea28e5e795c2c01615aa +telegraf: + version: "1.23.0" + checksum: f00854dfaab40ecdda05eeab841cab77aff8242601b80310d46a97887c998c39 +tendermint: + # The Git branch/tag/commit hash of the Tendermint repo from which the E2E + # test app will be built. + version: "v0.35.x" + # This must match what has already been configured in the Tendermint nodes' + # config files. Changing this value will only affect the port that Telegraf + # polls for Prometheus metrics. + prometheus_port: "26660" + home_dir: /tendermint + log_file: /var/log/tendermint.log + pid_file: /tendermint/testapp.pid diff --git a/script/configgen.sh b/script/configgen.sh index b77e34d..44dded3 100755 --- a/script/configgen.sh +++ b/script/configgen.sh @@ -1,14 +1,26 @@ #!/bin/bash set -euo pipefail -NEW_IPS=$1 +ANSIBLE_HOSTS=$1 +E2E_RUNNER_VERSION=${E2E_RUNNER_VERSION:-v0.35.5} +E2E_RUNNER_URL="github.com/tendermint/tendermint/test/e2e/runner@${E2E_RUNNER_VERSION}" -go run github.com/tendermint/tendermint/test/e2e/runner@v0.35.5 setup -f ./testnet.toml +# Extract the IP addresses of all of the nodes (excluding the monitoring +# server) from the Ansible hosts file. IP addresses will be in the same order +# as those generated in the docker-compose.yml file, and will be separated by +# newlines. +NEW_IPS=`cat ${ANSIBLE_HOSTS} | grep -v 'monitor' | grep 'ansible_host' | awk -F' ansible_host=' '{print $2}' | head -c -1 | tr '\n' ','` + +go run ${E2E_RUNNER_URL} setup -f ./testnet.toml OLD_IPS=`grep -E '(ipv4_address|container_name)' ./testnet/docker-compose.yml | sed 's/^.*ipv4_address: \(.*\)/\1/g' | sed 's/.*container_name: \(.*\)/\1/g' | paste -sd ' \n' - | sort -k1 | cut -d ' ' -f2` while read old <&3 && read new <&4; do + echo "Swapping ${old} for ${new}" find ./testnet/ -type f | xargs -I{} sed -i "s/$old/$new/g" {} -done 3< <(echo $OLD_IPS | tr ' ' '\n') 4< <(echo $NEW_IPS | tr , '\n' ) +done 3< <(echo $OLD_IPS | tr ' ' '\n') 4< <(echo $NEW_IPS | tr , '\n' ) + +# Update configuration parameters +find ./testnet/ -name 'config.toml' | xargs -I{} sed -i "s/^log-format = .*$/log-format = \"json\"/g" {} rm -rf ./ansible/testnet mv ./testnet ./ansible diff --git a/script/runload.sh b/script/runload.sh new file mode 100755 index 0000000..e3145ad --- /dev/null +++ b/script/runload.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -euo pipefail + +ANSIBLE_HOSTS=$1 +LOAD_RUNNER_CMD=${LOAD_RUNNER_CMD:-"go run github.com/tendermint/tendermint/test/e2e/runner@51685158fe36869ab600527b852437ca0939d0cc"} +IP_LIST=`cat ${ANSIBLE_HOSTS} | grep -v 'monitor' | grep 'ansible_host' | awk -F' ansible_host=' '{print $2}' | head -c -1 | tr '\n' ','` + +${LOAD_RUNNER_CMD} load --ip-list ${IP_LIST} --seed-delta 42 diff --git a/script/secretsgen.sh b/script/secretsgen.sh new file mode 100755 index 0000000..f3cbf91 --- /dev/null +++ b/script/secretsgen.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -euo pipefail + +OUTPUT_FILE="$1" + +if [ ! -f "${OUTPUT_FILE}" ]; then + cat < ${OUTPUT_FILE} +generated_secrets: + influxdb_admin_password: "$(tr -cd '[:alnum:]' < /dev/urandom | fold -w30 | head -n1)" + influxdb_telegraf_password: "$(tr -cd '[:alnum:]' < /dev/urandom | fold -w30 | head -n1)" +EOF + echo "Generated secrets in ${OUTPUT_FILE}" +else + echo "${OUTPUT_FILE} already exists - reusing generated secrets" +fi + diff --git a/tf/Makefile b/tf/Makefile index 81236bd..0e34431 100644 --- a/tf/Makefile +++ b/tf/Makefile @@ -19,6 +19,7 @@ apply: terraform.tfvars -var='instance_names=[$(INSTANCE_NAMES)]' && \ terraform validate && \ terraform apply \ + -auto-approve \ -var='testnet_size=$(TESTNET_SIZE)' \ -var='instance_tags=["$(DO_INSTANCE_TAGNAME)"]' \ -var='instance_names=[$(INSTANCE_NAMES)]' @@ -26,6 +27,7 @@ apply: terraform.tfvars .PHONY: destroy destroy: terraform.tfvars terraform destroy \ + -auto-approve \ -var='testnet_size=$(TESTNET_SIZE)' \ -var='instance_tags=["$(DO_INSTANCE_TAGNAME)"]' \ -var='instance_names=[$(INSTANCE_NAMES)]' diff --git a/tf/nodes.tf b/tf/droplets.tf similarity index 52% rename from tf/nodes.tf rename to tf/droplets.tf index baba334..d85c958 100644 --- a/tf/nodes.tf +++ b/tf/droplets.tf @@ -1,22 +1,4 @@ -variable "testnet_size" { - type = number - default = 20 -} - -variable "ssh_keys" { - type = list(string) -} - -variable "instance_tags" { - type = list(string) - default = ["v035-testnet"] -} - -variable "instance_names" { - type = list(string) -} - -resource "digitalocean_droplet" "testnet-node" { +resource "digitalocean_droplet" "node" { count = var.testnet_size name = var.instance_names[count.index] image = "debian-11-x64" @@ -26,8 +8,8 @@ resource "digitalocean_droplet" "testnet-node" { ssh_keys = var.ssh_keys } -resource "digitalocean_droplet" "testnet-prometheus" { - name = "testnet-prometheus" +resource "digitalocean_droplet" "monitor" { + name = "monitor" image = "debian-11-x64" region = "fra1" tags = concat(var.instance_tags, ["testnet-observability"]) diff --git a/tf/hosts.tftpl b/tf/hosts.tftpl new file mode 100644 index 0000000..083d4ef --- /dev/null +++ b/tf/hosts.tftpl @@ -0,0 +1,6 @@ +${monitor.name} ansible_host=${monitor.ipv4_address} + +[nodes] +%{ for node in nodes ~} +${node.name} ansible_host=${node.ipv4_address} +%{ endfor ~} diff --git a/tf/outputs.tf b/tf/outputs.tf new file mode 100644 index 0000000..660de05 --- /dev/null +++ b/tf/outputs.tf @@ -0,0 +1,7 @@ +resource "local_file" "ansible_inventory" { + content = templatefile("hosts.tftpl", { + nodes = digitalocean_droplet.node.*, + monitor = digitalocean_droplet.monitor, + }) + filename = "../ansible/hosts" +} diff --git a/tf/project.tf b/tf/project.tf index 4dd4973..5124e3a 100644 --- a/tf/project.tf +++ b/tf/project.tf @@ -1,5 +1,5 @@ resource "digitalocean_project" "tm-testnet" { name = "tm-testnet" description = "A project to test the Tendermint codebase." - resources = concat([for node in digitalocean_droplet.testnet-node: node.urn], [digitalocean_droplet.testnet-prometheus.urn]) + resources = concat([for node in digitalocean_droplet.node: node.urn], [digitalocean_droplet.monitor.urn]) } diff --git a/tf/variables.tf b/tf/variables.tf new file mode 100644 index 0000000..5731ba3 --- /dev/null +++ b/tf/variables.tf @@ -0,0 +1,17 @@ +variable "testnet_size" { + type = number + default = 20 +} + +variable "ssh_keys" { + type = list(string) +} + +variable "instance_tags" { + type = list(string) + default = ["v035-testnet"] +} + +variable "instance_names" { + type = list(string) +}