Skip to content

Commit dd37dfe

Browse files
author
Xuewei Zhang
committed
Add e2e tests for reporting filesystem problems
Also added support for running e2e tests in parallel.
1 parent b3f811d commit dd37dfe

File tree

7 files changed

+132
-55
lines changed

7 files changed

+132
-55
lines changed

Makefile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ PKG:=k8s.io/node-problem-detector
4040
# PKG_SOURCES are all the go source code.
4141
PKG_SOURCES:=$(shell find pkg cmd -name '*.go')
4242

43+
# PARALLEL specifies the number of parallel test nodes to run for e2e tests.
44+
PARALLEL?=3
45+
4346
# TARBALL is the name of release tar. Include binary version by default.
4447
TARBALL?=node-problem-detector-$(VERSION).tar.gz
4548

@@ -122,8 +125,8 @@ test: vet fmt
122125
GO111MODULE=on go test -mod vendor -timeout=1m -v -race -short -tags "$(BUILD_TAGS)" ./...
123126

124127
e2e-test: vet fmt build-tar
125-
GO111MODULE=on go test -mod vendor -timeout=10m -v -tags "$(BUILD_TAGS)" \
126-
./test/e2e/metriconly/... \
128+
GO111MODULE=on ginkgo -nodes=$(PARALLEL) -mod vendor -timeout=10m -v -tags "$(BUILD_TAGS)" \
129+
./test/e2e/metriconly/... -- \
127130
-project=$(PROJECT) -zone=$(ZONE) \
128131
-image=$(VM_IMAGE) -image-family=$(IMAGE_FAMILY) -image-project=$(IMAGE_PROJECT) \
129132
-ssh-user=$(SSH_USER) -ssh-key=$(SSH_KEY) \

test/e2e-install.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ function install-npd() {
6767
echo "Installing NPD systemd service."
6868
cp "${workdir}"/config/systemd/node-problem-detector-metric-only.service /etc/systemd/system/node-problem-detector.service
6969

70+
echo "Installing problem maker binary, used only for e2e testing."
71+
cp "${workdir}"/test/bin/problem-maker "${BIN_DIR}"
72+
7073
rm -rf "${workdir}"
7174

7275
# Start systemd service.

test/e2e/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Currently the tests only support Google Compute Engine (GCE) environment. Suppor
66

77
## Prerequisites
88

9-
1. Setup [Google Application Default Credentials](https://developers.google.com/identity/protocols/application-default-credentials), which is [required for authentication](https://godoc.org/google.golang.org/api/compute/v1#hdr-Creating_a_client) by the Compute Engine API.
9+
1. Setup [Google Application Default Credentials (ADC)](https://developers.google.com/identity/protocols/application-default-credentials), which is [required for authentication](https://godoc.org/google.golang.org/api/compute/v1#hdr-Creating_a_client) by the Compute Engine API.
1010
2. Setup a [project-wide SSH key](https://cloud.google.com/compute/docs/instances/adding-removing-ssh-keys#project-wide) that can be used to SSH into the GCE VMs.
1111

1212
## Running tests
@@ -21,5 +21,6 @@ export VM_IMAGE=[TESTED_OS_IMAGE:cos-73-11647-217-0]
2121
export IMAGE_PROJECT=[TESTED_OS_IMAGE_PROJECT:cos-cloud]
2222
export SSH_USER=${USER}
2323
export SSH_KEY=~/.ssh/id_rsa
24+
export ARTIFACTS=/tmp/npd
2425
make e2e-test
2526
```

test/e2e/lib/gce/instance.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323

2424
"k8s.io/node-problem-detector/test/e2e/lib/ssh"
2525

26+
. "github.com/onsi/gomega"
2627
compute "google.golang.org/api/compute/v1"
2728
)
2829

@@ -145,6 +146,14 @@ func (ins *Instance) RunCommand(cmd string) ssh.Result {
145146
return ssh.Run(cmd, ins.ExternalIP, ins.SshUser, ins.SshKey)
146147
}
147148

149+
// RunCommand runs a command on the GCE instance and returns the command result, and fails the test when the command failed.
150+
func (ins *Instance) RunCommandOrFail(cmd string) ssh.Result {
151+
result := ins.RunCommand(cmd)
152+
Expect(result.SSHError).ToNot(HaveOccurred(), "SSH-ing to the instance failed: %v\n", result)
153+
Expect(result.Code).To(Equal(0), "Running command failed: %v\n", result)
154+
return result
155+
}
156+
148157
// PushFile pushes a local file to a GCE instance.
149158
func (ins *Instance) PushFile(srcPath, destPath string) error {
150159
if ins.ExternalIP == "" {

test/e2e/lib/npd/npd.go

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,6 @@ import (
2929
"github.com/avast/retry-go"
3030
)
3131

32-
const npdMetricsFilename = "node-problem-detector-metrics.txt"
33-
const npdLogsFilename = "node-problem-detector.log"
34-
3532
// SetupNPD installs NPD from the test tarball onto the provided GCE instance.
3633
//
3734
// Here is how it works:
@@ -91,6 +88,20 @@ func FetchNPDMetrics(ins gce.Instance) ([]metrics.Float64MetricRepresentation, e
9188
return npdMetrics, nil
9289
}
9390

91+
// FetchNPDMetric fetches and parses a specific metric reported by NPD on the provided GCE instance.
92+
func FetchNPDMetric(ins gce.Instance, metricName string, labels map[string]string) (float64, error) {
93+
gotMetrics, err := FetchNPDMetrics(ins)
94+
if err != nil {
95+
return 0.0, err
96+
}
97+
metric, err := metrics.GetFloat64Metric(gotMetrics, metricName, labels, true)
98+
if err != nil {
99+
return 0.0, fmt.Errorf("Failed to find %s metric with label %v: %v.\nHere is all NPD exported metrics: %v",
100+
metricName, labels, err, gotMetrics)
101+
}
102+
return metric.Value, nil
103+
}
104+
94105
// WaitForNPD waits for NPD to become ready by waiting for expected metrics.
95106
func WaitForNPD(ins gce.Instance, metricNames []string, timeoutSeconds uint) error {
96107
verifyMetricExist := func() error {
@@ -116,30 +127,33 @@ func WaitForNPD(ins gce.Instance, metricNames []string, timeoutSeconds uint) err
116127
}
117128

118129
// SaveTestArtifacts saves debugging data from NPD.
119-
func SaveTestArtifacts(ins gce.Instance, directory string) []error {
130+
func SaveTestArtifacts(ins gce.Instance, artifactDirectory string, testID int) []error {
120131
var errs []error
121132

122-
npdMetrics := ins.RunCommand("curl http://localhost:20257/metrics")
123-
if npdMetrics.SSHError != nil || npdMetrics.Code != 0 {
124-
errs = append(errs, fmt.Errorf("Error fetching NPD metrics: %v\n", npdMetrics))
125-
} else {
126-
npdMetricsPath := path.Join(directory, npdMetricsFilename)
127-
err := ioutil.WriteFile(npdMetricsPath, []byte(npdMetrics.Stdout), 0644)
128-
if err != nil {
129-
errs = append(errs, fmt.Errorf("Error writing to %s: %v", npdMetricsPath, err))
130-
}
133+
if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID,
134+
"curl http://localhost:20257/metrics", "node-problem-detector-metrics"); err != nil {
135+
errs = append(errs, err)
131136
}
132-
133-
npdLog := ins.RunCommand("sudo journalctl -u node-problem-detector.service")
134-
if npdLog.SSHError != nil || npdLog.Code != 0 {
135-
errs = append(errs, fmt.Errorf("Error fetching NPD logs: %v\n", npdLog))
136-
} else {
137-
npdLogsPath := path.Join(directory, npdLogsFilename)
138-
err := ioutil.WriteFile(npdLogsPath, []byte(npdLog.Stdout), 0644)
139-
if err != nil {
140-
errs = append(errs, fmt.Errorf("Error writing to %s: %v", npdLogsPath, err))
141-
}
137+
if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID,
138+
"sudo journalctl -u node-problem-detector.service", "node-problem-detector"); err != nil {
139+
errs = append(errs, err)
140+
}
141+
if err := saveCommandResultAsArtifact(ins, artifactDirectory, testID,
142+
"sudo journalctl -k", "kernel-logs"); err != nil {
143+
errs = append(errs, err)
142144
}
143145

144146
return errs
145147
}
148+
149+
func saveCommandResultAsArtifact(ins gce.Instance, artifactDirectory string, testID int, command string, artifactPrefix string) error {
150+
artifactPath := path.Join(artifactDirectory, fmt.Sprintf("%v-%02d.txt", artifactPrefix, testID))
151+
result := ins.RunCommand(command)
152+
if result.SSHError != nil || result.Code != 0 {
153+
return fmt.Errorf("Error running command: %v\n", result)
154+
}
155+
if err := ioutil.WriteFile(artifactPath, []byte(result.Stdout), 0644); err != nil {
156+
return fmt.Errorf("Error writing artifact to %v: %v\n", artifactPath, err)
157+
}
158+
return nil
159+
}

test/e2e/metriconly/e2e_npd_test.go

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,12 @@ import (
2929
"k8s.io/test-infra/boskos/client"
3030

3131
"github.com/onsi/ginkgo"
32+
"github.com/onsi/ginkgo/config"
3233
"github.com/onsi/ginkgo/reporters"
34+
. "github.com/onsi/gomega"
3335
compute "google.golang.org/api/compute/v1"
3436
)
3537

36-
const junitFileName = "junit.xml"
37-
3838
var zone = flag.String("zone", "", "gce zone the hosts live in")
3939
var project = flag.String("project", "", "gce project the hosts live in")
4040
var image = flag.String("image", "", "image to test")
@@ -80,7 +80,7 @@ func TestNPD(t *testing.T) {
8080
}
8181

8282
// The junit formatted result output is for showing test results on testgrid.
83-
junitReporter := reporters.NewJUnitReporter(path.Join(*artifactsDir, junitFileName))
83+
junitReporter := reporters.NewJUnitReporter(path.Join(*artifactsDir, fmt.Sprintf("junit-%02d.xml", config.GinkgoConfig.ParallelNode)))
8484
ginkgo.RunSpecsWithDefaultAndCustomReporters(t, "NPD Metric-only Suite", []ginkgo.Reporter{junitReporter})
8585
}
8686

@@ -89,9 +89,8 @@ func acquireProjectOrDie(boskosClient *client.Client) string {
8989
ctx, cancel := context.WithTimeout(context.Background(), *boskosWaitDuration)
9090
defer cancel()
9191
p, err := boskosClient.AcquireWait(ctx, *boskosProjectType, "free", "busy")
92-
if err != nil {
93-
panic(fmt.Sprintf("Unable to rent project from Boskos: %v\n", err))
94-
}
92+
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to rent project from Boskos: %v\n", err))
93+
9594
fmt.Printf("Rented project %s from Boskos", p.Name)
9695

9796
go func(boskosClient *client.Client, projectName string) {
@@ -110,12 +109,11 @@ func releaseProjectOrDie(boskosClient *client.Client) {
110109
return
111110
}
112111
err := boskosClient.ReleaseAll("dirty")
113-
if err != nil {
114-
panic(fmt.Sprintf("Failed to release project to Boskos: %v", err))
115-
}
112+
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to release project to Boskos: %v", err))
116113
}
117114

118115
func TestMain(m *testing.M) {
116+
RegisterFailHandler(ginkgo.Fail)
119117
flag.Parse()
120118

121119
os.Exit(m.Run())

test/e2e/metriconly/metrics_test.go

Lines changed: 69 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,15 @@ import (
2121
"os"
2222
"path"
2323
"strings"
24+
"time"
2425

2526
"k8s.io/node-problem-detector/pkg/util/metrics"
2627
"k8s.io/node-problem-detector/test/e2e/lib/gce"
2728
"k8s.io/node-problem-detector/test/e2e/lib/npd"
2829

2930
"github.com/onsi/ginkgo"
31+
"github.com/onsi/ginkgo/config"
32+
. "github.com/onsi/gomega"
3033
"github.com/pborman/uuid"
3134
)
3235

@@ -57,42 +60,77 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() {
5760
},
5861
*image,
5962
*imageProject)
60-
if err != nil {
61-
ginkgo.Fail(fmt.Sprintf("Unable to create test instance: %v", err))
62-
}
63+
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to create test instance: %v", err))
6364

6465
err = npd.SetupNPD(instance, *npdBuildTar)
65-
if err != nil {
66-
ginkgo.Fail(fmt.Sprintf("Unable to setup NPD: %v", err))
67-
}
66+
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Unable to setup NPD: %v", err))
6867
})
6968

7069
ginkgo.Context("On a clean node", func() {
7170

7271
ginkgo.It("NPD should export host_uptime metric", func() {
7372
err := npd.WaitForNPD(instance, []string{"host_uptime"}, 120)
74-
if err != nil {
75-
ginkgo.Fail(fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
76-
}
73+
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
7774

7875
gotMetrics, err := npd.FetchNPDMetrics(instance)
79-
if err != nil {
80-
ginkgo.Fail(fmt.Sprintf("Error fetching NPD metrics: %v", err))
81-
}
76+
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Error fetching NPD metrics: %v", err))
77+
8278
_, err = metrics.GetFloat64Metric(gotMetrics, "host_uptime", map[string]string{}, false)
83-
if err != nil {
84-
ginkgo.Fail(fmt.Sprintf("Failed to find uptime metric: %v.\nHere is all NPD exported metrics: %v",
85-
err, gotMetrics))
86-
}
79+
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to find uptime metric: %v.\nHere is all NPD exported metrics: %v", err, gotMetrics))
80+
})
81+
82+
ginkgo.It("NPD should not report any problem", func() {
83+
err := npd.WaitForNPD(instance, []string{"problem_gauge"}, 120)
84+
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
85+
86+
assertMetricValueInBound(instance,
87+
"problem_gauge", map[string]string{"reason": "DockerHung", "type": "KernelDeadlock"},
88+
0.0, 0.0)
89+
assertMetricValueInBound(instance,
90+
"problem_counter", map[string]string{"reason": "DockerHung"},
91+
0.0, 0.0)
92+
assertMetricValueInBound(instance,
93+
"problem_counter", map[string]string{"reason": "FilesystemIsReadOnly"},
94+
0.0, 0.0)
95+
assertMetricValueInBound(instance,
96+
"problem_counter", map[string]string{"reason": "KernelOops"},
97+
0.0, 0.0)
98+
assertMetricValueInBound(instance,
99+
"problem_counter", map[string]string{"reason": "OOMKilling"},
100+
0.0, 0.0)
101+
})
102+
})
103+
104+
ginkgo.Context("When ext4 filesystem error happens", func() {
105+
106+
ginkgo.BeforeEach(func() {
107+
err := npd.WaitForNPD(instance, []string{"problem_gauge"}, 120)
108+
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err))
109+
// This will trigger a ext4 error on the boot disk, causing the boot disk mounted as read-only and systemd-journald crashing.
110+
instance.RunCommandOrFail("sudo /home/kubernetes/bin/problem-maker --problem Ext4FilesystemError")
111+
})
112+
113+
ginkgo.It("NPD should update problem_counter{reason:Ext4Error} and problem_gauge{type:ReadonlyFilesystem}", func() {
114+
time.Sleep(5 * time.Second)
115+
assertMetricValueInBound(instance,
116+
"problem_counter", map[string]string{"reason": "Ext4Error"},
117+
1.0, 2.0)
118+
assertMetricValueInBound(instance,
119+
"problem_gauge", map[string]string{"reason": "FilesystemIsReadOnly", "type": "ReadonlyFilesystem"},
120+
1.0, 1.0)
121+
})
122+
123+
ginkgo.It("NPD should remain healthy", func() {
124+
npdStates := instance.RunCommandOrFail("sudo systemctl show node-problem-detector -p ActiveState -p SubState")
125+
Expect(npdStates.Stdout).To(ContainSubstring("ActiveState=active"), "NPD is no longer active: %v", npdStates)
126+
Expect(npdStates.Stdout).To(ContainSubstring("SubState=running"), "NPD is no longer running: %v", npdStates)
87127
})
88128
})
89129

90130
ginkgo.AfterEach(func() {
91131
defer func() {
92132
err := instance.DeleteInstance()
93-
if err != nil {
94-
ginkgo.Fail(fmt.Sprintf("Failed to clean up the test VM: %v", err))
95-
}
133+
Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("Failed to clena up the test VM: %v", err))
96134
}()
97135

98136
artifactSubDir := ""
@@ -109,9 +147,20 @@ var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() {
109147
}
110148
}
111149

112-
errs := npd.SaveTestArtifacts(instance, artifactSubDir)
150+
errs := npd.SaveTestArtifacts(instance, artifactSubDir, config.GinkgoConfig.ParallelNode)
113151
if len(errs) != 0 {
114152
fmt.Printf("Error storing debugging data to test artifacts: %v", errs)
115153
}
116154
})
117155
})
156+
157+
func assertMetricValueInBound(instance gce.Instance, metricName string, labels map[string]string, lowBound float64, highBound float64) {
158+
value, err := npd.FetchNPDMetric(instance, metricName, labels)
159+
if err != nil {
160+
ginkgo.Fail(fmt.Sprintf("Failed to find %s metric with label %v: %v", metricName, labels, err))
161+
}
162+
Expect(value).Should(BeNumerically(">=", lowBound),
163+
"Got value for metric %s with label %v: %v, expect at least %v.", metricName, labels, value, lowBound)
164+
Expect(value).Should(BeNumerically("<=", highBound),
165+
"Got value for metric %s with label %v: %v, expect at most %v.", metricName, labels, value, highBound)
166+
}

0 commit comments

Comments
 (0)