Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ install-crds: ## Install CRDs

.PHONY: install-gateway-crds
install-gateway-crds: ## Install Gateway API CRDs
kubectl kustomize $(SELF_DIR)config/crd/gateway-api/$(if $(filter true,$(ENABLE_EXPERIMENTAL)),experimental,standard) | kubectl apply -f -
kubectl kustomize $(SELF_DIR)config/crd/gateway-api/$(if $(filter true,$(ENABLE_EXPERIMENTAL)),experimental,standard) | kubectl apply --server-side -f -

.PHONY: uninstall-gateway-crds
uninstall-gateway-crds: ## Uninstall Gateway API CRDs
Expand Down
2 changes: 1 addition & 1 deletion tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ HELM_PARAMETERS += --set nginxGateway.name=nginx-gateway --set nginx.service.typ
# it overrides the target in the main Makefile when the GW_API_VERSION is set to main
ifeq ($(GW_API_VERSION),main)
install-gateway-crds:
kubectl kustomize "https://github.com/kubernetes-sigs/gateway-api/config/crd/$(if $(filter true,$(ENABLE_EXPERIMENTAL)),experimental,)?timeout=120&ref=main" | kubectl apply -f -
kubectl kustomize "https://github.com/kubernetes-sigs/gateway-api/config/crd/$(if $(filter true,$(ENABLE_EXPERIMENTAL)),experimental,)?timeout=120&ref=main" | kubectl apply --server-side -f -
endif

.PHONY: install-ngf-local-no-build
Expand Down
28 changes: 16 additions & 12 deletions tests/framework/timeout.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,22 +38,26 @@ type TimeoutConfig struct {

// KubernetesClientTimeout represents the maximum time for Kubernetes client operations.
KubernetesClientTimeout time.Duration

// GatewayListenerUpdateTimeout represents the maximum time for Gateway Listener count to be updated.
GatewayListenerUpdateTimeout time.Duration
}

// DefaultTimeoutConfig populates a TimeoutConfig with the default values.
func DefaultTimeoutConfig() TimeoutConfig {
return TimeoutConfig{
CreateTimeout: 60 * time.Second,
UpdateTimeout: 60 * time.Second,
DeleteTimeout: 10 * time.Second,
DeleteNamespaceTimeout: 150 * time.Second,
GetTimeout: 10 * time.Second,
ManifestFetchTimeout: 10 * time.Second,
RequestTimeout: 10 * time.Second,
ContainerRestartTimeout: 10 * time.Second,
GetLeaderLeaseTimeout: 60 * time.Second,
GetStatusTimeout: 60 * time.Second,
TestForTrafficTimeout: 60 * time.Second,
KubernetesClientTimeout: 10 * time.Second,
CreateTimeout: 60 * time.Second,
UpdateTimeout: 60 * time.Second,
DeleteTimeout: 10 * time.Second,
DeleteNamespaceTimeout: 150 * time.Second,
GetTimeout: 10 * time.Second,
ManifestFetchTimeout: 10 * time.Second,
RequestTimeout: 30 * time.Second,
ContainerRestartTimeout: 10 * time.Second,
GetLeaderLeaseTimeout: 60 * time.Second,
GetStatusTimeout: 60 * time.Second,
TestForTrafficTimeout: 60 * time.Second,
KubernetesClientTimeout: 10 * time.Second,
GatewayListenerUpdateTimeout: 60 * time.Second,
}
}
18 changes: 18 additions & 0 deletions tests/suite/manifests/scale/httproute.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: route
spec:
parentRefs:
- name: gateway
sectionName: listener
hostnames:
- "*.example.com"
rules:
- matches:
- path:
type: PathPrefix
value: /
backendRefs:
- name: backend
port: 80
19 changes: 0 additions & 19 deletions tests/suite/manifests/scale/upstreams.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,6 @@ spec:
port: 80
protocol: HTTP
---
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: route
spec:
parentRefs:
- name: gateway
sectionName: listener
hostnames:
- "*.example.com"
rules:
- matches:
- path:
type: PathPrefix
value: /
backendRefs:
- name: backend
port: 80
---
apiVersion: apps/v1
kind: Deployment
metadata:
Expand Down
28 changes: 23 additions & 5 deletions tests/suite/scale_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"os"
Expand Down Expand Up @@ -40,6 +41,10 @@ var _ = Describe("Scale test", Ordered, Label("nfr", "scale"), func() {
"scale/upstreams.yaml",
}

httpRouteManifests = []string{
"scale/httproute.yaml",
}

namespace = "scale"

scrapeInterval = 15 * time.Second
Expand Down Expand Up @@ -468,6 +473,10 @@ The logs are attached only if there are errors.
Expect(resourceManager.ApplyFromFiles(upstreamsManifests, namespace)).To(Succeed())
Expect(resourceManager.WaitForAppsToBeReady(namespace)).To(Succeed())

// apply HTTPRoute after upstreams are ready
Expect(resourceManager.ApplyFromFiles(httpRouteManifests, namespace)).To(Succeed())
Expect(resourceManager.WaitForAppsToBeReady(namespace)).To(Succeed())

var nginxPodNames []string
var err error
Eventually(
Expand Down Expand Up @@ -835,7 +844,16 @@ var _ = Describe("Zero downtime scale test", Ordered, Label("nfr", "zero-downtim
AfterAll(func() {
_, err := fmt.Fprint(outFile)
Expect(err).ToNot(HaveOccurred())
Expect(outFile.Close()).To(Succeed())
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering how this could already be closed in any situation.

Copy link
Contributor Author

@salonichf5 salonichf5 Sep 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know. I saw it as a possible issue in the pipeline runs. I was trying to understand the same by asking github copilot but all the answers did not make sense to me. But I wanted to ensure we fix it for the pipeline so added these checks.

From what I understand -- I don't think these tests run in parallel looking at the workflow. But is it worth risking the NFR biweekly pipeline run to fail for these reasons and not get to the PR step?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally I'd like to dig deeper into why this specific test could have this issue, rather than essentially just ignoring it. Since all the tests should open and close their respective fields as part of the setup and teardown. This one shouldn't be any different.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I didn’t communicate it well earlier — I actually tried to reproduce this bug with multiple runs, but I couldn’t get the exact error to show up. I also dug into whether the file might be getting closed in any of the helper functions or if the sequence was off, but I didn’t find anything unusual there either.

I added this check mainly as a safeguard so the pipeline wouldn’t fail on what seems like a simple case, but I agree it’s worth digging deeper. I’ll keep investigating to see if I can pinpoint the real cause.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just an update -- on my 5th run of this locally but haven't had a file close error. But one time I couldn't get LB IP and it timed out. Don't think it is related happens when GKE is hasn't assigned the external IP

Copy link
Collaborator

@sjberman sjberman Sep 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked logs for Retrying and saw one entry of the test that failed. confused :?

Screenshot 2025-09-04 at 1 05 27 PM

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, since we have flake-atttempts=2, when there's a failure, it'll retry the test again, which is what is happening there.

Copy link
Contributor Author

@salonichf5 salonichf5 Sep 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no I meant like I ave never seen a flakey attempt pass for NFR, so to agree to your original point -- why keep them if they will lead us to different file issues

Also from one of the runs, I saw this error for scale tests

FAILED] in [AfterAll] - /home/username/nginx-gateway-fabric/tests/suite/system_suite_test.go:264 @ 08/15/25 18:37:27.954
• [FAILED] [1090.394 seconds]
Scale test [BeforeEach] scales HTTP routes to 1000 [nfr, scale]
  [BeforeEach] /home/username/nginx-gateway-fabric/tests/suite/scale_test.go:99
  [It] /home/username/nginx-gateway-fabric/tests/suite/scale_test.go:570

  [FAILED] install.go:224: 2025-08-15 18:25:42.463026353 +0000 UTC m=+0.038108761 [debug] Original chart version: ""
  install.go:241: 2025-08-15 18:25:42.463081634 +0000 UTC m=+0.038164040 [debug] CHART PATH: /home/username/nginx-gateway-fabric/charts/nginx-gateway-fabric

  walk.go:75: found symbolic link in path: /home/username/nginx-gateway-fabric/charts/nginx-gateway-fabric/crds resolves to /home/username/nginx-gateway-fabric/config/crd/bases. Contents of linked file included and used
  client.go:142: 2025-08-15 18:25:42.5001725 +0000 UTC m=+0.075254914 [debug] creating 1 resource(s)
  client.go:142: 2025-08-15 18:25:42.536370754 +0000 UTC m=+0.111453183 [debug] creating 1 resource(s)
  client.go:142: 2025-08-15 18:25:42.596179176 +0000 UTC m=+0.171261591 [debug] creating 1 resource(s)
  client.go:142: 2025-08-15 18:25:43.007022036 +0000 UTC m=+0.582104435 [debug] creating 1 resource(s)
  client.go:142: 2025-08-15 18:25:43.085068053 +0000 UTC m=+0.660150467 [debug] creating 1 resource(s)
  client.go:142: 2025-08-15 18:25:43.14430961 +0000 UTC m=+0.719392046 [debug] creating 1 resource(s)
  wait.go:50: 2025-08-15 18:25:43.237721342 +0000 UTC m=+0.812803751 [debug] beginning wait for 6 resources with timeout of 1m0s
  install.go:212: 2025-08-15 18:25:45.666880466 +0000 UTC m=+3.241962882 [debug] Clearing REST mapper cache
  client.go:142: 2025-08-15 18:25:46.294039937 +0000 UTC m=+3.869122350 [debug] creating 1 resource(s)
  client.go:486: 2025-08-15 18:25:46.460743731 +0000 UTC m=+4.0[358](https://github.com/nginx/nginx-gateway-fabric/actions/runs/16993918579/job/48179870697#step:11:359)26141 [debug] Starting delete for "ngf-test-nginx-gateway-fabric-cert-generator" ServiceAccount
  client.go:490: 2025-08-15 18:25:46.501964695 +0000 UTC m=+4.077047109 [debug] Ignoring delete failure for "ngf-test-nginx-gateway-fabric-cert-generator" /v1, Kind=ServiceAccount: serviceaccounts "ngf-test-nginx-gateway-fabric-cert-generator" not found
  wait.go:104: 2025-08-15 18:25:46.502099876 +0000 UTC m=+4.077182288 [debug] beginning wait for 1 resources to be deleted with timeout of 5m0s
  client.go:142: 2025-08-15 18:25:46.758832678 +0000 UTC m=+4.333915093 [debug] creating 1 resource(s)
  client.go:486: 2025-08-15 18:25:46.817617819 +0000 UTC m=+4.392700224 [debug] Starting delete for "ngf-test-nginx-gateway-fabric-cert-generator" Role
  client.go:490: 2025-08-15 18:25:46.851225828 +0000 UTC m=+4.426308242 [debug] Ignoring delete failure for "ngf-test-nginx-gateway-fabric-cert-generator" rbac.authorization.k8s.io/v1, Kind=Role: roles.rbac.authorization.k8s.io "ngf-test-nginx-gateway-fabric-cert-generator" not found
  wait.go:104: 2025-08-15 18:25:46.851272367 +0000 UTC m=+4.426354760 [debug] beginning wait for 1 resources to be deleted with timeout of 5m0s
  client.go:142: 2025-08-15 18:25:47.057280013 +0000 UTC m=+4.632362426 [debug] creating 1 resource(s)
  client.go:486: 2025-08-15 18:25:47.141462002 +0000 UTC m=+4.716544407 [debug] Starting delete for "ngf-test-nginx-gateway-fabric-cert-generator" RoleBinding
  client.go:490: 2025-08-15 18:25:47.174935325 +0000 UTC m=+4.750017744 [debug] Ignoring delete failure for "ngf-test-nginx-gateway-fabric-cert-generator" rbac.authorization.k8s.io/v1, Kind=RoleBinding: rolebindings.rbac.authorization.k8s.io "ngf-test-nginx-gateway-fabric-cert-generator" not found
  wait.go:104: 2025-08-15 18:25:47.175001271 +0000 UTC m=+4.750083677 [debug] beginning wait for 1 resources to be deleted with timeout of 5m0s
  client.go:142: 2025-08-15 18:25:47.379579502 +0000 UTC m=+4.954661916 [debug] creating 1 resource(s)
  client.go:486: 2025-08-15 18:25:47.454800946 +0000 UTC m=+5.029883351 [debug] Starting delete for "ngf-test-nginx-gateway-fabric-cert-generator" Job
  client.go:490: 2025-08-15 18:25:47.491223753 +0000 UTC m=+5.066306166 [debug] Ignoring delete failure for "ngf-test-nginx-gateway-fabric-cert-generator" batch/v1, Kind=Job: jobs.batch "ngf-test-nginx-gateway-fabric-cert-generator" not found
  wait.go:104: 2025-08-15 18:25:47.491272508 +0000 UTC m=+5.066354919 [debug] beginning wait for 1 resources to be deleted with timeout of 5m0s
  client.go:142: 2025-08-15 18:25:47.748791178 +0000 UTC m=+5.32387[359](https://github.com/nginx/nginx-gateway-fabric/actions/runs/16993918579/job/48179870697#step:11:360)1 [debug] creating 1 resource(s)
  client.go:712: 2025-08-15 18:25:47.805465089 +0000 UTC m=+5.380547487 [debug] Watching for changes to Job ngf-test-nginx-gateway-fabric-cert-generator with timeout of 5m0s
  client.go:740: 2025-08-15 18:25:47.834789407 +0000 UTC m=+5.409871818 [debug] Add/Modify event for ngf-test-nginx-gateway-fabric-cert-generator: ADDED
  client.go:779: 2025-08-15 18:25:47.834843163 +0000 UTC m=+5.409925574 [debug] ngf-test-nginx-gateway-fabric-cert-generator: Jobs active: 0, jobs failed: 0, jobs succeeded: 0
  client.go:740: 2025-08-15 18:25:47.940051746 +0000 UTC m=+5.515134161 [debug] Add/Modify event for ngf-test-nginx-gateway-fabric-cert-generator: MODIFIED
  client.go:779: 2025-08-15 18:25:47.940094635 +0000 UTC m=+5.515177045 [debug] ngf-test-nginx-gateway-fabric-cert-generator: Jobs active: 1, jobs failed: 0, jobs succeeded: 0
  Error: INSTALLATION FAILED: failed pre-install: 1 error occurred:
  	* timed out waiting for the condition


  helm.go:86: 2025-08-15 18:30:47.996973087 +0000 UTC m=+305.572055494 [debug] failed pre-install: 1 error occurred:
  	* timed out waiting for the condition


  INSTALLATION FAILED
  main.newInstallCmd.func2
  	helm.sh/helm/v3/cmd/helm/install.go:158
  github.com/spf13/cobra.(*Command).execute
  	github.com/spf13/[email protected]/command.go:985
  github.com/spf13/cobra.(*Command).ExecuteC
  	github.com/spf13/[email protected]/command.go:1117
  github.com/spf13/cobra.(*Command).Execute
  	github.com/spf13/[email protected]/command.go:1041
  main.main
  	helm.sh/helm/v3/cmd/helm/helm.go:85
  runtime.main
  	runtime/proc.go:272
  runtime.goexit
  	runtime/asm_amd64.s:1700

  Unexpected error:
      <*exec.ExitError | 0xc005170000>: 
      exit status 1
      {
          ProcessState: {
              pid: 38573,
              status: 256,
              rusage: {
                  Utime: {Sec: 1, Usec: 412618},
                  Stime: {Sec: 0, Usec: 159787},
                  Maxrss: 3481956,
                  Ixrss: 0,
                  Idrss: 0,
                  Isrss: 0,
                  Minflt: 19401,
                  Majflt: 0,
                  Nswap: 0,
                  Inblock: 0,
                  Oublock: 792,
                  Msgsnd: 0,
                  Msgrcv: 0,
                  Nsignals: 0,
                  Nvcsw: 4897,
                  Nivcsw: 806,
              },
          },
          Stderr: nil,
      }
  occurred
Error: BeforeEach 08/15/25 18:30:48.001

  Full Stack Trace
    github.com/nginx/nginx-gateway-fabric/v2/tests/suite.createNGFInstallConfig({{0x31caa9a, 0x8}, {0xc0005dd640, 0x3f}, {0x7ffc112c[363](https://github.com/nginx/nginx-gateway-fabric/actions/runs/16993918579/job/48179870697#step:11:364)8, 0x5}, 0x1, 0x1, 0x1, 0x0}, ...)
    	/home/username/nginx-gateway-fabric/tests/suite/system_suite_test.go:264 +0xbec
    github.com/nginx/nginx-gateway-fabric/v2/tests/suite.setup({{0x31caa9a, 0x8}, {0xc0005dd640, 0x3f}, {0x7ffc112c3638, 0x5}, 0x1, 0x1, 0x1, 0x0}, ...)
    	/home/username/nginx-gateway-fabric/tests/suite/system_suite_test.go:180 +0x12e8
    github.com/nginx/nginx-gateway-fabric/v2/tests/suite.init.func10.2()
    	/home/username/nginx-gateway-fabric/tests/suite/scale_test.go:104 +0x10a

  There were additional failures detected.  To view them in detail run ginkgo -vv
------------------------------

failed on waiting for createNGFInstallConfig similar to something Tina shared with us when images were not built and pushed up in her cluster, concerning. I'll recheck and see the workflow but this is my first time seeing this error

Copy link
Contributor Author

@salonichf5 salonichf5 Sep 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

had the first successful run for NFR pipelines. I ran it against my branch but tests results don't look good for some cases compared against the March results

I can remove the flakey attempts if we think it doesn't help.


if outFile != nil {
if err := outFile.Close(); err != nil {
if errors.Is(err, os.ErrClosed) || strings.Contains(err.Error(), "file already closed") {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these two checks redundant or is they different?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

only added this for sanity because ginkgo appends errors in case of multiple errors. And I wasn't able to reproduce so wanted to be sure

GinkgoWriter.Printf("Warning: attempted to close already closed file: %v\n", err)
} else {
Expect(err).ToNot(HaveOccurred())
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You've already checked that err != nil, so this will never pass

Copy link
Contributor Author

@salonichf5 salonichf5 Sep 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aah good catch
Just to clarify, the err != nil check is there so I can branch on the type of error. The intention is to only warn (and not fail the test) if the error is specifically "file already closed", while treating all other errors as failures.

}
}
}

// restoring NGF shared among tests in the suite
cfg := getDefaultSetupCfg()
Expand Down Expand Up @@ -1012,13 +1030,13 @@ var _ = Describe("Zero downtime scale test", Ordered, Label("nfr", "zero-downtim
checkGatewayListeners := func(num int) {
Eventually(
func() error {
ctx, cancel := context.WithTimeout(context.Background(), timeoutConfig.GetTimeout)
ctx, cancel := context.WithTimeout(context.Background(), timeoutConfig.GetTimeout*2)
defer cancel()

var gw v1.Gateway
key := types.NamespacedName{Namespace: ns.Name, Name: "gateway"}
if err := resourceManager.K8sClient.Get(ctx, key, &gw); err != nil {
return err
return fmt.Errorf("failed to get gateway: %w", err)
}

if len(gw.Status.Listeners) != num {
Expand All @@ -1028,8 +1046,8 @@ var _ = Describe("Zero downtime scale test", Ordered, Label("nfr", "zero-downtim
return nil
},
).
WithTimeout(5 * time.Second).
WithPolling(100 * time.Millisecond).
WithTimeout(timeoutConfig.GatewayListenerUpdateTimeout).
WithPolling(1 * time.Second).
Should(Succeed())
}

Expand Down
Loading