Skip to content

Commit 7c35b67

Browse files
Merge branch 'sonic-net:master' into master
2 parents f0a7f62 + 1628450 commit 7c35b67

File tree

595 files changed

+95867
-9929
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

595 files changed

+95867
-9929
lines changed
Lines changed: 97 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -1,134 +1,100 @@
11
jobs:
2-
- job: t0_elastictest
3-
displayName: "kvmtest-t0 by Elastictest"
4-
timeoutInMinutes: 240
5-
continueOnError: false
6-
pool: sonic-ubuntu-1c
7-
steps:
8-
- template: ../run-test-elastictest-template.yml
9-
parameters:
10-
TOPOLOGY: t0
11-
MIN_WORKER: $(T0_INSTANCE_NUM)
12-
MAX_WORKER: $(T0_INSTANCE_NUM)
13-
KVM_IMAGE_BRANCH: "master"
14-
MGMT_BRANCH: "master"
15-
BUILD_REASON: "BaselineTest"
16-
RETRY_TIMES: "0"
17-
STOP_ON_FAILURE: "False"
18-
TEST_PLAN_NUM: $(BASELINE_MGMT_PUBLIC_MASTER_TEST_NUM)
2+
- job: get_impacted_area
3+
cancelTimeoutInMinutes: 10
4+
displayName: "Get impacted area"
5+
timeoutInMinutes: 240
6+
continueOnError: false
7+
pool: sonic-ubuntu-1c
8+
steps:
9+
- template: ../impacted_area_testing/get-impacted-area.yml
1910

20-
- job: t0_2vlans_elastictest
21-
displayName: "kvmtest-t0-2vlans by Elastictest"
22-
timeoutInMinutes: 240
23-
continueOnError: false
24-
pool: sonic-ubuntu-1c
25-
steps:
26-
- template: ../run-test-elastictest-template.yml
27-
parameters:
28-
TOPOLOGY: t0
29-
TEST_SET: t0-2vlans
30-
MIN_WORKER: $(T0_2VLANS_INSTANCE_NUM)
31-
MAX_WORKER: $(T0_2VLANS_INSTANCE_NUM)
32-
DEPLOY_MG_EXTRA_PARAMS: "-e vlan_config=two_vlan_a"
33-
KVM_IMAGE_BRANCH: "master"
34-
MGMT_BRANCH: "master"
35-
BUILD_REASON: "BaselineTest"
36-
RETRY_TIMES: "0"
37-
STOP_ON_FAILURE: "False"
38-
TEST_PLAN_NUM: $(BASELINE_MGMT_PUBLIC_MASTER_TEST_NUM)
11+
- job: run_impacted_area_test
12+
cancelTimeoutInMinutes: 10
13+
dependsOn: get_impacted_area
14+
strategy:
15+
matrix:
16+
t0:
17+
TESTBED_PREP_TOPOLOGY: t0
18+
CHECKER: t0_checker
19+
TOPOLOGY: t0
20+
PREPARE_TIME: 30
21+
NUM_ASIC: 1
22+
DEPLOY_MG_EXTRA_PARAMS: ""
23+
COMMON_EXTRA_PARAMS: ""
24+
t0_2vlans:
25+
TESTBED_PREP_TOPOLOGY: t0-2vlans
26+
CHECKER: t0-2vlans_checker
27+
DEPLOY_MG_EXTRA_PARAMS: "-e vlan_config=two_vlan_a "
28+
TOPOLOGY: t0
29+
PREPARE_TIME: 30
30+
NUM_ASIC: 1
31+
COMMON_EXTRA_PARAMS: ""
32+
t1_lag:
33+
TESTBED_PREP_TOPOLOGY: t1
34+
CHECKER: t1_checker
35+
TOPOLOGY: t1-lag
36+
# 50 mins for preparing testbed, 30 mins for pre-test and post-test
37+
PREPARE_TIME: 80
38+
NUM_ASIC: 1
39+
COMMON_EXTRA_PARAMS: ""
40+
DEPLOY_MG_EXTRA_PARAMS: ""
41+
dualtor:
42+
TESTBED_PREP_TOPOLOGY: dualtor
43+
CHECKER: dualtor_checker
44+
COMMON_EXTRA_PARAMS: "--disable_loganalyzer "
45+
TOPOLOGY: dualtor
46+
# 30 mins for preparing testbed, 30 mins for pre-test and 20 mins for post-test
47+
PREPARE_TIME: 80
48+
NUM_ASIC: 1
49+
DEPLOY_MG_EXTRA_PARAMS: ""
50+
t1_multi_asic:
51+
TESTBED_PREP_TOPOLOGY: t1-multi-asic
52+
CHECKER: t1-multi-asic_checker
53+
TOPOLOGY: t1-8-lag
54+
NUM_ASIC: 4
55+
PREPARE_TIME: 30
56+
sonic_t0:
57+
TESTBED_PREP_TOPOLOGY: t0-sonic
58+
CHECKER: t0-sonic_checker
59+
COMMON_EXTRA_PARAMS: "--neighbor_type=sonic "
60+
TOPOLOGY: t0-64-32
61+
PREPARE_TIME: 40
62+
VM_TYPE: vsonic
63+
NUM_ASIC: 1
64+
DEPLOY_MG_EXTRA_PARAMS: ""
65+
dpu:
66+
TESTBED_PREP_TOPOLOGY: dpu
67+
CHECKER: dpu_checker
68+
TOPOLOGY: dpu
69+
PREPARE_TIME: 30
70+
NUM_ASIC: 1
71+
COMMON_EXTRA_PARAMS: ""
72+
DEPLOY_MG_EXTRA_PARAMS: ""
73+
condition: contains(dependencies.get_impacted_area.outputs['SetVariableTask.PR_CHECKERS'], variables['CHECKER'])
74+
variables:
75+
TEST_SCRIPTS: $[ dependencies.get_impacted_area.outputs['SetVariableTask.TEST_SCRIPTS'] ]
76+
timeoutInMinutes: 240
77+
continueOnError: false
78+
pool: sonic-ubuntu-1c
79+
steps:
80+
- template: ../impacted_area_testing/calculate-instance-numbers.yml
81+
parameters:
82+
TOPOLOGY: $(TESTBED_PREP_TOPOLOGY)
83+
BUILD_BRANCH: "master"
84+
PREPARE_TIME: $(PREPARE_TIME)
3985

40-
- job: t1_lag_elastictest
41-
displayName: "kvmtest-t1-lag by Elastictest"
42-
timeoutInMinutes: 240
43-
continueOnError: false
44-
pool: sonic-ubuntu-1c
45-
steps:
46-
- template: ../run-test-elastictest-template.yml
47-
parameters:
48-
TOPOLOGY: t1-lag
49-
MIN_WORKER: $(T1_LAG_INSTANCE_NUM)
50-
MAX_WORKER: $(T1_LAG_INSTANCE_NUM)
51-
KVM_IMAGE_BRANCH: "master"
52-
MGMT_BRANCH: "master"
53-
BUILD_REASON: "BaselineTest"
54-
RETRY_TIMES: "0"
55-
STOP_ON_FAILURE: "False"
56-
TEST_PLAN_NUM: $(BASELINE_MGMT_PUBLIC_MASTER_TEST_NUM)
57-
58-
- job: dualtor_elastictest
59-
displayName: "kvmtest-dualtor-t0 by Elastictest"
60-
timeoutInMinutes: 240
61-
continueOnError: false
62-
pool: sonic-ubuntu-1c
63-
steps:
64-
- template: ../run-test-elastictest-template.yml
65-
parameters:
66-
TOPOLOGY: dualtor
67-
MIN_WORKER: $(T0_DUALTOR_INSTANCE_NUM)
68-
MAX_WORKER: $(T0_DUALTOR_INSTANCE_NUM)
69-
COMMON_EXTRA_PARAMS: "--disable_loganalyzer "
70-
KVM_IMAGE_BRANCH: "master"
71-
MGMT_BRANCH: "master"
72-
BUILD_REASON: "BaselineTest"
73-
RETRY_TIMES: "0"
74-
STOP_ON_FAILURE: "False"
75-
TEST_PLAN_NUM: $(BASELINE_MGMT_PUBLIC_MASTER_TEST_NUM)
76-
77-
- job: multi_asic_elastictest
78-
displayName: "kvmtest-multi-asic-t1-lag by Elastictest"
79-
timeoutInMinutes: 240
80-
continueOnError: false
81-
pool: sonic-ubuntu-1c
82-
steps:
83-
- template: ../run-test-elastictest-template.yml
84-
parameters:
85-
TOPOLOGY: t1-8-lag
86-
TEST_SET: multi-asic-t1-lag
87-
MIN_WORKER: $(MULTI_ASIC_INSTANCE_NUM)
88-
MAX_WORKER: $(MULTI_ASIC_INSTANCE_NUM)
89-
NUM_ASIC: 4
90-
KVM_IMAGE_BRANCH: "master"
91-
MGMT_BRANCH: "master"
92-
BUILD_REASON: "BaselineTest"
93-
RETRY_TIMES: "0"
94-
STOP_ON_FAILURE: "False"
95-
TEST_PLAN_NUM: $(BASELINE_MGMT_PUBLIC_MASTER_TEST_NUM)
96-
97-
- job: sonic_t0_elastictest
98-
displayName: "kvmtest-t0-sonic by Elastictest"
99-
timeoutInMinutes: 240
100-
continueOnError: false
101-
pool: sonic-ubuntu-1c
102-
steps:
103-
- template: ../run-test-elastictest-template.yml
104-
parameters:
105-
TOPOLOGY: t0-64-32
106-
MIN_WORKER: $(T0_SONIC_INSTANCE_NUM)
107-
MAX_WORKER: $(T0_SONIC_INSTANCE_NUM)
108-
TEST_SET: t0-sonic
109-
COMMON_EXTRA_PARAMS: "--neighbor_type=sonic "
110-
VM_TYPE: vsonic
111-
KVM_IMAGE_BRANCH: "master"
112-
MGMT_BRANCH: "master"
113-
BUILD_REASON: "BaselineTest"
114-
RETRY_TIMES: "0"
115-
STOP_ON_FAILURE: "False"
116-
TEST_PLAN_NUM: $(BASELINE_MGMT_PUBLIC_MASTER_TEST_NUM)
117-
118-
- job: dpu_elastictest
119-
displayName: "kvmtest-dpu by Elastictest"
120-
timeoutInMinutes: 240
121-
continueOnError: false
122-
pool: sonic-ubuntu-1c
123-
steps:
124-
- template: ../run-test-elastictest-template.yml
125-
parameters:
126-
TOPOLOGY: dpu
127-
MIN_WORKER: $(T0_SONIC_INSTANCE_NUM)
128-
MAX_WORKER: $(T0_SONIC_INSTANCE_NUM)
129-
KVM_IMAGE_BRANCH: "master"
130-
MGMT_BRANCH: "master"
131-
BUILD_REASON: "BaselineTest"
132-
RETRY_TIMES: "0"
133-
STOP_ON_FAILURE: "False"
134-
TEST_PLAN_NUM: $(BASELINE_MGMT_PUBLIC_MASTER_TEST_NUM)
86+
- template: ../run-test-elastictest-template.yml
87+
parameters:
88+
TOPOLOGY: $(TOPOLOGY)
89+
SCRIPTS: $(TEST_SCRIPTS)
90+
MIN_WORKER: $(INSTANCE_NUMBER)
91+
MAX_WORKER: $(INSTANCE_NUMBER)
92+
DEPLOY_MG_EXTRA_PARAMS: $(DEPLOY_MG_EXTRA_PARAMS)
93+
COMMON_EXTRA_PARAMS: $(COMMON_EXTRA_PARAMS)
94+
KVM_IMAGE_BRANCH: "master"
95+
MGMT_BRANCH: "master"
96+
BUILD_REASON: "BaselineTest"
97+
RETRY_TIMES: "0"
98+
STOP_ON_FAILURE: "False"
99+
TEST_PLAN_NUM: $(BASELINE_MGMT_PUBLIC_MASTER_TEST_NUM)
100+
NUM_ASIC: $(NUM_ASIC)
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
## Background
2+
In current PR testing process, a fixed set of test scripts is executed regardless of the change scope.
3+
This approach lacks flexibility. On the one hand, if changes are only related to a few lines of codebase,
4+
we may don't need to run the whole scope. On the other hand, if there are new added test scripts,
5+
we need to add them manually.
6+
7+
With approximately 570 test scripts running, the process has become excessively large and the runtime increased significantly.
8+
Due to the maximum execution time limit, more instances are needed to run the tests in parallel.
9+
For example, to meet this requirement, we need 20 instances for t0 and 25 instances for t1.
10+
The cost per PR has reached $35, and we will use $23,000 per month to run PR testing, which is considerably high.
11+
12+
To address these issues, we propose a new PR testing model called 'Impacted Area-Based PR Testing.
13+
14+
## Preparation
15+
We can organize the codebase in this way:
16+
```
17+
sonic-mgmgt
18+
| - .azure-pipelines
19+
| - ansible
20+
| - docs
21+
| - ......
22+
| - tests
23+
|
24+
| - common ---------- shared
25+
| - arp -----|
26+
| - ecmp | --- features
27+
| - vlan |
28+
| - ...... -----|
29+
```
30+
Under sonic-mgmt, there are several top-level folders such as `.azure-pipelines`, `ansible`, `docs`, `tests`, and more.
31+
Except for the `tests` folder, we classify all other folders as part of the shared section of the repo.
32+
33+
Within the `tests` folder, there are multiple second-level directories.
34+
Among them, the common folder is also considered part of the shared section.
35+
Other folders, such as `arp`, `ecmp`, and similar directories, are classified as feature-specific parts.
36+
37+
Scripts in the common folder fall under the shared section and can be utilized across different folders.
38+
In contrast, scripts in other folders belong to the features section, representing specific functionalities such as arp, ecmp, and vlan,
39+
and are intended for use within their respective folders.
40+
This hierarchy helps us more effectively identify the impacted areas for the new PR testing process.
41+
42+
However, the previous code had numerous cross-feature dependencies.
43+
To achieve our goal, we carried out some preparatory work by eliminating these cross-feature dependencies.
44+
45+
46+
## Design
47+
### Impcated Area
48+
To take advantage of such code structure, we introduce a new term called `impacted area`, which represents the scope of PR testing.
49+
The `impacted area` can be defined by specific features, so that we can narrow down the scope into folders.
50+
51+
This term can be elaborated as follows:
52+
- If the changes are confined to a specific feature folder, we can narrow the scope of testing to only include files within that folder.
53+
As files in other feature folders remain unaffected and do not require testing.
54+
- If the changes affect the common components, we cannot narrow the testing scope and must run all test scripts to ensure comprehensive coverage, as they are commonly used by other features.
55+
56+
We can determine the impcated area using command `git diff`.
57+
58+
### Distribute scripts to PR checkers
59+
In our new PR test, we will have multiple PR checkers classified by topology type.
60+
To distribute all required scripts for each PR checker, which means,
61+
these scripts should not only within the scope that we changed, but also meet the requirement of topology.
62+
63+
We can suggest two approaches to achieve this:
64+
- One approach is by using the `--topology` parameter supported by pytest.
65+
It compares against the topology marked with `pytest.mark.topology` in script,
66+
and if the mark matches, the script is deemed necessary.
67+
However, this method triggers pytest's collection process for each script,
68+
leading to unnecessary time consumption, which is not expected.
69+
70+
- Another approach is to collect and analyze all scripts before execution.
71+
Each script includes the `pytest.mark.topology` marker to indicate the applicable topology it can run on.
72+
We will perform a global scan of all test scripts in the impacted area to identify this marker and extract its value,
73+
which represents the topology type compatible with the script.
74+
After determining the valid topology for each script, we can distribute the script to corresponding PR checkers.
75+
This method eliminates unnecessary processes by executing only the on-demand scripts, resulting in reduced running time.
76+
77+
### Implement dynamic instances
78+
Since the scope of PR testing is dynamic and determined by the impacted area,
79+
the number of instances required also needs to be dynamic to ensure cost efficiency.
80+
To achieve this, we must accurately estimate the total execution time in advance,
81+
allowing us to allocate the appropriate number of instances.
82+
This estimation can be achieved by analyzing historical data,
83+
which provides insights into execution times for similar scenarios.
84+
85+
We now have a Kusto table that logs details about the execution of test cases,
86+
including the running time, date, results, and more.
87+
To determine the preset running time for each test script,
88+
we will calculate the average running time of the latest five run times.
89+
If no relevant records are found in Kusto, a default value(1800s per script) will be used for the preset running time.
90+
This approach allows us to estimate the total execution time for our scripts accurately.
91+
92+
Using this information, we will evenly distribute the scripts across instances,
93+
ensuring that the workload is balanced of each instance.
94+
Ideally, each instance will run its assigned scripts in approximately 1.5 hours,
95+
leaving additional time for tasks such as testbed preparation and clean-up and keeping the total runtime within 2 hours.
96+
97+
## Advantages
98+
Impacted area based PR testing runs test scripts on demand, reducing the overall scale of the PR test and saving execution time.
99+
And instances will be allocated as needed, resulting in more cost-efficient resource usage.
100+
Additionally, the PR testing will be more flexible as we can collect test scripts automatically rather than hard code.
101+
102+
## Safeguard
103+
As impacted area based PR testing would not cover all test scripts, we need a safeguard to run all test scripts daily to prevent any unforeseen issues.
104+
Fortunately, we have Baseline testing to do so.
105+
Baseline testing involves running all test scripts in the test plan daily to ensure the overall stability of the system and identify potential issues.
106+
We conduct five rounds of baseline testing each day, and if any issues are detected, an ADO is automatically created, and email alerts are sent to notify the relevant teams.

0 commit comments

Comments
 (0)