diff --git a/contrib/kind-helm.sh b/contrib/kind-helm.sh index 343a6d40cf..7cd59aef61 100755 --- a/contrib/kind-helm.sh +++ b/contrib/kind-helm.sh @@ -52,8 +52,8 @@ set_default_params() { export SVC_CIDR_IPV6=${SVC_CIDR_IPV6:-fd00:10:96::/112} export JOIN_SUBNET_IPV4=${JOIN_SUBNET_IPV4:-100.64.0.0/16} export JOIN_SUBNET_IPV6=${JOIN_SUBNET_IPV6:-fd98::/64} - export TRANSIT_SWITCH_SUBNET_IPV4=${TRANSIT_SWITCH_SUBNET_IPV4:-100.88.0.0/16} - export TRANSIT_SWITCH_SUBNET_IPV6=${TRANSIT_SWITCH_SUBNET_IPV6:-fd97::/64} + export TRANSIT_SUBNET_IPV4=${TRANSIT_SUBNET_IPV4:-100.88.0.0/16} + export TRANSIT_SUBNET_IPV6=${TRANSIT_SUBNET_IPV6:-fd97::/64} export METALLB_CLIENT_NET_SUBNET_IPV4=${METALLB_CLIENT_NET_SUBNET_IPV4:-172.22.0.0/16} export METALLB_CLIENT_NET_SUBNET_IPV6=${METALLB_CLIENT_NET_SUBNET_IPV6:-fc00:f853:ccd:e792::/64} diff --git a/contrib/kind.sh b/contrib/kind.sh index cff76b68ef..34faaea1aa 100755 --- a/contrib/kind.sh +++ b/contrib/kind.sh @@ -593,8 +593,8 @@ set_default_params() { JOIN_SUBNET_IPV6=${JOIN_SUBNET_IPV6:-fd98::/64} MASQUERADE_SUBNET_IPV4=${MASQUERADE_SUBNET_IPV4:-169.254.0.0/17} MASQUERADE_SUBNET_IPV6=${MASQUERADE_SUBNET_IPV6:-fd69::/112} - TRANSIT_SWITCH_SUBNET_IPV4=${TRANSIT_SWITCH_SUBNET_IPV4:-100.88.0.0/16} - TRANSIT_SWITCH_SUBNET_IPV6=${TRANSIT_SWITCH_SUBNET_IPV6:-fd97::/64} + TRANSIT_SUBNET_IPV4=${TRANSIT_SUBNET_IPV4:-100.88.0.0/16} + TRANSIT_SUBNET_IPV6=${TRANSIT_SUBNET_IPV6:-fd97::/64} METALLB_CLIENT_NET_SUBNET_IPV4=${METALLB_CLIENT_NET_SUBNET_IPV4:-172.22.0.0/16} METALLB_CLIENT_NET_SUBNET_IPV6=${METALLB_CLIENT_NET_SUBNET_IPV6:-fc00:f853:ccd:e792::/64} BGP_SERVER_NET_SUBNET_IPV4=${BGP_SERVER_NET_SUBNET_IPV4:-172.26.0.0/16} @@ -631,7 +631,7 @@ set_default_params() { OVN_HOST_NETWORK_NAMESPACE=${OVN_HOST_NETWORK_NAMESPACE:-ovn-host-network} OVN_EGRESSIP_HEALTHCHECK_PORT=${OVN_EGRESSIP_HEALTHCHECK_PORT:-9107} OCI_BIN=${KIND_EXPERIMENTAL_PROVIDER:-docker} - OVN_DEPLOY_PODS=${OVN_DEPLOY_PODS:-"ovnkube-zone-controller ovnkube-control-plane ovnkube-master ovnkube-node"} + OVN_DEPLOY_PODS=${OVN_DEPLOY_PODS:-"ovnkube-identity ovnkube-zone-controller ovnkube-control-plane ovnkube-master ovnkube-node"} OVN_METRICS_SCALE_ENABLE=${OVN_METRICS_SCALE_ENABLE:-false} OVN_ISOLATED=${OVN_ISOLATED:-false} OVN_GATEWAY_OPTS=${OVN_GATEWAY_OPTS:-""} @@ -914,8 +914,8 @@ create_ovn_kube_manifests() { --v6-join-subnet="${JOIN_SUBNET_IPV6}" \ --v4-masquerade-subnet="${MASQUERADE_SUBNET_IPV4}" \ --v6-masquerade-subnet="${MASQUERADE_SUBNET_IPV6}" \ - --v4-transit-switch-subnet="${TRANSIT_SWITCH_SUBNET_IPV4}" \ - --v6-transit-switch-subnet="${TRANSIT_SWITCH_SUBNET_IPV6}" \ + --v4-transit-subnet="${TRANSIT_SUBNET_IPV4}" \ + --v6-transit-subnet="${TRANSIT_SUBNET_IPV6}" \ --ex-gw-network-interface="${OVN_EX_GW_NETWORK_INTERFACE}" \ --multi-network-enable="${ENABLE_MULTI_NET}" \ --network-segmentation-enable="${ENABLE_NETWORK_SEGMENTATION}" \ diff --git a/dist/images/daemonset.sh b/dist/images/daemonset.sh index f45f473a66..a2072b269e 100755 --- a/dist/images/daemonset.sh +++ b/dist/images/daemonset.sh @@ -79,8 +79,8 @@ OVN_V4_JOIN_SUBNET="" OVN_V6_JOIN_SUBNET="" OVN_V4_MASQUERADE_SUBNET="" OVN_V6_MASQUERADE_SUBNET="" -OVN_V4_TRANSIT_SWITCH_SUBNET="" -OVN_V6_TRANSIT_SWITCH_SUBNET="" +OVN_V4_TRANSIT_SUBNET="" +OVN_V6_TRANSIT_SUBNET="" OVN_NETFLOW_TARGETS="" OVN_SFLOW_TARGETS="" OVN_IPFIX_TARGETS="" @@ -302,11 +302,11 @@ while [ "$1" != "" ]; do --v6-masquerade-subnet) OVN_V6_MASQUERADE_SUBNET=$VALUE ;; - --v4-transit-switch-subnet) - OVN_V4_TRANSIT_SWITCH_SUBNET=$VALUE + --v4-transit-subnet) + OVN_V4_TRANSIT_SUBNET=$VALUE ;; - --v6-transit-switch-subnet) - OVN_V6_TRANSIT_SWITCH_SUBNET=$VALUE + --v6-transit-subnet) + OVN_V6_TRANSIT_SUBNET=$VALUE ;; --netflow-targets) OVN_NETFLOW_TARGETS=$VALUE @@ -536,10 +536,10 @@ ovn_v4_masquerade_subnet=${OVN_V4_MASQUERADE_SUBNET} echo "ovn_v4_masquerade_subnet: ${ovn_v4_masquerade_subnet}" ovn_v6_masquerade_subnet=${OVN_V6_MASQUERADE_SUBNET} echo "ovn_v6_masquerade_subnet: ${ovn_v6_masquerade_subnet}" -ovn_v4_transit_switch_subnet=${OVN_V4_TRANSIT_SWITCH_SUBNET} -echo "ovn_v4_transit_switch_subnet: ${ovn_v4_transit_switch_subnet}" -ovn_v6_transit_switch_subnet=${OVN_V6_TRANSIT_SWITCH_SUBNET} -echo "ovn_v6_transit_switch_subnet: ${ovn_v6_transit_switch_subnet}" +ovn_v4_transit_subnet=${OVN_V4_TRANSIT_SUBNET} +echo "ovn_v4_transit_subnet: ${ovn_v4_transit_subnet}" +ovn_v6_transit_subnet=${OVN_V6_TRANSIT_SUBNET} +echo "ovn_v6_transit_subnet: ${ovn_v6_transit_subnet}" ovn_netflow_targets=${OVN_NETFLOW_TARGETS} echo "ovn_netflow_targets: ${ovn_netflow_targets}" ovn_sflow_targets=${OVN_SFLOW_TARGETS} @@ -842,8 +842,8 @@ ovn_image=${ovnkube_image} \ ovn_enable_multi_external_gateway=${ovn_enable_multi_external_gateway} \ ovn_enable_ovnkube_identity=${ovn_enable_ovnkube_identity} \ ovn_network_qos_enable=${ovn_network_qos_enable} \ - ovn_v4_transit_switch_subnet=${ovn_v4_transit_switch_subnet} \ - ovn_v6_transit_switch_subnet=${ovn_v6_transit_switch_subnet} \ + ovn_v4_transit_subnet=${ovn_v4_transit_subnet} \ + ovn_v6_transit_subnet=${ovn_v6_transit_subnet} \ ovn_enable_persistent_ips=${ovn_enable_persistent_ips} \ ovn_enable_dnsnameresolver=${ovn_enable_dnsnameresolver} \ ovn_observ_enable=${ovn_observ_enable} \ diff --git a/dist/images/ovnkube.sh b/dist/images/ovnkube.sh index 1e0661f501..be4dedfc97 100755 --- a/dist/images/ovnkube.sh +++ b/dist/images/ovnkube.sh @@ -238,10 +238,10 @@ ovn_v6_join_subnet=${OVN_V6_JOIN_SUBNET:-} ovn_v4_masquerade_subnet=${OVN_V4_MASQUERADE_SUBNET:-} # OVN_V6_MASQUERADE_SUBNET - v6 masquerade subnet ovn_v6_masquerade_subnet=${OVN_V6_MASQUERADE_SUBNET:-} -# OVN_V4_TRANSIT_SWITCH_SUBNET - v4 Transit switch subnet -ovn_v4_transit_switch_subnet=${OVN_V4_TRANSIT_SWITCH_SUBNET:-} -# OVN_V6_TRANSIT_SWITCH_SUBNET - v6 Transit switch subnet -ovn_v6_transit_switch_subnet=${OVN_V6_TRANSIT_SWITCH_SUBNET:-} +# OVN_V4_TRANSIT_SUBNET - v4 Transit subnet +ovn_v4_transit_subnet=${OVN_V4_TRANSIT_SUBNET:-} +# OVN_V6_TRANSIT_SUBNET - v6 Transit subnet +ovn_v6_transit_subnet=${OVN_V6_TRANSIT_SUBNET:-} #OVN_REMOTE_PROBE_INTERVAL - ovn remote probe interval in ms (default 100000) ovn_remote_probe_interval=${OVN_REMOTE_PROBE_INTERVAL:-100000} #OVN_MONITOR_ALL - ovn-controller monitor all data in SB DB @@ -2356,17 +2356,17 @@ ovn-cluster-manager() { fi echo "ovn_v6_masquerade_subnet_opt=${ovn_v6_masquerade_subnet_opt}" - ovn_v4_transit_switch_subnet_opt= - if [[ -n ${ovn_v4_transit_switch_subnet} ]]; then - ovn_v4_transit_switch_subnet_opt="--cluster-manager-v4-transit-switch-subnet=${ovn_v4_transit_switch_subnet}" + ovn_v4_transit_subnet_opt= + if [[ -n ${ovn_v4_transit_subnet} ]]; then + ovn_v4_transit_subnet_opt="--cluster-manager-v4-transit-subnet=${ovn_v4_transit_subnet}" fi - echo "ovn_v4_transit_switch_subnet_opt=${ovn_v4_transit_switch_subnet}" + echo "ovn_v4_transit_subnet_opt=${ovn_v4_transit_subnet}" - ovn_v6_transit_switch_subnet_opt= - if [[ -n ${ovn_v6_transit_switch_subnet} ]]; then - ovn_v6_transit_switch_subnet_opt="--cluster-manager-v6-transit-switch-subnet=${ovn_v6_transit_switch_subnet}" + ovn_v6_transit_subnet_opt= + if [[ -n ${ovn_v6_transit_subnet} ]]; then + ovn_v6_transit_subnet_opt="--cluster-manager-v6-transit-subnet=${ovn_v6_transit_subnet}" fi - echo "ovn_v6_transit_switch_subnet_opt=${ovn_v6_transit_switch_subnet}" + echo "ovn_v6_transit_subnet_opt=${ovn_v6_transit_subnet}" multicast_enabled_flag= if [[ ${ovn_multicast_enable} == "true" ]]; then @@ -2476,8 +2476,8 @@ ovn-cluster-manager() { ${ovn_v4_masquerade_subnet_opt} \ ${ovn_v6_join_subnet_opt} \ ${ovn_v6_masquerade_subnet_opt} \ - ${ovn_v4_transit_switch_subnet_opt} \ - ${ovn_v6_transit_switch_subnet_opt} \ + ${ovn_v4_transit_subnet_opt} \ + ${ovn_v6_transit_subnet_opt} \ ${network_qos_enabled_flag} \ ${ovn_enable_dnsnameresolver_flag} \ --gateway-mode=${ovn_gateway_mode} \ diff --git a/dist/templates/ovnkube-control-plane.yaml.j2 b/dist/templates/ovnkube-control-plane.yaml.j2 index 0aaa47b262..3ef92bc461 100644 --- a/dist/templates/ovnkube-control-plane.yaml.j2 +++ b/dist/templates/ovnkube-control-plane.yaml.j2 @@ -172,10 +172,10 @@ spec: value: "{{ ovn_enable_interconnect }}" - name: OVN_ENABLE_MULTI_EXTERNAL_GATEWAY value: "{{ ovn_enable_multi_external_gateway }}" - - name: OVN_V4_TRANSIT_SWITCH_SUBNET - value: "{{ ovn_v4_transit_switch_subnet }}" - - name: OVN_V6_TRANSIT_SWITCH_SUBNET - value: "{{ ovn_v6_transit_switch_subnet }}" + - name: OVN_V4_TRANSIT_SUBNET + value: "{{ ovn_v4_transit_subnet }}" + - name: OVN_V6_TRANSIT_SUBNET + value: "{{ ovn_v6_transit_subnet }}" - name: OVN_ENABLE_PERSISTENT_IPS value: "{{ ovn_enable_persistent_ips }}" - name: OVN_NETWORK_QOS_ENABLE diff --git a/docs/installation/ovnkube.1 b/docs/installation/ovnkube.1 index c393e928e0..fc11a1f3cd 100644 --- a/docs/installation/ovnkube.1 +++ b/docs/installation/ovnkube.1 @@ -154,10 +154,10 @@ Show help. \fB\--version\fR, \fB\-v\fR Print the version. .TP -\fB\--cluster-manager-v4-transit-switch-subnet\fR string +\fB\--cluster-manager-v4-transit-subnet\fR string The v4 transit switch subnet to use for assigning transit switch IPv4 addresses\fR. .TP -\fB\--cluster-manager-v6-transit-switch-subnet\fR string +\fB\--cluster-manager-v6-transit-subnet\fR string The v6 transit switch subnet to use for assigning transit switch IPv6 addresses\fR. .SH "SEE ALSO" diff --git a/docs/okeps/okep-5494-ovn-kubernetes-mcp-server.md b/docs/okeps/okep-5494-ovn-kubernetes-mcp-server.md new file mode 100644 index 0000000000..cffe1970d5 --- /dev/null +++ b/docs/okeps/okep-5494-ovn-kubernetes-mcp-server.md @@ -0,0 +1,798 @@ +# OKEP-5494: Model Context Protocol for Troubleshooting OVN-Kubernetes + +# Problem Statement + +Diagnosing an issue in [OVN-Kubernetes](https://ovn-kubernetes.io/) network +plugin is complex because it has many layers (Kubernetes, OVN-Kubernetes, OVN, +OpenvSwitch, Kernel - especially Netfilter elements). Usually the person +troubleshooting an issue has to approach it in a layered fashion and has +to be fully aware of all the debugging tools each layer has to offer to +then be able to pin point where the problem is. That is time consuming +and requires years of expertise on each depth of the stack and across all +features. Sometimes it also involves working with the layered community +project teams like OVN and OVS since they hold more knowledge about their +domain than engineers working on the OVN-Kubernetes plugin. Each +troubleshooting session to understand where the packet is getting +blackholed or dropped takes a lot of time to solve (unless it's trivial). + +# Goals + +The goal of this enhancement is to try to improve "troubleshooting time", +"tool usage/typing time" and "erase the need to know how to use each of those +tools to a parameter level detail" by exposing these tools using +[Model Context Protocol](https://modelcontextprotocol.io/docs/learn/architecture#overview) (MCP) +and leveraging the backend Model's (Claude Sonnet4, Gemini2.5Pro, GPT-5, etc) +knowledge and context to troubleshoot issues on live clusters or locally on +containers with end user's databases loaded. This can go a long +way in speeding up bug triages. + +* Phase1 targets only ovn-kubernetes community members as target audience +* Build an OVN-Kubernetes MCP Server(s) that exposes all the tools required + to troubleshoot OVN-Kubernetes network plugin + * This MCP Server(s) must also be aware of where to run these tools to + troubleshoot the issue in question on a cluster (i.e Which node? Which + pod? Which container?) +* Add support to load a MCP Server against not just a real-time cluster + but also against a simulated environment constructed from information bundle + gathered or other debugging information extracted from a live cluster. + Examples are [must-gather](https://github.com/openshift/must-gather) and + [sos-reports](https://github.com/sosreport/sos) and how tools like + [omc](https://github.com/gmeghnag/omc) or [xsos](https://github.com/ryran/xsos) + can be leveraged +* Ensure that the tools we expose have read only permissions. Whether this means + we restrict the tools itself with read rights only OR expose only read actions + via the tools is to be discussed in proposal section. +* Support troubleshooting all features for OVN-Kubernetes + * We must have extensive failure scenario and chaos tests injected to see + how good LLM is able to troubleshoot + * We must form a list of all common ovn-kubernetes bugs we have hit across + features (example all possible ways of running into staleSNATs) + * We must add benchmarking tests for evaluating the quality of troubleshooting + specific to CNI Networking. + +# Future Goals + +* Phase2 targets end-users and others using OVN-Kubernetes to troubleshoot + issues +* Expand the MCP server to not just provide tools but also relevant context + around OVN-Kubernetes implementation +* Creating a RAGing system with all internal docs and integrating that with + the prompt client for better results (would require improving our docs + first). This is especially important for the LLM to know how OVN-Kubernetes + implements each traffic flow and feature and what constructs are created + underneath into the layers. This might not be needed for older features + like EgressIPs that has plenty of online resources but for newer features + like RouteAdvertisements, the LLM may not know much without providing that + extra context. +* Investigate if there is potential for converting it also into a remediation + system (this would need write access) +* Work with the Layered project community teams (OVN, OpenvSwitch, Kernel) + for each of those layers to also own an MCP server since they know their + stack better. So instead of 1 OVN-Kubernetes MCP Server it would be a + bunch of servers each owned by specific upstream community projects for + better maintenance and ownership +* The same MCP Server could also potentially be used as a + "OVN-Kubernetes network plugin - know it better" chatbot but that is not + the initial goal here. + +# Future-Stretch-Goals + +* Phase3 includes this getting productized and shipped to end-users using + OVN-Kubernetes in some far fetched future to run it on production to + troubleshoot. But this would require: + * Having a better overall architecture for the wholesome agentic AI + troubleshooting solution on a cluster + * Solving the compute problem of how and where to run the model + * Having an air tight security vetting. + + Running this stack in production is out-of-scope for this OKEP; it is a + future-stretch goal contingent on security and testing milestones. + +# Non-Goals + +* We will not be solving the problem around which LLM should be used. Most of + the good ones (based on community member experience) were proprietary (claude sonnet4 + and gemini2.5pro) but testing all LLMs and coming up with which works the + best is not in scope + * By not solving this problem as part of this enhancement, we risk having + to deal with "long-context windows causing hallucinations" but that's + where RAGing mentioned in future goals could help. +* We will also not be developing our own model to teach and maintain it + * By not solving this problem, we also risk relying on proprietary models + knowing and learning what we want them to learn but having no control on + how fast they learn it. Again RAGing could help here. + * So the quality here will heavily depend on how good the brain LLM is + which basically won't be in our control much. + +# Introduction + +An engineer troubleshooting OVN-Kubernetes usually uses the following set of +CLI tools in a layered fashion: + +* Kubernetes and OVN-Kubernetes layer - this cluster state information is + gathered as part of must-gather for offline troubleshooting + * **kubectl** commands like list, get, describe, logs, exec, events to know + everything about the Kubernetes API state of the feature and to know what + the ovnkube pods were doing through the logs they generate during that + window + * **ovnkube-trace** which executes ovn-trace and ovs ofproto trace and + detrace commands (this tool doesn't support all scenarios - it's not + maintained well) + * Future-tool - ovnkube CLI (need to ask NVIDIA what's the status here) + * A place for K8s state-database syncer should/could potentially exist +* OVN Layer - OVN databases are gathered as part of must-gather for + offline troubleshooting + * **ovn-nbctl** commands that are executed on the ovnkube node pods to + understand what OVN-Kubernetes created into northbound database via + libovsdbclient transactions + * **ovn-sbctl** commands that are executed on the ovnkube node pods to + understand what northd created into southbound database + * **ovn-trace and detrace** commands that are executed on the ovnkube node + pods to understand simulated packet flow tracing based on flows in + southbound database + * **ovn-appctl -t ovn-controller ct-zone-list** to list all the conntrack + zone to OVN construct mapping for better understanding how the conntrack + commit of the packets happened +* OpenvSwitch Layer - openvswitch database is usually gathered as part + of sos-report for offline troubleshooting + * **ovs-ofctl dump-flows** to debug specially the breth0 openflows + that OVN-Kubernetes creates on the gateway of each node + * **ovs-appctl dpctl/dump-flows** to trace live packets run on a specific + node's ovs container (KIND) or on the node (OpenShift) + * **ovs-appctl ofproto/trace** and detrace to run an ovs trace of the + packet based on the openflows + * **ovs-appctl dpctl/dump-conntrack** to know all the conntrack zones used + for a specific connection + * **ovs-vsctl** commands to list interfaces and bridges + * **retis** to see packet drops in ovs +* Netfilter/Kernel Layer - this information is usually gathered as part + of sos-report for offline troubleshooting + * **ip util commands** like **ip r** or **ip a** or **ip rule list** for + debugging VRFs, BGP learnt routes or the routes OVN-Kubernetes creates + on the node or custom routes that end user's add + * **nft list ruleset** to understand what rules were created by + OVN-Kubernetes specially in routingViaHost=true gateway mode + * **iptables-save** to list all iptables (Given iptables is deprecated, + I think we can skip this tool though for now 50% of ovn-kubernetes is + still on IPT) + * **conntrack** -L or -E on the host itself + * **ip xfrm policy** and **ip xfrm state** when using IPSEC +* TCPDUMP - external open source tools, can't be used for offline + troubleshooting + * **tcpdump** is used for packet capture and analysis + * [**pwru**](https://github.com/cilium/pwru) is used to know the kernel drop reason + * **libreswan** `ipsec-stateus` and `ipsec-trafficstatus` commands + * **frr** frr router config and routes learnt by BGP + +Ideally speaking, there are metrics and events that via alerts also go to +the dashboard which is probably what most end-users use to troubleshoot. +So when we do the phase3 we would need to reconsider this stack of +troubleshooting entirely for including other aspects like OVN-Kubernetes +troubleshooting dashboard that the observability team created or the +various packet drop tools observability team already exposes. But for +the scope of this enhancement, for now, we will consider these above set +of tools as MVP. + +As we can see, that's a lot of tools! So remembering the syntax for each of +these tools always and executing them one-by-one and gathering the information +at each layer, analysing them, and then moving to the next layer takes time +for a human. Always during a remote analysis of bug report the part that takes +the longest is the RCA by combing through all the data - same goes for +troubleshooting a cluster (which is slightly easier when we have access to the +cluster than analysing offline data). The fix is usually the easiest part (there are +exceptions). + +``` + OVN-Kubernetes Architecture & Troubleshooting Tools + (Per Node Components) + +┌────────────────────────────────────────────────────────────────────────────┐ +│ ovnkube-node pod │ +│ │ +│ ┌─────────────────┐ ┌─────────────────┐ kubectl exec/logs │ +│ │ ovnkube │◄──►│ NBDB │◄─────── ovn-nbctl show/list │ +│ │ controller │ │ (northbound) │ │ +│ └─────────────────┘ └─────────────────┘ │ +│ │ │ │ +│ │ ┌─────────────────┐ │ +│ │ │ northd │ │ +│ │ └─────────────────┘ │ +│ │ │ │ +│ │ ┌─────────────────┐ │ +│ └─────────────►│ SBDB │◄─────── ovn-sbctl show/list │ +│ │ (southbound) │ ovn-trace/detrace │ +│ └─────────────────┘ │ +│ │ │ +│ ┌─────────────────┐ │ +│ │ ovn-controller │◄─────── ovn-appctl ct-zone │ +│ └─────────────────┘ │ +│ │ │ +└───────────────────────────────────┼────────────────────────────────────────┘ + │ + ┌─────────────────┐ + │ OVS │◄─────── ovs-vsctl list + │ (database) │ ovs-appctl dpctl/ + └─────────────────┘ ovs-ofctl dump-flows + │ retis (packet drops) + ┌─────────────────┐ + │ OVS bridge │◄─────── ovs-appctl ofproto/trace + │ br-int/breth0 │ + └─────────────────┘ + │ + ┌─────────────────┐ + │ NIC │ + │ (physical) │ + └─────────────────┘ + │ + ┌──────────────┴──────────────┐ + │ Host Network │ + │ │ + │ ip route/addr/rule ◄───────┼─────── ip commands + │ nft ruleset ◄───────┼─────── nft list ruleset + │ iptables rules ◄───────┼─────── iptables-save + │ conntrack zones ◄───────┼─────── conntrack -L/-E + │ │ + │ Network interfaces ◄───────┼─────── tcpdump/pwru + └─────────────────────────────┘ + + Problem: Engineers must know WHERE to run WHICH tool on WHICH component + to troubleshoot issues across this distributed architecture +``` + +This enhancement aims to solve this pain point of reducing the time taken to +execute these tools and analyse these results using MCP Servers and LLMs. + +# User Stories + +**As an OVN-Kubernetes developer**, **I want to** troubleshoot my stack +without needing to know every tool's parameter fields by-heart or by spending +time looking it up each time I need to troubleshoot a feature **so that** I +can spend my time efficiently. I just want to tell in plain english what I +want and for the MCP server to execute those specific commands. + +**As an OVN-Kubernetes engineer**, **I want to** troubleshoot my stack +without needing to analyse each flow output of these tools when I need to +troubleshoot a feature **so that** I can spend my time efficiently. I just +want to tell in plain english what I want and for the LLM to help me analyze +the output of the commands executed by the MCP Server. I understand that I +will need to verify the reasoning thoroughly before accepting the RCA from AI. + +**As a new engineer joining the OVN-Kubernetes team**, **I want to** retrieve +specific information from different parts of the stack without having +knowledge of the topology or tooling of the stack. + +# Proposed Solution + +We build a Golang MCP Server (could be split into a set of MCP Servers in the +future) that exposes these tools in read only fashion and the LLM backend +that has the required context will analyse the results of the execution and +provide a response back to the prompter who has to verify it thoroughly. This +MCP Server code will be in a new repo in +[ovn-kubernetes org](https://github.com/ovn-kubernetes) called +**ovn-kubernetes-mcp**. + +## Example Workflow for an end-user + +1. An end-user can use any MCP Client to start a troubleshooting session via + prompting. The client connects with all the available servers (in our case + the OVN-Kubernetes MCP Server and maybe in the future all layered community + MCP Severs) and gathers their available tools and presents this information + to the LLM along with their schemas. Example: Using Cursor AI as your MCP + Client +2. LLM will be able use its intelligence and analyze the end-user query and + choose the appropriate tools. Example: Using Claude Sonnet4 as your LLM + model +3. MCP Client then receives the LLM's tool call and routes it to the + corresponding MCP Server which executes the tool and client then relays + the response back to the LLM +4. LLM again uses its intelligence to analyze the responses +5. LLM provides a RCA back to the end user + +The steps 2, 3 and 4 is repeated by the LLM and it intelligently does a step-by-step +layered troubleshooting exercise to find the root cause. + +We may also include predefined and tested prompt steps in the documentation +of our repo for helping end-users to give the LLM good context around +OVN-Kubernetes and OVN and OVS. For example, provide the latest OVN man +pages so that it has the current knowledge of OVN DB schemas and tools usage. +Some standard preparation prompt can be maintained in the mcp-server repo +as reference. + +``` +OVN-Kubernetes Troubleshooting MCP Architecture +=============================================== + +Engineer Query: "Pod A can't reach Pod B on different nodes, consistent connection drops" + +┌──────────────────────────────────────────────────────────────────────────────┐ +│ MCP SERVERS (Layer-Specific Tools) │ +└──────────────────────────────────────────────────────────────────────────────┘ + +┌──────────────────────┐ ┌──────────────────────┐ ┌──────────────────────┐ ┌──────────────────────┐ ┌──────────────────────┐ +│ Kubernetes/K8s │ │ OVN Layer │ │ OpenvSwitch │ │ Netfilter/ │ │ TCPDUMP/Debug │ +│ MCP Server │ │ MCP Server │ │ Layer MCP │ │ Kernel MCP │ │ MCP Server │ +│ │ │ │ │ Server │ │ Server │ │ │ +│ Tools: │ │ Tools: │ │ │ │ │ │ Tools: │ +│ • kubectl_get │ │ • ovn_nbctl_show │ │ Tools: │ │ Tools: │ │ • tcpdump_capture │ +│ • kubectl_describe │ │ • ovn_nbctl_list │ │ • ovs_ofctl_flows │ │ • ip_route_show │ │ • tcpdump_analyze │ +│ • kubectl_logs │ │ • ovn_sbctl_show │ │ • ovs_appctl_dpctl │ │ • ip_addr_show │ │ • pwru_trace │ +│ • kubectl_exec │ │ • ovn_sbctl_list │ │ • ovs_ofproto_trace │ │ • ip_rule_show │ │ │ +│ • kubectl_events │ │ • ovn_trace │ │ • ovs_appctl_conntr │ │ • nft_list_ruleset │ │ │ +│ • ovnkube_trace │ │ • ovn_detrace │ │ • ovs_vsctl_show │ │ • iptables_save │ │ │ +│ │ │ • ovn_controller_ct │ │ • retis_capture │ │ • conntrack_list │ │ │ +│ │ │ │ │ │ │ • conntrack_events │ │ │ +└──────────────────────┘ └──────────────────────┘ └──────────────────────┘ └──────────────────────┘ └──────────────────────┘ + + │ All tools aggregated + ▼ + +┌──────────────────────────────────────────────────────────────────────────────┐ +│ MCP CLIENT │ +│ (OVN-K8s Troubleshoot AI) │ +│ │ +│ Unified Tool Interface: │ +│ ┌──────────────────────────────────────────────────────────────────────────┐ │ +│ │ Layer 1 (K8s): kubectl_*, ovnkube_trace │ │ +│ │ Layer 2 (OVN): ovn_nbctl_*, ovn_sbctl_*, ovn_trace_* │ │ +│ │ Layer 3 (OVS): ovs_ofctl_*, ovs_appctl_*, ovs_vsctl_*, retis_* │ │ +│ │ Layer 4 (Kernel): ip_*, nft_*, iptables_*, conntrack_* │ │ +│ │ Layer 5 (Debug): tcpdump_*, pwru_* │ │ +│ └──────────────────────────────────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────────────────────┘ + + │ Present unified interface + ▼ + +┌──────────────────────────────────────────────────────────────────────────────┐ +│ LLM (OVN-K8s Expert) │ +└──────────────────────────────────────────────────────────────────────────────┘ +``` + +Doing multiple MCP Servers has clear advantages like each layer owning their +toolset, independent development and maintenance, granular RBAC, reusability, +one server can't affect the other. Given we would need to align with different +layered communities later on for a fully supported set of servers from their +side which might take several releases, for the phase1 here, we plan to build +our own monolithic server that could then be split into multiple servers later +on. A single unified server is simpler, faster to iterate on. + +So our current implementation design looks like this: + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ MCP Client │ +│ "Debug pod connectivity issues" │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + MCP Protocol + │ + ▼ +┌──────────────────────────────────────────────────────────────────────────────┐ +│ OVN-Kubernetes MCP Server │ +│ │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌──────────────────────────────┐ │ +│ │ Kubernetes │ │ Live Cluster │ │ Offline Bundle │ │ +│ │ Layer │ │ Execution │ │ Execution │ │ +│ │ │ │ │ │ │ │ +│ │ • kubectl get │ │ kubectl exec │ │ Offline artifacts parser │ │ +│ │ • kubectl desc │ │ │ │ tools. │ │ +│ │ • kubectl logs │ │ Direct API │ │ Example: xsos and omc │ │ +│ └─────────────────┘ └─────────────────┘ └──────────────────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────────────────────────────────────┐ │ +│ │ Tool Categories │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌──────────────┐ ┌─────────────┐ │ │ +│ │ │ OVN Layer │ │ OVS Layer │ │Kernel Layer │ │External │ │ │ +│ │ │ │ │ │ │ │ │Tools │ │ │ +│ │ │ ovn-nbctl │ │ ovs-ofctl │ │ ip route │ │ tcpdump │ │ │ +│ │ │ ovn-sbctl │ │ ovs-appctl │ │ nft list │ │ pwru │ │ │ +│ │ │ ovn-trace │ │ ovs-vsctl │ │ conntrack │ │ retis │ │ │ +│ │ │ ovn-detrace │ │ ovs-dpctl │ │ iptables-save│ │ │ │ │ +│ │ │ ovn-appctl │ │ ovs-ofproto │ │ │ │ │ │ │ +│ │ └─────────────┘ └─────────────┘ └──────────────┘ └─────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────────────────────────────────────┐ │ +│ │ Security & RBAC │ │ +│ │ │ │ +│ │ • ovn-troubleshooter ClusterRole (read-only) │ │ +│ │ • Command parameter validation │ │ +│ │ • Node-specific targeting required │ │ +│ │ • Write operations blocked │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────────────────────┘ + │ + ┌───────────┼───────────┐ + │ │ │ + ▼ ▼ ▼ + ┌────────┐ ┌────────┐ ┌────────┐ + │ Node1 │ │ Node2 │ │ NodeN │ + │ │ │ │ │ │ + │ovnkube-│ │ovnkube-│ │ovnkube-│ + │node pod│ │node pod│ │node pod│ + │ │ │ │ │ │ + │ ovn-nb │ │ ovn-nb │ │ ovn-nb │ + │ ovn-sb │ │ ovn-sb │ │ ovn-sb │ + │ ovs │ │ ovs │ │ ovs │ + └────────┘ └────────┘ └────────┘ + +Data Flow Examples: +├─ Live: kubectl exec ovnkube-node-xyz -c nb-ovsdb -- ovn-nbctl show +├─ Live: kubectl debug node/worker-1 -- ovs-ofctl dump-flows br-int +├─ Live: kubectl debug node/worker-1 -- ip route show table all +├─ Offline: Parse must-gather/ovn-kubernetes/ovn-northbound.db +├─ Offline: Parse sos-report/node-1/openvswitch/ovs-ofctl_dump-flows +└─ Offline: Parse sos-report/node-1/networking/ip_route +``` + +Note: Some of the layers like Kubernetes and Offline Debugging have +existing servers like [kubernetes-mcp-server](https://github.com/containers/kubernetes-mcp-server) +and [mustgather-mcp-server](https://github.com/shivprakashmuley/mustgather-mcp-server) +can be re-used together with ovn-kubernetes-mcp server for holistic +end user experience. However kubernetes-mcp-server exposes `kubectl-exec` +which has security implications (although they also have a read-only +mode where only read commands are exposed). + +Note2: Feeding container logs to LLM will make the context window +full pretty fast. We need to investigate a method to ensure we are +filtering out relevant logs to feed. + +## Implementation Details of the OVN-Kubernetes MCP Server + +See the Alternatives section for other ideas that were discarded. + +### Chosen Approach: Direct CLI Tool Exposure (Idea1) + +The initial implementation takes a pragmatic approach by directly exposing the +existing CLI tools (`ovn-nbctl`, `ovn-sbctl`, `ovs-vsctl`, etc.) as MCP tools. +While this approach may seem less elegant than creating higher-level wrapper +abstractions to connect to the database (see discarded alternatives), it +offers the fastest path to value for OVN-Kubernetes engineers who are already +familiar with these tools but want to leverage LLM assistance for command +construction and output analysis. The CLI-based approach is also optimal for +exposing the complete troubleshooting toolkit across all layers of the stack. +Most other discarded ideas only addressed OVSDB access and reusability +concerns while failing to provide the holistic set of tools needed for +comprehensive root cause analysis like running packet traces. + +The MCP server acts as a secure execution bridge, translating natural language +troubleshooting requests into appropriate CLI commands. + +**Advantages**: + +* **Fastest Time to Value**: Leverages existing tools that engineers already + know +* **Zero Deployment Overhead**: All required CLI binaries are already present + in the pods and nodes within the cluster +* **Comprehensive Coverage**: Only approach that provides access to the + complete troubleshooting toolkit across all stack layers +* **Security Controls**: Enables read-only access enforcement by exposing + only the tools with read-only access (get/list) +* **No version compatibility issues** between the MCP server tools and the + cluster's installed versions when running on live cluster environments + +**Trade-offs**: + +* **Limited Reusability**: Somewhat specific to OVN-Kubernetes deployment + patterns and can't be reused in other layers like OVS + +This approach was selected as the optimal balance between security, +functionality, and development effort. + +## Security Model and RBAC Constraints + +No matter how we approach this implementation, it is impossible to totally +secure the execution of networking troubleshooting tools at this stage. Most +networking tools require privileged access to function properly, creating +inherent security trade-offs. The goal is therefore to minimize blast radius +through layered controls rather than achieve perfect isolation. + +**Kubernetes Layer Security:** + +* `kubectl exec` and `kubectl debug node` operations require cluster-admin level + privileges by design. Avoid exposing generic exec in the K8s layer. + Prefer direct Kubernetes API reads (`get`, `list`, `logs`, `events`). +* Alternative approach using custom debug containers + (`kubectl debug node --image=`) with volume mounts to database + files reduces some attack surface but remains intrusive +* **Mitigation**: Expose only Kubernetes API read operations (`get`, `list`, + `logs`, `events`); remove any generic exec tool in this layer. +* **Constraint**: MCP Server service account requires elevated privileges + (including `pods/exec`) despite being conceptually a "troubleshooter" role + +At first, for the kubernetes layer, we thought of leveraging the opensource +[kubernetes-mcp-server](https://github.com/containers/kubernetes-mcp-server). +So whatever security posture they use for now, can be adopted. They have a +`read-only` mode and a mode where write can be done via kubectl exec. +Later, after getting reviews, this enhancement has changed the approach +and pivotted towards opting into a more secure approach of adding a +tool that is a wrapper on top of `kubectl-exec` without directly exposing +kubectl-exec. So instead of relying on `kubernetes-mcp-server`, our +`ovn-kubernetes-mcp` will take the more secure approach of only using +`kubectl_exec` as an implementation detail but not directly expose it. +Downside of this is that we will need to also account for get or list +resources command being duplicated into `ovn-kubernetes-mcp`. So we +would need to implement the tools we need also on the kubernetes layer +ourselves. + +**OVN/OVS Database Layer Security:** + +* Unix socket-based database access prevents using SSL certificate-based + authentication and authorization +* Database connections inherit the security context of the container executing + the commands +* **Mitigation**: Command parameter validation to ensure only read-only + database operations (`show`, `list`, `dump`) while blocking modification + commands (`set`, `add`, `remove`) +* **Long-term Path**: Requires RBAC-enabled CLI execution even against local + unix socket. + +**Host/Kernel Layer Security:** + +* Kernel-level networking tools (`ip`, `nft`, `conntrack`) inherently require + root system access +* Current tooling lacks granular RBAC capabilities - tools are typically + all-or-nothing from a privilege perspective +* **External Tools Note**: Tools like `tcpdump -i any` can be highly intrusive + as they capture all network traffic on the host, requiring careful + consideration of privacy and performance impact while being chosen for + execution. +* **Short-term Mitigation**: Strict command allowlisting exposing only read + operations (`ip route show`, `nft list ruleset`) while blocking modification + commands (`ip route add`, `nft add rule`) +* **Long-term Path**: Requires RBAC-enabled wrapper tools from upstream + layered community teams (example netfilter, kernel networking) + +## Distributed Execution Context + +MCP Server will only be supported on interconnect mode. + +### **OVN-Kubernetes Architecture Challenge** + +In OVN-Kubernetes interconnect architecture, each node maintains its own local +instances of critical databases and services: + +* **Northbound Database**: Local OVN northbound database per node +* **Southbound Database**: Local OVN southbound database per node +* **OpenVSwitch Database**: Node-specific OVS database and flow tables +* **Host Networking**: Node-specific routing tables, conntrack zones, and + kernel state + +This distributed architecture means that troubleshooting commands must be executed +on the specific node where the relevant data resides. + +### **Node-Targeted Command Execution** + +**Node Selection Strategy**: All tools requiring node-specific data accept a +`node_name` parameter as a required argument. The MCP server uses this +parameter to: + +1. **Pod Selection**: Locate the appropriate `ovnkube-node` pod running on the + specified node using `pod.spec.nodeName` matching +2. **Container Targeting**: Route OVN database commands to the correct + container within the ovnkube-node pod (e.g., `nb-ovsdb`, `sb-ovsdb` + containers) +3. **Execution Context**: Execute host-level commands via + `kubectl debug node/ --image <>` for direct host access + +**LLM Responsibility**: The LLM must determine the appropriate target node(s) +based on the troubleshooting context: + +* **Pod-specific issues**: Use the node where the problematic pod is scheduled + (`kubectl get pod -o wide`) +* **Network flow analysis**: Target nodes along the packet path (source node, + destination node, gateway nodes) +* **Cluster-wide analysis**: Potentially execute commands across multiple + nodes for correlation + +We need to account for testing the LLM Responsibility side which is not +something we can guarantee but something we are offloading to the LLM that we +won't solve. + +## Deployment Strategy + +### **Flexible Deployment Modes** + +The MCP server is designed for flexible deployment without requiring elaborate +cluster infrastructure. Multiple deployment modes support different use cases +and security requirements: + +**CLI Tool Mode (Simplest)**: + +* Run the MCP server binary directly on a machine with `kubectl` access + to the target cluster +* Server uses existing cluster credentials and executes commands via standard + CLI tools +* Suitable for both live cluster troubleshooting +* Offline data based troubleshooting analysis would still need corresponding + parser tools to be run locally where the debug artifact files are hosted +* No cluster deployment required - operates entirely through external API + access + +**Debug Container Mode**: + +* Package the MCP server as a container image that the LLM can select for + `kubectl debug node --image=` operations +* This custom debug image contains the MCP server binary along with all + necessary troubleshooting tools +* Reduces blast radius compared to using default debug images with full host + access +* The LLM chooses this image when it needs to execute commands requiring + direct host access + +**Future Considerations**: + +* More elaborate deployment patterns (DaemonSet, Deployment) can be considered + when we think of use cases beyond ovn-kubernetes developers + +## Testing Strategy + +**Unit Testing - MCP Server Tools**: + +* Straightforward validation of individual tool execution and parameter + handling. Use [mcp-inspector](https://github.com/modelcontextprotocol/inspector). +* Mock cluster responses to test command routing and error handling +* Verify security controls and command allowlisting functionality + +**Integration Testing - The Complex Challenge**: + +* **Real Failure Scenario Reproduction**: Design test scenarios based on past + bugs and commonly occurring incidents +* **Chaos Engineering Integration**: Implement controlled failure injection to + create realistic troubleshooting scenarios +* **LLM Reasoning Validation**: The most critical and challenging aspect - + verifying that the LLM can produce meaningful root cause analysis from tool + outputs + +### **Scenario-Based Test Design** + +**Historical Incident Replay**: + +* Collect must-gather and sos-report bundles from past bugs +* Use these as offline test datasets to validate LLM troubleshooting accuracy +* Build regression test suite ensuring consistent analysis quality over time + +**Synthetic Failure Scenarios**: + +Some examples include: +* Network policy and EgressIP misconfigurations +* Pod connectivity failures across nodes +* Gateway flow issues and routing problems +* OVN database inconsistencies specially around EgressIPs + +**LLM Capability Assessment**: + +* Measure accuracy of root cause identification - depends on how much + OVN-Kubernetes feature specific context it has +* Evaluate quality of troubleshooting step recommendations +* Test correlation of multi-layer data analysis +* Validate handling of incomplete or missing data scenarios + +### **Success Metrics** + +* **Accuracy**: Percentage of correct root cause identifications in known + failure scenarios +* **Completeness**: Coverage of troubleshooting steps recommended vs. manual + expert analysis +* **Efficiency**: Time reduction compared to manual troubleshooting workflows +* **Safety**: Verification that only read-only operations are executed as + intended + +## Documentation Details + +OKEP for the MCP Server will be on +[https://ovn-kubernetes.io/](https://ovn-kubernetes.io/) . All end-user +documentation will be on the new repo's docs folder. + +* **Getting Started Guide**: MCP client setup and initial configuration +* **Troubleshooting Scenarios**: Common use cases and example natural language + queries +* **Tool Reference**: Available tools and their capabilities across all stack + layers +* **Security Model**: Warning around security considerations +* **Deployment Options**: CLI mode vs. debug container mode setup instructions +* **Offline Analysis**: Must-gather and sos-report analysis workflows + +## Alternative Implementation Ideas + +### Idea0: Using Existing kubernetes-mcp-server with Generic Bash Execution + +**Approach**: Use the existing kubernetes-mcp-server's `pods_exec` tool to run +arbitrary bash commands like `ovn-nbctl show` or `ip route` directly through +`kubectl exec` or `kubectl debug` sessions, without building any specialized +tooling. + +**Rationale**: This approach would provide immediate access to all CLI tools +without any development effort, leveraging the LLM's knowledge of command +syntax to construct appropriate bash commands. + +**Why Discarded**: + +* **Security Risk**: Allowing arbitrary bash command execution creates + significant security vulnerabilities. There's no protection against + destructive commands like `ovn-nbctl set` operations that could modify live + database state. +* **Lack of Access Control**: No way to enforce read-only operations or + validate command parameters before execution. +* **Separation of Concerns**: The LLM should focus on analysis and + troubleshooting logic, not on understanding the security implications of + direct system access. +* **Blast Radius**: Any compromise or LLM hallucination could potentially + execute dangerous commands on production systems. + +The fundamental principle that "each layer knows best about how/what tools to +allow with proper read access" makes a controlled wrapper approach essential +rather than direct bash execution. + +### Idea1: Chosen Approach: Direct CLI Tool Exposure + +See the proposed solution section. + +### Idea2: libovsdb-client Golang Wrapper + +**Approach**: Build a Golang MCP server using `NewOVSDBClient` to directly +query OVSDB instances with proper RBAC controls implemented through OVSDB's +native access control mechanisms. + +**Advantages**: +* **High Reusability**: Could be shared across OVN, OVS, and OVN-Kubernetes + projects +* **Native RBAC**: Leverages OVSDB's built-in role-based access controls +* **Structured Output**: Returns structured data rather than CLI text parsing + +**Why Discarded**: +* **Deployment Model**: Would require running as a DaemonSet on each node or + shipping binaries to ovnkube-node pods +* **Scope Limitation**: Only addresses database access, missing ovn and ovs + flow trace simulation and host networking tools + +### Idea3: ovsdb-client Binary Wrapper + +**Approach**: Create a wrapper around the existing `ovsdb-client` binary +(owned by the OVS team) to provide structured database access. + +**Advantages**: +* **High Reusability**: Could be shared across OVN, OVS, and OVN-Kubernetes + projects +* **Native RBAC**: Leverages OVSDB's built-in role-based access controls + +**Why Discarded**: +* **Ownership**: We could argue this wrapper better belongs in the openvswitch + community then in OVN-Kubernetes org +* **Scope Limitation**: Only addresses database access, missing ovn and ovs + flow trace simulation and host networking tools +* In future once we reach out to OVN and OVS communities to see what + they plan to do, we could revisit it. + +### Idea4: Direct Database Access Wrapper + +**Approach**: Build wrapper for direct database read operations bypassing CLI +and client tools entirely. + +**Advantages**: +* **High Reusability**: Could be shared across OVN, OVS, and OVN-Kubernetes + projects + +**Why Discarded**: Building from scratch when there's no real need to - +existing CLI tools already provide all necessary functionality with proven +reliability. + +# Known Risks and Limitations + +* AI! We can trust it only as much as we can throw it. + * Quality of this troubleshooter depends on the LLM's intelligence + * Quality of the MCP Server itself is however in our own hands and can be + enhanced based on user experience +* Security! We know that we cannot fully eliminate the risk +* Performance/Scalability: MCPs are a relatively new concept. So aspects like + how many tools could we expose per server and upto what point it scales etc + are unknowns. We will need to try and test it in our PoCs as we develop this. + * With sending bulky logs and debug details we also have danger of context + window running out. We need to ensure we filter out relevant logs. + * Token consumption and context window bloating. + * Other potential problems we need to rule out during testing: + * Poor tool selection: LLMs struggle to choose the right tool from too many options + * Parameter hallucination: Agents invoke tools with incorrect or fabricated parameters + * Misinterpretation: Responses from tools are more likely to be misunderstood + * Attention spreading: The model's attention gets distributed thinly across many options diff --git a/docs/okeps/okep-5552-dynamic-udn-node-allocation.md b/docs/okeps/okep-5552-dynamic-udn-node-allocation.md new file mode 100644 index 0000000000..1a68b9efb6 --- /dev/null +++ b/docs/okeps/okep-5552-dynamic-udn-node-allocation.md @@ -0,0 +1,205 @@ +# OKEP-5552: Dynamic UDN Node Allocation + +* Issue: [#5552](https://github.com/ovn-org/ovn-kubernetes/issues/5552) + +## Problem Statement + +When scaling UDNs, the control-plane cost of rendering a topology is high. This is the core limiting factor to +being able to scale to 1000s of UDNs. While there are plans to also improve network controller performance with UDNs, +there is still valuable savings to be had by not rendering UDNs on nodes where they are not needed. + +An example use case where this makes sense is when a Kubernetes cluster has its node resources segmented per tenant. In +this case, it only makes sense to run the tenant network (UDN) on the nodes where a tenant is allowed to run pods. This +allows for horizontal scaling to much higher number of overall UDNs running in a cluster. + +## Goals + + * To dynamically allow the network to only be rendered on specific nodes. + * To increase overall scalability of the number UDNs in a Kubernetes cluster with this solution. + * To increase the efficiency of ovnkube operations on nodes where a UDN exists, but is not needed. + +## Non-Goals + + * To fully solve control plane performance issues with UDNs. There will be several other fixes done to address that + outside of this enhancement. + * To provide any type of network security guarantee about exposing UDNs to limited subset of nodes. + +## Future Goals + + * Potentially enabling this feature on a per UDN basis, rather than globally. + +## Introduction + +The purpose of this feature is to add a configuration knob that users can turn on which will only render UDNs on nodes +where pods exist on that UDN. This feature will allow for higher overall UDN scale and less per-node control plane resource usage +under conditions where clusters do not have pods on every node, with connections to all UDNs. For example, if I have +1000 UDNs and 500 nodes, if a particular node only has pods connected to say 200 of those UDNs, then my node is only +responsible for rendering 200 UDNs instead of 1000 UDNs as it does today. + +This can provide significant control plane savings, but comes at a cost. Using the previous example, if a pod is now +launched in UDN 201, the node will have to render UDN 201 before the pod can be wired. In other words, this introduces +a one time larger pod latency cost for the first pod wired to the UDN. Additionally, there are more tradeoffs with other +feature limitations outlined later in this document. + +## User-Stories/Use-Cases + +Story 1: Segment groups of nodes per tenant + +As a cluster admin, I plan to dedicate groups of nodes to either a single tenant or small group of tenants. I plan +to create a CUDN per tenant, which means my network will only really need to exist on this group of nodes. I would +like to be able to limit this network to only be rendered on that subset of nodes. +This way I will be able to have less resource overhead from OVN-Kubernetes on each node, +and be able to scale to a higher number of UDNs in my cluster. + +## Proposed Solution + +The proposed solution is to add a configuration knob to OVN-Kubernetes, "--dynamic-udn-allocation", which will enable +this feature. Once enabled, NADs derived from CUDNs and UDNs will only be rendered on nodes where there is a pod +scheduled in that respective network. Additionally, if the node is scheduled as an Egress IP Node for a UDN, this node +will also render the UDN. + +When the last pod on the network is deleted from a node, OVNK will not immediately tear down the UDN. +Instead, OVNK will rely on a dead timer to expire to conclude that this UDN is no longer in use and +may be removed. This timer will also be configurable in OVN-Kubernetes as "--udn-deletion-grace-period". + +### API Details + +There will be no API changes. There will be new status conditions introduced in the section below. + +### Implementation Details + +In OVN-Kubernetes we have three main controllers that handle rendering of networking features for UDNs. They exist as + - Cluster Manager - runs on the control-plane, handles cluster-wide allocation, rendering of CUDN/UDNs + - Controller Manager - runs on a per-zone basis, handles configuring OVN for all networking features + - Node Controller Manager - runs on a per-node basis, handles configuring node specific things like nftables, VRFs, etc. + +With this change, Cluster Manger will be largely untouched, while Controller Manager and Node Controller Manager will be +modified in a few places to filter out rendering UDNs when a pod doesn't exist. + +#### Internal Controller Details + +In OVN-Kubernetes we have many controllers that handle features for different networks, encompassed under three +controller manager containers. The breakdown of how these will be modified is outlined below: + +* Cluster Manager + * UDN Controller — No change + * Route Advertisements Controller — No change + * Egress Service Cluster — Doesn't support UDN + * Endpoint Mirror Controller — No change + * EgressIP Controller — No change + * Unidling Controller — No change + * DNS Resolver — No change + * Network Cluster Controller — Modified to report status and exclude nodes not serving the UDN +* Controller Manager (ovnkube-controller) + * Default Network — No change + * NAD Controller — Ignore NADs for UDNs that are not active on this node (no pods for the UDN and not an EIP node) +* Node Controller Manager + * Default Network — No change + * NAD Controller — Ignore NADs for UDNs that are not active on this node (no pods for the UDN and not an EIP node) + +The resulting NAD Controller change will filter out NADs that do not apply to this node, stopping NAD keys from being +enqueued to the Controller Manager/Node Controller Manager's Network Manager. Those Controller Managers will not need +to create or run any sub-controllers for nodes that do not have the network. To do this cleanly, NAD Controller will be +modified to hold a filterFunc field, which the respective controller manager can set in order to filter out NADs. For +Cluster Manager, this function will not apply, but for Controller Manager and Node Controller Manager it will be a function +that filters based on if the UDN is serving pods on this node. + +#### New Pod/EgressIP Tracker Controller + +In order to know whether the Managers should filter out a UDN, a pod controller and egress IP controller will be used +in the Managers to track this information in memory. The pod controller will be a new level driven controller for +each manager. For Egress IP, another new controller will be introduced that watches EgressIPs, Namespaces, and NADs in +order to track which NAD maps to a node serving Egress IP. + +When Managers are created, they will start these Pod/EgressIP Tracker Controllers, and set a filterFunc on NAD Controller. +The filterFunc will query the aforementioned controllers to determine if the NAD being synced matches the local node. If +not, then NADController will not create the UDN controller for that network. + +Additionally, the Pod/EgressIP Tracker Controllers will expose a callback function, called "onNetworkRefChange". When +the first pod is detected as coming up on a node + NAD combination, or the node activates as an Egress IP node for the +first time, onNetworkRefChange will be triggered, which allows a callback mechanism to be leveraged for events. The +Controller Manager and Node Controller Manager will leverage this callback, so that they can trigger NAD Controller to +reconcile the NAD for these events. This is important as it provides a way to signal that NADController should remove +a UDN controller if it is no longer active, or alternatively, force the NAD Controller to reconcile a UDN Controller if for example, +a new remote node has activated. + +#### Other Controller Changes + +The Layer3 network controller will need to filter out nodes where the UDN is not rendered. Upon receiving events, +they will query a Manager function called NodeHasNAD. Managers will export a Tracker interface, that only contains this +method for UDN Controllers to query. The implementation of NodeHasNAD will rely on the Manager querying their pod and +egress IP trackers. + +Upon UDN activation of a remote node, these controllers will need to receive events in order to reconcile the new remote node. +To do this, the corresponding tracker will trigger its callback, "onNetworkRefChange". That will trigger the Manager +to ask NAD Controller to reconcile the UDN controller belonging to this NAD. Once that Layer 3 UDN controller reconciles, +it will walk nodes and determine what needs to be added or removed. It will take the applicable nodes, set their +syncZoneICFailed status, then immediately queue the objects to the retry framework with no backoff. This will allow +the Zone IC (ZIC) controller to properly configure the transit switch with the remote peers, or tear it down, if necessary. + +#### Status Condition and Metric Changes + +A new status condition will be added to CUDN/UDN that will indicate how many nodes are selected for a network: +```yaml +status: + conditions: + - type: NodesSelected + status: "True" + reason: DynamicAllocation + message: "5 nodes rendered with network" + lastTransitionTime: 2025-09-22T20:10:00Z +``` + +If the status is "False", then no nodes are currently allocated for the network - no pods or egress IPs assigned. + +Cluster Manager will leverage instances of EgressIP and Pod Trackers in order to use that data for updating this status. +The nodes serving a network are defined as a node with at least one OVN networked pod or having an Egress IP assigned to +it on a NAD that maps to a UDN or CUDN. + +Additionally, events will be posted to the corresponding UDN/CUDN when nodes have become active or inactive for +a node. This was chosen instead of doing per node status events, as that can lead to scale issues. Using events provides +the audit trail, without those scale implications. The one drawback of this approach pertains to UDN deactivation. There +is an "udn-deletion-grace-period" timer used to delay deactivation of a UDN on a node. This is to prevent churn if a pod +is deleted, then almost immediately re-added. Without storing the timestamp in the API, we are relying internally on in +memory data. While this is fine for normal operation, if OVN-Kube pod restarts, we lose that context. However, this should +be fine as when we restart we have to walk and start all network controllers anyway, so we are not really creating a lot of +extra work for OVN-Kube here. + +A metric will also be exposed which allows the user to track over time how many nodes were active for a particular +network. + +### Testing Details + +* Unit Tests will be added to ensure the behavior works as expected, including checking that +OVN switches/routers are not created there is no pod/egress IP active on the node, etc. +* E2E Tests will be added to create a CUDN/UDN with the feature enabled and ensure pod traffic works correctly between nodes. +* Benchmark/Scale testing will be done to show the resource savings of 1000s of nodes with 1000s of UDNs. + +### Documentation Details + +* User-Defined Network feature documentation will be updated with a user guide for this new feature. + +## Risks, Known Limitations and Mitigations + +Risks: + * Additional first-pod cold start latency per UDN/node. Could impact pod readiness SLOs. + * Burst reconcile load on large rollouts of pods on inactive nodes. + +Limitations: + * No OVN central support. + * NodePort/ExternalIP services with external traffic policy mode "cluster", will not work when sending traffic to inactive nodes. + * MetalLB must be configured on nodes where the UDN is rendered. This can be achieved by scheduling a daemonset for the designated nodes on the UDN. + +## OVN Kubernetes Version Skew + +Targeted for release 1.2. + +## Alternatives + +Specifying a NodeSelector in the CUDN/UDN CRD in order to determine where a network should be rendered. This was the +initial idea of this enhancement, but was evaluated as less desirable than dynamic allocation. The dynamic allocation +provides more flexibility without a user/admin needing to intervene and update a CRD. + +## References + +None diff --git a/go-controller/.mockery.yaml b/go-controller/.mockery.yaml index 2ebe5c9937..7acd04759d 100644 --- a/go-controller/.mockery.yaml +++ b/go-controller/.mockery.yaml @@ -13,7 +13,6 @@ packages: github.com/ovn-org/ovn-kubernetes/go-controller/pkg/cni: interfaces: CNIPluginLibOps: - NetNS: config: dir: pkg/cni/mocks github.com/ovn-org/ovn-kubernetes/go-controller/pkg/kube: @@ -24,9 +23,9 @@ packages: config: all: true dir: pkg/kube/mocks - github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node: + github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/managementport: interfaces: - ManagementPort: + Interface: github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/address_set: config: all: true diff --git a/go-controller/Makefile b/go-controller/Makefile index 27ebf94c8b..008accd36f 100644 --- a/go-controller/Makefile +++ b/go-controller/Makefile @@ -35,7 +35,7 @@ TOOLS_OUTPUT_DIR = ${CURDIR}/${OUT_DIR} MOCKERY = ${TOOLS_OUTPUT_DIR}/mockery-${MOCKERY_VERSION} ## Tool Versions -MOCKERY_VERSION ?= v2.43.2 +MOCKERY_VERSION ?= v2.53.4 export NOROOT diff --git a/go-controller/pkg/allocator/id/allocator.go b/go-controller/pkg/allocator/id/allocator.go index a2a08a3b3f..15a30fb501 100644 --- a/go-controller/pkg/allocator/id/allocator.go +++ b/go-controller/pkg/allocator/id/allocator.go @@ -2,9 +2,10 @@ package id import ( "fmt" - "sync" + "slices" bitmapallocator "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/allocator/bitmap" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/syncmap" ) const ( @@ -28,7 +29,7 @@ type NamedAllocator interface { // idAllocator is used to allocate id for a resource and store the resource - id in a map type idAllocator struct { - nameIdMap sync.Map + nameIdMap *syncmap.SyncMap[int] idBitmap *bitmapallocator.AllocationBitmap } @@ -37,7 +38,7 @@ func NewIDAllocator(name string, maxIds int) Allocator { idBitmap := bitmapallocator.NewRoundRobinAllocationMap(maxIds, name) return &idAllocator{ - nameIdMap: sync.Map{}, + nameIdMap: syncmap.NewSyncMap[int](), idBitmap: idBitmap, } } @@ -45,10 +46,12 @@ func NewIDAllocator(name string, maxIds int) Allocator { // AllocateID allocates an id for the resource 'name' and returns the id. // If the id for the resource is already allocated, it returns the cached id. func (idAllocator *idAllocator) AllocateID(name string) (int, error) { + idAllocator.nameIdMap.LockKey(name) + defer idAllocator.nameIdMap.UnlockKey(name) // Check the idMap and return the id if its already allocated v, ok := idAllocator.nameIdMap.Load(name) if ok { - return v.(int), nil + return v, nil } id, allocated, _ := idAllocator.idBitmap.AllocateNext() @@ -66,13 +69,15 @@ func (idAllocator *idAllocator) AllocateID(name string) (int, error) { // It also returns an error if the resource 'name' has a different 'id' // already reserved. func (idAllocator *idAllocator) ReserveID(name string, id int) error { + idAllocator.nameIdMap.LockKey(name) + defer idAllocator.nameIdMap.UnlockKey(name) v, ok := idAllocator.nameIdMap.Load(name) if ok { - if v.(int) == id { + if v == id { // All good. The id is already reserved by the same resource name. return nil } - return fmt.Errorf("can't reserve id %d for the resource %s. It is already allocated with a different id %d", id, name, v.(int)) + return fmt.Errorf("can't reserve id %d for the resource %s. It is already allocated with a different id %d", id, name, v) } reserved, _ := idAllocator.idBitmap.Allocate(id) @@ -86,9 +91,11 @@ func (idAllocator *idAllocator) ReserveID(name string, id int) error { // ReleaseID releases the id allocated for the resource 'name' func (idAllocator *idAllocator) ReleaseID(name string) { + idAllocator.nameIdMap.LockKey(name) + defer idAllocator.nameIdMap.UnlockKey(name) v, ok := idAllocator.nameIdMap.Load(name) if ok { - idAllocator.idBitmap.Release(v.(int)) + idAllocator.idBitmap.Release(v) idAllocator.nameIdMap.Delete(name) } } @@ -116,3 +123,109 @@ func (allocator *namedAllocator) ReserveID(id int) error { func (allocator *namedAllocator) ReleaseID() { allocator.allocator.ReleaseID(allocator.name) } + +// idsAllocator is used to allocate multiple ids for a resource and store the resource - ids in a map +type idsAllocator struct { + // idBitmap allocated ids in range [0, maxIds-1] + idBitmap *bitmapallocator.AllocationBitmap + // offset can be used to shift the range to [offset, offset+maxIds-1] + offset int + // nameIdsMap stores the final allocated ids in range [offset, offset+maxIds-1] for a resource name + nameIdsMap *syncmap.SyncMap[[]int] +} + +// newIDsAllocator returns an idsAllocator. +// If offset is non-zero, the allocated ids will be in the range [offset, offset+maxIds-1) +func newIDsAllocator(name string, maxIds int, offset int) *idsAllocator { + idBitmap := bitmapallocator.NewRoundRobinAllocationMap(maxIds, name) + return &idsAllocator{ + nameIdsMap: syncmap.NewSyncMap[[]int](), + idBitmap: idBitmap, + offset: offset, + } +} + +// AllocateIDs allocates numOfIDs for the resource 'name' and returns the ids. +// If less ids than numOfIDs are already allocated for the resource name, it will allocate the missing amount. +// If more ids than numOfIDs are already allocated for the resource name, it returns an error. +func (idsAllocator *idsAllocator) AllocateIDs(name string, numOfIDs int) ([]int, error) { + idsAllocator.nameIdsMap.LockKey(name) + defer idsAllocator.nameIdsMap.UnlockKey(name) + // Check the idMap and return the id if its already allocated + ids, ok := idsAllocator.nameIdsMap.Load(name) + if ok { + if len(ids) == numOfIDs { + return ids, nil + } + if len(ids) > numOfIDs { + return ids, fmt.Errorf("the resource %s already has more ids allocated %v than requested %v", name, ids, numOfIDs) + } + } else { + ids = make([]int, 0, numOfIDs) + } + previouslyAllocated := len(ids) + for len(ids) < numOfIDs { + id, allocated, _ := idsAllocator.idBitmap.AllocateNext() + if !allocated { + // release newly allocated ids + for _, id := range ids[previouslyAllocated:] { + idsAllocator.idBitmap.Release(id - idsAllocator.offset) + } + return ids, fmt.Errorf("failed to allocate the id for the resource %s", name) + } + ids = append(ids, id+idsAllocator.offset) + } + if len(ids) == 0 { + // don't store empty slice in the map + return ids, nil + } + idsAllocator.nameIdsMap.Store(name, ids) + return ids, nil +} + +// ReserveIDs reserves 'ids' for the resource 'name'. It returns an +// error if one of the 'ids' is already reserved by a resource other than 'name'. +// It also returns an error if the resource 'name' has a different 'ids' slice +// already reserved. Slice elements order is important for comparison. +func (idsAllocator *idsAllocator) ReserveIDs(name string, ids []int) error { + idsAllocator.nameIdsMap.LockKey(name) + defer idsAllocator.nameIdsMap.UnlockKey(name) + existingIDs, ok := idsAllocator.nameIdsMap.Load(name) + if ok { + if slices.Equal(existingIDs, ids) { + // All good. The ids are already reserved by the same resource name. + return nil + } + return fmt.Errorf("can't reserve ids %v for the resource %s. It is already allocated with different ids %v", + ids, name, existingIDs) + } + allocatedIDs := make([]int, 0, len(ids)) + for _, id := range ids { + // don't forget to adjust the id with the offset + reserved, _ := idsAllocator.idBitmap.Allocate(id - idsAllocator.offset) + if !reserved { + // cleanup previously allocated ids + for _, allocatedID := range allocatedIDs { + idsAllocator.idBitmap.Release(allocatedID - idsAllocator.offset) + } + return fmt.Errorf("id %d is already reserved by another resource", id) + } + allocatedIDs = append(allocatedIDs, id) + } + idsAllocator.nameIdsMap.Store(name, allocatedIDs) + return nil +} + +// ReleaseIDs releases all ids allocated for the resource 'name' +func (idsAllocator *idsAllocator) ReleaseIDs(name string) { + idsAllocator.nameIdsMap.LockKey(name) + defer idsAllocator.nameIdsMap.UnlockKey(name) + existingIDs, ok := idsAllocator.nameIdsMap.Load(name) + if !ok { + return + } + for _, id := range existingIDs { + idsAllocator.idBitmap.Release(id - idsAllocator.offset) + } + idsAllocator.nameIdsMap.Delete(name) +} diff --git a/go-controller/pkg/allocator/id/allocator_test.go b/go-controller/pkg/allocator/id/allocator_test.go new file mode 100644 index 0000000000..d520145625 --- /dev/null +++ b/go-controller/pkg/allocator/id/allocator_test.go @@ -0,0 +1,173 @@ +package id + +import ( + "slices" + "testing" +) + +func TestIDsAllocator(t *testing.T) { + // create allocator with range [3, 8] + allocator := newIDsAllocator("test", 6, 3) + ids, err := allocator.AllocateIDs("test1", 0) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if len(ids) != 0 { + t.Errorf("expect 0 ids allocated, but got %v", ids) + } + // test reserve IDs + err = allocator.ReserveIDs("test1", []int{4}) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + // ids: test1 = [4] + // test offset and multiple IDs allocation skipping allocated ID + ids, err = allocator.AllocateIDs("test2", 3) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !slices.Equal(ids, []int{3, 5, 6}) { + t.Errorf("expect ids [3,5,6] allocated, but got %v", ids) + } + // ids: test1 = [4] + // ids: test2 = [3,5,6] + // try to allocate more ids for test1 + ids, err = allocator.AllocateIDs("test1", 2) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !slices.Equal(ids, []int{4, 7}) { + t.Errorf("expect ids [4,7] allocated, but got %v", ids) + } + // ids: test1 = [4,7] + // ids: test2 = [3,5,6] + // request already existing IDs + ids, err = allocator.AllocateIDs("test1", 2) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !slices.Equal(ids, []int{4, 7}) { + t.Errorf("expect ids [4,7] allocated, but got %v", ids) + } + // ids: test1 = [4,7] + // ids: test2 = [3,5,6] + // try to allocate more ids than available + ids, err = allocator.AllocateIDs("test3", 2) + if err == nil { + t.Errorf("expect error allocating id for test3, but got ids %v", ids) + } + // try to reserve last available ID + err = allocator.ReserveIDs("test3", []int{8}) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + // ids: test1 = [4,7] + // ids: test2 = [3,5,6] + // ids: test3 = [8] + // try to reserve different IDs + err = allocator.ReserveIDs("test3", []int{7, 8}) + if err == nil { + t.Errorf("expect error reserving ids for test3") + } + // now release IDs for test1 + allocator.ReleaseIDs("test1") + // ids: test2 = [3,5,6] + // ids: test3 = [8] + // try to allocate more ids than available + ids, err = allocator.AllocateIDs("test3", 4) + if err == nil { + t.Errorf("expect error allocating id for test3, but got ids %v", ids) + } + ids, err = allocator.AllocateIDs("test3", 3) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !slices.Equal(ids, []int{8, 4, 7}) { + t.Errorf("expect ids [8,4,7] allocated, but got %v", ids) + } + // ids: test2 = [3,5,6] + // ids: test3 = [8,4,7] +} + +func TestTunnelKeysAllocator(t *testing.T) { + allocator := NewTunnelKeyAllocator("test") + transitSwitchBase := 16711683 + tunnelKeyBase := 16715779 + // allocate 1 key for networkID 1 (transit switch key is preserved) + ids, err := allocator.AllocateKeys("net1", 1, 1) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !slices.Equal(ids, []int{transitSwitchBase + 1}) { + t.Errorf("expect ids %v allocated, but got %v", []int{transitSwitchBase + 1}, ids) + } + // now add one more key for networkID 1 (should return the same key) + ids, err = allocator.AllocateKeys("net1", 1, 2) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !slices.Equal(ids, []int{transitSwitchBase + 1, tunnelKeyBase}) { + t.Errorf("expect ids %v allocated, but got %v", []int{transitSwitchBase + 1, tunnelKeyBase}, ids) + } + // now ask for 1 key again for networkID 1 (reducing the number of requested keys is not expected and should return error) + ids, err = allocator.AllocateKeys("net1", 1, 1) + if err == nil { + t.Errorf("expect error allocating id for net1, but got ids %v", ids) + } + // check the 0 also works + ids, err = allocator.AllocateKeys("net1", 1, 0) + if err == nil { + t.Errorf("expect error allocating id for net1, but got ids %v", ids) + } + // same for reserve IDs + err = allocator.ReserveKeys("net1", []int{transitSwitchBase + 1}) + if err == nil { + t.Errorf("expect error reserving ids for net1") + } + // now reserve already allocated ids, should be ok + err = allocator.ReserveKeys("net1", []int{transitSwitchBase + 1, tunnelKeyBase}) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + + // allocate 3 keys for networkID 2 (transit switch key is preserved + 2 allocated keys) + ids, err = allocator.AllocateKeys("net2", 2, 3) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !slices.Equal(ids, []int{transitSwitchBase + 2, tunnelKeyBase + 1, tunnelKeyBase + 2}) { + t.Errorf("expect ids %v allocated, but got %v", []int{transitSwitchBase + 1, tunnelKeyBase + 1, tunnelKeyBase + 2}, ids) + } + // reserve next 2 keys for networkID 3 + err = allocator.ReserveKeys("net3", []int{tunnelKeyBase + 3, tunnelKeyBase + 4}) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + // allocate 2 keys for networkID 4 + ids, err = allocator.AllocateKeys("net4", 4, 2) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !slices.Equal(ids, []int{transitSwitchBase + 4, tunnelKeyBase + 5}) { + t.Errorf("expect ids %v allocated, but got %v", []int{transitSwitchBase + 4, tunnelKeyBase + 5}, ids) + } + // check network ID out of reserved range + ids, err = allocator.AllocateKeys("net5", 5000, 1) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !slices.Equal(ids, []int{tunnelKeyBase + 6}) { + t.Errorf("expect ids %v allocated, but got %v", []int{tunnelKeyBase + 6}, ids) + } + + totalKeys := 61437 + // we have already allocated 7 keys from the free range, request the rest of them + 1 + _, err = allocator.AllocateKeys("net6", 10000, totalKeys-7+1) + if err == nil { + t.Errorf("expect error allocating id for net5") + } + _, err = allocator.AllocateKeys("net6", 10000, totalKeys-7) + if err != nil { + t.Errorf("unexpected error: %v", err) + } +} diff --git a/go-controller/pkg/allocator/id/tunnelkeyallocator.go b/go-controller/pkg/allocator/id/tunnelkeyallocator.go new file mode 100644 index 0000000000..66730fe0c4 --- /dev/null +++ b/go-controller/pkg/allocator/id/tunnelkeyallocator.go @@ -0,0 +1,67 @@ +package id + +// TunnelKeysAllocator is used to allocate tunnel Keys for distributed OVN datapaths. +// It preserves first 4096 keys for the already-used transit switch IDs based on the networkID. +type TunnelKeysAllocator struct { + idsAllocator *idsAllocator + preservedRange int + idsOffset int +} + +// NewTunnelKeyAllocator returns an TunnelKeysAllocator +func NewTunnelKeyAllocator(name string) *TunnelKeysAllocator { + // OVN-defined constants from + // https://github.com/ovn-org/ovn/blob/cfaf849c034469502fc97149f20676dec4d76595/lib/ovn-util.h#L159-L164 + // total number of datapath(switches and routers) keys + maxDPKey := (1 << 24) - 1 + // We have already used some keys for transit switch tunnels, the maximum tunnel key that is already allocated + // is BaseTransitSwitchTunnelKey + MaxNetworks. + // BaseTransitSwitchTunnelKey = 16711683 + // MaxNetworks = 4096 + rangeStart := 16711683 + 4096 + // this is how many keys are left for allocation + freeIDs := maxDPKey - rangeStart + 1 + + return &TunnelKeysAllocator{ + idsAllocator: newIDsAllocator(name, freeIDs, rangeStart), + preservedRange: 4096, + idsOffset: 16711683, + } +} + +// AllocateKeys allocates 'numOfKeys' for the resource 'name'. +// Previously allocated keys for 'name' are preserved in case of error. +// If networkID is less than 4096, the first key will come from the preserved range +// based on the networkID. +// If less keys than numOfKeys are already allocated for the resource name, it will allocate the missing amount. +// If more keys than numOfKeys are already allocated for the resource name, it returns an error. +func (allocator *TunnelKeysAllocator) AllocateKeys(name string, networkID, numOfKeys int) ([]int, error) { + allocatedIDs := make([]int, 0, numOfKeys) + if networkID < allocator.preservedRange && numOfKeys > 0 { + // transit switch tunnel key is preserved + allocatedIDs = append(allocatedIDs, allocator.idsOffset+networkID) + numOfKeys -= 1 + } + newIDs, err := allocator.idsAllocator.AllocateIDs(name, numOfKeys) + if err != nil { + return nil, err + } + return append(allocatedIDs, newIDs...), nil +} + +// ReserveKeys reserves 'tunnelKeys' for the resource 'name'. It returns an +// error if one of the 'tunnelKeys' is already reserved by a resource other than 'name'. +// It also returns an error if the resource 'name' has a different 'tunnelKeys' slice +// already reserved. Slice elements order is important for comparison. +func (allocator *TunnelKeysAllocator) ReserveKeys(name string, tunnelKeys []int) error { + if len(tunnelKeys) > 0 && tunnelKeys[0]-allocator.idsOffset < allocator.preservedRange { + // transit switch tunnel key is not allocated by the allocator + tunnelKeys = tunnelKeys[1:] + } + return allocator.idsAllocator.ReserveIDs(name, tunnelKeys) +} + +// ReleaseKeys releases the tunnelKeys allocated for the resource 'name' +func (allocator *TunnelKeysAllocator) ReleaseKeys(name string) { + allocator.idsAllocator.ReleaseIDs(name) +} diff --git a/go-controller/pkg/allocator/pod/pod_annotation.go b/go-controller/pkg/allocator/pod/pod_annotation.go index eed6bab488..1f69442176 100644 --- a/go-controller/pkg/allocator/pod/pod_annotation.go +++ b/go-controller/pkg/allocator/pod/pod_annotation.go @@ -524,6 +524,8 @@ func AddRoutesGatewayIP( if !util.IsNetworkSegmentationSupportEnabled() || !netinfo.IsPrimaryNetwork() { return nil } + var nodeLRPMAC net.HardwareAddr + var hasV4 bool for _, podIfAddr := range podAnnotation.IPs { isIPv6 := utilnet.IsIPv6CIDR(podIfAddr) nodeSubnet, err := util.MatchFirstIPNetFamily(isIPv6, nodeSubnets) @@ -538,18 +540,30 @@ func AddRoutesGatewayIP( if network != nil && len(network.GatewayRequest) == 0 { // if specific default route for pod was not requested then add gatewayIP podAnnotation.Gateways = append(podAnnotation.Gateways, gatewayIPnet.IP) } + if !isIPv6 { + hasV4 = true + nodeLRPMAC = util.IPAddrToHWAddr(gatewayIPnet.IP) + } else if !hasV4 { + nodeLRPMAC = util.IPAddrToHWAddr(gatewayIPnet.IP) + } } // Until https://github.com/ovn-kubernetes/ovn-kubernetes/issues/4876 is fixed, it is limited to IC only if config.OVNKubernetesFeature.EnableInterconnect { if _, isIPv6Mode := netinfo.IPMode(); isIPv6Mode { - joinAddrs, err := udn.GetGWRouterIPs(node, netinfo.GetNetInfo()) - if err != nil { - if util.IsAnnotationNotSetError(err) { - return types.NewSuppressedError(err) + var routerPortMac net.HardwareAddr + if !util.UDNLayer2NodeUsesTransitRouter(node) { + joinAddrs, err := udn.GetGWRouterIPs(node, netinfo.GetNetInfo()) + if err != nil { + if util.IsAnnotationNotSetError(err) { + return types.NewSuppressedError(err) + } + return fmt.Errorf("failed parsing node gateway router join addresses, network %q, %w", netinfo.GetNetworkName(), err) } - return fmt.Errorf("failed parsing node gateway router join addresses, network %q, %w", netinfo.GetNetworkName(), err) + routerPortMac = util.IPAddrToHWAddr(joinAddrs[0].IP) + } else { + routerPortMac = nodeLRPMAC } - podAnnotation.GatewayIPv6LLA = util.HWAddrToIPv6LLA(util.IPAddrToHWAddr(joinAddrs[0].IP)) + podAnnotation.GatewayIPv6LLA = util.HWAddrToIPv6LLA(routerPortMac) } } return nil diff --git a/go-controller/pkg/clustermanager/clustermanager.go b/go-controller/pkg/clustermanager/clustermanager.go index b382cb5212..49226315f4 100644 --- a/go-controller/pkg/clustermanager/clustermanager.go +++ b/go-controller/pkg/clustermanager/clustermanager.go @@ -5,10 +5,14 @@ import ( "fmt" "net" + networkattchmentdefclientset "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/clientset/versioned" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/record" "k8s.io/klog/v2" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/allocator/id" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/clustermanager/dnsnameresolver" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/clustermanager/egressservice" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/clustermanager/endpointslicemirror" @@ -22,6 +26,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/networkmanager" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/controller/unidling" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/healthcheck" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" ) @@ -85,7 +90,16 @@ func NewClusterManager( cm.networkManager = networkmanager.Default() if config.OVNKubernetesFeature.EnableMultiNetwork { - cm.networkManager, err = networkmanager.NewForCluster(cm, wf, ovnClient, recorder) + // tunnelKeysAllocator is now only used for NAD tunnel keys allocation, but will be reused + // for Connecting UDNs. So we initialize it here and pass it to the networkManager. + // The same instance should be initialized only once and passed to all the + // users of tunnel-keys. + tunnelKeysAllocator, err := initTunnelKeysAllocator(ovnClient.NetworkAttchDefClient) + if err != nil { + return nil, fmt.Errorf("failed to initialize tunnel keys allocator: %w", err) + } + + cm.networkManager, err = networkmanager.NewForCluster(cm, wf, ovnClient, recorder, tunnelKeysAllocator) if err != nil { return nil, err } @@ -275,3 +289,39 @@ func (cm *ClusterManager) Reconcile(name string, old, new util.NetInfo) error { } return nil } + +// initTunnelKeysAllocator reserves any existing tunnel keys to avoid re-allocation. +// It will be shared across multiple controllers and should account for different object types. +// Good news is that we don't care about missing events, because we only need to reserve ids that are already +// annotated, and no one else can annotate them except ClusterManager. +func initTunnelKeysAllocator(nadClient networkattchmentdefclientset.Interface) (*id.TunnelKeysAllocator, error) { + tunnelKeysAllocator := id.NewTunnelKeyAllocator("TunnelKeys") + + existingNADs, err := nadClient.K8sCniCncfIoV1().NetworkAttachmentDefinitions("").List(context.TODO(), metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to list existing NADs: %w", err) + } + for _, nad := range existingNADs.Items { + // reserve tunnel keys that are already allocated to make sure they are + if nad.Annotations[types.OvnNetworkTunnelKeysAnnotation] != "" { + netconf, err := util.ParseNetConf(&nad) + if err != nil { + // ignore non-OVN NADs; otherwise log and continue + if err.Error() == util.ErrorAttachDefNotOvnManaged.Error() { + continue + } + klog.Warningf("Failed to parse NAD annotation %s: %v", nad.Name, err) + continue + } + networkName := netconf.Name + tunnelKeys, err := util.ParseTunnelKeysAnnotation(nad.Annotations[types.OvnNetworkTunnelKeysAnnotation]) + if err != nil { + return nil, fmt.Errorf("failed to parse annotated tunnel keys: %w", err) + } + if err = tunnelKeysAllocator.ReserveKeys(networkName, tunnelKeys); err != nil { + return nil, fmt.Errorf("failed to reserve tunnel keys %v for network %s: %w", tunnelKeys, networkName, err) + } + } + } + return tunnelKeysAllocator, nil +} diff --git a/go-controller/pkg/clustermanager/clustermanager_test.go b/go-controller/pkg/clustermanager/clustermanager_test.go index 74c2e0d1bf..f1bcb28a16 100644 --- a/go-controller/pkg/clustermanager/clustermanager_test.go +++ b/go-controller/pkg/clustermanager/clustermanager_test.go @@ -21,6 +21,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/factory" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/generator/udn" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/kube" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing" ovntypes "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" ) @@ -60,6 +61,7 @@ var _ = ginkgo.Describe("Cluster Manager", func() { ginkgo.AfterEach(func() { if f != nil { f.Shutdown() + f = nil } wg.Wait() }) @@ -843,6 +845,50 @@ var _ = ginkgo.Describe("Cluster Manager", func() { }) }) + ginkgo.Context("tunnel keys allocations", func() { + ginkgo.It("check for tunnel keys allocations", func() { + app.Action = func(_ *cli.Context) error { + nad1 := testing.GenerateNAD("test1", "test1", "test", ovntypes.Layer2Topology, + "10.0.0.0/24", ovntypes.NetworkRolePrimary) + // start with test1 network that already has keys allocated + nad1.Annotations = map[string]string{ + ovntypes.OvnNetworkTunnelKeysAnnotation: "[16711685,16715780]", + } + // and test2 network without keys allocated + nad2 := testing.GenerateNAD("test2", "test2", "test", ovntypes.Layer2Topology, + "10.0.0.0/24", ovntypes.NetworkRolePrimary) + clientSet := util.GetOVNClientset(nad1, nad2) + + // init the allocator that should reserve already allocated keys for test1 + allocator, err := initTunnelKeysAllocator(clientSet.NetworkAttchDefClient) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // check that reserving different keys for test2 will fail + err = allocator.ReserveKeys("test1", []int{16711685, 16715779}) + gomega.Expect(err).To(gomega.HaveOccurred()) + gomega.Expect(err.Error()).To(gomega.ContainSubstring("can't reserve ids [16715779] for the resource test1. It is already allocated with different ids [16715780]")) + // now try to allocate correct number of keys for test1 and check that returned IDs are correct + ids, err := allocator.AllocateKeys("test1", 2, 2) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + gomega.Expect(ids).To(gomega.Equal([]int{16711685, 16715780})) + // now allocate ids for networkID 1 + ids, err = allocator.AllocateKeys("test2", 1, 2) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + gomega.Expect(ids).To(gomega.Equal([]int{16711684, 16715779})) + // now try networkID 3 to make sure IDs of nad test1 are not allocated again + ids, err = allocator.AllocateKeys("test3", 3, 2) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + gomega.Expect(ids).To(gomega.Equal([]int{16711686, 16715781})) + return nil + } + + err := app.Run([]string{ + app.Name, + "-cluster-subnets=" + clusterCIDR, + }) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + }) + }) + ginkgo.Context("Node gateway router port IP allocations", func() { ginkgo.It("verify the node annotations", func() { app.Action = func(ctx *cli.Context) error { @@ -919,8 +965,8 @@ var _ = ginkgo.Describe("Cluster Manager", func() { ginkgo.Context("Transit switch port IP allocations", func() { ginkgo.It("Interconnect enabled", func() { - config.ClusterManager.V4TransitSwitchSubnet = "100.89.0.0/16" - config.ClusterManager.V6TransitSwitchSubnet = "fd99::/64" + config.ClusterManager.V4TransitSubnet = "100.89.0.0/16" + config.ClusterManager.V6TransitSubnet = "fd99::/64" app.Action = func(ctx *cli.Context) error { nodes := []corev1.Node{ { @@ -984,12 +1030,12 @@ var _ = ginkgo.Describe("Cluster Manager", func() { return fmt.Errorf("transit switch ips for node %s not allocated", n.Name) } - _, transitSwitchV4Subnet, err := net.ParseCIDR(config.ClusterManager.V4TransitSwitchSubnet) + _, transitSwitchV4Subnet, err := net.ParseCIDR(config.ClusterManager.V4TransitSubnet) if err != nil { return fmt.Errorf("could not parse IPv4 transit switch subnet %v", err) } - _, transitSwitchV6Subnet, err := net.ParseCIDR(config.ClusterManager.V6TransitSwitchSubnet) + _, transitSwitchV6Subnet, err := net.ParseCIDR(config.ClusterManager.V6TransitSubnet) if err != nil { return fmt.Errorf("could not parse IPv6 transit switch subnet %v", err) } diff --git a/go-controller/pkg/clustermanager/egressip_controller.go b/go-controller/pkg/clustermanager/egressip_controller.go index 4cbd00d18f..70ec7a58e2 100644 --- a/go-controller/pkg/clustermanager/egressip_controller.go +++ b/go-controller/pkg/clustermanager/egressip_controller.go @@ -129,6 +129,15 @@ func (eIPC *egressIPClusterController) getAllocationTotalCount() float64 { return float64(count) } +func (e *egressNode) hasAllocatedEgressIP(name string, eip string) bool { + for ip, egressIPName := range e.allocations { + if egressIPName == name && ip == eip { + return true + } + } + return false +} + // nodeAllocator contains all the information required to manage EgressIP assignment to egress node. This includes assignment // of EgressIP IPs to nodes and ensuring the egress nodes are reachable. For cloud nodes, it also tracks limits for // IP assignment to each node. @@ -865,6 +874,7 @@ func (eIPC *egressIPClusterController) addAllocatorEgressIPAssignments(name stri defer eIPC.nodeAllocator.Unlock() for _, status := range statusAssignments { if eNode, exists := eIPC.nodeAllocator.cache[status.Node]; exists { + klog.V(5).Infof("Setting egress IP node allocation - node: %s, EIP name: %s, IP: %s", eNode.name, name, status.EgressIP) eNode.allocations[status.EgressIP] = name } } @@ -1423,6 +1433,10 @@ func (eIPC *egressIPClusterController) validateEgressIPStatus(name string, items klog.Errorf("Allocator error: EgressIP: %s claims multiple egress IPs on same node: %s, will attempt rebalancing", name, eIPStatus.Node) validAssignment = false } + if !eNode.hasAllocatedEgressIP(name, eIPStatus.EgressIP) { + klog.Errorf("Allocator error: EgressIP: %s has mistmach with status vs cache for node: %s with IP: %s", name, eIPStatus.Node, eIPStatus.EgressIP) + validAssignment = false + } if !eNode.isEgressAssignable { klog.Errorf("Allocator error: EgressIP: %s assigned to node: %s which does not have egress label, will attempt rebalancing", name, eIPStatus.Node) validAssignment = false diff --git a/go-controller/pkg/clustermanager/egressip_controller_test.go b/go-controller/pkg/clustermanager/egressip_controller_test.go index 7f47c2d25d..b19fee1f7b 100644 --- a/go-controller/pkg/clustermanager/egressip_controller_test.go +++ b/go-controller/pkg/clustermanager/egressip_controller_test.go @@ -3453,6 +3453,50 @@ var _ = ginkgo.Describe("OVN cluster-manager EgressIP Operations", func() { egressIPs, nodes := getEgressIPStatus(egressIPName) gomega.Expect(nodes).To(gomega.ConsistOf(egressNode1.name, egressNode2.name)) gomega.Expect(egressIPs).To(gomega.ConsistOf(eIP.Status.Items[0].EgressIP, eIP.Status.Items[1].EgressIP)) + // give some time for event handler to be added and finish processing initial updates + time.Sleep(3 * time.Second) + realEIP, err := fakeClusterManagerOVN.fakeClient.EgressIPClient.K8sV1().EgressIPs().Get(context.TODO(), egressIPName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + ginkgo.By("Simulate lagging informer and send an event with outdated status") + eIP.Annotations = make(map[string]string) + for k, v := range realEIP.Annotations { + eIP.Annotations[k] = v + } + eIP.Status = egressipv1.EgressIPStatus{ + Items: []egressipv1.EgressIPStatusItem{ + { + EgressIP: egressIP2, + Node: egressNode1.name, + }, + { + EgressIP: egressIP1, + Node: egressNode1.name, + }, + }, + } + _, err = fakeClusterManagerOVN.fakeClient.EgressIPClient.K8sV1().EgressIPs().Update(context.TODO(), &eIP, metav1.UpdateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // give sometime for events to be processed + time.Sleep(3 * time.Second) + ginkgo.By("Simulate lagging informer and send an event with real updated status") + _, err = fakeClusterManagerOVN.fakeClient.EgressIPClient.K8sV1().EgressIPs().Update(context.TODO(), realEIP, metav1.UpdateOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + // give sometime for events to be processed + time.Sleep(3 * time.Second) + gomega.Eventually(func() error { + defer ginkgo.GinkgoRecover() + tmp, err := fakeClusterManagerOVN.fakeClient.EgressIPClient.K8sV1().EgressIPs().Get(context.TODO(), egressIPName, metav1.GetOptions{}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + var egressIPs, nodes []string + for _, status := range tmp.Status.Items { + egressIPs = append(egressIPs, status.EgressIP) + nodes = append(nodes, status.Node) + } + gomega.Expect(nodes).To(gomega.ConsistOf(egressNode1.name, egressNode2.name)) + gomega.Expect(egressIPs).To(gomega.ConsistOf(eIP.Status.Items[0].EgressIP, eIP.Status.Items[1].EgressIP)) + return nil + }).Should(gomega.Succeed()) + return nil } diff --git a/go-controller/pkg/clustermanager/endpointslicemirror/endpointslice_mirror_controller_test.go b/go-controller/pkg/clustermanager/endpointslicemirror/endpointslice_mirror_controller_test.go index 508a9e5b55..58c1f41554 100644 --- a/go-controller/pkg/clustermanager/endpointslicemirror/endpointslice_mirror_controller_test.go +++ b/go-controller/pkg/clustermanager/endpointslicemirror/endpointslice_mirror_controller_test.go @@ -17,6 +17,7 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/allocator/id" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/factory" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/networkmanager" @@ -41,7 +42,7 @@ var _ = ginkgo.Describe("Cluster manager EndpointSlice mirror controller", func( fakeClient = util.GetOVNClientset(objects...).GetClusterManagerClientset() wf, err := factory.NewClusterManagerWatchFactory(fakeClient) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - networkManager, err = networkmanager.NewForCluster(&testnm.FakeControllerManager{}, wf, fakeClient, nil) + networkManager, err = networkmanager.NewForCluster(&testnm.FakeControllerManager{}, wf, fakeClient, nil, id.NewTunnelKeyAllocator("TunnelKeys")) gomega.Expect(err).NotTo(gomega.HaveOccurred()) controller, err = NewController(fakeClient, wf, networkManager.Interface()) gomega.Expect(err).NotTo(gomega.HaveOccurred()) diff --git a/go-controller/pkg/clustermanager/routeadvertisements/controller_test.go b/go-controller/pkg/clustermanager/routeadvertisements/controller_test.go index c03c851808..680ee18a37 100644 --- a/go-controller/pkg/clustermanager/routeadvertisements/controller_test.go +++ b/go-controller/pkg/clustermanager/routeadvertisements/controller_test.go @@ -24,6 +24,7 @@ import ( "k8s.io/client-go/util/workqueue" "k8s.io/utils/ptr" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/allocator/id" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" controllerutil "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/controller" eiptypes "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/crd/egressip/v1" @@ -1005,7 +1006,7 @@ func TestController_reconcile(t *testing.T) { wf, err := factory.NewClusterManagerWatchFactory(fakeClientset) g.Expect(err).ToNot(gomega.HaveOccurred()) - nm, err := networkmanager.NewForCluster(&nmtest.FakeControllerManager{}, wf, fakeClientset, nil) + nm, err := networkmanager.NewForCluster(&nmtest.FakeControllerManager{}, wf, fakeClientset, nil, id.NewTunnelKeyAllocator("TunnelKeys")) g.Expect(err).ToNot(gomega.HaveOccurred()) c := NewController(nm.Interface(), wf, fakeClientset) diff --git a/go-controller/pkg/clustermanager/userdefinednetwork/controller_test.go b/go-controller/pkg/clustermanager/userdefinednetwork/controller_test.go index 08791e9bf4..ae9b88f6f1 100644 --- a/go-controller/pkg/clustermanager/userdefinednetwork/controller_test.go +++ b/go-controller/pkg/clustermanager/userdefinednetwork/controller_test.go @@ -20,6 +20,7 @@ import ( "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/allocator/id" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/clustermanager/userdefinednetwork/template" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" udnv1 "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/crd/userdefinednetwork/v1" @@ -61,7 +62,7 @@ var _ = Describe("User Defined Network Controller", func() { Expect(err).NotTo(HaveOccurred()) Expect(f.Start()).To(Succeed()) - networkManager, err := networkmanager.NewForCluster(&nmtest.FakeControllerManager{}, f, cs, nil) + networkManager, err := networkmanager.NewForCluster(&nmtest.FakeControllerManager{}, f, cs, nil, id.NewTunnelKeyAllocator("TunnelKeys")) Expect(err).NotTo(HaveOccurred()) return New(cs.NetworkAttchDefClient, f.NADInformer(), cs.UserDefinedNetworkClient, f.UserDefinedNetworkInformer(), f.ClusterUserDefinedNetworkInformer(), diff --git a/go-controller/pkg/clustermanager/userdefinednetwork/template/net-attach-def-template.go b/go-controller/pkg/clustermanager/userdefinednetwork/template/net-attach-def-template.go index c18cad708d..e451ed3923 100644 --- a/go-controller/pkg/clustermanager/userdefinednetwork/template/net-attach-def-template.go +++ b/go-controller/pkg/clustermanager/userdefinednetwork/template/net-attach-def-template.go @@ -174,6 +174,13 @@ func renderCNINetworkConfig(networkName, nadName string, spec SpecGetter) (map[s netConfSpec.DefaultGatewayIPs = ipString(cfg.DefaultGatewayIPs) } netConfSpec.JoinSubnet = cidrString(renderJoinSubnets(cfg.Role, cfg.JoinSubnets)) + // now generate transit subnet for layer2 topology + if cfg.Role == userdefinednetworkv1.NetworkRolePrimary { + err := util.SetTransitSubnets(netConfSpec) + if err != nil { + return nil, err + } + } case userdefinednetworkv1.NetworkTopologyLocalnet: cfg := spec.GetLocalnet() netConfSpec.Role = strings.ToLower(string(cfg.Role)) @@ -194,6 +201,7 @@ func renderCNINetworkConfig(networkName, nadName string, spec SpecGetter) (map[s if err := util.ValidateNetConf(nadName, netConfSpec); err != nil { return nil, err } + if _, err := util.NewNetInfo(netConfSpec); err != nil { return nil, err } @@ -218,6 +226,9 @@ func renderCNINetworkConfig(networkName, nadName string, spec SpecGetter) (map[s if len(netConfSpec.JoinSubnet) > 0 { cniNetConf["joinSubnet"] = netConfSpec.JoinSubnet } + if len(netConfSpec.TransitSubnet) > 0 { + cniNetConf["transitSubnet"] = netConfSpec.TransitSubnet + } if len(netConfSpec.Subnets) > 0 { cniNetConf["subnets"] = netConfSpec.Subnets } diff --git a/go-controller/pkg/clustermanager/userdefinednetwork/template/net-attach-def-template_test.go b/go-controller/pkg/clustermanager/userdefinednetwork/template/net-attach-def-template_test.go index c24b56503e..e44cee4366 100644 --- a/go-controller/pkg/clustermanager/userdefinednetwork/template/net-attach-def-template_test.go +++ b/go-controller/pkg/clustermanager/userdefinednetwork/template/net-attach-def-template_test.go @@ -1,14 +1,18 @@ package template import ( + "strings" + netv1 "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/apis/k8s.cni.cncf.io/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" + ovncnitypes "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/cni/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" udnv1 "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/crd/userdefinednetwork/v1" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" . "github.com/onsi/ginkgo/v2" @@ -373,6 +377,7 @@ var _ = Describe("NetAttachDefTemplate", func() { "role": "primary", "topology": "layer2", "joinSubnet": "100.65.0.0/16,fd99::/64", + "transitSubnet": "100.88.0.0/16,fd97::/64", "subnets": "192.168.100.0/24,2001:dbb::/64", "mtu": 1500, "allowPersistentIPs": true @@ -399,6 +404,7 @@ var _ = Describe("NetAttachDefTemplate", func() { "role": "primary", "topology": "layer2", "joinSubnet": "100.62.0.0/24,fd92::/64", + "transitSubnet": "100.88.0.0/16,fd97::/64", "subnets": "192.168.100.0/24,2001:dbb::/64", "mtu": 1500, "allowPersistentIPs": true @@ -509,6 +515,7 @@ var _ = Describe("NetAttachDefTemplate", func() { "role": "primary", "topology": "layer2", "joinSubnet": "100.65.0.0/16,fd99::/64", + "transitSubnet": "100.88.0.0/16,fd97::/64", "subnets": "192.168.100.0/24,2001:dbb::/64", "mtu": 1500, "allowPersistentIPs": true @@ -535,6 +542,7 @@ var _ = Describe("NetAttachDefTemplate", func() { "role": "primary", "topology": "layer2", "joinSubnet": "100.62.0.0/24,fd92::/64", + "transitSubnet": "100.88.0.0/16,fd97::/64", "subnets": "192.168.100.0/24,2001:dbb::/64", "mtu": 1500, "allowPersistentIPs": true @@ -624,4 +632,45 @@ var _ = Describe("NetAttachDefTemplate", func() { }`, ), ) + + It("should correctly assign transit Subnets", func() { + // check no overlap, use default values + netConf := &ovncnitypes.NetConf{ + Role: strings.ToLower(types.NetworkRolePrimary), + Topology: strings.ToLower(types.Layer2Topology), + Subnets: "10.12.0.0/16,fd12:dbba::/64", + } + err := util.SetTransitSubnets(netConf) + Expect(err).NotTo(HaveOccurred()) + Expect(netConf.TransitSubnet).To(Equal("100.88.0.0/16,fd97::/64")) + // check Subnet with the default Transit subnet overlap + netConf = &ovncnitypes.NetConf{ + Role: strings.ToLower(types.NetworkRolePrimary), + Topology: strings.ToLower(types.Layer2Topology), + Subnets: "100.88.0.0/15,fd97::/63", + } + err = util.SetTransitSubnets(netConf) + Expect(err).NotTo(HaveOccurred()) + Expect(netConf.TransitSubnet).To(Equal("100.90.0.0/16,fd97:0:0:2::/64")) + // check joinSubnet with the default Transit subnet overlap + netConf = &ovncnitypes.NetConf{ + Role: strings.ToLower(types.NetworkRolePrimary), + Topology: strings.ToLower(types.Layer2Topology), + Subnets: "10.12.0.0/16,fd12:dbba::/64", + JoinSubnet: "100.88.0.0/17,fd97::/65", + } + err = util.SetTransitSubnets(netConf) + Expect(err).NotTo(HaveOccurred()) + Expect(netConf.TransitSubnet).To(Equal("100.89.0.0/16,fd97:0:0:1::/64")) + // check Subnet with the default Transit subnet overlap, then joinSubnet overlaps with the next selected transit subnet + netConf = &ovncnitypes.NetConf{ + Role: strings.ToLower(types.NetworkRolePrimary), + Topology: strings.ToLower(types.Layer2Topology), + Subnets: "100.88.0.0/15,fd97::/65", + JoinSubnet: "100.90.0.0/16,fd97:0:0:1::/64", + } + err = util.SetTransitSubnets(netConf) + Expect(err).NotTo(HaveOccurred()) + Expect(netConf.TransitSubnet).To(Equal("100.91.0.0/16,fd97:0:0:2::/64")) + }) }) diff --git a/go-controller/pkg/clustermanager/zone_cluster_controller.go b/go-controller/pkg/clustermanager/zone_cluster_controller.go index 41452e9c80..29d6c82cf1 100644 --- a/go-controller/pkg/clustermanager/zone_cluster_controller.go +++ b/go-controller/pkg/clustermanager/zone_cluster_controller.go @@ -66,16 +66,16 @@ func newZoneClusterController(ovnClient *util.OVNClusterManagerClientset, wf *fa var err error if config.OVNKubernetesFeature.EnableInterconnect { if config.IPv4Mode { - transitSwitchIPv4Generator, err = ipgenerator.NewIPGenerator(config.ClusterManager.V4TransitSwitchSubnet) + transitSwitchIPv4Generator, err = ipgenerator.NewIPGenerator(config.ClusterManager.V4TransitSubnet) if err != nil { - return nil, fmt.Errorf("error creating IP Generator for v4 transit switch subnet %s: %w", config.ClusterManager.V4TransitSwitchSubnet, err) + return nil, fmt.Errorf("error creating IP Generator for v4 transit subnet %s: %w", config.ClusterManager.V4TransitSubnet, err) } } if config.IPv6Mode { - transitSwitchIPv6Generator, err = ipgenerator.NewIPGenerator(config.ClusterManager.V6TransitSwitchSubnet) + transitSwitchIPv6Generator, err = ipgenerator.NewIPGenerator(config.ClusterManager.V6TransitSubnet) if err != nil { - return nil, fmt.Errorf("error creating IP Generator for v6 transit switch subnet %s: %w", config.ClusterManager.V4TransitSwitchSubnet, err) + return nil, fmt.Errorf("error creating IP Generator for v6 transit subnet %s: %w", config.ClusterManager.V6TransitSubnet, err) } } } diff --git a/go-controller/pkg/cni/helper_linux_test.go b/go-controller/pkg/cni/helper_linux_test.go index 4efb711de0..fb744c3018 100644 --- a/go-controller/pkg/cni/helper_linux_test.go +++ b/go-controller/pkg/cni/helper_linux_test.go @@ -560,7 +560,7 @@ func TestSetupSriovInterface(t *testing.T) { t.Fatal("failed to get NameSpace for test") }*/ - netNsDoForward := &mocks.NetNS{} + netNsDoForward := &cni_ns_mocks.NetNS{} netNsDoForward.On("Fd", mock.Anything).Return(uintptr(0)) var netNsDoError error netNsDoForward.On("Do", mock.AnythingOfType("func(ns.NetNS) error")).Run(func(args mock.Arguments) { diff --git a/go-controller/pkg/cni/mocks/CNIPluginLibOps.go b/go-controller/pkg/cni/mocks/CNIPluginLibOps.go index 8ddbeb5fd0..43e437a01d 100644 --- a/go-controller/pkg/cni/mocks/CNIPluginLibOps.go +++ b/go-controller/pkg/cni/mocks/CNIPluginLibOps.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/cni/mocks/NetNS.go b/go-controller/pkg/cni/mocks/NetNS.go deleted file mode 100644 index 193359080b..0000000000 --- a/go-controller/pkg/cni/mocks/NetNS.go +++ /dev/null @@ -1,98 +0,0 @@ -// Code generated by mockery v2.14.0. DO NOT EDIT. - -package mocks - -import ( - ns "github.com/containernetworking/plugins/pkg/ns" - mock "github.com/stretchr/testify/mock" -) - -// NetNS is an autogenerated mock type for the NetNS type -type NetNS struct { - mock.Mock -} - -// Close provides a mock function with given fields: -func (_m *NetNS) Close() error { - ret := _m.Called() - - var r0 error - if rf, ok := ret.Get(0).(func() error); ok { - r0 = rf() - } else { - r0 = ret.Error(0) - } - - return r0 -} - -// Do provides a mock function with given fields: toRun -func (_m *NetNS) Do(toRun func(ns.NetNS) error) error { - ret := _m.Called(toRun) - - var r0 error - if rf, ok := ret.Get(0).(func(func(ns.NetNS) error) error); ok { - r0 = rf(toRun) - } else { - r0 = ret.Error(0) - } - - return r0 -} - -// Fd provides a mock function with given fields: -func (_m *NetNS) Fd() uintptr { - ret := _m.Called() - - var r0 uintptr - if rf, ok := ret.Get(0).(func() uintptr); ok { - r0 = rf() - } else { - r0 = ret.Get(0).(uintptr) - } - - return r0 -} - -// Path provides a mock function with given fields: -func (_m *NetNS) Path() string { - ret := _m.Called() - - var r0 string - if rf, ok := ret.Get(0).(func() string); ok { - r0 = rf() - } else { - r0 = ret.Get(0).(string) - } - - return r0 -} - -// Set provides a mock function with given fields: -func (_m *NetNS) Set() error { - ret := _m.Called() - - var r0 error - if rf, ok := ret.Get(0).(func() error); ok { - r0 = rf() - } else { - r0 = ret.Error(0) - } - - return r0 -} - -type mockConstructorTestingTNewNetNS interface { - mock.TestingT - Cleanup(func()) -} - -// NewNetNS creates a new instance of NetNS. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. -func NewNetNS(t mockConstructorTestingTNewNetNS) *NetNS { - mock := &NetNS{} - mock.Mock.Test(t) - - t.Cleanup(func() { mock.AssertExpectations(t) }) - - return mock -} diff --git a/go-controller/pkg/cni/types/types.go b/go-controller/pkg/cni/types/types.go index 1813dccf83..90fcb47ef0 100644 --- a/go-controller/pkg/cni/types/types.go +++ b/go-controller/pkg/cni/types/types.go @@ -50,6 +50,11 @@ type NetConf struct { // valid for UDN layer3/layer2 network topology // default value: 100.65.0.0/16,fd99::/64 if not provided JoinSubnet string `json:"joinSubnet,omitempty"` + // transit subnet cidr was previously internally set to the default value, + // but with the recent layer2 topology changes it may overlap with the network Subnet. + // To avoid that, transit subnet is now configurable. Only used by Primary Layer2 networks. + // in case of dualstack cluster, please do a comma-separated list + TransitSubnet string `json:"transitSubnet,omitempty"` // comma-separated list of default gateway IPs for layer2 primary networks // in case of dualstack cluster, please do a comma-separated list // expected format: diff --git a/go-controller/pkg/config/config.go b/go-controller/pkg/config/config.go index 72a22defbd..1dd4dce6ef 100644 --- a/go-controller/pkg/config/config.go +++ b/go-controller/pkg/config/config.go @@ -203,9 +203,11 @@ var ( } ClusterManager = ClusterManagerConfig{ - V4TransitSwitchSubnet: "100.88.0.0/16", - V6TransitSwitchSubnet: "fd97::/64", + V4TransitSubnet: "100.88.0.0/16", + V6TransitSubnet: "fd97::/64", } + + Layer2UsesTransitRouter bool ) const ( @@ -561,10 +563,10 @@ type OvnKubeNodeConfig struct { // ClusterManagerConfig holds configuration for ovnkube-cluster-manager type ClusterManagerConfig struct { - // V4TransitSwitchSubnet to be used in the cluster for interconnecting multiple zones - V4TransitSwitchSubnet string `gcfg:"v4-transit-switch-subnet"` - // V6TransitSwitchSubnet to be used in the cluster for interconnecting multiple zones - V6TransitSwitchSubnet string `gcfg:"v6-transit-switch-subnet"` + // V4TransitSubnet to be used in the cluster for interconnecting multiple zones + V4TransitSubnet string `gcfg:"v4-transit-subnet"` + // V6TransitSubnet to be used in the cluster for interconnecting multiple zones + V6TransitSubnet string `gcfg:"v6-transit-subnet"` } // OvnDBScheme describes the OVN database connection transport method @@ -684,6 +686,7 @@ func PrepareTestConfig() error { if Gateway.Mode != GatewayModeDisabled { Gateway.EphemeralPortRange = DefaultEphemeralPortRange } + Layer2UsesTransitRouter = true if err := completeConfig(); err != nil { return err @@ -1659,16 +1662,16 @@ var OvnKubeNodeFlags = []cli.Flag{ // ClusterManagerFlags captures ovnkube-cluster-manager specific configurations var ClusterManagerFlags = []cli.Flag{ &cli.StringFlag{ - Name: "cluster-manager-v4-transit-switch-subnet", - Usage: "The v4 transit switch subnet used for assigning transit switch IPv4 addresses for interconnect", - Destination: &cliConfig.ClusterManager.V4TransitSwitchSubnet, - Value: ClusterManager.V4TransitSwitchSubnet, + Name: "cluster-manager-v4-transit-subnet", + Usage: "The v4 transit subnet used for assigning transit switch and transit router IPv4 addresses for interconnect", + Destination: &cliConfig.ClusterManager.V4TransitSubnet, + Value: ClusterManager.V4TransitSubnet, }, &cli.StringFlag{ - Name: "cluster-manager-v6-transit-switch-subnet", - Usage: "The v6 transit switch subnet used for assigning transit switch IPv6 addresses for interconnect", - Destination: &cliConfig.ClusterManager.V6TransitSwitchSubnet, - Value: ClusterManager.V6TransitSwitchSubnet, + Name: "cluster-manager-v6-transit-subnet", + Usage: "The v6 transit switch subnet used for assigning transit switch and transit router IPv6 addresses for interconnect", + Destination: &cliConfig.ClusterManager.V6TransitSubnet, + Value: ClusterManager.V6TransitSubnet, }, } @@ -2187,14 +2190,14 @@ func buildClusterManagerConfig(cli, file *config) error { // into their final form. func completeClusterManagerConfig(allSubnets *ConfigSubnets) error { // Validate v4 and v6 transit switch subnets - v4IP, v4TransitCIDR, err := net.ParseCIDR(ClusterManager.V4TransitSwitchSubnet) + v4IP, v4TransitCIDR, err := net.ParseCIDR(ClusterManager.V4TransitSubnet) if err != nil || utilnet.IsIPv6(v4IP) { - return fmt.Errorf("invalid transit switch v4 subnet specified, subnet: %s: error: %v", ClusterManager.V4TransitSwitchSubnet, err) + return fmt.Errorf("invalid transit switch v4 subnet specified, subnet: %s: error: %v", ClusterManager.V4TransitSubnet, err) } - v6IP, v6TransitCIDR, err := net.ParseCIDR(ClusterManager.V6TransitSwitchSubnet) + v6IP, v6TransitCIDR, err := net.ParseCIDR(ClusterManager.V6TransitSubnet) if err != nil || !utilnet.IsIPv6(v6IP) { - return fmt.Errorf("invalid transit switch v6 subnet specified, subnet: %s: error: %v", ClusterManager.V6TransitSwitchSubnet, err) + return fmt.Errorf("invalid transit switch v6 subnet specified, subnet: %s: error: %v", ClusterManager.V6TransitSubnet, err) } allSubnets.Append(ConfigSubnetTransit, v4TransitCIDR) allSubnets.Append(ConfigSubnetTransit, v6TransitCIDR) @@ -2507,7 +2510,7 @@ func completeConfig() error { return err } - if err := allSubnets.CheckForOverlaps(); err != nil { + if _, _, err := allSubnets.CheckForOverlaps(); err != nil { return err } diff --git a/go-controller/pkg/config/config_test.go b/go-controller/pkg/config/config_test.go index c5a032c92c..6eb6013ea0 100644 --- a/go-controller/pkg/config/config_test.go +++ b/go-controller/pkg/config/config_test.go @@ -236,8 +236,8 @@ enable-admin-network-policy=false enable-persistent-ips=false [clustermanager] -v4-transit-switch-subnet=100.89.0.0/16 -v6-transit-switch-subnet=fd98::/64 +v4-transit-subnet=100.89.0.0/16 +v6-transit-subnet=fd98::/64 ` var newData string @@ -703,8 +703,8 @@ var _ = Describe("Config Operations", func() { gomega.Expect(HybridOverlay.ClusterSubnets).To(gomega.Equal([]CIDRNetworkEntry{ {ovntest.MustParseIPNet("11.132.0.0/14"), 23}, })) - gomega.Expect(ClusterManager.V4TransitSwitchSubnet).To(gomega.Equal("100.89.0.0/16")) - gomega.Expect(ClusterManager.V6TransitSwitchSubnet).To(gomega.Equal("fd98::/64")) + gomega.Expect(ClusterManager.V4TransitSubnet).To(gomega.Equal("100.89.0.0/16")) + gomega.Expect(ClusterManager.V6TransitSubnet).To(gomega.Equal("fd98::/64")) return nil } @@ -815,8 +815,8 @@ var _ = Describe("Config Operations", func() { })) gomega.Expect(Default.MonitorAll).To(gomega.BeFalse()) gomega.Expect(Default.OfctrlWaitBeforeClear).To(gomega.Equal(5000)) - gomega.Expect(ClusterManager.V4TransitSwitchSubnet).To(gomega.Equal("100.90.0.0/16")) - gomega.Expect(ClusterManager.V6TransitSwitchSubnet).To(gomega.Equal("fd96::/64")) + gomega.Expect(ClusterManager.V4TransitSubnet).To(gomega.Equal("100.90.0.0/16")) + gomega.Expect(ClusterManager.V6TransitSubnet).To(gomega.Equal("fd96::/64")) return nil } @@ -891,8 +891,8 @@ var _ = Describe("Config Operations", func() { "-dns-service-namespace=kube-system-2", "-dns-service-name=kube-dns-2", "-disable-requestedchassis=true", - "-cluster-manager-v4-transit-switch-subnet=100.90.0.0/16", - "-cluster-manager-v6-transit-switch-subnet=fd96::/64", + "-cluster-manager-v4-transit-subnet=100.90.0.0/16", + "-cluster-manager-v6-transit-subnet=fd96::/64", } err = app.Run(cliArgs) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -1224,7 +1224,7 @@ enable-pprof=true } cliArgs := []string{ app.Name, - "-cluster-manager-v4-transit-switch-subnet=foobar", + "-cluster-manager-v4-transit-subnet=foobar", } err := app.Run(cliArgs) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -1237,7 +1237,7 @@ enable-pprof=true } cliArgs := []string{ app.Name, - "-cluster-manager-v6-transit-switch-subnet=100.89.0.0/16", + "-cluster-manager-v6-transit-subnet=100.89.0.0/16", } err := app.Run(cliArgs) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -1250,13 +1250,13 @@ enable-pprof=true } cliArgs := []string{ app.Name, - "-cluster-manager-v4-transit-switch-subnet=100.89.0.0/16", - "-cluster-manager-v6-transit-switch-subnet=fd99::/64", + "-cluster-manager-v4-transit-subnet=100.89.0.0/16", + "-cluster-manager-v6-transit-subnet=fd99::/64", } err := app.Run(cliArgs) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - gomega.Expect(ClusterManager.V4TransitSwitchSubnet).To(gomega.Equal("100.89.0.0/16")) - gomega.Expect(ClusterManager.V6TransitSwitchSubnet).To(gomega.Equal("fd99::/64")) + gomega.Expect(ClusterManager.V4TransitSubnet).To(gomega.Equal("100.89.0.0/16")) + gomega.Expect(ClusterManager.V6TransitSubnet).To(gomega.Equal("fd99::/64")) }) It("overrides config file and defaults with CLI options (multi-master)", func() { kubeconfigFile, _, err := createTempFile("kubeconfig") diff --git a/go-controller/pkg/config/utils.go b/go-controller/pkg/config/utils.go index 20f4e0b35c..8aed084d57 100644 --- a/go-controller/pkg/config/utils.go +++ b/go-controller/pkg/config/utils.go @@ -209,17 +209,18 @@ func (cs *ConfigSubnets) Append(subnetType ConfigSubnetType, subnet *net.IPNet) } } -// CheckForOverlaps checks if any of the subnets in cs overlap -func (cs *ConfigSubnets) CheckForOverlaps() error { +// CheckForOverlaps checks if any of the subnets in cs overlap, and returns the first overlapping subnets +// together with an error. +func (cs *ConfigSubnets) CheckForOverlaps() (*net.IPNet, *net.IPNet, error) { for i, si := range cs.Subnets { for j := 0; j < i; j++ { sj := cs.Subnets[j] if si.Subnet.Contains(sj.Subnet.IP) || sj.Subnet.Contains(si.Subnet.IP) { - return NewSubnetOverlapError(si, sj) + return si.Subnet, sj.Subnet, NewSubnetOverlapError(si, sj) } } } - return nil + return nil, nil, nil } func (cs *ConfigSubnets) describeSubnetType(subnetType ConfigSubnetType) string { diff --git a/go-controller/pkg/config/utils_test.go b/go-controller/pkg/config/utils_test.go index 0092dc34dd..804b179a99 100644 --- a/go-controller/pkg/config/utils_test.go +++ b/go-controller/pkg/config/utils_test.go @@ -308,7 +308,7 @@ func Test_checkForOverlap(t *testing.T) { allSubnets.Append(ConfigSubnetCluster, subnet) } - err := allSubnets.CheckForOverlaps() + _, _, err := allSubnets.CheckForOverlaps() if err == nil && tc.shouldError { t.Errorf("testcase \"%s\" failed to find overlap", tc.name) } else if err != nil && !tc.shouldError { diff --git a/go-controller/pkg/controller/controller.go b/go-controller/pkg/controller/controller.go index f486981bdd..6d518c3e0c 100644 --- a/go-controller/pkg/controller/controller.go +++ b/go-controller/pkg/controller/controller.go @@ -255,7 +255,7 @@ func (c *controller[T]) processNextQueueItem() bool { if err != nil { retry := c.config.MaxAttempts == InfiniteAttempts || c.queue.NumRequeues(key) < c.config.MaxAttempts if retry { - klog.Infof("Controller %s: error found while processing %s: %v", c.name, key, err) + klog.Errorf("Controller %s: error found while processing %s: %v", c.name, key, err) c.queue.AddRateLimited(key) return true } diff --git a/go-controller/pkg/controllermanager/controller_manager.go b/go-controller/pkg/controllermanager/controller_manager.go index 6597e381ca..fef5b54f24 100644 --- a/go-controller/pkg/controllermanager/controller_manager.go +++ b/go-controller/pkg/controllermanager/controller_manager.go @@ -9,6 +9,8 @@ import ( "github.com/containernetworking/cni/pkg/types" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" clientset "k8s.io/client-go/kubernetes" @@ -252,8 +254,8 @@ func NewControllerManager(ovnClient *util.OVNClientset, wf *factory.WatchFactory wg: wg, multicastSupport: config.EnableMulticast, } - var err error + cm.networkManager = networkmanager.Default() if config.OVNKubernetesFeature.EnableMultiNetwork { cm.networkManager, err = networkmanager.NewForZone(config.Default.Zone, cm, wf) @@ -422,6 +424,10 @@ func (cm *ControllerManager) Start(ctx context.Context) error { } klog.Infof("Waiting for node in zone sync took: %s", time.Since(start)) + if err = cm.setTopologyType(); err != nil { + return fmt.Errorf("failed to set layer2 topology type: %w", err) + } + cm.configureMetrics(cm.stopChan) err = cm.configureSCTPSupport() @@ -536,3 +542,112 @@ func (cm *ControllerManager) configureAdvertisedNetworkIsolation() error { _, err := addressSetFactory.EnsureAddressSet(ovn.GetAdvertisedNetworkSubnetsAddressSetDBIDs()) return err } + +func (cm *ControllerManager) setTopologyType() error { + nodes, err := cm.kube.KClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("unable to get nodes from informer while waiting for node zone sync") + } + // set it to true and check if all the nodes in the zone already have annotation + config.Layer2UsesTransitRouter = true + for _, node := range nodes.Items { + if util.GetNodeZone(&node) == config.Default.Zone && node.Annotations[util.Layer2TopologyVersion] != util.TransitRouterTopoVersion { + // at least one node doesn't have the annotation + config.Layer2UsesTransitRouter = false + break + } + } + if config.Layer2UsesTransitRouter { + // all nodes are already using new topology, no need to do anything extra + return nil + } + + // Transit router is not used yet, check if we can switch to the new topology now. + // Find all layer2 switches and check if they have any running pods. + layer2Switches, err := libovsdbops.FindLogicalSwitchesWithPredicate(cm.nbClient, func(ls *nbdb.LogicalSwitch) bool { + return ls.ExternalIDs[ovntypes.TopologyExternalID] == ovntypes.Layer2Topology + }) + if err != nil { + return fmt.Errorf("failed to find layer2 switches: %w", err) + } + for _, sw := range layer2Switches { + hasRunningPods, err := cm.hasLocalPodsOnSwitch(sw) + if err != nil { + return fmt.Errorf("failed to check if there are running pods on switch %s: %w", sw.Name, err) + } + if hasRunningPods { + klog.Infof("Network %s has running pods, not switching to transit router topology yet", sw.Name) + return nil + } + } + // we checked all layer2 switches and none of them has running pods + // now make sure that cluster manager has upgraded and is assigning tunnel keys, otherwise new topology won't work + + // no layer2 switches means there are no layer2 networks (already handled, new ones are fine), so we won't find tunnel-keys annotations + if len(layer2Switches) != 0 { + existingNADs, err := cm.kube.NADClient.K8sCniCncfIoV1().NetworkAttachmentDefinitions("").List(context.TODO(), metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to list existing NADs: %w", err) + } + clusterManagerReady := false + for _, nad := range existingNADs.Items { + if nad.Annotations[ovntypes.OvnNetworkTunnelKeysAnnotation] != "" { + clusterManagerReady = true + break + } + } + if !clusterManagerReady { + klog.Infof("Cluster manager is not ready to assign tunnel keys yet, not switching to transit router topology yet") + return nil + } + } + + klog.Infof("Switching to transit router for layer2 networks") + config.Layer2UsesTransitRouter = true + return cm.setUDNLayer2NodeUsesTransitRouter(nodes) +} + +func hasPort(ports []string, port string) bool { + for _, p := range ports { + if p == port { + return true + } + } + return false +} + +func (cm *ControllerManager) hasLocalPodsOnSwitch(sw *nbdb.LogicalSwitch) (bool, error) { + if len(sw.Ports) == 0 { + return false, nil + } + + ports, err := libovsdbops.FindLogicalSwitchPortWithPredicate( + cm.nbClient, + func(lsp *nbdb.LogicalSwitchPort) bool { + return lsp.Type == "" && + lsp.ExternalIDs["pod"] == "true" && + hasPort(sw.Ports, lsp.UUID) + }) + if err != nil { + return false, err + } + if len(ports) > 0 { + return true, nil + } + return false, nil +} + +func (cm *ControllerManager) setUDNLayer2NodeUsesTransitRouter(nodeList *corev1.NodeList) error { + for _, node := range nodeList.Items { + if util.GetNodeZone(&node) == config.Default.Zone { + annotator := kube.NewNodeAnnotator(cm.kube, node.Name) + if err := annotator.Set(util.Layer2TopologyVersion, util.TransitRouterTopoVersion); err != nil { + return fmt.Errorf("failed to set node %s annotation %s: %w", node.Name, util.Layer2TopologyVersion, err) + } + if err := annotator.Run(); err != nil { + return fmt.Errorf("failed to run node %s annotator: %w", node.Name, err) + } + } + } + return nil +} diff --git a/go-controller/pkg/factory/mocks/NodeWatchFactory.go b/go-controller/pkg/factory/mocks/NodeWatchFactory.go index 11c39843de..3eb31c87ae 100644 --- a/go-controller/pkg/factory/mocks/NodeWatchFactory.go +++ b/go-controller/pkg/factory/mocks/NodeWatchFactory.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -32,7 +32,7 @@ type NodeWatchFactory struct { mock.Mock } -// APBRouteInformer provides a mock function with given fields: +// APBRouteInformer provides a mock function with no fields func (_m *NodeWatchFactory) APBRouteInformer() v1.AdminPolicyBasedExternalRouteInformer { ret := _m.Called() @@ -202,7 +202,7 @@ func (_m *NodeWatchFactory) AddServiceHandler(handlerFuncs cache.ResourceEventHa return r0, r1 } -// ClusterUserDefinedNetworkInformer provides a mock function with given fields: +// ClusterUserDefinedNetworkInformer provides a mock function with no fields func (_m *NodeWatchFactory) ClusterUserDefinedNetworkInformer() userdefinednetworkv1.ClusterUserDefinedNetworkInformer { ret := _m.Called() @@ -222,7 +222,7 @@ func (_m *NodeWatchFactory) ClusterUserDefinedNetworkInformer() userdefinednetwo return r0 } -// EgressIPInformer provides a mock function with given fields: +// EgressIPInformer provides a mock function with no fields func (_m *NodeWatchFactory) EgressIPInformer() egressipv1.EgressIPInformer { ret := _m.Called() @@ -242,7 +242,7 @@ func (_m *NodeWatchFactory) EgressIPInformer() egressipv1.EgressIPInformer { return r0 } -// GetAllPods provides a mock function with given fields: +// GetAllPods provides a mock function with no fields func (_m *NodeWatchFactory) GetAllPods() ([]*corev1.Pod, error) { ret := _m.Called() @@ -332,7 +332,7 @@ func (_m *NodeWatchFactory) GetNamespace(name string) (*corev1.Namespace, error) return r0, r1 } -// GetNamespaces provides a mock function with given fields: +// GetNamespaces provides a mock function with no fields func (_m *NodeWatchFactory) GetNamespaces() ([]*corev1.Namespace, error) { ret := _m.Called() @@ -392,7 +392,7 @@ func (_m *NodeWatchFactory) GetNode(name string) (*corev1.Node, error) { return r0, r1 } -// GetNodes provides a mock function with given fields: +// GetNodes provides a mock function with no fields func (_m *NodeWatchFactory) GetNodes() ([]*corev1.Node, error) { ret := _m.Called() @@ -542,7 +542,7 @@ func (_m *NodeWatchFactory) GetServiceEndpointSlices(namespace string, svcName s return r0, r1 } -// GetServices provides a mock function with given fields: +// GetServices provides a mock function with no fields func (_m *NodeWatchFactory) GetServices() ([]*corev1.Service, error) { ret := _m.Called() @@ -602,7 +602,7 @@ func (_m *NodeWatchFactory) ListNodes(selector labels.Selector) ([]*corev1.Node, return r0, r1 } -// LocalPodInformer provides a mock function with given fields: +// LocalPodInformer provides a mock function with no fields func (_m *NodeWatchFactory) LocalPodInformer() cache.SharedIndexInformer { ret := _m.Called() @@ -622,7 +622,7 @@ func (_m *NodeWatchFactory) LocalPodInformer() cache.SharedIndexInformer { return r0 } -// NADInformer provides a mock function with given fields: +// NADInformer provides a mock function with no fields func (_m *NodeWatchFactory) NADInformer() k8s_cni_cncf_iov1.NetworkAttachmentDefinitionInformer { ret := _m.Called() @@ -642,7 +642,7 @@ func (_m *NodeWatchFactory) NADInformer() k8s_cni_cncf_iov1.NetworkAttachmentDef return r0 } -// NamespaceInformer provides a mock function with given fields: +// NamespaceInformer provides a mock function with no fields func (_m *NodeWatchFactory) NamespaceInformer() informerscorev1.NamespaceInformer { ret := _m.Called() @@ -662,7 +662,7 @@ func (_m *NodeWatchFactory) NamespaceInformer() informerscorev1.NamespaceInforme return r0 } -// NodeCoreInformer provides a mock function with given fields: +// NodeCoreInformer provides a mock function with no fields func (_m *NodeWatchFactory) NodeCoreInformer() informerscorev1.NodeInformer { ret := _m.Called() @@ -682,7 +682,7 @@ func (_m *NodeWatchFactory) NodeCoreInformer() informerscorev1.NodeInformer { return r0 } -// NodeInformer provides a mock function with given fields: +// NodeInformer provides a mock function with no fields func (_m *NodeWatchFactory) NodeInformer() cache.SharedIndexInformer { ret := _m.Called() @@ -702,7 +702,7 @@ func (_m *NodeWatchFactory) NodeInformer() cache.SharedIndexInformer { return r0 } -// PodCoreInformer provides a mock function with given fields: +// PodCoreInformer provides a mock function with no fields func (_m *NodeWatchFactory) PodCoreInformer() informerscorev1.PodInformer { ret := _m.Called() @@ -742,7 +742,7 @@ func (_m *NodeWatchFactory) RemoveServiceHandler(handler *factory.Handler) { _m.Called(handler) } -// RouteAdvertisementsInformer provides a mock function with given fields: +// RouteAdvertisementsInformer provides a mock function with no fields func (_m *NodeWatchFactory) RouteAdvertisementsInformer() routeadvertisementsv1.RouteAdvertisementsInformer { ret := _m.Called() @@ -762,12 +762,12 @@ func (_m *NodeWatchFactory) RouteAdvertisementsInformer() routeadvertisementsv1. return r0 } -// Shutdown provides a mock function with given fields: +// Shutdown provides a mock function with no fields func (_m *NodeWatchFactory) Shutdown() { _m.Called() } -// Start provides a mock function with given fields: +// Start provides a mock function with no fields func (_m *NodeWatchFactory) Start() error { ret := _m.Called() @@ -785,7 +785,7 @@ func (_m *NodeWatchFactory) Start() error { return r0 } -// UserDefinedNetworkInformer provides a mock function with given fields: +// UserDefinedNetworkInformer provides a mock function with no fields func (_m *NodeWatchFactory) UserDefinedNetworkInformer() userdefinednetworkv1.UserDefinedNetworkInformer { ret := _m.Called() diff --git a/go-controller/pkg/factory/mocks/ObjectCacheInterface.go b/go-controller/pkg/factory/mocks/ObjectCacheInterface.go index b63736d685..8b6dcb5b7a 100644 --- a/go-controller/pkg/factory/mocks/ObjectCacheInterface.go +++ b/go-controller/pkg/factory/mocks/ObjectCacheInterface.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -15,7 +15,7 @@ type ObjectCacheInterface struct { mock.Mock } -// GetAllPods provides a mock function with given fields: +// GetAllPods provides a mock function with no fields func (_m *ObjectCacheInterface) GetAllPods() ([]*v1.Pod, error) { ret := _m.Called() @@ -75,7 +75,7 @@ func (_m *ObjectCacheInterface) GetNamespace(name string) (*v1.Namespace, error) return r0, r1 } -// GetNamespaces provides a mock function with given fields: +// GetNamespaces provides a mock function with no fields func (_m *ObjectCacheInterface) GetNamespaces() ([]*v1.Namespace, error) { ret := _m.Called() @@ -135,7 +135,7 @@ func (_m *ObjectCacheInterface) GetNode(name string) (*v1.Node, error) { return r0, r1 } -// GetNodes provides a mock function with given fields: +// GetNodes provides a mock function with no fields func (_m *ObjectCacheInterface) GetNodes() ([]*v1.Node, error) { ret := _m.Called() diff --git a/go-controller/pkg/generator/ip/ip_generator.go b/go-controller/pkg/generator/ip/ip_generator.go index 1344fc3d03..a3c096fe6b 100644 --- a/go-controller/pkg/generator/ip/ip_generator.go +++ b/go-controller/pkg/generator/ip/ip_generator.go @@ -5,6 +5,8 @@ import ( "math/big" "net" + iputils "github.com/containernetworking/plugins/pkg/ip" + utilnet "k8s.io/utils/net" ) @@ -40,3 +42,27 @@ func (ipGenerator *IPGenerator) GenerateIP(idx int) (*net.IPNet, error) { } return nil, fmt.Errorf("generated ip %s from the idx %d is out of range in the network %s", ip.String(), idx, ipGenerator.netCidr.String()) } + +// GenerateIPPair generates a pair of CIDRs in a subnet of size 2 (/31 or /127), carved from a supernet. +// idx determines the offset of subnet chosen. For example if the supernet was 100.88.0.0/16, +// the ordered list of subnets would be: +// [idx=0] 100.88.0.0 - 100.88.0.1 (100.88.0.0/31) +// [idx=1] 100.88.0.2 - 100.88.0.3 (100.88.0.2/31) +// [idx=2] 100.88.0.4 - 100.88.0.5 (100.88.0.4/31) +func (ipGenerator *IPGenerator) GenerateIPPair(idx int) (*net.IPNet, *net.IPNet, error) { + netMask := net.CIDRMask(31, 32) + if utilnet.IsIPv6CIDR(ipGenerator.netCidr) { + netMask = net.CIDRMask(127, 128) + } + numberOfIPs := 2 + // nodeIDs start from 1, netIP is the first IP of the subnet + firstIP := utilnet.AddIPOffset(ipGenerator.netBaseIP, idx*numberOfIPs) + if !ipGenerator.netCidr.Contains(firstIP) { + return nil, nil, fmt.Errorf("generated ip %s from the idx %d is out of range in the network %s", firstIP.String(), idx, ipGenerator.netCidr.String()) + } + secondIP := iputils.NextIP(firstIP) + if secondIP == nil || !ipGenerator.netCidr.Contains(secondIP) { + return nil, nil, fmt.Errorf("generated ip %s from the idx %d is out of range in the network %s", secondIP.String(), idx, ipGenerator.netCidr.String()) + } + return &net.IPNet{IP: firstIP, Mask: netMask}, &net.IPNet{IP: secondIP, Mask: netMask}, nil +} diff --git a/go-controller/pkg/generator/udn/join_ips.go b/go-controller/pkg/generator/udn/join_ips.go index 945799d245..f632365b37 100644 --- a/go-controller/pkg/generator/udn/join_ips.go +++ b/go-controller/pkg/generator/udn/join_ips.go @@ -88,3 +88,14 @@ func getGWRouterIP(subnet string, nodeID int) (*net.IPNet, error) { } return nodeGWRouterLRPIPGenerator.GenerateIP(nodeID) } + +func GetLastIPsFromJoinSubnet(netInfo util.NetInfo) []*net.IPNet { + var gwRouterAddrs []*net.IPNet + if config.IPv4Mode { + gwRouterAddrs = append(gwRouterAddrs, util.GetLastIPOfSubnet(netInfo.JoinSubnetV4(), 1)) + } + if config.IPv6Mode { + gwRouterAddrs = append(gwRouterAddrs, util.GetLastIPOfSubnet(netInfo.JoinSubnetV6(), 1)) + } + return gwRouterAddrs +} diff --git a/go-controller/pkg/generator/udn/masquerade_ips.go b/go-controller/pkg/generator/udn/masquerade_ips.go index 5882fb809e..3cec3c3833 100644 --- a/go-controller/pkg/generator/udn/masquerade_ips.go +++ b/go-controller/pkg/generator/udn/masquerade_ips.go @@ -86,3 +86,23 @@ func GetUDNGatewayMasqueradeIPs(networkID int) ([]*net.IPNet, error) { } return masqIPs, nil } + +// GetUDNMgmtPortMasqueradeIPs returns the list of management port masqueradeIPs for the given UDN's networkID +func GetUDNMgmtPortMasqueradeIPs(networkID int) ([]*net.IPNet, error) { + var masqIPs []*net.IPNet + if config.IPv4Mode { + v4MasqIPs, err := AllocateV4MasqueradeIPs(networkID) + if err != nil { + return nil, fmt.Errorf("failed to get v4 masquerade IP, networkID %d: %v", networkID, err) + } + masqIPs = append(masqIPs, v4MasqIPs.ManagementPort) + } + if config.IPv6Mode { + v6MasqIPs, err := AllocateV6MasqueradeIPs(networkID) + if err != nil { + return nil, fmt.Errorf("failed to get v6 masquerade IP, networkID %d: %v", networkID, err) + } + masqIPs = append(masqIPs, v6MasqIPs.ManagementPort) + } + return masqIPs, nil +} diff --git a/go-controller/pkg/kube/mocks/Annotator.go b/go-controller/pkg/kube/mocks/Annotator.go index 082d7c43fc..3bc47c92c1 100644 --- a/go-controller/pkg/kube/mocks/Annotator.go +++ b/go-controller/pkg/kube/mocks/Annotator.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -14,7 +14,7 @@ func (_m *Annotator) Delete(key string) { _m.Called(key) } -// Run provides a mock function with given fields: +// Run provides a mock function with no fields func (_m *Annotator) Run() error { ret := _m.Called() diff --git a/go-controller/pkg/kube/mocks/HTTPServer.go b/go-controller/pkg/kube/mocks/HTTPServer.go index 56f2f9ff9f..1998cb8716 100644 --- a/go-controller/pkg/kube/mocks/HTTPServer.go +++ b/go-controller/pkg/kube/mocks/HTTPServer.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/kube/mocks/HTTPServerFactory.go b/go-controller/pkg/kube/mocks/HTTPServerFactory.go index dfe42ae0b7..be3dcb6df7 100644 --- a/go-controller/pkg/kube/mocks/HTTPServerFactory.go +++ b/go-controller/pkg/kube/mocks/HTTPServerFactory.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/kube/mocks/Interface.go b/go-controller/pkg/kube/mocks/Interface.go index 594d33d699..81dd42e2b3 100644 --- a/go-controller/pkg/kube/mocks/Interface.go +++ b/go-controller/pkg/kube/mocks/Interface.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -17,7 +17,7 @@ type Interface struct { mock.Mock } -// Events provides a mock function with given fields: +// Events provides a mock function with no fields func (_m *Interface) Events() v1.EventInterface { ret := _m.Called() @@ -37,7 +37,7 @@ func (_m *Interface) Events() v1.EventInterface { return r0 } -// GetNode provides a mock function with given fields: name +// GetNodeForWindows provides a mock function with given fields: name func (_m *Interface) GetNodeForWindows(name string) (*corev1.Node, error) { ret := _m.Called(name) @@ -67,7 +67,7 @@ func (_m *Interface) GetNodeForWindows(name string) (*corev1.Node, error) { return r0, r1 } -// GetNodesForWindows provides a mock function with given fields: +// GetNodesForWindows provides a mock function with no fields func (_m *Interface) GetNodesForWindows() ([]*corev1.Node, error) { ret := _m.Called() @@ -97,7 +97,7 @@ func (_m *Interface) GetNodesForWindows() ([]*corev1.Node, error) { return r0, r1 } -// GetPods provides a mock function with given fields: namespace, opts +// GetPodsForDBChecker provides a mock function with given fields: namespace, opts func (_m *Interface) GetPodsForDBChecker(namespace string, opts metav1.ListOptions) ([]*corev1.Pod, error) { ret := _m.Called(namespace, opts) diff --git a/go-controller/pkg/kube/mocks/InterfaceOVN.go b/go-controller/pkg/kube/mocks/InterfaceOVN.go index 18e93ed800..0243889b2f 100644 --- a/go-controller/pkg/kube/mocks/InterfaceOVN.go +++ b/go-controller/pkg/kube/mocks/InterfaceOVN.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -71,7 +71,7 @@ func (_m *InterfaceOVN) DeleteCloudPrivateIPConfig(name string) error { return r0 } -// Events provides a mock function with given fields: +// Events provides a mock function with no fields func (_m *InterfaceOVN) Events() corev1.EventInterface { ret := _m.Called() @@ -91,7 +91,7 @@ func (_m *InterfaceOVN) Events() corev1.EventInterface { return r0 } -// GetEgressFirewalls provides a mock function with given fields: +// GetEgressFirewalls provides a mock function with no fields func (_m *InterfaceOVN) GetEgressFirewalls() ([]*egressfirewallv1.EgressFirewall, error) { ret := _m.Called() @@ -151,7 +151,7 @@ func (_m *InterfaceOVN) GetEgressIP(name string) (*egressipv1.EgressIP, error) { return r0, r1 } -// GetEgressIPs provides a mock function with given fields: +// GetEgressIPs provides a mock function with no fields func (_m *InterfaceOVN) GetEgressIPs() ([]*egressipv1.EgressIP, error) { ret := _m.Called() @@ -181,7 +181,7 @@ func (_m *InterfaceOVN) GetEgressIPs() ([]*egressipv1.EgressIP, error) { return r0, r1 } -// GetNode provides a mock function with given fields: name +// GetNodeForWindows provides a mock function with given fields: name func (_m *InterfaceOVN) GetNodeForWindows(name string) (*apicorev1.Node, error) { ret := _m.Called(name) @@ -211,12 +211,12 @@ func (_m *InterfaceOVN) GetNodeForWindows(name string) (*apicorev1.Node, error) return r0, r1 } -// GetNodesForWindows provides a mock function with given fields: +// GetNodesForWindows provides a mock function with no fields func (_m *InterfaceOVN) GetNodesForWindows() ([]*apicorev1.Node, error) { ret := _m.Called() if len(ret) == 0 { - panic("no return value specified for GetNodes") + panic("no return value specified for GetNodesForWindows") } var r0 []*apicorev1.Node @@ -241,7 +241,7 @@ func (_m *InterfaceOVN) GetNodesForWindows() ([]*apicorev1.Node, error) { return r0, r1 } -// GetPods provides a mock function with given fields: namespace, opts +// GetPodsForDBChecker provides a mock function with given fields: namespace, opts func (_m *InterfaceOVN) GetPodsForDBChecker(namespace string, opts metav1.ListOptions) ([]*apicorev1.Pod, error) { ret := _m.Called(namespace, opts) diff --git a/go-controller/pkg/kube/mocks/Listener.go b/go-controller/pkg/kube/mocks/Listener.go index dc8fda0074..5c3802c71c 100644 --- a/go-controller/pkg/kube/mocks/Listener.go +++ b/go-controller/pkg/kube/mocks/Listener.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/kube/mocks/Server.go b/go-controller/pkg/kube/mocks/Server.go index a91ea7cd89..6d911fd71b 100644 --- a/go-controller/pkg/kube/mocks/Server.go +++ b/go-controller/pkg/kube/mocks/Server.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/kubevirt/pod.go b/go-controller/pkg/kubevirt/pod.go index 8cde9d713e..5a6be80c8f 100644 --- a/go-controller/pkg/kubevirt/pod.go +++ b/go-controller/pkg/kubevirt/pod.go @@ -15,6 +15,7 @@ import ( libovsdbclient "github.com/ovn-kubernetes/libovsdb/client" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/factory" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/generator/udn" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/kube" @@ -515,24 +516,29 @@ func (r *DefaultGatewayReconciler) ReconcileIPv4AfterLiveMigration(liveMigration if liveMigrationStatus.State != LiveMigrationTargetDomainReady { return nil } + var gwMAC net.HardwareAddr + if !config.Layer2UsesTransitRouter { + targetNode, err := r.watchFactory.GetNode(liveMigrationStatus.TargetPod.Spec.NodeName) + if err != nil { + return err + } - targetNode, err := r.watchFactory.GetNode(liveMigrationStatus.TargetPod.Spec.NodeName) - if err != nil { - return err - } + lrpJoinAddress, err := udn.GetGWRouterIPv4(targetNode, r.netInfo) + if err != nil { + return err + } - lrpJoinAddress, err := udn.GetGWRouterIPv4(targetNode, r.netInfo) - if err != nil { - return err + gwMAC = util.IPAddrToHWAddr(lrpJoinAddress) } - - lrpMAC := util.IPAddrToHWAddr(lrpJoinAddress) for _, subnet := range r.netInfo.Subnets() { gwIP := r.netInfo.GetNodeGatewayIP(subnet.CIDR).IP.To4() if gwIP == nil { continue } - garp := util.GARP{IP: gwIP, MAC: &lrpMAC} + if config.Layer2UsesTransitRouter { + gwMAC = util.IPAddrToHWAddr(gwIP) + } + garp := util.GARP{IP: gwIP, MAC: &gwMAC} if err := util.BroadcastGARP(r.interfaceName, garp); err != nil { return err } @@ -573,7 +579,7 @@ func (r *DefaultGatewayReconciler) ReconcileIPv6AfterLiveMigration(liveMigration ras := make([]ndp.RouterAdvertisement, 0, len(nodes)) for _, node := range nodes { - if node.Name == liveMigration.TargetPod.Spec.NodeName { + if !config.Layer2UsesTransitRouter && node.Name == liveMigration.TargetPod.Spec.NodeName { // skip the target node since this is the proper gateway continue } @@ -587,22 +593,47 @@ func (r *DefaultGatewayReconciler) ReconcileIPv6AfterLiveMigration(liveMigration // to signal the removal of the old default gateway. // NOTE: This is a workaround for the issue and may not be needed in the future, after // upgrading to a version that supports the new behavior. - ras = append(ras, newRouterAdvertisementFromJoinIPAndLifetime(nodeJoinAddrs[0].IP, destinationMAC, destinationIP.IP, 0)) - } - targetNode, err := r.watchFactory.GetNode(liveMigration.TargetPod.Spec.NodeName) - if err != nil { - return fmt.Errorf("failed fetching node %q to reconcile ipv6 gateway: %w", liveMigration.TargetPod.Spec.NodeName, err) + ras = append(ras, newRouterAdvertisementFromIPAndLifetime(nodeJoinAddrs[0].IP, destinationMAC, destinationIP.IP, 0)) } - targetNodeJoinAddrs, err := udn.GetGWRouterIPs(targetNode, r.netInfo) - if err != nil { - return ovntypes.NewSuppressedError(fmt.Errorf("failed parsing join addresss from live migration target node %q and network %q to reconcile ipv6 gateway: %w", targetNode.Name, r.netInfo.GetNetworkName(), err)) + if !config.Layer2UsesTransitRouter { + targetNode, err := r.watchFactory.GetNode(liveMigration.TargetPod.Spec.NodeName) + if err != nil { + return fmt.Errorf("failed fetching node %q to reconcile ipv6 gateway: %w", liveMigration.TargetPod.Spec.NodeName, err) + } + targetNodeJoinAddrs, err := udn.GetGWRouterIPs(targetNode, r.netInfo) + if err != nil { + return ovntypes.NewSuppressedError(fmt.Errorf("failed parsing join addresss from live migration target node %q and network %q to reconcile ipv6 gateway: %w", targetNode.Name, r.netInfo.GetNetworkName(), err)) + } + ras = append(ras, newRouterAdvertisementFromIPAndLifetime(targetNodeJoinAddrs[0].IP, destinationMAC, destinationIP.IP, 65535)) + } else { + if len(targetPodAnnotation.Gateways) == 0 { + return fmt.Errorf("missing gateways to calculate ipv6 gateway reconciler RA") + } + // The LRP mac is calculated from the first address on the list. + gwIP := targetPodAnnotation.Gateways[0] + + // Create Prefix Information Option with IPv6 join subnet + prefixNet := r.netInfo.JoinSubnetV6() + if prefixNet == nil { + return fmt.Errorf("no IPv6 join subnet available for network %q", r.netInfo.GetNetworkName()) + } + + prefixInfo := ndp.PrefixInformation{ + Prefix: *prefixNet, + ValidLifetime: 0, + PreferredLifetime: 0, // IP lifetime 0 as requested + OnLink: true, + Autonomous: true, + } + + ras = append(ras, newRouterAdvertisementWithPrefixInfos(gwIP, destinationMAC, destinationIP.IP, 65535, []ndp.PrefixInformation{prefixInfo})) } - ras = append(ras, newRouterAdvertisementFromJoinIPAndLifetime(targetNodeJoinAddrs[0].IP, destinationMAC, destinationIP.IP, 65535)) + return ndp.SendRouterAdvertisements(r.interfaceName, ras...) } -// newRouterAdvertisementFromJoinIPAndLifetime creates a new Router Advertisement (RA) message -// using the provided join IP address, destination MAC, destination IP, and lifetime. +// newRouterAdvertisementFromIPAndLifetime creates a new Router Advertisement (RA) message +// using the provided IP address, destination MAC, destination IP, and lifetime. // // This function performs the following: // - Derives the source MAC address from the given IP using util.IPAddrToHWAddr. @@ -611,14 +642,25 @@ func (r *DefaultGatewayReconciler) ReconcileIPv6AfterLiveMigration(liveMigration // - Sets the RA message's lifetime to the specified value. // // Parameters: -// - ip: The join IP address used to derive the source MAC and LLA. +// - ip: The IP address used to derive the source MAC and LLA. // - destinationMAC: The MAC address to which the RA message will be sent. // - destinationIP: The IP address to which the RA message will be sent. // - lifetime: The lifetime value for the RA message, in seconds. // // Returns: // - An ndp.RouterAdvertisement object configured with the calculated source MAC, LLA, and the provided destination MAC, IP, and lifetime. -func newRouterAdvertisementFromJoinIPAndLifetime(ip net.IP, destinationMAC net.HardwareAddr, destinationIP net.IP, lifetime uint16) ndp.RouterAdvertisement { +func newRouterAdvertisementFromIPAndLifetime(ip net.IP, destinationMAC net.HardwareAddr, destinationIP net.IP, lifetime uint16) ndp.RouterAdvertisement { + sourceMAC := util.IPAddrToHWAddr(ip) + return ndp.RouterAdvertisement{ + SourceMAC: sourceMAC, + SourceIP: util.HWAddrToIPv6LLA(sourceMAC), + DestinationMAC: destinationMAC, + DestinationIP: destinationIP, + Lifetime: lifetime, + } +} + +func newRouterAdvertisementWithPrefixInfos(ip net.IP, destinationMAC net.HardwareAddr, destinationIP net.IP, lifetime uint16, prefixInfos []ndp.PrefixInformation) ndp.RouterAdvertisement { sourceMAC := util.IPAddrToHWAddr(ip) return ndp.RouterAdvertisement{ SourceMAC: sourceMAC, @@ -626,5 +668,6 @@ func newRouterAdvertisementFromJoinIPAndLifetime(ip net.IP, destinationMAC net.H DestinationMAC: destinationMAC, DestinationIP: destinationIP, Lifetime: lifetime, + PrefixInfos: prefixInfos, } } diff --git a/go-controller/pkg/libovsdb/ops/router.go b/go-controller/pkg/libovsdb/ops/router.go index 5f0ce594d4..abfac46ead 100644 --- a/go-controller/pkg/libovsdb/ops/router.go +++ b/go-controller/pkg/libovsdb/ops/router.go @@ -443,6 +443,45 @@ func CreateOrAddNextHopsToLogicalRouterPolicyWithPredicateOps(nbClient libovsdbc return m.CreateOrUpdateOps(ops, opModels...) } +// ReplaceNextHopForLogicalRouterPolicyWithPredicateOps replaces the Nexthop for logical router policies +// matching the given predicate. It first deletes the old Nexthop and then adds the new Nexthop for each policy. +// Returns the corresponding operations. +func ReplaceNextHopForLogicalRouterPolicyWithPredicateOps(nbClient libovsdbclient.Client, ops []ovsdb.Operation, p logicalRouterPolicyPredicate, + oldNextHop, newNextHop string) ([]ovsdb.Operation, error) { + lrps, err := FindLogicalRouterPoliciesWithPredicate(nbClient, p) + if err != nil { + return nil, err + } + for _, lrp := range lrps { + lrp.Nexthops = []string{oldNextHop} + opModel := operationModel{ + Model: lrp, + OnModelMutations: []interface{}{&lrp.Nexthops}, + ErrNotFound: false, + BulkOp: false, + } + + m := newModelClient(nbClient) + var err error + ops, err = m.DeleteOps(ops, opModel) + if err != nil { + return nil, fmt.Errorf("failed to get delete old nexthop %s ops: %w", oldNextHop, err) + } + lrp.Nexthops = []string{newNextHop} + opModel = operationModel{ + Model: lrp, + OnModelMutations: []interface{}{&lrp.Nexthops}, + ErrNotFound: false, + BulkOp: true, + } + ops, err = m.CreateOrUpdateOps(ops, opModel) + if err != nil { + return nil, fmt.Errorf("failed to get delete old nexthop %s ops: %w", oldNextHop, err) + } + } + return ops, nil +} + // DeleteNextHopsFromLogicalRouterPolicyOps removes the Nexthops from the // provided logical router policies. func DeleteNextHopsFromLogicalRouterPolicyOps(nbClient libovsdbclient.Client, ops []ovsdb.Operation, routerName string, lrps []*nbdb.LogicalRouterPolicy, nextHops ...string) ([]ovsdb.Operation, error) { diff --git a/go-controller/pkg/libovsdb/util/router.go b/go-controller/pkg/libovsdb/util/router.go index b316fea0e3..6d301b19f8 100644 --- a/go-controller/pkg/libovsdb/util/router.go +++ b/go-controller/pkg/libovsdb/util/router.go @@ -33,7 +33,8 @@ import ( // (TODO: FIXME): With this route, we are officially breaking support for IC with zones that have multiple-nodes // NOTE: This route is exactly the same as what is added by pod-live-migration feature and we keep the route exactly // same across the 3 features so that if the route already exists on the node, this is just a no-op -func CreateDefaultRouteToExternal(nbClient libovsdbclient.Client, clusterRouter, gwRouterName string, clusterSubnets []config.CIDRNetworkEntry, gatewayIPs []*net.IPNet) error { +func CreateDefaultRouteToExternal(nbClient libovsdbclient.Client, clusterRouter, gwRouterName string, + clusterSubnets []config.CIDRNetworkEntry, gatewayIPs []*net.IPNet) error { for _, clusterSubnet := range clusterSubnets { isClusterSubnetIPV6 := utilnet.IsIPv6String(clusterSubnet.CIDR.IP.String()) gatewayIP, err := util.MatchFirstIPNetFamily(isClusterSubnetIPV6, gatewayIPs) diff --git a/go-controller/pkg/networkmanager/api.go b/go-controller/pkg/networkmanager/api.go index 79f131da71..79a19415dd 100644 --- a/go-controller/pkg/networkmanager/api.go +++ b/go-controller/pkg/networkmanager/api.go @@ -6,6 +6,7 @@ import ( "k8s.io/client-go/tools/record" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/allocator/id" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" ) @@ -69,6 +70,7 @@ func NewForCluster( wf watchFactory, ovnClient *util.OVNClusterManagerClientset, recorder record.EventRecorder, + tunnelKeysAllocator *id.TunnelKeysAllocator, ) (Controller, error) { return new( "clustermanager-nad-controller", @@ -78,6 +80,7 @@ func NewForCluster( wf, ovnClient, recorder, + tunnelKeysAllocator, ) } @@ -95,6 +98,7 @@ func NewForZone( wf, nil, nil, + nil, ) } @@ -112,6 +116,7 @@ func NewForNode( wf, nil, nil, + nil, ) } @@ -126,8 +131,9 @@ func new( wf watchFactory, ovnClient *util.OVNClusterManagerClientset, recorder record.EventRecorder, + tunnelKeysAllocator *id.TunnelKeysAllocator, ) (Controller, error) { - return newController(name, zone, node, cm, wf, ovnClient, recorder) + return newController(name, zone, node, cm, wf, ovnClient, recorder, tunnelKeysAllocator) } // ControllerManager manages controllers. Needs to be provided in order to build diff --git a/go-controller/pkg/networkmanager/nad_controller.go b/go-controller/pkg/networkmanager/nad_controller.go index 78c0fea60e..62c3c3c7af 100644 --- a/go-controller/pkg/networkmanager/nad_controller.go +++ b/go-controller/pkg/networkmanager/nad_controller.go @@ -73,8 +73,9 @@ type nadController struct { // primaryNADs holds a mapping of namespace to NAD of primary UDNs primaryNADs map[string]string - networkIDAllocator id.Allocator - nadClient nadclientset.Interface + networkIDAllocator id.Allocator + tunnelKeysAllocator *id.TunnelKeysAllocator + nadClient nadclientset.Interface } func newController( @@ -85,6 +86,7 @@ func newController( wf watchFactory, ovnClient *util.OVNClusterManagerClientset, recorder record.EventRecorder, + tunnelKeysAllocator *id.TunnelKeysAllocator, ) (*nadController, error) { c := &nadController{ name: fmt.Sprintf("[%s NAD controller]", name), @@ -100,7 +102,7 @@ func newController( c.nadClient = ovnClient.NetworkAttchDefClient } - // this is cluster network manager, so we allocate network IDs + // this is cluster network manager, so we allocate network IDs and tunnel keys if zone == "" && node == "" { c.networkIDAllocator = id.NewIDAllocator("NetworkIDs", MaxNetworks) // Reserve the ID of the default network @@ -108,6 +110,8 @@ func newController( if err != nil { return nil, fmt.Errorf("failed to allocate default network ID: %w", err) } + // tunnelKeysAllocator must be passed for cluster manager + c.tunnelKeysAllocator = tunnelKeysAllocator } config := &controller.ControllerConfig[nettypes.NetworkAttachmentDefinition]{ @@ -348,7 +352,7 @@ func (c *nadController) syncNAD(key string, nad *nettypes.NetworkAttachmentDefin } } - if err := c.handleNetworkID(oldNetwork, ensureNetwork, nad); err != nil { + if err := c.handleNetworkAnnotations(oldNetwork, ensureNetwork, nad); err != nil { return err } @@ -565,22 +569,21 @@ func (c *nadController) DoWithLock(f func(network util.NetInfo) error) error { return errors.Join(errs...) } -// handleNetworkID finds out what the network ID should be for a new network and -// sets it on 'new'. The network ID is primarily found annotated in the NAD. If -// not annotated, it means it is still to be allocated. If this is not the NAD -// controller running in cluster manager, then we don't do anything as we are -// expected to wait until it happens. If this is the NAD controller running in -// cluster manager then a new ID is allocated and annotated on the NAD. The NAD -// controller running in cluster manager also releases here the network ID of a -// network that is being deleted. -func (c *nadController) handleNetworkID(old util.NetInfo, new util.MutableNetInfo, nad *nettypes.NetworkAttachmentDefinition) error { +// handleNetworkAnnotations assigns or reads info from the NAD annotations. +// We store network ID and tunnel keys in the AND annotation. This function +// finds out what these values should be for a new network and +// sets it on 'new'. If not annotated, it means it is still to be allocated. +// If this is not the NAD controller running in cluster manager, then we don't +// do anything as we are expected to wait until it happens. +// If this is the NAD controller running in cluster manager then a new ID +// is allocated and annotated on the NAD. The NAD controller running in +// cluster manager also releases here the network ID of a network that is being deleted. +func (c *nadController) handleNetworkAnnotations(old util.NetInfo, new util.MutableNetInfo, nad *nettypes.NetworkAttachmentDefinition) (err error) { if new != nil && new.IsDefault() { return nil } - var err error id := types.InvalidID - // check what ID is currently annotated if nad != nil && nad.Annotations[types.OvnNetworkIDAnnotation] != "" { annotated := nad.Annotations[types.OvnNetworkIDAnnotation] @@ -590,11 +593,21 @@ func (c *nadController) handleNetworkID(old util.NetInfo, new util.MutableNetInf } } + tunnelKeys := []int{} + // check what tunnel keys are currently annotated + if nad != nil && nad.Annotations[types.OvnNetworkTunnelKeysAnnotation] != "" { + tunnelKeys, err = util.ParseTunnelKeysAnnotation(nad.Annotations[types.OvnNetworkTunnelKeysAnnotation]) + if err != nil { + return fmt.Errorf("failed to parse annotated tunnel keys: %w", err) + } + } + // this is not the cluster manager nad controller and we are not allocating // so just return what we got from the annotation if c.networkIDAllocator == nil { if new != nil { new.SetNetworkID(id) + new.SetTunnelKeys(tunnelKeys) } return nil } @@ -602,6 +615,7 @@ func (c *nadController) handleNetworkID(old util.NetInfo, new util.MutableNetInf // release old ID if the network is being deleted if old != nil && !old.IsDefault() && len(old.GetNADs()) == 0 { c.networkIDAllocator.ReleaseID(old.GetNetworkName()) + c.tunnelKeysAllocator.ReleaseKeys(old.GetNetworkName()) } // nothing to allocate @@ -610,7 +624,7 @@ func (c *nadController) handleNetworkID(old util.NetInfo, new util.MutableNetInf } name := new.GetNetworkName() - // an ID was annotated, check if it is free to use or stale + // a network ID was annotated, check if it is free to use or stale if id != types.InvalidID { err = c.networkIDAllocator.ReserveID(name, id) if err != nil { @@ -618,27 +632,56 @@ func (c *nadController) handleNetworkID(old util.NetInfo, new util.MutableNetInf id = types.InvalidID } } + // tunnel key annotation doesn't need the same check ^ because it is initialized outside the + // nad controller and has already assured that all annotated tunnel keys are reserved. + // we are about to allocate resources, so prepare a cleanup function + // in case of error to release them. + var allocatedNetworkID, allocatedTunnelKeys bool + defer func() { + if err != nil { + if allocatedNetworkID { + c.networkIDAllocator.ReleaseID(name) + } + if allocatedTunnelKeys { + c.tunnelKeysAllocator.ReleaseKeys(name) + } + } + }() // we don't have an ID, allocate a new one if id == types.InvalidID { id, err = c.networkIDAllocator.AllocateID(name) if err != nil { return fmt.Errorf("failed to allocate network ID: %w", err) } + allocatedNetworkID = true // check if there is still a network running with that ID in the process // of being stopped other := c.networkController.getRunningNetwork(id) if other != "" && c.networkController.getNetwork(other) == nil { - c.networkIDAllocator.ReleaseID(name) return fmt.Errorf("found other network %s being stopped with allocated ID %d, will retry", other, id) } } + // allocate tunnel keys + if len(tunnelKeys) != getNumberOfTunnelKeys(new) { + tunnelKeys, err = c.tunnelKeysAllocator.AllocateKeys(name, id, getNumberOfTunnelKeys(new)) + if err != nil { + return fmt.Errorf("failed to allocate tunnel keys: %w", err) + } + allocatedTunnelKeys = true + } + // set and annotate the network ID + tunnelKeyAnno, err := util.FormatTunnelKeysAnnotation(tunnelKeys) + if err != nil { + return fmt.Errorf("failed to format tunnel keys annotation: %w", err) + } annotations := map[string]string{ - types.OvnNetworkNameAnnotation: name, - types.OvnNetworkIDAnnotation: strconv.Itoa(id), + types.OvnNetworkNameAnnotation: name, + types.OvnNetworkIDAnnotation: strconv.Itoa(id), + types.OvnNetworkTunnelKeysAnnotation: tunnelKeyAnno, } if nad.Annotations[types.OvnNetworkNameAnnotation] == annotations[types.OvnNetworkNameAnnotation] { delete(annotations, types.OvnNetworkNameAnnotation) @@ -646,8 +689,12 @@ func (c *nadController) handleNetworkID(old util.NetInfo, new util.MutableNetInf if nad.Annotations[types.OvnNetworkIDAnnotation] == annotations[types.OvnNetworkIDAnnotation] { delete(annotations, types.OvnNetworkIDAnnotation) } + if nad.Annotations[types.OvnNetworkTunnelKeysAnnotation] == annotations[types.OvnNetworkTunnelKeysAnnotation] { + delete(annotations, types.OvnNetworkTunnelKeysAnnotation) + } if len(annotations) == 0 { new.SetNetworkID(id) + new.SetTunnelKeys(tunnelKeys) return nil } @@ -662,10 +709,10 @@ func (c *nadController) handleNetworkID(old util.NetInfo, new util.MutableNetInf c.name, ) if err != nil { - c.networkIDAllocator.ReleaseID(name) - return fmt.Errorf("failed to annotate network ID on NAD: %w", err) + return fmt.Errorf("failed to annotate network ID and/or tunnel keys on NAD: %w", err) } new.SetNetworkID(id) + new.SetTunnelKeys(tunnelKeys) return nil } @@ -679,3 +726,18 @@ func (c *nadController) GetActiveNetwork(network string) util.NetInfo { } return state.controller } + +func getNumberOfTunnelKeys(netInfo util.NetInfo) int { + if netInfo.IsDefault() { + // default network does not need tunnel keys allocation because it always uses network ID 0. + return 0 + } + // Layer3, Secondary Layer2 and Localnet topologies need only 1 tunnel key for now that is derived from the network ID + // and is limited by the MaxNetworks. Don't annotate any tunnel keys in that case until we decide to + // increase the MaxNetworks. + if netInfo.TopologyType() != types.Layer2Topology || !netInfo.IsPrimaryNetwork() { + return 0 + } + // Primary Layer2 UDNs need 2 tunnel keys: one for the layer2 switch and one for the transit router + return 2 +} diff --git a/go-controller/pkg/networkmanager/nad_controller_test.go b/go-controller/pkg/networkmanager/nad_controller_test.go index 1ce5ad9168..6e083b785f 100644 --- a/go-controller/pkg/networkmanager/nad_controller_test.go +++ b/go-controller/pkg/networkmanager/nad_controller_test.go @@ -139,9 +139,10 @@ func TestNADController(t *testing.T) { Name: "networkAPrimary", Type: "ovn-k8s-cni-overlay", }, - Subnets: "10.1.130.0/24", - Role: types.NetworkRolePrimary, - MTU: 1400, + Subnets: "10.1.130.0/24", + TransitSubnet: config.ClusterManager.V4TransitSubnet, + Role: types.NetworkRolePrimary, + MTU: 1400, } networkAIncompatible := &ovncnitypes.NetConf{ Topology: types.LocalnetTopology, @@ -500,12 +501,13 @@ func TestNADController(t *testing.T) { } fakeClient := util.GetOVNClientset().GetClusterManagerClientset() nadController := &nadController{ - nads: map[string]string{}, - primaryNADs: map[string]string{}, - networkController: newNetworkController("", "", "", tcm, nil), - networkIDAllocator: id.NewIDAllocator("NetworkIDs", MaxNetworks), - nadClient: fakeClient.NetworkAttchDefClient, - namespaceLister: &fakeNamespaceLister{}, + nads: map[string]string{}, + primaryNADs: map[string]string{}, + networkController: newNetworkController("", "", "", tcm, nil), + networkIDAllocator: id.NewIDAllocator("NetworkIDs", MaxNetworks), + tunnelKeysAllocator: id.NewTunnelKeyAllocator("TunnelKeys"), + nadClient: fakeClient.NetworkAttchDefClient, + namespaceLister: &fakeNamespaceLister{}, } err = nadController.networkIDAllocator.ReserveID(types.DefaultNetworkName, types.DefaultNetworkID) g.Expect(err).ToNot(gomega.HaveOccurred()) @@ -563,7 +565,11 @@ func TestNADController(t *testing.T) { id, err := nadController.networkIDAllocator.AllocateID(name) g.Expect(err).ToNot(gomega.HaveOccurred()) g.Expect(netController.networks[name].GetNetworkID()).To(gomega.Equal(id)) - + if netInfo.TopologyType() == types.Layer2Topology && netInfo.IsPrimaryNetwork() { + tunnelKeys, err := nadController.tunnelKeysAllocator.AllocateKeys(name, id, 2) + g.Expect(err).ToNot(gomega.HaveOccurred()) + g.Expect(netController.networks[name].GetTunnelKeys()).To(gomega.Equal(tunnelKeys)) + } // test that the actual controllers have the expected config and NADs if !netInfo.IsDefault() { g.Expect(tcm.controllers).To(gomega.HaveKey(testNetworkKey)) @@ -696,6 +702,7 @@ func TestSyncAll(t *testing.T) { wf, fakeClient, nil, + id.NewTunnelKeyAllocator("TunnelKeys"), ) g.Expect(err).ToNot(gomega.HaveOccurred()) @@ -778,6 +785,62 @@ func TestSyncAll(t *testing.T) { } } +func TestResourceCleanup(t *testing.T) { + g := gomega.NewWithT(t) + err := config.PrepareTestConfig() + g.Expect(err).ToNot(gomega.HaveOccurred()) + config.OVNKubernetesFeature.EnableNetworkSegmentation = true + config.OVNKubernetesFeature.EnableMultiNetwork = true + tcm := &testControllerManager{ + controllers: map[string]NetworkController{}, + defaultNetwork: &testNetworkController{ + ReconcilableNetInfo: &util.DefaultNetInfo{}, + }, + } + fakeClient := util.GetOVNClientset().GetClusterManagerClientset() + nadController := &nadController{ + nads: map[string]string{}, + primaryNADs: map[string]string{}, + networkController: newNetworkController("", "", "", tcm, nil), + networkIDAllocator: id.NewIDAllocator("NetworkIDs", MaxNetworks), + tunnelKeysAllocator: id.NewTunnelKeyAllocator("TunnelKeys"), + nadClient: fakeClient.NetworkAttchDefClient, + namespaceLister: &fakeNamespaceLister{}, + } + err = nadController.networkIDAllocator.ReserveID(types.DefaultNetworkName, types.DefaultNetworkID) + g.Expect(err).ToNot(gomega.HaveOccurred()) + g.Expect(nadController.networkController.Start()).To(gomega.Succeed()) + defer nadController.networkController.Stop() + + nadNs := "test" + nadName := "nad_1" + nadKey := nadNs + "/" + nadName + networkAPrimary := &ovncnitypes.NetConf{ + Topology: types.Layer2Topology, + NetConf: cnitypes.NetConf{ + Name: "networkAPrimary", + Type: "ovn-k8s-cni-overlay", + }, + Subnets: "10.1.130.0/24", + Role: types.NetworkRolePrimary, + MTU: 1400, + NADName: nadKey, + } + nad, err := buildNAD(nadName, nadNs, networkAPrimary) + g.Expect(err).ToNot(gomega.HaveOccurred()) + + // make annotation update fail (nad doesn't exist), make sure networkID and tunnel keys are released + err = nadController.syncNAD(nadKey, nad) + g.Expect(err).To(gomega.HaveOccurred()) + g.Expect(err.Error()).To(gomega.ContainSubstring("failed to annotate network ID and/or tunnel keys")) + // we know the allocated network ID was 1 and tunnelKeys were [16711684, 16715779] (first available IDs after Default network) + // try to reserve these exact IDs for a different network to make sure they were released + err = nadController.networkIDAllocator.ReserveID("networkB", 1) + g.Expect(err).ToNot(gomega.HaveOccurred()) + err = nadController.tunnelKeysAllocator.ReserveKeys("networkB", []int{16711684, 16715779}) + g.Expect(err).ToNot(gomega.HaveOccurred()) +} + func buildNAD(name, namespace string, network *ovncnitypes.NetConf) (*nettypes.NetworkAttachmentDefinition, error) { config, err := json.Marshal(network) if err != nil { diff --git a/go-controller/pkg/node/controllers/egressip/egressip.go b/go-controller/pkg/node/controllers/egressip/egressip.go index 3c6b340bbf..dd1f15f3c9 100644 --- a/go-controller/pkg/node/controllers/egressip/egressip.go +++ b/go-controller/pkg/node/controllers/egressip/egressip.go @@ -45,6 +45,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/syncmap" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/egressip" utilerrors "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/errors" ) @@ -539,15 +540,15 @@ func (c *Controller) processEIP(eip *eipv1.EgressIP) (*eIPConfig, sets.Set[strin if isValid := isEIPStatusItemValid(status, c.nodeName); !isValid { continue } - eIPNet, err := util.GetIPNetFullMask(status.EgressIP) - if err != nil { + ip := net.ParseIP(status.EgressIP) + if ip == nil { return nil, selectedNamespaces, selectedPods, selectedNamespacesPodIPs, - fmt.Errorf("failed to generate mask for EgressIP %s IP %s: %v", eip.Name, status.EgressIP, err) + fmt.Errorf("failed to parse EgressIP %s IP %s", eip.Name, status.EgressIP) } - if util.IsOVNNetwork(parsedNodeEIPConfig, eIPNet.IP) { + if util.IsOVNNetwork(parsedNodeEIPConfig, ip) { continue } - found, link, err := findLinkOnSameNetworkAsIP(eIPNet.IP, c.v4, c.v6) + found, link, err := findLinkOnSameNetworkAsIP(ip, c.v4, c.v6) if err != nil { return nil, selectedNamespaces, selectedPods, selectedNamespacesPodIPs, fmt.Errorf("failed to find a network to host EgressIP %s IP %s: %v", eip.Name, status.EgressIP, err) @@ -560,7 +561,7 @@ func (c *Controller) processEIP(eip *eipv1.EgressIP) (*eIPConfig, sets.Set[strin if err != nil { return nil, selectedNamespaces, selectedPods, selectedNamespacesPodIPs, fmt.Errorf("failed to list namespaces: %w", err) } - isEIPV6 := utilnet.IsIPv6(eIPNet.IP) + isEIPV6 := utilnet.IsIPv6(ip) for _, namespace := range namespaces { netInfo, err := c.getActiveNetworkForNamespace(namespace.Name) if err != nil { @@ -593,13 +594,13 @@ func (c *Controller) processEIP(eip *eipv1.EgressIP) (*eIPConfig, sets.Set[strin if selectedNamespacesPodIPs[namespace.Name] == nil { selectedNamespacesPodIPs[namespace.Name] = make(map[ktypes.NamespacedName]*podIPConfigList) } - selectedNamespacesPodIPs[namespace.Name][podNamespaceName] = generatePodConfig(ips, link, eIPNet, isEIPV6) + selectedNamespacesPodIPs[namespace.Name][podNamespaceName] = generatePodConfig(ips, link, ip, isEIPV6) selectedPods.Insert(podNamespaceName) } } // ensure at least one pod is selected before generating config if len(selectedNamespacesPodIPs) > 0 { - eipSpecificConfig, err = generateEIPConfig(link, eIPNet, isEIPV6) + eipSpecificConfig, err = generateEIPConfig(link, ip, isEIPV6) if err != nil { return nil, selectedNamespaces, selectedPods, selectedNamespacesPodIPs, fmt.Errorf("failed to generate EIP configuration for EgressIP %s IP %s: %v", eip.Name, status.EgressIP, err) @@ -611,7 +612,7 @@ func (c *Controller) processEIP(eip *eipv1.EgressIP) (*eIPConfig, sets.Set[strin return eipSpecificConfig, selectedNamespaces, selectedPods, selectedNamespacesPodIPs, nil } -func generatePodConfig(podIPs []net.IP, link netlink.Link, eIPNet *net.IPNet, isEIPV6 bool) *podIPConfigList { +func generatePodConfig(podIPs []net.IP, link netlink.Link, eIP net.IP, isEIPV6 bool) *podIPConfigList { newPodIPConfigs := newPodIPConfigList() for _, podIP := range podIPs { isPodIPv6 := utilnet.IsIPv6(podIP) @@ -619,7 +620,7 @@ func generatePodConfig(podIPs []net.IP, link netlink.Link, eIPNet *net.IPNet, is continue } ipConfig := newPodIPConfig() - ipConfig.ipTableRule = generateIPTablesSNATRuleArg(podIP, isPodIPv6, link.Attrs().Name, eIPNet.IP.String()) + ipConfig.ipTableRule = generateIPTablesSNATRuleArg(podIP, isPodIPv6, link.Attrs().Name, eIP.String()) ipConfig.ipRule = generateIPRule(podIP, isPodIPv6, link.Attrs().Index) ipConfig.v6 = isPodIPv6 newPodIPConfigs.elems = append(newPodIPConfigs.elems, ipConfig) @@ -628,14 +629,14 @@ func generatePodConfig(podIPs []net.IP, link netlink.Link, eIPNet *net.IPNet, is } // generateEIPConfig generates configuration that isn't related to any pod EIPs to support config of a single EIP -func generateEIPConfig(link netlink.Link, eIPNet *net.IPNet, isEIPV6 bool) (*eIPConfig, error) { +func generateEIPConfig(link netlink.Link, eIP net.IP, isEIPV6 bool) (*eIPConfig, error) { eipConfig := newEIPConfig() linkRoutes, err := generateRoutesForLink(link, isEIPV6) if err != nil { return nil, err } eipConfig.routes = linkRoutes - eipConfig.addr = getNetlinkAddress(eIPNet, link.Attrs().Index) + eipConfig.addr = egressip.GetNetlinkAddress(eIP, link.Attrs().Index) return eipConfig, nil } @@ -1482,14 +1483,6 @@ func isLinkUp(flags string) bool { return strings.Contains(flags, "up") } -func getNetlinkAddress(addr *net.IPNet, ifindex int) *netlink.Addr { - return &netlink.Addr{ - IPNet: addr, - Scope: int(netlink.SCOPE_UNIVERSE), - LinkIndex: ifindex, - } -} - // generateIPRules generates IP rules at a predefined priority for each pod IP with a custom routing table based // from the links 'ifindex' func generateIPRule(srcIP net.IP, isIPv6 bool, ifIndex int) netlink.Rule { diff --git a/go-controller/pkg/node/egressip/gateway_egressip.go b/go-controller/pkg/node/egressip/gateway_egressip.go index 38bd2b058e..27700e026e 100644 --- a/go-controller/pkg/node/egressip/gateway_egressip.go +++ b/go-controller/pkg/node/egressip/gateway_egressip.go @@ -3,13 +3,9 @@ package egressip import ( "encoding/json" "fmt" - "math" "net" "sync" - "github.com/vishvananda/netlink" - "golang.org/x/sys/unix" - "k8s.io/apimachinery/pkg/util/sets" corev1informers "k8s.io/client-go/informers/core/v1" corev1listers "k8s.io/client-go/listers/core/v1" @@ -23,6 +19,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/kube" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/linkmanager" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/egressip" ) // markIPs contains packet mark and associated EgressIP IP for IPv4 / IPv6. Key is packet mark, value egress IP @@ -126,6 +123,7 @@ func (mic *MarkIPsCache) deleteMarkIP(pktMark util.EgressIPMark, ip net.IP) { func (mic *MarkIPsCache) replaceAll(markIPs markIPs) { mic.mu.Lock() mic.markToIPs = markIPs + mic.IPToMark = make(map[string]int, len(markIPs.v4)+len(markIPs.v6)) for mark, ipv4 := range markIPs.v4 { mic.IPToMark[ipv4] = mark } @@ -451,7 +449,7 @@ func (g *BridgeEIPAddrManager) addIPBridge(ip net.IP) error { if err != nil { return fmt.Errorf("failed to get link obj by name %s: %v", g.bridgeName, err) } - return g.addrManager.AddAddress(getEIPBridgeNetlinkAddress(ip, link.Attrs().Index)) + return g.addrManager.AddAddress(*egressip.GetNetlinkAddress(ip, link.Attrs().Index)) } func (g *BridgeEIPAddrManager) deleteIPBridge(ip net.IP) error { @@ -459,7 +457,7 @@ func (g *BridgeEIPAddrManager) deleteIPBridge(ip net.IP) error { if err != nil { return fmt.Errorf("failed to get link obj by name %s: %v", g.bridgeName, err) } - return g.addrManager.DelAddress(getEIPBridgeNetlinkAddress(ip, link.Attrs().Index)) + return g.addrManager.DelAddress(*egressip.GetNetlinkAddress(ip, link.Attrs().Index)) } // getAnnotationIPs retrieves the egress IP annotation from the current node Nodes object. If multiple users, callers must synchronise. @@ -514,29 +512,3 @@ func getIPsStr(ips ...net.IP) []string { } return ipsStr } - -func getEIPBridgeNetlinkAddress(ip net.IP, ifindex int) netlink.Addr { - return netlink.Addr{ - IPNet: &net.IPNet{IP: ip, Mask: util.GetIPFullMask(ip)}, - Flags: getEIPNetlinkAddressFlag(ip), - Scope: int(netlink.SCOPE_UNIVERSE), - ValidLft: getEIPNetlinkAddressValidLft(ip), - LinkIndex: ifindex, - } -} - -func getEIPNetlinkAddressFlag(ip net.IP) int { - // isV6? - if ip.To4() == nil && ip.To16() != nil { - return unix.IFA_F_NODAD - } - return 0 -} - -func getEIPNetlinkAddressValidLft(ip net.IP) int { - // isV6? - if ip.To4() == nil && ip.To16() != nil { - return math.MaxUint32 - } - return 0 -} diff --git a/go-controller/pkg/node/egressip/gateway_egressip_test.go b/go-controller/pkg/node/egressip/gateway_egressip_test.go index 07a03a87b6..1fe48a6f5b 100644 --- a/go-controller/pkg/node/egressip/gateway_egressip_test.go +++ b/go-controller/pkg/node/egressip/gateway_egressip_test.go @@ -20,6 +20,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/linkmanager" netlink_mocks "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing/mocks/github.com/vishvananda/netlink" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/egressip" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/mocks" ) @@ -63,7 +64,7 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() eip := getEIPAssignedToNode(nodeName, mark, ipV4Addr) @@ -74,11 +75,11 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).Should(gomega.ConsistOf(ipV4Addr)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("doesn't configure or fail when annotation mark isn't found", func() { - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() eip := getEIPAssignedToNode(nodeName, "", ipV4Addr) @@ -89,11 +90,11 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).ShouldNot(gomega.ConsistOf(ipV4Addr)) gomega.Expect(nlMock.AssertNotCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("fails when invalid annotation mark", func() { - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() eip := getEIPAssignedToNode(nodeName, "not-an-integer", ipV4Addr) @@ -104,7 +105,7 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).ShouldNot(gomega.ConsistOf(ipV4Addr)) gomega.Expect(nlMock.AssertNotCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("configures annotations with existing entries", func() { @@ -113,7 +114,7 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, generateAnnotFromIPs(ipV4Addr2)) defer stopFn() eip := getEIPAssignedToNode(nodeName, mark, ipV4Addr) @@ -124,7 +125,7 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).Should(gomega.ConsistOf(ipV4Addr, ipV4Addr2)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) }) @@ -135,7 +136,7 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() assignedEIP := getEIPAssignedToNode(nodeName, mark, ipV4Addr) @@ -147,7 +148,7 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).Should(gomega.ConsistOf(ipV4Addr)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("removes EgressIP previously assigned", func() { @@ -156,8 +157,8 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) - nlMock.On("AddrDel", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrDel", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() assignedEIP := getEIPAssignedToNode(nodeName, mark, ipV4Addr) @@ -172,9 +173,9 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).ShouldNot(gomega.ConsistOf(ipV4Addr)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrDel", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("reconfigures from an old to a new IP", func() { @@ -183,9 +184,9 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr2), bridgeLinkIndex)).Return(nil) - nlMock.On("AddrDel", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr2), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrDel", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() unassignedEIP := getEIPNotAssignedToNode(mark, ipV4Addr) @@ -201,11 +202,11 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).Should(gomega.ConsistOf(ipV4Addr2)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr2), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr2), bridgeLinkIndex))).Should(gomega.BeTrue()) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrDel", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) }) @@ -216,8 +217,8 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) - nlMock.On("AddrDel", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrDel", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() eip := getEIPAssignedToNode(nodeName, mark, ipV4Addr) @@ -231,9 +232,9 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).ShouldNot(gomega.ConsistOf(ipV4Addr)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrDel", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("does not update when EIP is deleted that wasn't assigned to the node", func() { @@ -247,7 +248,7 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).Should(gomega.ConsistOf(ipV4Addr2)) gomega.Expect(nlMock.AssertNotCalled(ginkgo.GinkgoT(), "AddrDel", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) }) }) @@ -258,8 +259,8 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr2), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr2), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, emptyAnnotation) defer stopFn() eipAssigned1 := getEIPAssignedToNode(nodeName, mark, ipV4Addr) @@ -271,9 +272,9 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).Should(gomega.ConsistOf(ipV4Addr, ipV4Addr2)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr2), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr2), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("delete previous configuration", func() { @@ -282,9 +283,9 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { nlMock.On("LinkByIndex", bridgeLinkIndex).Return(nlLinkMock, nil) nlMock.On("LinkList").Return([]netlink.Link{nlLinkMock}, nil) nlMock.On("AddrList", nlLinkMock, 0).Return([]netlink.Addr{}, nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) - nlMock.On("AddrAdd", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr2), bridgeLinkIndex)).Return(nil) - nlMock.On("AddrDel", nlLinkMock, getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr3), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrAdd", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr2), bridgeLinkIndex)).Return(nil) + nlMock.On("AddrDel", nlLinkMock, egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr3), bridgeLinkIndex)).Return(nil) addrMgr, stopFn := initBridgeEIPAddrManager(nodeName, bridgeName, generateAnnotFromIPs(ipV4Addr3)) // previously configured IP defer stopFn() eipAssigned1 := getEIPAssignedToNode(nodeName, mark, ipV4Addr) @@ -295,11 +296,11 @@ var _ = ginkgo.Describe("Gateway EgressIP", func() { gomega.Expect(err).ShouldNot(gomega.HaveOccurred(), "node should be present within kapi") gomega.Expect(parseEIPsFromAnnotation(node)).Should(gomega.ConsistOf(ipV4Addr, ipV4Addr2)) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr), bridgeLinkIndex))).Should(gomega.BeTrue()) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrAdd", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr2), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr2), bridgeLinkIndex))).Should(gomega.BeTrue()) gomega.Expect(nlMock.AssertCalled(ginkgo.GinkgoT(), "AddrDel", nlLinkMock, - getEIPBridgeNetlinkAddressPtr(net.ParseIP(ipV4Addr3), bridgeLinkIndex))).Should(gomega.BeTrue()) + egressip.GetNetlinkAddress(net.ParseIP(ipV4Addr3), bridgeLinkIndex))).Should(gomega.BeTrue()) }) ginkgo.It("no update or failure when mark is not set", func() { @@ -387,11 +388,6 @@ func generateAnnotFromIPs(ips ...string) string { return fmt.Sprintf("[%s]", strings.Join(ipsWithQuotes, ",")) } -func getEIPBridgeNetlinkAddressPtr(ip net.IP, ifindex int) *netlink.Addr { - addr := getEIPBridgeNetlinkAddress(ip, ifindex) - return &addr -} - func parseEIPsFromAnnotation(node *corev1.Node) []string { ips, err := util.ParseNodeBridgeEgressIPsAnnotation(node) if err != nil { diff --git a/go-controller/pkg/node/gateway_init_linux_test.go b/go-controller/pkg/node/gateway_init_linux_test.go index 87a49da908..6f245e906b 100644 --- a/go-controller/pkg/node/gateway_init_linux_test.go +++ b/go-controller/pkg/node/gateway_init_linux_test.go @@ -39,7 +39,7 @@ import ( nodenft "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/nftables" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/routemanager" ovntest "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing" - nodemocks "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing/mocks/github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node" + mgmtportmock "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing/mocks/github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/managementport" linkMock "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing/mocks/github.com/vishvananda/netlink" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" @@ -769,7 +769,7 @@ func shareGatewayInterfaceDPUTest(app *cli.App, testNS ns.NetNS, // FIXME(mk): starting the gateway causing go routines to be spawned within sub functions and therefore they escape the // netns we wanted to set it to originally here. Refactor test cases to not spawn a go routine or just fake out everything // and remove need to create netns - mpmock := &nodemocks.ManagementPort{} + mpmock := &mgmtportmock.Interface{} err = testNS.Do(func(ns.NetNS) error { defer GinkgoRecover() diff --git a/go-controller/pkg/node/node_ip_handler_linux_test.go b/go-controller/pkg/node/node_ip_handler_linux_test.go index aa819cdb8a..c78307cca1 100644 --- a/go-controller/pkg/node/node_ip_handler_linux_test.go +++ b/go-controller/pkg/node/node_ip_handler_linux_test.go @@ -24,7 +24,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/bridgeconfig" nodenft "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/nftables" ovntest "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing" - nodemocks "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing/mocks/github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node" + mgmtportmock "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/testing/mocks/github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/managementport" ovntypes "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" @@ -399,7 +399,7 @@ func configureKubeOVNContext(nodeName string, useNetlink bool) *testCtx { _ = nodenft.SetFakeNFTablesHelper() - mpmock := &nodemocks.ManagementPort{} + mpmock := &mgmtportmock.Interface{} mpmock.On("GetAddresses").Return([]*net.IPNet{tc.mgmtPortIP4, tc.mgmtPortIP6}) fakeBridgeConfiguration := bridgeconfig.TestBridgeConfig("breth0") diff --git a/go-controller/pkg/ovn/address_set/mocks/AddressSet.go b/go-controller/pkg/ovn/address_set/mocks/AddressSet.go index f5dd89448f..abaaf6f47f 100644 --- a/go-controller/pkg/ovn/address_set/mocks/AddressSet.go +++ b/go-controller/pkg/ovn/address_set/mocks/AddressSet.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -108,7 +108,7 @@ func (_m *AddressSet) DeleteAddressesReturnOps(addresses []string) ([]ovsdb.Oper return r0, r1 } -// Destroy provides a mock function with given fields: +// Destroy provides a mock function with no fields func (_m *AddressSet) Destroy() error { ret := _m.Called() @@ -126,7 +126,7 @@ func (_m *AddressSet) Destroy() error { return r0 } -// GetASHashNames provides a mock function with given fields: +// GetASHashNames provides a mock function with no fields func (_m *AddressSet) GetASHashNames() (string, string) { ret := _m.Called() @@ -154,7 +154,7 @@ func (_m *AddressSet) GetASHashNames() (string, string) { return r0, r1 } -// GetAddresses provides a mock function with given fields: +// GetAddresses provides a mock function with no fields func (_m *AddressSet) GetAddresses() ([]string, []string) { ret := _m.Called() @@ -186,7 +186,7 @@ func (_m *AddressSet) GetAddresses() ([]string, []string) { return r0, r1 } -// GetName provides a mock function with given fields: +// GetName provides a mock function with no fields func (_m *AddressSet) GetName() string { ret := _m.Called() diff --git a/go-controller/pkg/ovn/address_set/mocks/AddressSetDoFunc.go b/go-controller/pkg/ovn/address_set/mocks/AddressSetDoFunc.go deleted file mode 100644 index 88385d570c..0000000000 --- a/go-controller/pkg/ovn/address_set/mocks/AddressSetDoFunc.go +++ /dev/null @@ -1,42 +0,0 @@ -// Code generated by mockery v2.16.0. DO NOT EDIT. - -package mocks - -import ( - addressset "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/address_set" - mock "github.com/stretchr/testify/mock" -) - -// AddressSetDoFunc is an autogenerated mock type for the AddressSetDoFunc type -type AddressSetDoFunc struct { - mock.Mock -} - -// Execute provides a mock function with given fields: as -func (_m *AddressSetDoFunc) Execute(as addressset.AddressSet) error { - ret := _m.Called(as) - - var r0 error - if rf, ok := ret.Get(0).(func(addressset.AddressSet) error); ok { - r0 = rf(as) - } else { - r0 = ret.Error(0) - } - - return r0 -} - -type mockConstructorTestingTNewAddressSetDoFunc interface { - mock.TestingT - Cleanup(func()) -} - -// NewAddressSetDoFunc creates a new instance of AddressSetDoFunc. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. -func NewAddressSetDoFunc(t mockConstructorTestingTNewAddressSetDoFunc) *AddressSetDoFunc { - mock := &AddressSetDoFunc{} - mock.Mock.Test(t) - - t.Cleanup(func() { mock.AssertExpectations(t) }) - - return mock -} diff --git a/go-controller/pkg/ovn/address_set/mocks/AddressSetFactory.go b/go-controller/pkg/ovn/address_set/mocks/AddressSetFactory.go index 0d18215185..b226488211 100644 --- a/go-controller/pkg/ovn/address_set/mocks/AddressSetFactory.go +++ b/go-controller/pkg/ovn/address_set/mocks/AddressSetFactory.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/ovn/address_set/mocks/AddressSetIterFunc.go b/go-controller/pkg/ovn/address_set/mocks/AddressSetIterFunc.go index 1c9ff3de62..6657de1d2e 100644 --- a/go-controller/pkg/ovn/address_set/mocks/AddressSetIterFunc.go +++ b/go-controller/pkg/ovn/address_set/mocks/AddressSetIterFunc.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/ovn/address_set/mocks/removeFunc.go b/go-controller/pkg/ovn/address_set/mocks/removeFunc.go index 044f4b440a..d408953a3a 100644 --- a/go-controller/pkg/ovn/address_set/mocks/removeFunc.go +++ b/go-controller/pkg/ovn/address_set/mocks/removeFunc.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/ovn/base_network_controller.go b/go-controller/pkg/ovn/base_network_controller.go index ea72526b10..32d7538d04 100644 --- a/go-controller/pkg/ovn/base_network_controller.go +++ b/go-controller/pkg/ovn/base_network_controller.go @@ -383,6 +383,15 @@ func (bnc *BaseNetworkController) getOVNClusterRouterPortToJoinSwitchIfAddrs() ( return gwLRPIPs, nil } +// getCRToSwitchPortName returns a cluster router name for layer3 topo and transit router name for layer2 topo. +// In the context of baseNetworkController they are similar. +func (bnc *BaseNetworkController) getCRToSwitchPortName(switchName string) string { + if bnc.TopologyType() == types.Layer2Topology { + return types.TransitRouterToSwitchPrefix + switchName + } + return types.RouterToSwitchPrefix + switchName +} + // syncNodeClusterRouterPort ensures a node's LS to the cluster router's LRP is created. // NOTE: We could have created the router port in createNodeLogicalSwitch() instead of here, // but chassis ID is not available at that moment. We need the chassis ID to set the @@ -412,9 +421,9 @@ func (bnc *BaseNetworkController) syncNodeClusterRouterPort(node *corev1.Node, h } } - switchName := bnc.GetNetworkScopedName(node.Name) + switchName := bnc.GetNetworkScopedSwitchName(node.Name) logicalRouterName := bnc.GetNetworkScopedClusterRouterName() - lrpName := types.RouterToSwitchPrefix + switchName + lrpName := bnc.getCRToSwitchPortName(switchName) lrpNetworks := []string{} for _, hostSubnet := range hostSubnets { gwIfAddr := bnc.GetNodeGatewayIP(hostSubnet) @@ -440,6 +449,22 @@ func (bnc *BaseNetworkController) syncNodeClusterRouterPort(node *corev1.Node, h ChassisName: chassisID, Priority: 1, } + _, isNetIPv6 := bnc.IPMode() + if bnc.TopologyType() == types.Layer2Topology && + isNetIPv6 && + util.IsNetworkSegmentationSupportEnabled() && + bnc.IsPrimaryNetwork() { + logicalRouterPort.Ipv6RaConfigs = map[string]string{ + "address_mode": "dhcpv6_stateful", + "send_periodic": "true", + "max_interval": "900", // 15 minutes + "min_interval": "300", // 5 minutes + "router_preference": "LOW", // The static gateway configured by CNI is MEDIUM, so make this LOW so it has less effect for pods + } + if bnc.MTU() > 0 { + logicalRouterPort.Ipv6RaConfigs["mtu"] = fmt.Sprintf("%d", bnc.MTU()) + } + } err = libovsdbops.CreateOrUpdateLogicalRouterPort(bnc.nbClient, &logicalRouter, &logicalRouterPort, &gatewayChassis, &logicalRouterPort.MAC, &logicalRouterPort.Networks, &logicalRouterPort.Options) @@ -450,7 +475,8 @@ func (bnc *BaseNetworkController) syncNodeClusterRouterPort(node *corev1.Node, h if util.IsNetworkSegmentationSupportEnabled() && bnc.IsPrimaryNetwork() && !config.OVNKubernetesFeature.EnableInterconnect && - bnc.TopologyType() == types.Layer3Topology { + (bnc.TopologyType() == types.Layer3Topology || + bnc.TopologyType() == types.Layer2Topology) { // since in nonIC the ovn_cluster_router is distributed, we must specify the gatewayPort for the // conditional SNATs to signal OVN which gatewayport should be chosen if there are mutiple distributed // gateway ports. Now that the LRP is created, let's update the NATs to reflect that. diff --git a/go-controller/pkg/ovn/base_secondary_layer2_network_controller.go b/go-controller/pkg/ovn/base_secondary_layer2_network_controller.go index 421314586b..fa9931dbba 100644 --- a/go-controller/pkg/ovn/base_secondary_layer2_network_controller.go +++ b/go-controller/pkg/ovn/base_secondary_layer2_network_controller.go @@ -11,6 +11,7 @@ import ( "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" libovsdbops "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/ops" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb" + zoneinterconnect "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/zone_interconnect" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" utilerrors "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/errors" @@ -179,7 +180,14 @@ func (oc *BaseLayer2UserDefinedNetworkController) initializeLogicalSwitch(switch } if oc.isLayer2Interconnect() { - err := oc.zoneICHandler.AddTransitSwitchConfig(&logicalSwitch) + tunnelKey := zoneinterconnect.BaseTransitSwitchTunnelKey + oc.GetNetworkID() + if config.Layer2UsesTransitRouter && oc.IsPrimaryNetwork() { + if len(oc.GetTunnelKeys()) != 2 { + return nil, fmt.Errorf("layer2 network %s with transit router enabled requires exactly 2 tunnel keys, got: %v", oc.GetNetworkName(), oc.GetTunnelKeys()) + } + tunnelKey = oc.GetTunnelKeys()[0] + } + err := oc.zoneICHandler.AddTransitSwitchConfig(&logicalSwitch, tunnelKey) if err != nil { return nil, err } diff --git a/go-controller/pkg/ovn/controller/services/services_controller.go b/go-controller/pkg/ovn/controller/services/services_controller.go index 3f4275e028..8d8cdbcd80 100644 --- a/go-controller/pkg/ovn/controller/services/services_controller.go +++ b/go-controller/pkg/ovn/controller/services/services_controller.go @@ -779,7 +779,7 @@ func (c *Controller) cleanupUDNEnabledServiceRoute(key string) error { var ops []ovsdb.Operation var err error - if c.netInfo.TopologyType() == types.Layer2Topology { + if c.netInfo.TopologyType() == types.Layer2Topology && !globalconfig.Layer2UsesTransitRouter { for _, node := range c.nodeInfos { if ops, err = libovsdbops.DeleteLogicalRouterStaticRoutesWithPredicateOps(c.nbClient, ops, c.netInfo.GetNetworkScopedGWRouterName(node.name), delPredicate); err != nil { return err @@ -824,7 +824,7 @@ func (c *Controller) configureUDNEnabledServiceRoute(service *corev1.Service) er ExternalIDs: extIDs, } routerName := c.netInfo.GetNetworkScopedClusterRouterName() - if c.netInfo.TopologyType() == types.Layer2Topology { + if c.netInfo.TopologyType() == types.Layer2Topology && !globalconfig.Layer2UsesTransitRouter { routerName = nodeInfo.gatewayRouterName } ops, err = libovsdbops.CreateOrUpdateLogicalRouterStaticRoutesWithPredicateOps(c.nbClient, nil, routerName, &staticRoute, func(item *nbdb.LogicalRouterStaticRoute) bool { diff --git a/go-controller/pkg/ovn/egressip.go b/go-controller/pkg/ovn/egressip.go index 31a48d0c99..6021f2737d 100644 --- a/go-controller/pkg/ovn/egressip.go +++ b/go-controller/pkg/ovn/egressip.go @@ -174,13 +174,14 @@ type EgressIPController struct { // used as a locking mechanism to serialize egress IP processing on a per egress IP basis // the order of locking should always be egressIPCache, then podAssignment, then nodeZoneState egressIPCache *syncmap.SyncMap[bool] - // nodeUpdateMutex is used for two reasons: + // nodeUpdateMutex is used for three reasons: // (1) to ensure safe handling of node ip address updates. VIP addresses are // dynamic and might move across nodes. // (2) used in ensureDefaultNoRerouteQoSRules function to ensure // creating QoS rules is thread safe since otherwise when two nodes are added // at the same time by two different threads we end up creating duplicate // QoS rules in database due to libovsdb cache race + // (3) to update nextHop during layer2 topology upgrade nodeUpdateMutex *sync.Mutex // podAssignment is a cache used for keeping track of which egressIP status // has been set up for each pod. The key is defined by getPodKey @@ -1515,14 +1516,9 @@ func (e *EgressIPController) syncPodAssignmentCache(egressIPCache egressIPCache) if ni == nil { return fmt.Errorf("failed to get active network for network name %q", networkName) } - routerName := ni.GetNetworkScopedClusterRouterName() - if ni.TopologyType() == types.Layer2Topology { - // no support for multiple Nodes per OVN zone, therefore pick the first local zone node - localNodeName, err := e.getALocalZoneNodeName() - if err != nil { - return err - } - routerName = ni.GetNetworkScopedGWRouterName(localNodeName) + routerName, err := e.getTopologyScopedLocalZoneRouterName(ni) + if err != nil { + return err } reRoutePolicies, err := libovsdbops.FindALogicalRouterPoliciesWithPredicate(e.nbClient, routerName, p1) if err != nil { @@ -1903,14 +1899,14 @@ func (e *EgressIPController) generateCacheForEgressIP() (egressIPCache, error) { if localZoneNodes.Has(node.Name) { if e.v4 { - if gatewayRouterIP, err := e.getGatewayNextHop(ni, node.Name, false); err != nil { + if gatewayRouterIP, err := e.getGatewayNextHop(ni, node, false); err != nil { klog.V(5).Infof("Unable to retrieve gateway IP for node: %s, protocol is IPv4: err: %v", node.Name, err) } else { r.v4Gateway = gatewayRouterIP.String() } } if e.v6 { - if gatewayRouterIP, err := e.getGatewayNextHop(ni, node.Name, true); err != nil { + if gatewayRouterIP, err := e.getGatewayNextHop(ni, node, true); err != nil { klog.V(5).Infof("Unable to retrieve gateway IP for node: %s, protocol is IPv6: err: %v", node.Name, err) } else { r.v6Gateway = gatewayRouterIP.String() @@ -2400,8 +2396,8 @@ func (e *EgressIPController) addPodEgressIPAssignment(ni util.NetInfo, egressIPN return fmt.Errorf("unable to create NAT rule ops for status: %v, err: %v", status, err) } - } else if ni.IsUserDefinedNetwork() && ni.TopologyType() == types.Layer3Topology { - // not required for L2 because we always have LRPs using reroute action to pkt mark + } else if ni.IsUserDefinedNetwork() && (ni.TopologyType() == types.Layer3Topology || + ni.TopologyType() == types.Layer2Topology && config.Layer2UsesTransitRouter) { ops, err = e.createGWMarkPolicyOps(ni, ops, podIPs, status, mark, pod.Namespace, pod.Name, egressIPName) if err != nil { return fmt.Errorf("unable to create GW router LRP ops to packet mark pod %s/%s: %v", pod.Namespace, pod.Name, err) @@ -2424,7 +2420,8 @@ func (e *EgressIPController) addPodEgressIPAssignment(ni util.NetInfo, egressIPN // For L2, we always attach an LRP with reroute action to the Nodes gateway router. If the pod is remote, use the local zone Node name to generate the GW router name. nodeName := pod.Spec.NodeName - if loadedEgressNode && loadedPodNode && !isLocalZonePod && isLocalZoneEgressNode && ni.IsUserDefinedNetwork() && ni.TopologyType() == types.Layer2Topology { + if loadedEgressNode && loadedPodNode && !isLocalZonePod && isLocalZoneEgressNode && ni.IsUserDefinedNetwork() && + ni.TopologyType() == types.Layer2Topology && !config.Layer2UsesTransitRouter { nodeName = status.Node } routerName, err := getTopologyScopedRouterName(ni, nodeName) @@ -2492,7 +2489,8 @@ func (e *EgressIPController) deletePodEgressIPAssignment(ni util.NetInfo, egress } // For L2, we always attach an LRP with reroute action to the Nodes gateway router. If the pod is remote, use the local zone Node name to generate the GW router name. nodeName := pod.Spec.NodeName - if !isLocalZonePod && isLocalZoneEgressNode && ni.IsUserDefinedNetwork() && ni.TopologyType() == types.Layer2Topology { + if !isLocalZonePod && isLocalZoneEgressNode && ni.IsUserDefinedNetwork() && + ni.TopologyType() == types.Layer2Topology && !config.Layer2UsesTransitRouter { nodeName = status.Node } routerName, err := getTopologyScopedRouterName(ni, nodeName) @@ -2511,7 +2509,8 @@ func (e *EgressIPController) deletePodEgressIPAssignment(ni util.NetInfo, egress // Case 1 - node where pod is hosted is not known // Case 2 - pod is within the local zone // case 3 - a local zone node is egress node and pod is attached to layer 2. For layer2, there is always an LRP attached to the egress Node GW router - if !loadedPodNode || isLocalZonePod || (isLocalZoneEgressNode && ni.IsUserDefinedNetwork() && ni.TopologyType() == types.Layer2Topology) { + if !loadedPodNode || isLocalZonePod || (isLocalZoneEgressNode && ni.IsUserDefinedNetwork() && + ni.TopologyType() == types.Layer2Topology) { ops, err = e.deleteReroutePolicyOps(ni, ops, status, egressIPName, nextHopIP, routerName, pod.Namespace, pod.Name) if errors.Is(err, libovsdbclient.ErrNotFound) { // if the gateway router join IP setup is already gone, then don't count it as error. @@ -2534,7 +2533,8 @@ func (e *EgressIPController) deletePodEgressIPAssignment(ni util.NetInfo, egress if err != nil { return fmt.Errorf("unable to delete NAT rule for status: %v, err: %v", status, err) } - } else if ni.IsUserDefinedNetwork() && ni.TopologyType() == types.Layer3Topology { + } else if ni.IsUserDefinedNetwork() && ni.TopologyType() == types.Layer3Topology || + ni.TopologyType() == types.Layer2Topology && config.Layer2UsesTransitRouter { ops, err = e.deleteGWMarkPolicyOps(ni, ops, status, pod.Namespace, pod.Name, egressIPName) if err != nil { return fmt.Errorf("unable to create GW router packet mark LRPs delete ops for pod %s/%s: %v", pod.Namespace, pod.Name, err) @@ -2652,30 +2652,53 @@ func (e *EgressIPController) deleteExternalGWPodSNATOps(ni util.NetInfo, ops []o // getGatewayNextHop determines the next hop for a given Node considering the network topology type // For layer 3, next hop is gateway routers 'router to join' port IP -// For layer 2, it's the callers responsibility to ensure that the egress node is remote because a LRP should not be created -func (e *EgressIPController) getGatewayNextHop(ni util.NetInfo, nodeName string, isIPv6 bool) (net.IP, error) { - // fetch gateway router 'router to join' port IP +// For layer 2 with transit router, next hop is the node's transit GR IP. +// For layer 2 without transit router, it's the callers responsibility to ensure that the egress node is remote because a LRP should not be created +func (e *EgressIPController) getGatewayNextHop(ni util.NetInfo, node *corev1.Node, isIPv6 bool) (net.IP, error) { if ni.TopologyType() == types.Layer3Topology { - return e.getRouterPortIP(types.GWRouterToJoinSwitchPrefix+ni.GetNetworkScopedGWRouterName(nodeName), isIPv6) - } - - // If egress node is local, retrieve the external default gateway next hops from the Node L3 gateway annotation. - // We must pick one of the next hops to add to the LRP reroute next hops to not break ECMP. - // If an egress node is remote, retrieve the remote Nodes gateway router 'router to switch' port IP - // from the Node annotation. - // FIXME: remove gathering the required information from a Node annotations as this approach does not scale - // FIXME: we do not respect multiple default gateway next hops and instead pick the first IP that matches the IP family of the EIP - if ni.TopologyType() == types.Layer2Topology { - node, err := e.watchFactory.GetNode(nodeName) + return e.getRouterPortIP(types.GWRouterToJoinSwitchPrefix+ni.GetNetworkScopedGWRouterName(node.Name), isIPv6) + } else if ni.TopologyType() == types.Layer2Topology { + if config.Layer2UsesTransitRouter { + upgradedNode := util.UDNLayer2NodeUsesTransitRouter(node) + if upgradedNode { + transitRouterInfo, err := getTransitRouterInfo(ni, node) + if err != nil { + return nil, err + } + nodeTransitIP, err := util.MatchFirstIPNetFamily(isIPv6, transitRouterInfo.gatewayRouterNets) + if err != nil { + return nil, fmt.Errorf("could not find transit router IP of node %v for this family %v: %v", node, isIPv6, err) + } + return nodeTransitIP.IP, nil + } else { + gwIPs, err := udn.GetGWRouterIPs(node, ni) + if err != nil { + return nil, fmt.Errorf("failed to get gateway router IPs for node %s: %w", node.Name, err) + } + gwIP, err := util.MatchFirstIPNetFamily(isIPv6, gwIPs) + if err != nil { + return nil, fmt.Errorf("failed to find a gateway router IP for node %s that matches the EgressIP IP family (is IPv6: %v): %w", + node.Name, isIPv6, err) + } + return gwIP.IP, nil + } + } + // If egress node is local, retrieve the external default gateway next hops from the Node L3 gateway annotation. + // We must pick one of the next hops to add to the LRP reroute next hops to not break ECMP. + // If an egress node is remote, retrieve the remote Nodes gateway router 'router to switch' port IP + // from the Node annotation. + // FIXME: remove gathering the required information from a Node annotations as this approach does not scale + // FIXME: we do not respect multiple default gateway next hops and instead pick the first IP that matches the IP family of the EIP + node, err := e.watchFactory.GetNode(node.Name) if err != nil { - return nil, fmt.Errorf("failed to retrive node %s: %w", nodeName, err) + return nil, fmt.Errorf("failed to retrive node %s: %w", node.Name, err) } localNode, err := e.getALocalZoneNodeName() if err != nil { return nil, err } // Node is local - if localNode == nodeName { + if localNode == node.Name { nextHopIPs, err := util.ParseNodeL3GatewayAnnotation(node) if err != nil { if util.IsAnnotationNotSetError(err) { @@ -2780,11 +2803,15 @@ func (e *EgressIPController) getTransitIP(nodeName string, wantsIPv6 bool) (stri // and no error returned. This means we searched successfully but could not find the information required to generate the next hop IP. func (e *EgressIPController) getNextHop(ni util.NetInfo, egressNodeName, egressIP, egressIPName string, isLocalZoneEgressNode, isOVNNetwork bool) (string, error) { isEgressIPv6 := utilnet.IsIPv6String(egressIP) + egressNode, err := e.watchFactory.GetNode(egressNodeName) + if err != nil { + return "", err + } if isLocalZoneEgressNode || ni.TopologyType() == types.Layer2Topology { // isOVNNetwork is true when an EgressIP is "assigned" to the Nodes primary interface (breth0). Ext traffic will egress breth0. // is OVNNetwork is false when the EgressIP is assigned to a host secondary interface (not breth0). Ext traffic will egress this interface. if isOVNNetwork { - gatewayRouterIP, err := e.getGatewayNextHop(ni, egressNodeName, isEgressIPv6) + gatewayRouterIP, err := e.getGatewayNextHop(ni, egressNode, isEgressIPv6) // return error only when we failed to retrieve the gateway IP. Do not return error when we can never get this IP (gw deleted) if err != nil && !errors.Is(err, libovsdbclient.ErrNotFound) { return "", fmt.Errorf("unable to retrieve gateway IP for node: %s, protocol is IPv6: %v, err: %w", @@ -3003,13 +3030,9 @@ func (e *EgressIPController) deleteEgressIPStatusSetup(ni util.NetInfo, name str } if nextHopIP != "" { - router := ni.GetNetworkScopedClusterRouterName() - if ni.TopologyType() == types.Layer2Topology { - nodeName, err := e.getALocalZoneNodeName() - if err != nil { - return err - } - router = ni.GetNetworkScopedGWRouterName(nodeName) + router, err := e.getTopologyScopedLocalZoneRouterName(ni) + if err != nil { + return err } ops, err = libovsdbops.DeleteNextHopFromLogicalRouterPoliciesWithPredicateOps(e.nbClient, ops, router, policyPredNextHop, nextHopIP) if err != nil { @@ -3230,6 +3253,56 @@ func (e *EgressIPController) ensureRouterPoliciesForNetwork(ni util.NetInfo, nod return nil } +// updateNodeNextHop updates the next hop IP for reroute policies on the node's logical router. +// Only used during layer2 topology upgrade to change gwIP to the transit routerIP +func (e *EgressIPController) updateNodeNextHop(ni util.NetInfo, node *corev1.Node) error { + e.nodeUpdateMutex.Lock() + defer e.nodeUpdateMutex.Unlock() + transitRouterInfo, err := getTransitRouterInfo(ni, node) + if err != nil { + return err + } + gwIPs, err := udn.GetGWRouterIPs(node, ni) + if err != nil { + return fmt.Errorf("failed to get gateway router IPs for node %s: %w", node.Name, err) + } + for _, transitIP := range transitRouterInfo.gatewayRouterNets { + gwIP, err := util.MatchFirstIPNetFamily(utilnet.IsIPv6(transitIP.IP), gwIPs) + if err != nil { + return fmt.Errorf("failed to find a gateway router IP for node %s that matches the transit IP %v family: %w", + node.Name, transitIP, err) + } + // replace reroute policies with the new next hop IP + ops, err := libovsdbops.ReplaceNextHopForLogicalRouterPolicyWithPredicateOps( + e.nbClient, nil, func(policy *nbdb.LogicalRouterPolicy) bool { + if policy.Priority != types.EgressIPReroutePriority { + return false + } + // Restrict to this network and controller + if policy.ExternalIDs[libovsdbops.NetworkKey.String()] != ni.GetNetworkName() || + policy.ExternalIDs[libovsdbops.OwnerControllerKey.String()] != e.controllerName || + policy.ExternalIDs[libovsdbops.OwnerTypeKey.String()] != libovsdbops.EgressIPOwnerType { + return false + } + for _, nextHop := range policy.Nexthops { + if nextHop == gwIP.IP.String() { + return true + } + } + return false + }, gwIP.IP.String(), transitIP.IP.String()) + if err != nil { + return fmt.Errorf("failed to build update reroute policies ops for node %s with transit IP %s: %v", + node.Name, transitIP.IP.String(), err) + } + if _, err = libovsdbops.TransactAndCheck(e.nbClient, ops); err != nil { + return fmt.Errorf("failed to update reroute policies for node %s with transit IP %s: %v", + node.Name, transitIP.IP.String(), err) + } + } + return nil +} + func (e *EgressIPController) ensureSwitchPoliciesForNode(ni util.NetInfo, nodeName string) error { e.nodeUpdateMutex.Lock() defer e.nodeUpdateMutex.Unlock() @@ -3445,14 +3518,9 @@ func (e *EgressIPController) ensureDefaultNoRerouteNodePolicies() error { if network.GetNetworkName() == types.DefaultNetworkName { return nil } - routerName := network.GetNetworkScopedClusterRouterName() - if network.TopologyType() == types.Layer2Topology { - // assume one node per zone only. Multi nodes per zone not supported. - nodeName, err := e.getALocalZoneNodeName() - if err != nil { - return err - } - routerName = network.GetNetworkScopedGWRouterName(nodeName) + routerName, err := e.getTopologyScopedLocalZoneRouterName(network) + if err != nil { + return err } err = ensureDefaultNoRerouteNodePolicies(e.nbClient, e.addressSetFactory, network.GetNetworkName(), routerName, e.controllerName, nodeLister, e.v4, e.v6) @@ -3823,7 +3891,7 @@ func addPktMarkToLRPOptions(options map[string]string, mark string) { // getTopologyScopedRouterName returns the router name that we attach polices to support EgressIP depending on network topology // For Layer 3, we return the network scoped OVN "cluster router" name. For layer 2, we return a Nodes network scoped OVN gateway router name. func getTopologyScopedRouterName(ni util.NetInfo, nodeName string) (string, error) { - if ni.TopologyType() == types.Layer2Topology { + if ni.TopologyType() == types.Layer2Topology && !config.Layer2UsesTransitRouter { if nodeName == "" { return "", fmt.Errorf("node name is required to determine the Nodes gateway router name") } @@ -3832,6 +3900,19 @@ func getTopologyScopedRouterName(ni util.NetInfo, nodeName string) (string, erro return ni.GetNetworkScopedClusterRouterName(), nil } +func (e *EgressIPController) getTopologyScopedLocalZoneRouterName(ni util.NetInfo) (string, error) { + routerName := ni.GetNetworkScopedClusterRouterName() + if ni.TopologyType() == types.Layer2Topology && !config.Layer2UsesTransitRouter { + // no support for multiple Nodes per OVN zone, therefore pick the first local zone node + localNodeName, err := e.getALocalZoneNodeName() + if err != nil { + return "", err + } + routerName = ni.GetNetworkScopedGWRouterName(localNodeName) + } + return routerName, nil +} + func isEgressIPForUDNSupported() bool { return config.OVNKubernetesFeature.EnableInterconnect && config.OVNKubernetesFeature.EnableNetworkSegmentation diff --git a/go-controller/pkg/ovn/egressip_udn_l2_test.go b/go-controller/pkg/ovn/egressip_udn_l2_test.go index 581992ab6b..40913c4ef8 100644 --- a/go-controller/pkg/ovn/egressip_udn_l2_test.go +++ b/go-controller/pkg/ovn/egressip_udn_l2_test.go @@ -32,36 +32,32 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol ) const ( - nadName1 = "nad1" - networkName1 = "network1" - networkName1_ = networkName1 + "_" - node1Name = "node1" - v4Net1 = "20.128.0.0/14" - v4Node1Net1 = "20.128.0.0/16" - v4Pod1IPNode1Net1 = "20.128.0.5" - node1DefaultRtoJIP = "100.64.0.1" - node1DefaultRtoJIPCIDR = node1DefaultRtoJIP + "/16" - node1Network1RtoSIP = "100.65.0.1" - node1Network1RtoSIPCIDR = node1Network1RtoSIP + "/16" - podName3 = "egress-pod3" - v4Pod2IPNode1Net1 = "20.128.0.6" - v4Node1Tsp = "100.88.0.2" - node2Name = "node2" - v4Node2Net1 = "20.129.0.0/16" - v4Node2Tsp = "100.88.0.3" - podName4 = "egress-pod4" - v4Pod1IPNode2Net1 = "20.129.0.2" - v4Pod2IPNode2Net1 = "20.129.0.3" - node2DefaultRtoJIP = "100.64.0.2" - node2DefaultRtoJIPCIDR = node2DefaultRtoJIP + "/16" - node2Network1RtoSIP = "100.65.0.2" - node2Network1RtoSIPCIDR = node2Network1RtoSIP + "/16" - eIP1Mark = 50000 - eIP2Mark = 50001 - layer2SwitchName = "ovn_layer2_switch" - gwIP = "192.168.126.1" - gwIP2 = "192.168.127.1" - userDefinedNetworkID = "2" + nadName1 = "nad1" + networkName1 = "network1" + networkName1_ = networkName1 + "_" + node1Name = "node1" + v4Net1 = "20.128.0.0/14" + v4Node1Net1 = "20.128.0.0/16" + v4Pod1IPNode1Net1 = "20.128.0.5" + node1DefaultRtoJIP = "100.64.0.1" + node1DefaultRtoJIPCIDR = node1DefaultRtoJIP + "/16" + node1Network1JoinIP = "100.65.0.1" + node1Network1JoinCIDR = node1Network1JoinIP + "/16" + node1Network1TransitIP = "100.88.0.3" + node1Network1TransitCIDR = node1Network1TransitIP + "/31" + podName3 = "egress-pod3" + v4Node1Tsp = "100.88.0.2" + node2Name = "node2" + v4Node2Net1 = "20.129.0.0/16" + v4Node2Tsp = "100.88.0.3" + podName4 = "egress-pod4" + v4Pod2IPNode2Net1 = "20.129.0.3" + node2Network1JoinIP = "100.65.0.2" + node2Network1TransitIP = "100.88.0.5" + eIP1Mark = 50000 + layer2SwitchName = "ovn_layer2_switch" + gwIP = "192.168.126.1" + userDefinedNetworkID = "2" ) getEgressIPStatusLen := func(egressIPName string) func() int { @@ -145,10 +141,11 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol Name: networkName1, Type: "ovn-k8s-cni-overlay", }, - Role: ovntypes.NetworkRolePrimary, - Topology: ovntypes.Layer2Topology, - NADName: nadName, - Subnets: v4Net1, + Role: ovntypes.NetworkRolePrimary, + Topology: ovntypes.Layer2Topology, + NADName: nadName, + Subnets: v4Net1, + TransitSubnet: config.ClusterManager.V4TransitSubnet, } nad, err := newNetworkAttachmentDefinition( eipNamespace2, @@ -156,7 +153,6 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol netconf, ) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} netInfo, err := util.NewNetInfo(&netconf) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -170,6 +166,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", node1IPv4CIDR), util.OvnNodeL3GatewayConfig: fmt.Sprintf(`{"%s":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"%s", "next-hop":"%s", "next-hops": ["%s"]}, "default":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"192.168.126.12/24", "next-hop": "192.168.126.1", "next-hops": ["192.168.126.1"]}}`, networkName1, v4Net1, gwIP, gwIP), + util.Layer2TopologyVersion: util.TransitRouterTopoVersion, } labels := map[string]string{ "k8s.ovn.org/egress-assignable": "", @@ -185,6 +182,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", node2IPv4CIDR), util.OvnNodeL3GatewayConfig: fmt.Sprintf(`{"%s":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"%s", "next-hop":"%s", "next-hops": ["%s"]}, "default":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"192.168.126.12/24", "next-hop":"192.168.126.1", "next-hops": ["192.168.126.1"]}`, networkName1, v4Net1, gwIP, gwIP), + util.Layer2TopologyVersion: util.TransitRouterTopoVersion, } node2 := getNodeObj(node2Name, node2Annotations, labels) eIP := egressipv1.EgressIP{ @@ -238,21 +236,26 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol Ports: []string{"k8s-" + node1Name + "-UUID"}, }, // UDN start - getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{gwIP2, node2Network1RtoSIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), // stale gateway - getReRoutePolicyForController(egressIPName, eipNamespace2, podName, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{gwIP2, node2Network1RtoSIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), // stale pod + getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{node1Network1JoinCIDR, node2Network1JoinIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), // stale gateway + getReRoutePolicyForController(egressIPName, eipNamespace2, podName, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{node1Network1JoinCIDR, node2Network1JoinIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), // stale pod &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, }, &nbdb.LogicalRouter{ - UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", - Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", ExternalIDs: map[string]string{ovntypes.NetworkExternalID: networkName1, ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, Policies: []string{getReRoutePolicyUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName()), getReRoutePolicyUUID(eipNamespace2, podName, IPFamilyValueV4, netInfo.GetNetworkName())}, // stale policies }, + &nbdb.LogicalRouter{ + UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", + Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: networkName1, ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, + }, &nbdb.LogicalSwitchPort{ UUID: "k8s-" + networkName1_ + node1Name + "-UUID", Name: "k8s-" + networkName1_ + node1Name, @@ -413,8 +416,12 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol egressNodeIPsASv4, // UDN - getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{gwIP, node2Network1RtoSIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), - getReRoutePolicyForController(egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, eIP1Mark, IPFamilyValueV4, []string{gwIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{node1Network1TransitIP, node2Network1TransitIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getReRoutePolicyForController(egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, eIP1Mark, IPFamilyValueV4, []string{node1Network1TransitIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getGWPktMarkLRPForController(eIP1Mark, egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, IPFamilyValueV4, + netInfo.GetNetworkName(), DefaultNetworkControllerName), + getGWPktMarkLRPForController(eIP1Mark, egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, IPFamilyValueV4, + netInfo.GetNetworkName(), DefaultNetworkControllerName), getNoReRoutePolicyForUDNEnabledSvc(false, netInfo.GetNetworkName(), DefaultNetworkControllerName, egressIPServedPodsASUDNv4.Name, egressSVCServedPodsASv4.Name, udnEnabledSvcV4.Name), &nbdb.LogicalRouterPolicy{ Priority: ovntypes.DefaultNoRereoutePriority, @@ -440,20 +447,27 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol ExternalIDs: getEgressIPLRPNoReRoutePodToNodeDbIDs(IPFamilyValueV4, netInfo.GetNetworkName(), DefaultNetworkControllerName).GetExternalIDs(), }, &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, + }, + &nbdb.LogicalRouter{ + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, + Policies: []string{ + "udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", + fmt.Sprintf("%s-no-reroute-reply-traffic", netInfo.GetNetworkName()), + getReRoutePolicyUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName()), + getReRoutePolicyUUID(eipNamespace2, podName4, IPFamilyValueV4, netInfo.GetNetworkName())}, }, &nbdb.LogicalRouter{ UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, - Policies: []string{getReRoutePolicyUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName()), - getReRoutePolicyUUID(eipNamespace2, podName4, IPFamilyValueV4, netInfo.GetNetworkName()), - "udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", "udn-no-reroute-service-UUID", - fmt.Sprintf("%s-no-reroute-reply-traffic", netInfo.GetNetworkName()), "udn-enabled-svc-no-reroute-UUID", - }, + Policies: []string{getGWPktMarkLRPUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName()), + getGWPktMarkLRPUUID(eipNamespace2, podName4, IPFamilyValueV4, netInfo.GetNetworkName())}, }, &nbdb.LogicalSwitchPort{ UUID: "k8s-" + networkName1_ + node1Name + "-UUID", @@ -513,10 +527,11 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol Name: networkName1, Type: "ovn-k8s-cni-overlay", }, - Role: ovntypes.NetworkRolePrimary, - Topology: ovntypes.Layer2Topology, - NADName: nadName, - Subnets: v4Net1, + Role: ovntypes.NetworkRolePrimary, + Topology: ovntypes.Layer2Topology, + NADName: nadName, + Subnets: v4Net1, + TransitSubnet: config.ClusterManager.V4TransitSubnet, } nad, err := newNetworkAttachmentDefinition( eipNamespace2, @@ -524,7 +539,6 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol netconf, ) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} netInfo, err := util.NewNetInfo(&netconf) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -539,6 +553,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", node1IPv4CIDR), util.OvnNodeL3GatewayConfig: fmt.Sprintf(`{"%s":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"%s", "next-hop":"%s", "next-hops": ["%s"]}, "default":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"192.168.126.12/24", "next-hop": "192.168.126.1", "next-hops": ["192.168.126.1"]}}`, networkName1, v4Net1, gwIP, gwIP), + util.Layer2TopologyVersion: util.TransitRouterTopoVersion, } labels := map[string]string{ "k8s.ovn.org/egress-assignable": "", @@ -555,6 +570,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", node2IPv4CIDR), util.OvnNodeL3GatewayConfig: fmt.Sprintf(`{"%s":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"%s", "next-hop":"%s", "next-hops": ["%s"]}, "default":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"192.168.126.12/24", "next-hop": "192.168.126.1", "next-hops": ["192.168.126.1"]}}`, networkName1, v4Net1, gwIP, gwIP), + util.Layer2TopologyVersion: util.TransitRouterTopoVersion, } node2 := getNodeObj(node2Name, node2Annotations, labels) twoNodeStatus := []egressipv1.EgressIPStatusItem{ @@ -611,14 +627,19 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol }, // UDN start &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, + }, + &nbdb.LogicalRouter{ + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: networkName1, ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, }, &nbdb.LogicalRouter{ UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, ExternalIDs: map[string]string{ovntypes.NetworkExternalID: networkName1, ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, }, &nbdb.LogicalSwitchPort{ @@ -783,9 +804,14 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol egressNodeIPsASv4, // UDN - getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{gwIP, node2Network1RtoSIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), - getReRoutePolicyForController(egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, eIP1Mark, IPFamilyValueV4, []string{gwIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), - getNoReRoutePolicyForUDNEnabledSvc(false, netInfo.GetNetworkName(), DefaultNetworkControllerName, egressIPServedPodsASUDNv4.Name, egressSVCServedPodsASv4.Name, udnEnabledSvcV4.Name), + getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{node1Network1TransitIP, node2Network1TransitIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getReRoutePolicyForController(egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, eIP1Mark, IPFamilyValueV4, []string{node1Network1TransitIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getGWPktMarkLRPForController(eIP1Mark, egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, IPFamilyValueV4, + netInfo.GetNetworkName(), DefaultNetworkControllerName), + getGWPktMarkLRPForController(eIP1Mark, egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, IPFamilyValueV4, + netInfo.GetNetworkName(), DefaultNetworkControllerName), + getNoReRoutePolicyForUDNEnabledSvc(false, netInfo.GetNetworkName(), DefaultNetworkControllerName, + egressIPServedPodsASUDNv4.Name, egressSVCServedPodsASv4.Name, udnEnabledSvcV4.Name), &nbdb.LogicalRouterPolicy{ Priority: ovntypes.DefaultNoRereoutePriority, Match: fmt.Sprintf("ip4.src == %s && ip4.dst == %s", v4Net1, v4Net1), @@ -810,14 +836,13 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol ExternalIDs: getEgressIPLRPNoReRoutePodToNodeDbIDs(IPFamilyValueV4, netInfo.GetNetworkName(), DefaultNetworkControllerName).GetExternalIDs(), }, &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, }, &nbdb.LogicalRouter{ - UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", - Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, Policies: []string{ "udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", @@ -825,6 +850,14 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol getReRoutePolicyUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName()), getReRoutePolicyUUID(eipNamespace2, podName4, IPFamilyValueV4, netInfo.GetNetworkName())}, }, + &nbdb.LogicalRouter{ + UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", + Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, + Policies: []string{getGWPktMarkLRPUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName()), + getGWPktMarkLRPUUID(eipNamespace2, podName4, IPFamilyValueV4, netInfo.GetNetworkName())}, + }, &nbdb.LogicalSwitchPort{ UUID: "k8s-" + networkName1_ + node1Name + "-UUID", Name: "k8s-" + networkName1_ + node1Name, @@ -917,7 +950,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol egressNodeIPsASv4, // UDN - getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{node2Network1RtoSIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{node2Network1TransitIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), getNoReRoutePolicyForUDNEnabledSvc(false, netInfo.GetNetworkName(), DefaultNetworkControllerName, egressIPServedPodsASUDNv4.Name, egressSVCServedPodsASv4.Name, udnEnabledSvcV4.Name), &nbdb.LogicalRouterPolicy{ Priority: ovntypes.DefaultNoRereoutePriority, @@ -943,17 +976,23 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol ExternalIDs: getEgressIPLRPNoReRoutePodToNodeDbIDs(IPFamilyValueV4, netInfo.GetNetworkName(), DefaultNetworkControllerName).GetExternalIDs(), }, &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, }, &nbdb.LogicalRouter{ - UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", - Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, - Policies: []string{"udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, + Policies: []string{ + "udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", fmt.Sprintf("%s-no-reroute-reply-traffic", netInfo.GetNetworkName()), getReRoutePolicyUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName())}, + }, + &nbdb.LogicalRouter{ + UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", + Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, }, &nbdb.LogicalSwitchPort{ @@ -1019,10 +1058,11 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol Name: networkName1, Type: "ovn-k8s-cni-overlay", }, - Role: ovntypes.NetworkRolePrimary, - Topology: ovntypes.Layer2Topology, - NADName: nadName, - Subnets: v4Net1, + Role: ovntypes.NetworkRolePrimary, + Topology: ovntypes.Layer2Topology, + NADName: nadName, + Subnets: v4Net1, + TransitSubnet: config.ClusterManager.V4TransitSubnet, } nad, err := newNetworkAttachmentDefinition( eipNamespace2, @@ -1030,7 +1070,6 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol netconf, ) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} netInfo, err := util.NewNetInfo(&netconf) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -1045,6 +1084,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", node1IPv4CIDR), util.OvnNodeL3GatewayConfig: fmt.Sprintf(`{"%s":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"%s", "next-hop":"%s", "next-hops": ["%s"]}, "default":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"192.168.126.12/24", "next-hop": "192.168.126.1", "next-hops": ["192.168.126.1"]}}`, networkName1, v4Net1, gwIP, gwIP), + util.Layer2TopologyVersion: util.TransitRouterTopoVersion, } labels := map[string]string{ "k8s.ovn.org/egress-assignable": "", @@ -1061,6 +1101,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", node2IPv4CIDR), util.OvnNodeL3GatewayConfig: fmt.Sprintf(`{"%s":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"%s", "next-hop":"%s", "next-hops": ["%s"]}, "default":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"192.168.126.12/24", "next-hop": "192.168.126.1", "next-hops": ["192.168.126.1"]}}`, networkName1, v4Net1, gwIP, gwIP), + util.Layer2TopologyVersion: util.TransitRouterTopoVersion, } node2 := getNodeObj(node2Name, node2Annotations, labels) twoNodeStatus := []egressipv1.EgressIPStatusItem{ @@ -1117,14 +1158,19 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol }, // UDN start &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, + }, + &nbdb.LogicalRouter{ + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: networkName1, ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, }, &nbdb.LogicalRouter{ UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, ExternalIDs: map[string]string{ovntypes.NetworkExternalID: networkName1, ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, }, &nbdb.LogicalSwitchPort{ @@ -1283,8 +1329,12 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol egressNodeIPsASv4, // UDN - getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{gwIP, node2Network1RtoSIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), - getReRoutePolicyForController(egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, eIP1Mark, IPFamilyValueV4, []string{gwIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{node1Network1TransitIP, node2Network1TransitIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getReRoutePolicyForController(egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, eIP1Mark, IPFamilyValueV4, []string{node1Network1TransitIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getGWPktMarkLRPForController(eIP1Mark, egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, IPFamilyValueV4, + netInfo.GetNetworkName(), DefaultNetworkControllerName), + getGWPktMarkLRPForController(eIP1Mark, egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, IPFamilyValueV4, + netInfo.GetNetworkName(), DefaultNetworkControllerName), getNoReRoutePolicyForUDNEnabledSvc(false, netInfo.GetNetworkName(), DefaultNetworkControllerName, egressIPServedPodsASUDNv4.Name, egressSVCServedPodsASv4.Name, udnEnabledSvcV4.Name), &nbdb.LogicalRouterPolicy{ Priority: ovntypes.DefaultNoRereoutePriority, @@ -1310,21 +1360,27 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol ExternalIDs: getEgressIPLRPNoReRoutePodToNodeDbIDs(IPFamilyValueV4, netInfo.GetNetworkName(), DefaultNetworkControllerName).GetExternalIDs(), }, &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, }, &nbdb.LogicalRouter{ - UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", - Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, Policies: []string{ "udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", fmt.Sprintf("%s-no-reroute-reply-traffic", netInfo.GetNetworkName()), getReRoutePolicyUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName()), - getReRoutePolicyUUID(eipNamespace2, podName4, IPFamilyValueV4, netInfo.GetNetworkName()), - }, + getReRoutePolicyUUID(eipNamespace2, podName4, IPFamilyValueV4, netInfo.GetNetworkName())}, + }, + &nbdb.LogicalRouter{ + UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", + Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, + Policies: []string{getGWPktMarkLRPUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName()), + getGWPktMarkLRPUUID(eipNamespace2, podName4, IPFamilyValueV4, netInfo.GetNetworkName())}, }, &nbdb.LogicalSwitchPort{ UUID: "k8s-" + networkName1_ + node1Name + "-UUID", @@ -1436,17 +1492,23 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol ExternalIDs: getEgressIPLRPNoReRoutePodToNodeDbIDs(IPFamilyValueV4, netInfo.GetNetworkName(), DefaultNetworkControllerName).GetExternalIDs(), }, &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, }, &nbdb.LogicalRouter{ - UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", - Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, - Policies: []string{"udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, + Policies: []string{ + "udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", fmt.Sprintf("%s-no-reroute-reply-traffic", netInfo.GetNetworkName())}, - ExternalIDs: map[string]string{ovntypes.NetworkExternalID: secConInfo.bnc.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, + }, + &nbdb.LogicalRouter{ + UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", + Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, }, &nbdb.LogicalSwitchPort{ UUID: "k8s-" + networkName1_ + node1Name + "-UUID", @@ -1501,10 +1563,11 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol Name: networkName1, Type: "ovn-k8s-cni-overlay", }, - Role: ovntypes.NetworkRolePrimary, - Topology: ovntypes.Layer2Topology, - NADName: nadNsName, - Subnets: v4Net1, + Role: ovntypes.NetworkRolePrimary, + Topology: ovntypes.Layer2Topology, + NADName: nadNsName, + Subnets: v4Net1, + TransitSubnet: config.ClusterManager.V4TransitSubnet, } nad, err := newNetworkAttachmentDefinition( eipNamespace2, @@ -1512,7 +1575,6 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol netconf, ) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} netInfo, err := util.NewNetInfo(&netconf) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -1527,6 +1589,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", node1IPv4CIDR), util.OvnNodeL3GatewayConfig: fmt.Sprintf(`{"%s":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"%s", "next-hop":"%s", "next-hops": ["%s"]}, "default":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"192.168.126.12/24", "next-hop": "192.168.126.1", "next-hops": ["192.168.126.1"]}}`, networkName1, v4Net1, gwIP, gwIP), + util.Layer2TopologyVersion: util.TransitRouterTopoVersion, } labels := map[string]string{ "k8s.ovn.org/egress-assignable": "", @@ -1543,6 +1606,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", node2IPv4CIDR), util.OvnNodeL3GatewayConfig: fmt.Sprintf(`{"%s":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"%s", "next-hop":"%s", "next-hops": ["%s"]}, "default":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"192.168.126.12/24", "next-hop": "192.168.126.1", "next-hops": ["192.168.126.1"]}}`, networkName1, v4Net1, gwIP, gwIP), + util.Layer2TopologyVersion: util.TransitRouterTopoVersion, } node2 := getNodeObj(node2Name, node2Annotations, labels) twoNodeStatus := []egressipv1.EgressIPStatusItem{ @@ -1599,14 +1663,19 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol }, // UDN start &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, + }, + &nbdb.LogicalRouter{ + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: networkName1, ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, }, &nbdb.LogicalRouter{ UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, ExternalIDs: map[string]string{ovntypes.NetworkExternalID: networkName1, ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, }, &nbdb.LogicalSwitchPort{ @@ -1772,7 +1841,9 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol egressNodeIPsASv4, // UDN - getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{gwIP, node2Network1RtoSIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{node1Network1TransitIP, node2Network1TransitIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getGWPktMarkLRPForController(eIP1Mark, egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, IPFamilyValueV4, + netInfo.GetNetworkName(), DefaultNetworkControllerName), getNoReRoutePolicyForUDNEnabledSvc(false, netInfo.GetNetworkName(), DefaultNetworkControllerName, egressIPServedPodsASUDNv4.Name, egressSVCServedPodsASv4.Name, udnEnabledSvcV4.Name), &nbdb.LogicalRouterPolicy{ Priority: ovntypes.DefaultNoRereoutePriority, @@ -1798,20 +1869,26 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol ExternalIDs: getEgressIPLRPNoReRoutePodToNodeDbIDs(IPFamilyValueV4, netInfo.GetNetworkName(), DefaultNetworkControllerName).GetExternalIDs(), }, &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, }, &nbdb.LogicalRouter{ - UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", - Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, - Policies: []string{"udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", - "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", + Policies: []string{ + "udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", fmt.Sprintf("%s-no-reroute-reply-traffic", netInfo.GetNetworkName()), getReRoutePolicyUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName())}, }, + &nbdb.LogicalRouter{ + UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", + Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, + Policies: []string{getGWPktMarkLRPUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName())}, + }, &nbdb.LogicalSwitchPort{ UUID: "k8s-" + networkName1_ + node1Name + "-UUID", Name: "k8s-" + networkName1_ + node1Name, @@ -1865,10 +1942,11 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol Name: networkName1, Type: "ovn-k8s-cni-overlay", }, - Role: ovntypes.NetworkRolePrimary, - Topology: ovntypes.Layer2Topology, - NADName: nadNsName, - Subnets: v4Net1, + Role: ovntypes.NetworkRolePrimary, + Topology: ovntypes.Layer2Topology, + NADName: nadNsName, + Subnets: v4Net1, + TransitSubnet: config.ClusterManager.V4TransitSubnet, } nad, err := newNetworkAttachmentDefinition( eipNamespace2, @@ -1876,7 +1954,6 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol netconf, ) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} netInfo, err := util.NewNetInfo(&netconf) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -1891,6 +1968,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", node1IPv4CIDR), util.OvnNodeL3GatewayConfig: fmt.Sprintf(`{"%s":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"%s", "next-hop":"%s", "next-hops": ["%s"]}, "default":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"192.168.126.12/24", "next-hop": "192.168.126.1", "next-hops": ["192.168.126.1"]}}`, networkName1, v4Net1, gwIP, gwIP), + util.Layer2TopologyVersion: util.TransitRouterTopoVersion, } labels := map[string]string{ "k8s.ovn.org/egress-assignable": "", @@ -1907,6 +1985,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", node2IPv4CIDR), util.OvnNodeL3GatewayConfig: fmt.Sprintf(`{"%s":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"%s", "next-hop":"%s", "next-hops": ["%s"]}, "default":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"192.168.126.12/24", "next-hop": "192.168.126.1", "next-hops": ["192.168.126.1"]}}`, networkName1, v4Net1, gwIP, gwIP), + util.Layer2TopologyVersion: util.TransitRouterTopoVersion, } node2 := getNodeObj(node2Name, node2Annotations, labels) twoNodeStatus := []egressipv1.EgressIPStatusItem{ @@ -1963,14 +2042,19 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol }, // UDN start &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, + }, + &nbdb.LogicalRouter{ + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: networkName1, ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, }, &nbdb.LogicalRouter{ UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, ExternalIDs: map[string]string{ovntypes.NetworkExternalID: networkName1, ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, }, &nbdb.LogicalSwitchPort{ @@ -2122,7 +2206,9 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol egressNodeIPsASv4, // UDN - getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{gwIP, node2Network1RtoSIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{node1Network1TransitIP, node2Network1TransitIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getGWPktMarkLRPForController(eIP1Mark, egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, IPFamilyValueV4, + netInfo.GetNetworkName(), DefaultNetworkControllerName), getNoReRoutePolicyForUDNEnabledSvc(false, netInfo.GetNetworkName(), DefaultNetworkControllerName, egressIPServedPodsASUDNv4.Name, egressSVCServedPodsASv4.Name, udnEnabledSvcV4.Name), &nbdb.LogicalRouterPolicy{ Priority: ovntypes.DefaultNoRereoutePriority, @@ -2148,20 +2234,26 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol ExternalIDs: getEgressIPLRPNoReRoutePodToNodeDbIDs(IPFamilyValueV4, netInfo.GetNetworkName(), DefaultNetworkControllerName).GetExternalIDs(), }, &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, }, &nbdb.LogicalRouter{ - UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", - Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, - Policies: []string{"udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", - "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", + Policies: []string{ + "udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", fmt.Sprintf("%s-no-reroute-reply-traffic", netInfo.GetNetworkName()), getReRoutePolicyUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName())}, }, + &nbdb.LogicalRouter{ + UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", + Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, + Policies: []string{getGWPktMarkLRPUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName())}, + }, &nbdb.LogicalSwitchPort{ UUID: "k8s-" + networkName1_ + node1Name + "-UUID", Name: "k8s-" + networkName1_ + node1Name, @@ -2218,10 +2310,11 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol Name: networkName1, Type: "ovn-k8s-cni-overlay", }, - Role: ovntypes.NetworkRolePrimary, - Topology: ovntypes.Layer2Topology, - NADName: nadName, - Subnets: v4Net1, + Role: ovntypes.NetworkRolePrimary, + Topology: ovntypes.Layer2Topology, + NADName: nadName, + Subnets: v4Net1, + TransitSubnet: config.ClusterManager.V4TransitSubnet, } nad, err := newNetworkAttachmentDefinition( eipNamespace2, @@ -2229,7 +2322,6 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol netconf, ) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} netInfo, err := util.NewNetInfo(&netconf) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -2244,6 +2336,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", node1IPv4CIDR), util.OvnNodeL3GatewayConfig: fmt.Sprintf(`{"%s":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"%s", "next-hop":"%s", "next-hops": ["%s"]}, "default":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"192.168.126.12/24", "next-hop": "192.168.126.1", "next-hops": ["192.168.126.1"]}}`, networkName1, v4Net1, gwIP, gwIP), + util.Layer2TopologyVersion: util.TransitRouterTopoVersion, } labels := map[string]string{ "k8s.ovn.org/egress-assignable": "", @@ -2260,6 +2353,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", node2IPv4CIDR), util.OvnNodeL3GatewayConfig: fmt.Sprintf(`{"%s":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"%s", "next-hop":"%s", "next-hops": ["%s"]}, "default":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"192.168.126.12/24", "next-hop": "192.168.126.1", "next-hops": ["192.168.126.1"]}}`, networkName1, v4Net1, gwIP, gwIP), + util.Layer2TopologyVersion: util.TransitRouterTopoVersion, } node2 := getNodeObj(node2Name, node2Annotations, labels) twoNodeStatus := []egressipv1.EgressIPStatusItem{ @@ -2316,14 +2410,19 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol }, // UDN start &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, + }, + &nbdb.LogicalRouter{ + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: networkName1, ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, }, &nbdb.LogicalRouter{ UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, ExternalIDs: map[string]string{ovntypes.NetworkExternalID: networkName1, ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, }, &nbdb.LogicalSwitchPort{ @@ -2490,8 +2589,12 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol egressNodeIPsASv4, // UDN - getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{gwIP, node2Network1RtoSIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), - getReRoutePolicyForController(egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, eIP1Mark, IPFamilyValueV4, []string{gwIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{node1Network1TransitIP, node2Network1TransitIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getReRoutePolicyForController(egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, eIP1Mark, IPFamilyValueV4, []string{node1Network1TransitIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getGWPktMarkLRPForController(eIP1Mark, egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, IPFamilyValueV4, + netInfo.GetNetworkName(), DefaultNetworkControllerName), + getGWPktMarkLRPForController(eIP1Mark, egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, IPFamilyValueV4, + netInfo.GetNetworkName(), DefaultNetworkControllerName), getNoReRoutePolicyForUDNEnabledSvc(false, netInfo.GetNetworkName(), DefaultNetworkControllerName, egressIPServedPodsASUDNv4.Name, egressSVCServedPodsASv4.Name, udnEnabledSvcV4.Name), &nbdb.LogicalRouterPolicy{ Priority: ovntypes.DefaultNoRereoutePriority, @@ -2517,21 +2620,27 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol ExternalIDs: getEgressIPLRPNoReRoutePodToNodeDbIDs(IPFamilyValueV4, netInfo.GetNetworkName(), DefaultNetworkControllerName).GetExternalIDs(), }, &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, }, &nbdb.LogicalRouter{ - UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", - Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, - Policies: []string{"udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", - "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", + Policies: []string{ + "udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", fmt.Sprintf("%s-no-reroute-reply-traffic", netInfo.GetNetworkName()), getReRoutePolicyUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName()), - getReRoutePolicyUUID(eipNamespace2, podName4, IPFamilyValueV4, netInfo.GetNetworkName()), - }, + getReRoutePolicyUUID(eipNamespace2, podName4, IPFamilyValueV4, netInfo.GetNetworkName())}, + }, + &nbdb.LogicalRouter{ + UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", + Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, + Policies: []string{getGWPktMarkLRPUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName()), + getGWPktMarkLRPUUID(eipNamespace2, podName4, IPFamilyValueV4, netInfo.GetNetworkName())}, }, &nbdb.LogicalSwitchPort{ UUID: "k8s-" + networkName1_ + node1Name + "-UUID", @@ -2586,10 +2695,11 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol Name: networkName1, Type: "ovn-k8s-cni-overlay", }, - Role: ovntypes.NetworkRolePrimary, - Topology: ovntypes.Layer2Topology, - NADName: nadName, - Subnets: v4Net1, + Role: ovntypes.NetworkRolePrimary, + Topology: ovntypes.Layer2Topology, + NADName: nadName, + Subnets: v4Net1, + TransitSubnet: config.ClusterManager.V4TransitSubnet, } nad, err := newNetworkAttachmentDefinition( eipNamespace2, @@ -2597,7 +2707,6 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol netconf, ) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} netInfo, err := util.NewNetInfo(&netconf) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -2612,6 +2721,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", node1IPv4CIDR), util.OvnNodeL3GatewayConfig: fmt.Sprintf(`{"%s":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"%s", "next-hop":"%s", "next-hops": ["%s"]}, "default":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"192.168.126.12/24", "next-hop": "192.168.126.1", "next-hops": ["192.168.126.1"]}}`, networkName1, v4Net1, gwIP, gwIP), + util.Layer2TopologyVersion: util.TransitRouterTopoVersion, } labels := map[string]string{ "k8s.ovn.org/egress-assignable": "", @@ -2628,6 +2738,7 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol util.OVNNodeHostCIDRs: fmt.Sprintf("[\"%s\"]", node2IPv4CIDR), util.OvnNodeL3GatewayConfig: fmt.Sprintf(`{"%s":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"%s", "next-hop":"%s", "next-hops": ["%s"]}, "default":{"mode":"local","mac-address":"7e:57:f8:f0:3c:49", "ip-address":"192.168.126.12/24", "next-hop": "192.168.126.1", "next-hops": ["192.168.126.1"]}}`, networkName1, v4Net1, gwIP, gwIP), + util.Layer2TopologyVersion: util.TransitRouterTopoVersion, } node2 := getNodeObj(node2Name, node2Annotations, nil) oneNodeStatus := []egressipv1.EgressIPStatusItem{ @@ -2680,14 +2791,19 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol }, // UDN start &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, + }, + &nbdb.LogicalRouter{ + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: networkName1, ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, }, &nbdb.LogicalRouter{ UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, ExternalIDs: map[string]string{ovntypes.NetworkExternalID: networkName1, ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, }, &nbdb.LogicalSwitchPort{ @@ -2849,8 +2965,12 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol egressNodeIPsASv4, // UDN - getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{gwIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), - getReRoutePolicyForController(egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, eIP1Mark, IPFamilyValueV4, []string{gwIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getReRoutePolicyForController(egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, eIP1Mark, IPFamilyValueV4, []string{node1Network1TransitIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getReRoutePolicyForController(egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, eIP1Mark, IPFamilyValueV4, []string{node1Network1TransitIP}, netInfo.GetNetworkName(), DefaultNetworkControllerName), + getGWPktMarkLRPForController(eIP1Mark, egressIPName, eipNamespace2, podName2, v4Pod1IPNode1Net1, IPFamilyValueV4, + netInfo.GetNetworkName(), DefaultNetworkControllerName), + getGWPktMarkLRPForController(eIP1Mark, egressIPName, eipNamespace2, podName4, v4Pod2IPNode2Net1, IPFamilyValueV4, + netInfo.GetNetworkName(), DefaultNetworkControllerName), getNoReRoutePolicyForUDNEnabledSvc(false, netInfo.GetNetworkName(), DefaultNetworkControllerName, egressIPServedPodsASUDNv4.Name, egressSVCServedPodsASv4.Name, udnEnabledSvcV4.Name), &nbdb.LogicalRouterPolicy{ Priority: ovntypes.DefaultNoRereoutePriority, @@ -2876,21 +2996,27 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol ExternalIDs: getEgressIPLRPNoReRoutePodToNodeDbIDs(IPFamilyValueV4, netInfo.GetNetworkName(), DefaultNetworkControllerName).GetExternalIDs(), }, &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, }, &nbdb.LogicalRouter{ - UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", - Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, - Policies: []string{"udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", - "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", + Policies: []string{ + "udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", fmt.Sprintf("%s-no-reroute-reply-traffic", netInfo.GetNetworkName()), getReRoutePolicyUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName()), - getReRoutePolicyUUID(eipNamespace2, podName4, IPFamilyValueV4, netInfo.GetNetworkName()), - }, + getReRoutePolicyUUID(eipNamespace2, podName4, IPFamilyValueV4, netInfo.GetNetworkName())}, + }, + &nbdb.LogicalRouter{ + UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", + Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, + Policies: []string{getGWPktMarkLRPUUID(eipNamespace2, podName2, IPFamilyValueV4, netInfo.GetNetworkName()), + getGWPktMarkLRPUUID(eipNamespace2, podName4, IPFamilyValueV4, netInfo.GetNetworkName())}, }, &nbdb.LogicalSwitchPort{ UUID: "k8s-" + networkName1_ + node1Name + "-UUID", @@ -3006,19 +3132,23 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol ExternalIDs: getEgressIPLRPNoReRoutePodToNodeDbIDs(IPFamilyValueV4, netInfo.GetNetworkName(), DefaultNetworkControllerName).GetExternalIDs(), }, &nbdb.LogicalRouterPort{ - UUID: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID", - Name: ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName, - Networks: []string{node1Network1RtoSIPCIDR}, + UUID: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID", + Name: ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name, + Networks: []string{node1Network1JoinCIDR, node1Network1TransitCIDR}, + }, + &nbdb.LogicalRouter{ + Name: netInfo.GetNetworkScopedClusterRouterName(), + UUID: netInfo.GetNetworkScopedClusterRouterName() + "-UUID", + ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, + Policies: []string{ + "udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", + fmt.Sprintf("%s-no-reroute-reply-traffic", netInfo.GetNetworkName())}, }, &nbdb.LogicalRouter{ UUID: netInfo.GetNetworkScopedGWRouterName(node1.Name) + "-UUID", Name: netInfo.GetNetworkScopedGWRouterName(node1.Name), - Ports: []string{ovntypes.RouterToSwitchPrefix + networkName1_ + layer2SwitchName + "-UUID"}, + Ports: []string{ovntypes.RouterToTransitRouterPrefix + ovntypes.GWRouterPrefix + networkName1_ + node1.Name + "-UUID"}, ExternalIDs: map[string]string{ovntypes.NetworkExternalID: netInfo.GetNetworkName(), ovntypes.TopologyExternalID: ovntypes.Layer2Topology}, - Policies: []string{"udn-default-no-reroute-node-UUID", "udn-default-no-reroute-UUID", - "udn-no-reroute-service-UUID", "udn-enabled-svc-no-reroute-UUID", - fmt.Sprintf("%s-no-reroute-reply-traffic", netInfo.GetNetworkName()), - }, }, &nbdb.LogicalSwitchPort{ UUID: "k8s-" + networkName1_ + node1Name + "-UUID", diff --git a/go-controller/pkg/ovn/egressip_udn_l3_test.go b/go-controller/pkg/ovn/egressip_udn_l3_test.go index 28035e8374..cf6875f446 100644 --- a/go-controller/pkg/ovn/egressip_udn_l3_test.go +++ b/go-controller/pkg/ovn/egressip_udn_l3_test.go @@ -157,7 +157,6 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol netconf, ) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} netInfo, err := util.NewNetInfo(&netconf) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -534,7 +533,6 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol netconf, ) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} netInfo, err := util.NewNetInfo(&netconf) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -1055,7 +1053,6 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol netconf, ) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} netInfo, err := util.NewNetInfo(&netconf) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -1791,7 +1788,6 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol netconf, ) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} netInfo, err := util.NewNetInfo(&netconf) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -2161,7 +2157,6 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol netconf, ) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} netInfo, err := util.NewNetInfo(&netconf) gomega.Expect(err).NotTo(gomega.HaveOccurred()) @@ -2522,7 +2517,6 @@ var _ = ginkgo.Describe("EgressIP Operations for user defined network with topol netconf, ) gomega.Expect(err).NotTo(gomega.HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} netInfo, err := util.NewNetInfo(&netconf) gomega.Expect(err).NotTo(gomega.HaveOccurred()) diff --git a/go-controller/pkg/ovn/gateway.go b/go-controller/pkg/ovn/gateway.go index 4c4f63d3fa..4ab5d5d467 100644 --- a/go-controller/pkg/ovn/gateway.go +++ b/go-controller/pkg/ovn/gateway.go @@ -14,6 +14,7 @@ import ( "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" utilnet "k8s.io/utils/net" + "k8s.io/utils/ptr" libovsdbclient "github.com/ovn-kubernetes/libovsdb/client" @@ -54,6 +55,8 @@ type GatewayManager struct { // Cluster wide router Load_Balancer_Group UUID. // Includes all node gateway routers. routerLoadBalancerGroupUUID string + + transitRouterInfo *transitRouterInfo } type GatewayOption func(*GatewayManager) @@ -65,14 +68,18 @@ func NewGatewayManagerForLayer2Topology( nbClient libovsdbclient.Client, netInfo util.NetInfo, watchFactory *factory.WatchFactory, + useTransitRouter bool, opts ...GatewayOption, ) *GatewayManager { + routerName := "" + if useTransitRouter { + routerName = netInfo.GetNetworkScopedClusterRouterName() + } return newGWManager( nodeName, - "", - netInfo.GetNetworkScopedGWRouterName(nodeName), + routerName, netInfo.GetNetworkScopedExtSwitchName(nodeName), - netInfo.GetNetworkScopedName(types.OVNLayer2Switch), + netInfo.GetNetworkScopedSwitchName(""), coopUUID, kube, nbClient, @@ -94,7 +101,6 @@ func NewGatewayManager( return newGWManager( nodeName, netInfo.GetNetworkScopedClusterRouterName(), - netInfo.GetNetworkScopedGWRouterName(nodeName), netInfo.GetNetworkScopedExtSwitchName(nodeName), netInfo.GetNetworkScopedJoinSwitchName(), coopUUID, @@ -107,7 +113,7 @@ func NewGatewayManager( } func newGWManager( - nodeName, clusterRouterName, gwRouterName, extSwitchName, joinSwitchName string, + nodeName, clusterRouterName, extSwitchName, joinSwitchName string, coopUUID string, kube kube.InterfaceOVN, nbClient libovsdbclient.Client, @@ -117,7 +123,7 @@ func newGWManager( gwManager := &GatewayManager{ nodeName: nodeName, clusterRouterName: clusterRouterName, - gwRouterName: gwRouterName, + gwRouterName: netInfo.GetNetworkScopedGWRouterName(nodeName), extSwitchName: extSwitchName, joinSwitchName: joinSwitchName, coppUUID: coopUUID, @@ -241,7 +247,7 @@ func (gw *GatewayManager) cleanupStalePodSNATs(nodeName string, nodeIPs []*net.I return nil } -func (gw *GatewayManager) createGWRouter(l3GatewayConfig *util.L3GatewayConfig, gwLRPJoinIPs []*net.IPNet) (*nbdb.LogicalRouter, error) { +func (gw *GatewayManager) createGWRouter(gwConfig *GatewayConfig) (*nbdb.LogicalRouter, error) { // Create a gateway router. dynamicNeighRouters := "true" if config.OVNKubernetesFeature.EnableInterconnect { @@ -251,7 +257,7 @@ func (gw *GatewayManager) createGWRouter(l3GatewayConfig *util.L3GatewayConfig, logicalRouterOptions := map[string]string{ "always_learn_from_arp_request": "false", "dynamic_neigh_routers": dynamicNeighRouters, - "chassis": l3GatewayConfig.ChassisID, + "chassis": gwConfig.annoConfig.ChassisID, "lb_force_snat_ip": "router_ip", "mac_binding_age_threshold": types.GRMACBindingAgeThreshold, } @@ -267,14 +273,10 @@ func (gw *GatewayManager) createGWRouter(l3GatewayConfig *util.L3GatewayConfig, // when it comes to SNATing traffic after load balancing. // Hence for Layer2 UDPNs let's set the snat-ip explicitly to the // joinsubnetIP - joinIPDualStack := make([]string, len(gwLRPJoinIPs)) - for i, gwLRPJoinIP := range gwLRPJoinIPs { - joinIPDualStack[i] = gwLRPJoinIP.IP.String() - } - logicalRouterOptions["lb_force_snat_ip"] = strings.Join(joinIPDualStack, " ") + logicalRouterOptions["lb_force_snat_ip"] = strings.Join(util.IPNetsIPToStringSlice(gwConfig.gwRouterJoinCIDRs), " ") } - physicalIPs := make([]string, len(l3GatewayConfig.IPAddresses)) - for i, ip := range l3GatewayConfig.IPAddresses { + physicalIPs := make([]string, len(gwConfig.annoConfig.IPAddresses)) + for i, ip := range gwConfig.annoConfig.IPAddresses { physicalIPs[i] = ip.IP.String() } logicalRouterExternalIDs := map[string]string{ @@ -295,7 +297,7 @@ func (gw *GatewayManager) createGWRouter(l3GatewayConfig *util.L3GatewayConfig, if gw.clusterLoadBalancerGroupUUID != "" { gwRouter.LoadBalancerGroup = []string{gw.clusterLoadBalancerGroupUUID} - if l3GatewayConfig.NodePortEnable && gw.routerLoadBalancerGroupUUID != "" { + if gwConfig.annoConfig.NodePortEnable && gw.routerLoadBalancerGroupUUID != "" { // add routerLoadBalancerGroupUUID to the gateway router only if nodePort is enabled gwRouter.LoadBalancerGroup = append(gwRouter.LoadBalancerGroup, gw.routerLoadBalancerGroupUUID) } @@ -309,31 +311,29 @@ func (gw *GatewayManager) createGWRouter(l3GatewayConfig *util.L3GatewayConfig, return &gwRouter, nil } -func (gw *GatewayManager) getGWRouterPeerPortName() string { - // In Layer2 networks there is no join switch and the gw.joinSwitchName points to the cluster switch. - // Ensure that the ports are named appropriately, this is important for the logical router policies - // created for local node access. - // TODO(kyrtapz): Clean this up for clarity as part of https://github.com/ovn-org/ovn-kubernetes/issues/4689 +func (gw *GatewayManager) getGWRouterPeerRouterPortName() string { + return types.TransitRouterToRouterPrefix + gw.gwRouterName +} + +func (gw *GatewayManager) getGWRouterPeerSwitchPortName() string { if gw.netInfo.TopologyType() == types.Layer2Topology { return types.SwitchToRouterPrefix + gw.joinSwitchName } - return types.JoinSwitchToGWRouterPrefix + gw.gwRouterName } func (gw *GatewayManager) getGWRouterPortName() string { - // In Layer2 networks there is no join switch and the gw.joinSwitchName points to the cluster switch. - // Ensure that the ports are named appropriately, this is important for the logical router policies - // created for local node access. - // TODO(kyrtapz): Clean this up for clarity as part of https://github.com/ovn-org/ovn-kubernetes/issues/4689 if gw.netInfo.TopologyType() == types.Layer2Topology { + if gw.transitRouterInfo != nil { + return types.RouterToTransitRouterPrefix + gw.gwRouterName + } return types.RouterToSwitchPrefix + gw.joinSwitchName } return types.GWRouterToJoinSwitchPrefix + gw.gwRouterName } -func (gw *GatewayManager) createGWRouterPeerPort(nodeName string) error { - gwSwitchPort := gw.getGWRouterPeerPortName() +func (gw *GatewayManager) createGWRouterPeerSwitchPort(nodeName string) error { + gwSwitchPort := gw.getGWRouterPeerSwitchPortName() gwRouterPortName := gw.getGWRouterPortName() logicalSwitchPort := nbdb.LogicalSwitchPort{ @@ -375,25 +375,76 @@ func (gw *GatewayManager) createGWRouterPeerPort(nodeName string) error { return err } -func (gw *GatewayManager) createGWRouterPort(hostSubnets []*net.IPNet, gwLRPJoinIPs []*net.IPNet, - enableGatewayMTU bool, gwRouter *nbdb.LogicalRouter) ([]net.IP, error) { - gwLRPIPs := make([]net.IP, 0) +func (gw *GatewayManager) deleteGWRouterPeerSwitchPort() error { + // Remove the patch port that connects join switch to gateway router + lsp := nbdb.LogicalSwitchPort{Name: gw.getGWRouterPeerSwitchPortName()} + sw := nbdb.LogicalSwitch{Name: gw.joinSwitchName} + err := libovsdbops.DeleteLogicalSwitchPorts(gw.nbClient, &sw, &lsp) + if err != nil && !errors.Is(err, libovsdbclient.ErrNotFound) { + return fmt.Errorf("failed to delete logical switch port %s from switch %s: %w", lsp.Name, sw.Name, err) + } + return nil +} + +func (gw *GatewayManager) createGWRouterPeerRouterPort() error { + gwPeerPortName := gw.getGWRouterPeerRouterPortName() + gwRouterPortName := gw.getGWRouterPortName() + + ovnClusterRouterToGWRouterPort := nbdb.LogicalRouterPort{ + Name: gwPeerPortName, + MAC: util.IPAddrToHWAddr(gw.transitRouterInfo.transitRouterNets[0].IP).String(), + Networks: util.IPNetsToStringSlice(gw.transitRouterInfo.transitRouterNets), + Options: map[string]string{ + libovsdbops.RequestedTnlKey: fmt.Sprintf("%d", gw.transitRouterInfo.nodeID), + }, + Peer: ptr.To(gwRouterPortName), + ExternalIDs: map[string]string{ + types.NetworkExternalID: gw.netInfo.GetNetworkName(), + types.TopologyExternalID: gw.netInfo.TopologyType(), + }, + } + + ovnClusterRouter := nbdb.LogicalRouter{Name: gw.clusterRouterName} + err := libovsdbops.CreateOrUpdateLogicalRouterPort(gw.nbClient, &ovnClusterRouter, + &ovnClusterRouterToGWRouterPort, nil, &ovnClusterRouterToGWRouterPort.MAC, &ovnClusterRouterToGWRouterPort.Networks, + &ovnClusterRouterToGWRouterPort.Options, &ovnClusterRouterToGWRouterPort.Peer, &ovnClusterRouterToGWRouterPort.ExternalIDs) + if err != nil { + return fmt.Errorf("failed to create port %+v on router %+v: %v", ovnClusterRouterToGWRouterPort, ovnClusterRouter, err) + } + return nil +} + +func (gw *GatewayManager) deleteGWRouterPeerRouterPort() error { + ovnClusterRouterToGWRouterPort := nbdb.LogicalRouterPort{Name: gw.getGWRouterPeerRouterPortName()} + ovnClusterRouter := nbdb.LogicalRouter{Name: gw.clusterRouterName} + err := libovsdbops.DeleteLogicalRouterPorts(gw.nbClient, &ovnClusterRouter, &ovnClusterRouterToGWRouterPort) + if err != nil && !errors.Is(err, libovsdbclient.ErrNotFound) { + return fmt.Errorf("failed to delete router port %s from router %s: %w", ovnClusterRouterToGWRouterPort.Name, ovnClusterRouter.Name, err) + } + return nil +} + +func (gw *GatewayManager) createGWRouterPort(gwConfig *GatewayConfig, + enableGatewayMTU bool, gwRouter *nbdb.LogicalRouter) error { gwLRPNetworks := []string{} - for _, gwLRPJoinIP := range gwLRPJoinIPs { - gwLRPIPs = append(gwLRPIPs, gwLRPJoinIP.IP) - gwLRPNetworks = append(gwLRPNetworks, gwLRPJoinIP.String()) + for _, gwRouterJoinNet := range gwConfig.gwRouterJoinCIDRs { + gwLRPNetworks = append(gwLRPNetworks, gwRouterJoinNet.String()) } - if gw.netInfo.TopologyType() == types.Layer2Topology { + if gw.netInfo.TopologyType() == types.Layer2Topology && gw.transitRouterInfo == nil { // At layer2 GR LRP acts as the layer3 ovn_cluster_router so we need // to configure here the .1 address, this will work only for IC with // one node per zone, since ARPs for .1 will not go beyond local switch. // This is being done to add the ICMP SNATs for .1 podSubnet that OVN GR generates - for _, subnet := range hostSubnets { - gwLRPIPs = append(gwLRPIPs, gw.netInfo.GetNodeGatewayIP(subnet).IP) + for _, subnet := range gwConfig.hostSubnets { gwLRPNetworks = append(gwLRPNetworks, gw.netInfo.GetNodeGatewayIP(subnet).String()) } } - gwLRPMAC := util.IPAddrToHWAddr(gwLRPIPs[0]) + if gw.netInfo.TopologyType() == types.Layer2Topology && gw.transitRouterInfo != nil { + for _, gatewayRouterTransitNetwork := range gw.transitRouterInfo.gatewayRouterNets { + gwLRPNetworks = append(gwLRPNetworks, gatewayRouterTransitNetwork.String()) + } + } + gwLRPMAC := util.IPAddrToHWAddr(gwConfig.gwRouterJoinCIDRs[0].IP) var options map[string]string if enableGatewayMTU { @@ -413,8 +464,12 @@ func (gw *GatewayManager) createGWRouterPort(hostSubnets []*net.IPNet, gwLRPJoin types.NetworkExternalID: gw.netInfo.GetNetworkName(), types.TopologyExternalID: gw.netInfo.TopologyType(), } + if gw.netInfo.TopologyType() == types.Layer2Topology && gw.transitRouterInfo != nil { + gwRouterPort.Peer = ptr.To(gw.getGWRouterPeerRouterPortName()) + } + _, isNetIPv6 := gw.netInfo.IPMode() - if gw.netInfo.TopologyType() == types.Layer2Topology && isNetIPv6 && config.IPv6Mode { + if gw.netInfo.TopologyType() == types.Layer2Topology && isNetIPv6 && config.IPv6Mode && gw.transitRouterInfo == nil { gwRouterPort.Ipv6RaConfigs = map[string]string{ "address_mode": "dhcpv6_stateful", "send_periodic": "true", @@ -432,16 +487,17 @@ func (gw *GatewayManager) createGWRouterPort(hostSubnets []*net.IPNet, gwLRPJoin &gwRouterPort, nil, &gwRouterPort.MAC, &gwRouterPort.Networks, &gwRouterPort.Options) if err != nil { - return nil, fmt.Errorf("failed to create port %+v on router %+v: %v", gwRouterPort, gwRouter, err) + return fmt.Errorf("failed to create port %+v on router %+v: %v", gwRouterPort, gwRouter, err) } - return gwLRPIPs, nil + return nil } -func (gw *GatewayManager) updateGWRouterStaticRoutes(clusterIPSubnet, drLRPIfAddrs []*net.IPNet, - l3GatewayConfig *util.L3GatewayConfig, externalRouterPort string, gwRouter *nbdb.LogicalRouter) error { - if len(drLRPIfAddrs) > 0 { - for _, entry := range clusterIPSubnet { - drLRPIfAddr, err := util.MatchFirstIPNetFamily(utilnet.IsIPv6CIDR(entry), drLRPIfAddrs) +func (gw *GatewayManager) updateGWRouterStaticRoutes(gwConfig *GatewayConfig, externalRouterPort string, + gwRouter *nbdb.LogicalRouter) error { + if len(gwConfig.ovnClusterLRPToJoinIfAddrs) > 0 { + // this is only the case for layer3 topology + for _, entry := range gwConfig.clusterSubnets { + drLRPIfAddr, err := util.MatchFirstIPNetFamily(utilnet.IsIPv6CIDR(entry), gwConfig.ovnClusterLRPToJoinIfAddrs) if err != nil { return fmt.Errorf("failed to add a static route in GR %s with distributed "+ "router as the nexthop: %v", @@ -482,6 +538,33 @@ func (gw *GatewayManager) updateGWRouterStaticRoutes(clusterIPSubnet, drLRPIfAdd } } } + // for layer2 topology with transit router, add pod subnet routes via transit router, like so: + // 10.10.0.0/24 100.88.0.8 dst-ip rtotr-GR_ + if gw.netInfo.TopologyType() == types.Layer2Topology && gw.transitRouterInfo != nil { + for _, subnet := range gwConfig.hostSubnets { + nexthop, err := util.MatchFirstIPNetFamily(utilnet.IsIPv6(subnet.IP), gw.transitRouterInfo.transitRouterNets) + if err != nil { + return err + } + subnetRoute := nbdb.LogicalRouterStaticRoute{ + IPPrefix: subnet.String(), + Nexthop: nexthop.IP.String(), + OutputPort: ptr.To(gw.getGWRouterPortName()), + } + subnetRoute.ExternalIDs = map[string]string{ + types.NetworkExternalID: gw.netInfo.GetNetworkName(), + types.TopologyExternalID: gw.netInfo.TopologyType(), + } + p := func(item *nbdb.LogicalRouterStaticRoute) bool { + return item.OutputPort != nil && *item.OutputPort == *subnetRoute.OutputPort && item.IPPrefix == subnetRoute.IPPrefix && + libovsdbops.PolicyEqualPredicate(subnetRoute.Policy, item.Policy) + } + if err := libovsdbops.CreateOrReplaceLogicalRouterStaticRouteWithPredicate(gw.nbClient, gw.gwRouterName, &subnetRoute, + p, &subnetRoute.Nexthop); err != nil { + return fmt.Errorf("error creating static route %+v in GW router %s: %v", subnetRoute, gw.gwRouterName, err) + } + } + } for _, nextHop := range node.DummyNextHopIPs() { // Add return service route for OVN back to host @@ -512,7 +595,7 @@ func (gw *GatewayManager) updateGWRouterStaticRoutes(clusterIPSubnet, drLRPIfAdd } } - nextHops := l3GatewayConfig.NextHops + nextHops := gwConfig.annoConfig.NextHops // Add default gateway routes in GR for _, nextHop := range nextHops { var allIPs string @@ -543,21 +626,38 @@ func (gw *GatewayManager) updateGWRouterStaticRoutes(clusterIPSubnet, drLRPIfAdd return fmt.Errorf("error creating static route %+v in GR %s: %v", lrsr, gw.gwRouterName, err) } } - return nil } -func (gw *GatewayManager) updateClusterRouterStaticRoutes(hostSubnets []*net.IPNet, gwLRPIPs []net.IP) error { +func (gw *GatewayManager) updateClusterRouterStaticRoutes(gwConfig *GatewayConfig, gwRouterIPs []net.IP) error { // We need to add a route to the Gateway router's IP, on the // cluster router, to ensure that the return traffic goes back // to the same gateway router // // This can be removed once https://bugzilla.redhat.com/show_bug.cgi?id=1891516 is fixed. // FIXME(trozet): if LRP IP is changed, we do not remove stale instances of these routes - for _, gwLRPIP := range gwLRPIPs { + nextHops := gwRouterIPs + if gw.netInfo.TopologyType() == types.Layer2Topology && gw.transitRouterInfo != nil { + nextHops = util.IPNetsToIPs(gw.transitRouterInfo.gatewayRouterNets) + } + + for _, gwRouterIP := range gwRouterIPs { + nextHop, err := util.MatchIPFamily(utilnet.IsIPv6(gwRouterIP), nextHops) + if err != nil { + if gw.transitRouterInfo != nil { + // for layer2 networks with transit router it is not an error. + // JoinIPs are allocated for both IP families always, but transit router IPs and routes + // are only created for the actual IP families of the network + continue + } + return fmt.Errorf("failed to add source IP address based "+ + "routes in distributed router %s: %v", + gw.clusterRouterName, err) + } + lrsr := nbdb.LogicalRouterStaticRoute{ - IPPrefix: gwLRPIP.String(), - Nexthop: gwLRPIP.String(), + IPPrefix: gwRouterIP.String(), + Nexthop: nextHop[0].String(), } if gw.netInfo.IsUserDefinedNetwork() { lrsr.ExternalIDs = map[string]string{ @@ -571,21 +671,21 @@ func (gw *GatewayManager) updateClusterRouterStaticRoutes(hostSubnets []*net.IPN } if gw.clusterRouterName != "" { - err := libovsdbops.CreateOrReplaceLogicalRouterStaticRouteWithPredicate(gw.nbClient, + err = libovsdbops.CreateOrReplaceLogicalRouterStaticRouteWithPredicate(gw.nbClient, gw.clusterRouterName, &lrsr, p, &lrsr.Nexthop) if err != nil { return fmt.Errorf("error creating static route %+v in %s: %v", lrsr, gw.clusterRouterName, err) } } } + if gw.clusterRouterName == "" { + return nil + } // Add source IP address based routes in distributed router // for this gateway router. - for _, hostSubnet := range hostSubnets { - if gw.clusterRouterName == "" { - break - } - gwLRPIP, err := util.MatchIPFamily(utilnet.IsIPv6CIDR(hostSubnet), gwLRPIPs) + for _, hostSubnet := range gwConfig.hostSubnets { + nextHop, err := util.MatchIPFamily(utilnet.IsIPv6CIDR(hostSubnet), nextHops) if err != nil { return fmt.Errorf("failed to add source IP address based "+ "routes in distributed router %s: %v", @@ -595,7 +695,7 @@ func (gw *GatewayManager) updateClusterRouterStaticRoutes(hostSubnets []*net.IPN lrsr := nbdb.LogicalRouterStaticRoute{ Policy: &nbdb.LogicalRouterStaticRoutePolicySrcIP, IPPrefix: hostSubnet.String(), - Nexthop: gwLRPIP[0].String(), + Nexthop: nextHop[0].String(), } if config.Gateway.Mode != config.GatewayModeLocal { @@ -627,18 +727,16 @@ func (gw *GatewayManager) updateClusterRouterStaticRoutes(hostSubnets []*net.IPN // If migrating from shared to local gateway, let's remove the static routes towards // join switch for the hostSubnet prefix and any potential routes for UDN enabled services. // Note syncManagementPort happens before gateway sync so only remove things pointing to join subnet - if gw.clusterRouterName != "" { - p := func(item *nbdb.LogicalRouterStaticRoute) bool { - if _, ok := item.ExternalIDs[types.UDNEnabledServiceExternalID]; ok { - return true - } - return item.IPPrefix == lrsr.IPPrefix && item.Policy != nil && *item.Policy == *lrsr.Policy && - gw.containsJoinIP(net.ParseIP(item.Nexthop)) - } - err := libovsdbops.DeleteLogicalRouterStaticRoutesWithPredicate(gw.nbClient, gw.clusterRouterName, p) - if err != nil { - return fmt.Errorf("error deleting static route %+v in GR %s: %v", lrsr, gw.clusterRouterName, err) + p := func(item *nbdb.LogicalRouterStaticRoute) bool { + if _, ok := item.ExternalIDs[types.UDNEnabledServiceExternalID]; ok { + return true } + return item.IPPrefix == lrsr.IPPrefix && item.Policy != nil && *item.Policy == *lrsr.Policy && + gw.containsJoinIP(net.ParseIP(item.Nexthop)) + } + err := libovsdbops.DeleteLogicalRouterStaticRoutesWithPredicate(gw.nbClient, gw.clusterRouterName, p) + if err != nil { + return fmt.Errorf("error deleting static route %+v in GR %s: %v", lrsr, gw.clusterRouterName, err) } } } @@ -659,7 +757,7 @@ func (gw *GatewayManager) updateClusterRouterStaticRoutes(hostSubnets []*net.IPN // This function also updates SNAT created by `updateGWRouterNAT`, because NATs don't use ExternalIDs, // and their fields are used to find equivalent NATs. That means on gateway IPs change, instead of updating // the old NAT, we would create a new one. FIXME: add externalIDs to NATs -func (gw *GatewayManager) syncNATsForGRIPChange(externalIPs, oldExtIPs, gwLRPIPs []net.IP, +func (gw *GatewayManager) syncNATsForGRIPChange(gwConfig *GatewayConfig, oldExtIPs, gwRouterIPs []net.IP, gwRouter, oldGWRouter *nbdb.LogicalRouter) error { // if config.Gateway.DisabledSNATMultipleGWs is not set (by default it is not), // the NAT rules for pods not having annotations to route through either external @@ -685,7 +783,7 @@ func (gw *GatewayManager) syncNATsForGRIPChange(externalIPs, oldExtIPs, gwLRPIPs } // check external ip changed - for _, externalIP := range externalIPs { + for _, externalIP := range gwConfig.externalIPs { oldExternalIP, err := util.MatchFirstIPFamily(utilnet.IsIPv6(externalIP), oldExtIPs) if err != nil { return fmt.Errorf("failed to update GW SNAT rule for pods on router %s error: %v", gw.gwRouterName, err) @@ -707,10 +805,10 @@ func (gw *GatewayManager) syncNATsForGRIPChange(externalIPs, oldExtIPs, gwLRPIPs // check if join ip changed if gw.containsJoinIP(parsedLogicalIP) { // is a join SNAT, check if IP needs updating - joinIP, err := util.MatchFirstIPFamily(utilnet.IsIPv6(parsedLogicalIP), gwLRPIPs) + joinIP, err := util.MatchFirstIPFamily(utilnet.IsIPv6(parsedLogicalIP), gwRouterIPs) if err != nil { return fmt.Errorf("failed to find valid IP family match for join subnet IP: %s on "+ - "gateway router: %s, provided IPs: %#v", parsedLogicalIP, gw.gwRouterName, gwLRPIPs) + "gateway router: %s, provided IPs: %#v", parsedLogicalIP, gw.gwRouterName, gwRouterIPs) } if nat.LogicalIP != joinIP.String() { // needs to be updated @@ -732,8 +830,7 @@ func (gw *GatewayManager) syncNATsForGRIPChange(externalIPs, oldExtIPs, gwLRPIPs return nil } -func (gw *GatewayManager) updateGWRouterNAT(nodeName string, clusterIPSubnet []*net.IPNet, l3GatewayConfig *util.L3GatewayConfig, - externalIPs, gwLRPIPs []net.IP, gwRouter *nbdb.LogicalRouter) error { +func (gw *GatewayManager) updateGWRouterNAT(nodeName string, gwConfig *GatewayConfig, gwLRPIPs []net.IP, gwRouter *nbdb.LogicalRouter) error { // REMOVEME(trozet) workaround - create join subnet SNAT to handle ICMP needs frag return var extIDs map[string]string if gw.netInfo.IsUserDefinedNetwork() { @@ -744,7 +841,7 @@ func (gw *GatewayManager) updateGWRouterNAT(nodeName string, clusterIPSubnet []* } joinNATs := make([]*nbdb.NAT, 0, len(gwLRPIPs)) for _, gwLRPIP := range gwLRPIPs { - externalIP, err := util.MatchIPFamily(utilnet.IsIPv6(gwLRPIP), externalIPs) + externalIP, err := util.MatchIPFamily(utilnet.IsIPv6(gwLRPIP), gwConfig.externalIPs) if err != nil { return fmt.Errorf("failed to find valid external IP family match for join subnet IP: %s on "+ "gateway router: %s", gwLRPIP, gw.gwRouterName) @@ -761,15 +858,15 @@ func (gw *GatewayManager) updateGWRouterNAT(nodeName string, clusterIPSubnet []* return fmt.Errorf("failed to create SNAT rule for join subnet on router %s error: %v", gw.gwRouterName, err) } - nats := make([]*nbdb.NAT, 0, len(clusterIPSubnet)) + nats := make([]*nbdb.NAT, 0, len(gwConfig.clusterSubnets)) var nat *nbdb.NAT // DisableSNATMultipleGWs is only applicable to cluster default network and not to user defined networks. // For user defined networks, we always add SNAT rules regardless of whether the network is advertised or not. if !config.Gateway.DisableSNATMultipleGWs || gw.netInfo.IsPrimaryNetwork() { // Default SNAT rules. DisableSNATMultipleGWs=false in LGW (traffic egresses via mp0) always. // We are not checking for gateway mode to be shared explicitly to reduce topology differences. - for _, entry := range clusterIPSubnet { - externalIP, err := util.MatchIPFamily(utilnet.IsIPv6CIDR(entry), externalIPs) + for _, entry := range gwConfig.clusterSubnets { + externalIP, err := util.MatchIPFamily(utilnet.IsIPv6CIDR(entry), gwConfig.externalIPs) if err != nil { return fmt.Errorf("failed to create default SNAT rules for gateway router %s: %v", gw.gwRouterName, err) @@ -780,7 +877,8 @@ func (gw *GatewayManager) updateGWRouterNAT(nodeName string, clusterIPSubnet []* if utilnet.IsIPv6CIDR(entry) { ipFamily = utilnet.IPv6 } - snatMatch, err := GetNetworkScopedClusterSubnetSNATMatch(gw.nbClient, gw.netInfo, nodeName, gw.isRoutingAdvertised(nodeName), ipFamily) + snatMatch, err := GetNetworkScopedClusterSubnetSNATMatch(gw.nbClient, gw.netInfo, nodeName, + gw.isRoutingAdvertised(nodeName), ipFamily) if err != nil { return fmt.Errorf("failed to get SNAT match for node %s for network %s: %w", nodeName, gw.netInfo.GetNetworkName(), err) } @@ -794,7 +892,7 @@ func (gw *GatewayManager) updateGWRouterNAT(nodeName string, clusterIPSubnet []* } } else { // ensure we do not have any leftover SNAT entries after an upgrade - for _, logicalSubnet := range clusterIPSubnet { + for _, logicalSubnet := range gwConfig.clusterSubnets { nat = libovsdbops.BuildSNAT(nil, logicalSubnet, "", extIDs) nats = append(nats, nat) } @@ -804,7 +902,7 @@ func (gw *GatewayManager) updateGWRouterNAT(nodeName string, clusterIPSubnet []* } } - if err = gw.cleanupStalePodSNATs(nodeName, l3GatewayConfig.IPAddresses, gwLRPIPs); err != nil { + if err = gw.cleanupStalePodSNATs(nodeName, gwConfig.annoConfig.IPAddresses, gwLRPIPs); err != nil { return fmt.Errorf("failed to sync stale SNATs on node %s: %v", nodeName, err) } return nil @@ -818,6 +916,18 @@ func (gw *GatewayManager) gatewayInit( enableGatewayMTU bool, ) error { + if gw.netInfo.TopologyType() == types.Layer2Topology && gw.clusterRouterName != "" { + // layer2 network uses transit router, so we need to set the transit router info + // in all the other operations we can use both `gw.clusterRouterName == ""` and `gw.transitRouterInfo == nil` + // as an indicator of the old topology. + err := gw.setTransitRouterInfo(nodeName) + if err != nil { + return fmt.Errorf("failed to initialize layer2 info for gateway on node %s: %v", nodeName, err) + } + if err = gw.oldLayer2TopoCleanup(); err != nil { + return fmt.Errorf("failed to cleanup old layer2 topology for gateway on node %s: %v", nodeName, err) + } + } // If l3gatewayAnnotation.IPAddresses changed, we need to update the perPodSNATs, // so let's save the old value before we update the router for later use var oldExtIPs []net.IP @@ -844,16 +954,21 @@ func (gw *GatewayManager) gatewayInit( } } - gwRouter, err := gw.createGWRouter(gwConfig.annoConfig, gwConfig.gwLRPJoinIPs) + gwRouter, err := gw.createGWRouter(gwConfig) if err != nil { return err } - if err = gw.createGWRouterPeerPort(nodeName); err != nil { + if gw.netInfo.TopologyType() == types.Layer2Topology && gw.transitRouterInfo != nil { + err = gw.createGWRouterPeerRouterPort() + } else { + err = gw.createGWRouterPeerSwitchPort(nodeName) + } + if err != nil { return err } - gwLRPIPs, err := gw.createGWRouterPort(gwConfig.hostSubnets, gwConfig.gwLRPJoinIPs, enableGatewayMTU, gwRouter) + err = gw.createGWRouterPort(gwConfig, enableGatewayMTU, gwRouter) if err != nil { return err } @@ -890,20 +1005,20 @@ func (gw *GatewayManager) gatewayInit( } externalRouterPort := types.GWRouterToExtSwitchPrefix + gw.gwRouterName - if err = gw.updateGWRouterStaticRoutes(gwConfig.clusterSubnets, gwConfig.ovnClusterLRPToJoinIfAddrs, gwConfig.annoConfig, externalRouterPort, - gwRouter); err != nil { + if err = gw.updateGWRouterStaticRoutes(gwConfig, externalRouterPort, gwRouter); err != nil { return err } - if err = gw.updateClusterRouterStaticRoutes(gwConfig.hostSubnets, gwLRPIPs); err != nil { + gwRouterIPs := util.IPNetsToIPs(gwConfig.gwRouterJoinCIDRs) + if err = gw.updateClusterRouterStaticRoutes(gwConfig, gwRouterIPs); err != nil { return err } - if err = gw.syncNATsForGRIPChange(gwConfig.externalIPs, oldExtIPs, gwLRPIPs, gwRouter, oldLogicalRouter); err != nil { + if err = gw.syncNATsForGRIPChange(gwConfig, oldExtIPs, gwRouterIPs, gwRouter, oldLogicalRouter); err != nil { return err } - if err = gw.updateGWRouterNAT(nodeName, gwConfig.clusterSubnets, gwConfig.annoConfig, gwConfig.externalIPs, gwLRPIPs, gwRouter); err != nil { + if err = gw.updateGWRouterNAT(nodeName, gwConfig, gwRouterIPs, gwRouter); err != nil { return err } @@ -921,9 +1036,11 @@ func (gw *GatewayManager) gatewayInit( // If the network is advertised: // - For Layer2 topology, the match is the output port of the GR to the join switch and the destination must be a nodeIP in the cluster. // - For Layer3 topology, the match is the destination must be a nodeIP in the cluster. -func GetNetworkScopedClusterSubnetSNATMatch(nbClient libovsdbclient.Client, netInfo util.NetInfo, nodeName string, isNetworkAdvertised bool, ipFamily utilnet.IPFamily) (string, error) { +func GetNetworkScopedClusterSubnetSNATMatch(nbClient libovsdbclient.Client, netInfo util.NetInfo, nodeName string, + isNetworkAdvertised bool, ipFamily utilnet.IPFamily) (string, error) { + layer2OldTopo := netInfo.TopologyType() == types.Layer2Topology && !config.Layer2UsesTransitRouter if !isNetworkAdvertised { - if netInfo.TopologyType() != types.Layer2Topology { + if !layer2OldTopo { return "", nil } return fmt.Sprintf("outport == %q", types.GWRouterToExtSwitchPrefix+netInfo.GetNetworkScopedGWRouterName(nodeName)), nil @@ -940,7 +1057,7 @@ func GetNetworkScopedClusterSubnetSNATMatch(nbClient libovsdbclient.Client, netI if destinationMatch == "" { return "", fmt.Errorf("could not build a destination based SNAT match because no addressSet %v exists for IP family %v", dbIDs, ipFamily) } - if netInfo.TopologyType() != types.Layer2Topology { + if !layer2OldTopo { return destinationMatch, nil } return fmt.Sprintf("outport == %q && %s", types.GWRouterToExtSwitchPrefix+netInfo.GetNetworkScopedGWRouterName(nodeName), destinationMatch), nil @@ -1197,15 +1314,14 @@ func (gw *GatewayManager) Cleanup() error { // Get the gateway router port's IP address (connected to join switch) var nextHops []net.IP - gwRouterToJoinSwitchPortName := gw.getGWRouterPortName() - portName := gw.getGWRouterPeerPortName() + gwRouterPortName := gw.getGWRouterPortName() - gwIPAddrs, err := libovsdbutil.GetLRPAddrs(gw.nbClient, gwRouterToJoinSwitchPortName) + gwIPAddrs, err := libovsdbutil.GetLRPAddrs(gw.nbClient, gwRouterPortName) if err != nil && !errors.Is(err, libovsdbclient.ErrNotFound) { return fmt.Errorf( "failed to get gateway IPs for network %q from LRP %s: %v", gw.netInfo.GetNetworkName(), - gwRouterToJoinSwitchPortName, + gwRouterPortName, err, ) } @@ -1216,12 +1332,13 @@ func (gw *GatewayManager) Cleanup() error { gw.staticRouteCleanup(nextHops, nil) gw.policyRouteCleanup(nextHops) - // Remove the patch port that connects join switch to gateway router - lsp := nbdb.LogicalSwitchPort{Name: portName} - sw := nbdb.LogicalSwitch{Name: gw.joinSwitchName} - err = libovsdbops.DeleteLogicalSwitchPorts(gw.nbClient, &sw, &lsp) - if err != nil && !errors.Is(err, libovsdbclient.ErrNotFound) { - return fmt.Errorf("failed to delete logical switch port %s from switch %s: %w", portName, sw.Name, err) + if gw.netInfo.TopologyType() == types.Layer2Topology && gw.transitRouterInfo != nil { + err = gw.deleteGWRouterPeerRouterPort() + } else { + err = gw.deleteGWRouterPeerSwitchPort() + } + if err != nil { + return err } // Remove the static mac bindings of the gateway router @@ -1417,7 +1534,7 @@ func (gw *GatewayManager) SyncGateway( if err := pbrMngr.AddSameNodeIPPolicy(node.Name, mgmtIfAddr.IP.String(), l3GatewayConfigIP, relevantHostIPs); err != nil { return fmt.Errorf("failed to configure the policy based routes for network %q: %v", gw.netInfo.GetNetworkName(), err) } - if gw.netInfo.TopologyType() == types.Layer2Topology && config.Gateway.Mode == config.GatewayModeLocal { + if gw.netInfo.TopologyType() == types.Layer2Topology && gw.transitRouterInfo == nil && config.Gateway.Mode == config.GatewayModeLocal { if err := pbrMngr.AddHostCIDRPolicy(node, mgmtIfAddr.IP.String(), subnet.String()); err != nil { return fmt.Errorf("failed to configure the hostCIDR policy for L2 network %q on local gateway: %v", gw.netInfo.GetNetworkName(), err) @@ -1434,3 +1551,47 @@ func physNetName(netInfo util.NetInfo) string { } return netInfo.GetNetworkName() } + +func (gw *GatewayManager) setTransitRouterInfo(nodeName string) error { + node, err := gw.watchFactory.GetNode(nodeName) + if err != nil { + return err + } + gw.transitRouterInfo, err = getTransitRouterInfo(gw.netInfo, node) + if err != nil { + return err + } + return nil +} + +// oldLayer2TopoCleanup cleans up the old layer2 topology for the gateway on the node. +// Idempotent, will check if nbdb needs cleanup. +func (gw *GatewayManager) oldLayer2TopoCleanup() error { + // Check if the stale gateway router port exists. + // We delete GR a last operation in this cleanup, hence if it doesn't exist, we can skip the cleanup. + gwRouterPort := &nbdb.LogicalRouterPort{ + Name: types.RouterToSwitchPrefix + gw.joinSwitchName, + } + var err error + gwRouterPort, err = libovsdbops.GetLogicalRouterPort(gw.nbClient, gwRouterPort) + if err != nil && errors.Is(err, libovsdbclient.ErrNotFound) { + // cleanup not needed, old port does not exist + return nil + } + + // 1. Delete old port from the switch + if err := gw.deleteGWRouterPeerSwitchPort(); err != nil { + return fmt.Errorf("failed to delete peer switch port %s: %v", gw.getGWRouterPeerSwitchPortName(), err) + } + // 2. Remove the static mac bindings of the gateway router (otherwise you can't delete the router) + err = gateway.DeleteDummyGWMacBindings(gw.nbClient, gw.gwRouterName, gw.netInfo) + if err != nil { + return fmt.Errorf("failed to delete GR dummy mac bindings for node %s: %w", gw.nodeName, err) + } + + // 3. Delete stale GR, this will remove stale ports, NATs, routes and routing policies + if err := libovsdbops.DeleteLogicalRouter(gw.nbClient, &nbdb.LogicalRouter{Name: gw.gwRouterName}); err != nil { + return fmt.Errorf("failed to delete GR port %s: %v", gwRouterPort.Name, err) + } + return nil +} diff --git a/go-controller/pkg/ovn/gateway_test.go b/go-controller/pkg/ovn/gateway_test.go index 893d17ad09..2513f0ab84 100644 --- a/go-controller/pkg/ovn/gateway_test.go +++ b/go-controller/pkg/ovn/gateway_test.go @@ -500,7 +500,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterIPSubnets, - gwLRPJoinIPs: joinLRPIPs, + gwRouterJoinCIDRs: joinLRPIPs, hostAddrs: nil, externalIPs: extractExternalIPs(l3GatewayConfig), ovnClusterLRPToJoinIfAddrs: defLRPIPs, @@ -617,7 +617,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterIPSubnets, - gwLRPJoinIPs: joinLRPIPs, + gwRouterJoinCIDRs: joinLRPIPs, hostAddrs: nil, externalIPs: extractExternalIPs(l3GatewayConfig), ovnClusterLRPToJoinIfAddrs: defLRPIPs, @@ -740,7 +740,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterIPSubnets, - gwLRPJoinIPs: joinLRPIPs, + gwRouterJoinCIDRs: joinLRPIPs, hostAddrs: nil, externalIPs: extractExternalIPs(l3GatewayConfig), ovnClusterLRPToJoinIfAddrs: defLRPIPs, @@ -829,7 +829,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterIPSubnets, - gwLRPJoinIPs: joinLRPIPs, + gwRouterJoinCIDRs: joinLRPIPs, hostAddrs: nil, externalIPs: extractExternalIPs(l3GatewayConfig), ovnClusterLRPToJoinIfAddrs: defLRPIPs, @@ -911,7 +911,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterIPSubnets, - gwLRPJoinIPs: joinLRPIPs, + gwRouterJoinCIDRs: joinLRPIPs, hostAddrs: nil, externalIPs: extractExternalIPs(l3GatewayConfig), ovnClusterLRPToJoinIfAddrs: defLRPIPs, @@ -1007,7 +1007,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterIPSubnets, - gwLRPJoinIPs: joinLRPIPs, + gwRouterJoinCIDRs: joinLRPIPs, hostAddrs: nil, externalIPs: extractExternalIPs(l3GatewayConfig), ovnClusterLRPToJoinIfAddrs: defLRPIPs, @@ -1038,7 +1038,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { ginkgo.By("modifying the node join IP") oldJoinLRPIPs := joinLRPIPs joinLRPIPs = ovntest.MustParseIPNets("100.64.0.99/16") - gwConfig.gwLRPJoinIPs = joinLRPIPs + gwConfig.gwRouterJoinCIDRs = joinLRPIPs expectedOVNClusterRouter.StaticRoutes = []string{} err = newGatewayManager(fakeOvn, nodeName).gatewayInit( nodeName, @@ -1117,7 +1117,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterIPSubnets, - gwLRPJoinIPs: joinLRPIPs, + gwRouterJoinCIDRs: joinLRPIPs, hostAddrs: nil, externalIPs: extractExternalIPs(l3GatewayConfig), ovnClusterLRPToJoinIfAddrs: defLRPIPs, @@ -1197,7 +1197,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterIPSubnets, - gwLRPJoinIPs: joinLRPIPs, + gwRouterJoinCIDRs: joinLRPIPs, hostAddrs: nil, externalIPs: extractExternalIPs(l3GatewayConfig), ovnClusterLRPToJoinIfAddrs: defLRPIPs, @@ -1284,7 +1284,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterIPSubnets, - gwLRPJoinIPs: joinLRPIPs, + gwRouterJoinCIDRs: joinLRPIPs, hostAddrs: nil, externalIPs: extractExternalIPs(l3GatewayConfig), ovnClusterLRPToJoinIfAddrs: defLRPIPs, @@ -1365,7 +1365,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterIPSubnets, - gwLRPJoinIPs: joinLRPIPs, + gwRouterJoinCIDRs: joinLRPIPs, hostAddrs: nil, externalIPs: extractExternalIPs(l3GatewayConfig), ovnClusterLRPToJoinIfAddrs: defLRPIPs, @@ -1479,7 +1479,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterIPSubnets, - gwLRPJoinIPs: joinLRPIPs, + gwRouterJoinCIDRs: joinLRPIPs, hostAddrs: nil, externalIPs: extractExternalIPs(l3GatewayConfig), ovnClusterLRPToJoinIfAddrs: defLRPIPs, @@ -1596,7 +1596,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterIPSubnets, - gwLRPJoinIPs: joinLRPIPs, + gwRouterJoinCIDRs: joinLRPIPs, hostAddrs: nil, externalIPs: extractExternalIPs(l3GatewayConfig), ovnClusterLRPToJoinIfAddrs: defLRPIPs, @@ -1686,7 +1686,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterIPSubnets, - gwLRPJoinIPs: joinLRPIPs, + gwRouterJoinCIDRs: joinLRPIPs, hostAddrs: nil, externalIPs: extractExternalIPs(l3GatewayConfig), ovnClusterLRPToJoinIfAddrs: defLRPIPs, @@ -1805,7 +1805,7 @@ var _ = ginkgo.Describe("Gateway Init Operations", func() { annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterIPSubnets, - gwLRPJoinIPs: joinLRPIPs, + gwRouterJoinCIDRs: joinLRPIPs, hostAddrs: nil, externalIPs: extractExternalIPs(l3GatewayConfig), ovnClusterLRPToJoinIfAddrs: defLRPIPs, diff --git a/go-controller/pkg/ovn/layer2_user_defined_network_controller.go b/go-controller/pkg/ovn/layer2_user_defined_network_controller.go index eb7bb05abd..67d163fdfe 100644 --- a/go-controller/pkg/ovn/layer2_user_defined_network_controller.go +++ b/go-controller/pkg/ovn/layer2_user_defined_network_controller.go @@ -2,6 +2,7 @@ package ovn import ( "context" + "errors" "fmt" "net" "reflect" @@ -11,7 +12,11 @@ import ( "time" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" + utilnet "k8s.io/utils/net" + + libovsdbclient "github.com/ovn-kubernetes/libovsdb/client" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/allocator/pod" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config" @@ -26,6 +31,7 @@ import ( svccontroller "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/controller/services" lsm "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/logical_switch_manager" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/routeimport" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/topology" zoneinterconnect "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/ovn/zone_interconnect" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/persistentips" "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/retry" @@ -115,16 +121,19 @@ func (h *layer2UserDefinedNetworkControllerEventHandler) AddResource(obj interfa _, syncMgmtPort := h.oc.mgmtPortFailed.Load(node.Name) _, syncGw := h.oc.gatewaysFailed.Load(node.Name) _, syncReroute := h.oc.syncEIPNodeRerouteFailed.Load(node.Name) + _, syncNodeClusterRouterPort := h.oc.nodeClusterRouterPortFailed.Load(node.Name) nodeParams = &nodeSyncs{ - syncMgmtPort: syncMgmtPort, - syncGw: syncGw, - syncReroute: syncReroute, + syncMgmtPort: syncMgmtPort, + syncGw: syncGw, + syncReroute: syncReroute, + syncClusterRouterPort: syncNodeClusterRouterPort, } } else { nodeParams = &nodeSyncs{ - syncMgmtPort: true, - syncGw: true, - syncReroute: true, + syncMgmtPort: true, + syncGw: true, + syncReroute: true, + syncClusterRouterPort: true, } } return h.oc.addUpdateLocalNodeEvent(node, nodeParams) @@ -181,25 +190,31 @@ func (h *layer2UserDefinedNetworkControllerEventHandler) UpdateResource(oldObj, nodeGatewayMTUSupportChanged(oldNode, newNode) _, syncRerouteFailed := h.oc.syncEIPNodeRerouteFailed.Load(newNode.Name) shouldSyncReroute := syncRerouteFailed || util.NodeHostCIDRsAnnotationChanged(oldNode, newNode) + _, clusterRouterPortFailed := h.oc.nodeClusterRouterPortFailed.Load(newNode.Name) nodeSyncsParam = &nodeSyncs{ - syncMgmtPort: shouldSyncMgmtPort, - syncGw: shouldSyncGW, - syncReroute: shouldSyncReroute, + syncMgmtPort: shouldSyncMgmtPort, + syncGw: shouldSyncGW, + syncReroute: shouldSyncReroute, + syncClusterRouterPort: clusterRouterPortFailed, } } else { klog.Infof("Node %s moved from the remote zone %s to local zone %s.", newNode.Name, util.GetNodeZone(oldNode), util.GetNodeZone(newNode)) // The node is now a local zone node. Trigger a full node sync. nodeSyncsParam = &nodeSyncs{ - syncMgmtPort: true, - syncGw: true, - syncReroute: true, + syncMgmtPort: true, + syncGw: true, + syncReroute: true, + syncClusterRouterPort: true, } } return h.oc.addUpdateLocalNodeEvent(newNode, nodeSyncsParam) } else { _, syncZoneIC := h.oc.syncZoneICFailed.Load(newNode.Name) + if h.oc.remoteNodesNoRouter.Has(oldNode.Name) && util.UDNLayer2NodeUsesTransitRouter(newNode) { + syncZoneIC = true + } return h.oc.addUpdateRemoteNodeEvent(newNode, syncZoneIC) } case factory.PodType: @@ -266,15 +281,17 @@ type Layer2UserDefinedNetworkController struct { BaseLayer2UserDefinedNetworkController // Node-specific syncMaps used by node event handler - mgmtPortFailed sync.Map - gatewaysFailed sync.Map - syncZoneICFailed sync.Map - syncEIPNodeRerouteFailed sync.Map + mgmtPortFailed sync.Map + gatewaysFailed sync.Map + syncZoneICFailed sync.Map + syncEIPNodeRerouteFailed sync.Map + nodeClusterRouterPortFailed sync.Map // Cluster-wide router default Control Plane Protection (COPP) UUID defaultCOPPUUID string - gatewayManagers sync.Map + gatewayManagers sync.Map + gatewayTopologyFactory *topology.GatewayTopologyFactory // Cluster wide Load_Balancer_Group UUID. // Includes the cluster switch and all node gateway routers. @@ -296,6 +313,8 @@ type Layer2UserDefinedNetworkController struct { // reconcile the virtual machine default gateway sending GARPs and RAs defaultGatewayReconciler *kubevirt.DefaultGatewayReconciler + + remoteNodesNoRouter sets.Set[string] } // NewLayer2UserDefinedNetworkController create a new OVN controller for the given layer2 NAD @@ -352,10 +371,12 @@ func NewLayer2UserDefinedNetworkController( }, }, }, - mgmtPortFailed: sync.Map{}, - syncZoneICFailed: sync.Map{}, - gatewayManagers: sync.Map{}, - eIPController: eIPController, + mgmtPortFailed: sync.Map{}, + syncZoneICFailed: sync.Map{}, + gatewayTopologyFactory: topology.NewGatewayTopologyFactory(cnci.nbClient), + gatewayManagers: sync.Map{}, + eIPController: eIPController, + remoteNodesNoRouter: sets.New[string](), } if config.OVNKubernetesFeature.EnableInterconnect { @@ -464,6 +485,21 @@ func (oc *Layer2UserDefinedNetworkController) Cleanup() error { return true }) + // now delete cluster router + if config.Layer2UsesTransitRouter { + ops, err := libovsdbops.DeleteLogicalRouterOps(oc.nbClient, nil, + &nbdb.LogicalRouter{ + Name: oc.GetNetworkScopedClusterRouterName(), + }) + if err != nil { + return fmt.Errorf("failed to get ops for deleting routers of network %s: %v", oc.GetNetworkName(), err) + } + _, err = libovsdbops.TransactAndCheck(oc.nbClient, ops) + if err != nil { + return fmt.Errorf("failed to deleting routers/switches of network %s: %v", oc.GetNetworkName(), err) + } + } + // remove load balancer groups lbGroups := make([]*nbdb.LoadBalancerGroup, 0, 3) for _, lbGroupUUID := range []string{oc.switchLoadBalancerGroupUUID, oc.clusterLoadBalancerGroupUUID, oc.routerLoadBalancerGroupUUID} { @@ -484,6 +520,15 @@ func (oc *Layer2UserDefinedNetworkController) init() error { } oc.defaultCOPPUUID = defaultCOPPUUID + if config.Layer2UsesTransitRouter && oc.IsPrimaryNetwork() { + if len(oc.GetTunnelKeys()) != 2 { + return fmt.Errorf("layer2 network %s with transit router enabled requires exactly 2 tunnel keys, got: %v", oc.GetNetworkName(), oc.GetTunnelKeys()) + } + if _, err = oc.newTransitRouter(oc.GetTunnelKeys()[1]); err != nil { + return fmt.Errorf("failed to create OVN transit router for network %q: %v", oc.GetNetworkName(), err) + } + } + clusterLBGroupUUID, switchLBGroupUUID, routerLBGroupUUID, err := initLoadBalancerGroups(oc.nbClient, oc.GetNetInfo()) if err != nil { return err @@ -581,8 +626,22 @@ func (oc *Layer2UserDefinedNetworkController) newRetryFramework( func (oc *Layer2UserDefinedNetworkController) addUpdateLocalNodeEvent(node *corev1.Node, nSyncs *nodeSyncs) error { var errs []error + var err error + + hostSubnets := make([]*net.IPNet, 0, len(oc.Subnets())) + for _, subnet := range oc.Subnets() { + hostSubnets = append(hostSubnets, subnet.CIDR) + } if util.IsNetworkSegmentationSupportEnabled() && oc.IsPrimaryNetwork() { + if nSyncs.syncClusterRouterPort && config.Layer2UsesTransitRouter { + if err = oc.syncClusterRouterPorts(node, hostSubnets); err != nil { + errs = append(errs, err) + oc.nodeClusterRouterPortFailed.Store(node.Name, true) + } else { + oc.nodeClusterRouterPortFailed.Delete(node.Name) + } + } if nSyncs.syncGw { gwManager := oc.gatewayManagerForNode(node.Name) oc.gatewayManagers.Store(node.Name, gwManager) @@ -599,7 +658,7 @@ func (oc *Layer2UserDefinedNetworkController) addUpdateLocalNodeEvent(node *core return err } isUDNAdvertised := util.IsPodNetworkAdvertisedAtNode(oc, node.Name) - err = oc.addOrUpdateUDNClusterSubnetEgressSNAT(gwConfig.hostSubnets, gwManager.gwRouterName, isUDNAdvertised) + err = oc.addOrUpdateUDNClusterSubnetEgressSNAT(gwConfig.hostSubnets, node.Name, isUDNAdvertised) if err != nil { return err } @@ -624,16 +683,12 @@ func (oc *Layer2UserDefinedNetworkController) addUpdateLocalNodeEvent(node *core } if nSyncs.syncMgmtPort { - // Layer 2 networks have a single, large subnet, that's the one - // associated to the controller. Take the management port IP from - // there. - subnets := oc.Subnets() - hostSubnets := make([]*net.IPNet, 0, len(subnets)) - for _, subnet := range oc.Subnets() { - hostSubnets = append(hostSubnets, subnet.CIDR) + routerName := oc.GetNetworkScopedClusterRouterName() + if !config.Layer2UsesTransitRouter { + routerName = oc.GetNetworkScopedGWRouterName(node.Name) } if _, err := oc.syncNodeManagementPort(node, oc.GetNetworkScopedSwitchName(types.OVNLayer2Switch), - oc.GetNetworkScopedGWRouterName(node.Name), hostSubnets); err != nil { + routerName, hostSubnets); err != nil { errs = append(errs, err) oc.mgmtPortFailed.Store(node.Name, true) } else { @@ -661,7 +716,7 @@ func (oc *Layer2UserDefinedNetworkController) addUpdateLocalNodeEvent(node *core errs = append(errs, oc.BaseLayer2UserDefinedNetworkController.addUpdateLocalNodeEvent(node)) - err := utilerrors.Join(errs...) + err = utilerrors.Join(errs...) if err != nil { oc.recordNodeErrorEvent(node, err) } @@ -673,7 +728,11 @@ func (oc *Layer2UserDefinedNetworkController) addUpdateRemoteNodeEvent(node *cor if util.IsNetworkSegmentationSupportEnabled() && oc.IsPrimaryNetwork() { if syncZoneIC && config.OVNKubernetesFeature.EnableInterconnect { - if err := oc.addPortForRemoteNodeGR(node); err != nil { + portUpdateFn := oc.addRouterSetupForRemoteNodeGR + if !config.Layer2UsesTransitRouter { + portUpdateFn = oc.addSwitchPortForRemoteNodeGR + } + if err := portUpdateFn(node); err != nil { err = fmt.Errorf("failed to add the remote zone node %s's remote LRP, %w", node.Name, err) errs = append(errs, err) oc.syncZoneICFailed.Store(node.Name, true) @@ -692,7 +751,7 @@ func (oc *Layer2UserDefinedNetworkController) addUpdateRemoteNodeEvent(node *cor return err } -func (oc *Layer2UserDefinedNetworkController) addPortForRemoteNodeGR(node *corev1.Node) error { +func (oc *Layer2UserDefinedNetworkController) addSwitchPortForRemoteNodeGR(node *corev1.Node) error { nodeJoinSubnetIPs, err := udn.GetGWRouterIPs(node, oc.GetNetInfo()) if err != nil { if util.IsAnnotationNotSetError(err) { @@ -745,7 +804,135 @@ func (oc *Layer2UserDefinedNetworkController) addPortForRemoteNodeGR(node *corev return nil } +func (oc *Layer2UserDefinedNetworkController) cleanupSwitchPortForRemoteNodeGR(nodeName string) error { + logicalSwitchPort := &nbdb.LogicalSwitchPort{ + Name: types.SwitchToRouterPrefix + oc.GetNetworkScopedSwitchName(types.OVNLayer2Switch) + "_" + nodeName, + } + sw := &nbdb.LogicalSwitch{Name: oc.GetNetworkScopedSwitchName(types.OVNLayer2Switch)} + return libovsdbops.DeleteLogicalSwitchPorts(oc.nbClient, sw, logicalSwitchPort) +} + +func (oc *Layer2UserDefinedNetworkController) addRouterSetupForRemoteNodeGR(node *corev1.Node) error { + if oc.remoteNodesNoRouter.Has(node.Name) { + // remote node uses old topology + if util.UDNLayer2NodeUsesTransitRouter(node) { + // node has just been upgraded + // upgrade remote node connection + // delete old switch port + if err := oc.cleanupSwitchPortForRemoteNodeGR(node.Name); err != nil { + return fmt.Errorf("failed to cleanup port for remote node %s: %v", node.Name, err) + } + if err := oc.eIPController.updateNodeNextHop(oc.GetNetInfo(), node); err != nil { + return fmt.Errorf("failed to ensure EgressIP switch policies for network %s: %v", oc.GetNetworkName(), err) + } + oc.remoteNodesNoRouter.Delete(node.Name) + } else { + // node is still using old topology + if err := oc.addSwitchPortForRemoteNodeGR(node); err != nil { + return err + } + gwRouterJoinIPs, err := udn.GetGWRouterIPs(node, oc.GetNetInfo()) + if err != nil { + return err + } + // create joinIP via joinIP routes to send traffic via the switch port + return oc.addTransitRouterRoutes(node, gwRouterJoinIPs) + } + } + transitRouterInfo, err := getTransitRouterInfo(oc.GetNetInfo(), node) + if err != nil { + return nil + } + transitPort := nbdb.LogicalRouterPort{ + Name: types.TransitRouterToRouterPrefix + oc.GetNetworkScopedGWRouterName(node.Name), + MAC: util.IPAddrToHWAddr(transitRouterInfo.transitRouterNets[0].IP).String(), + Networks: util.IPNetsToStringSlice(transitRouterInfo.transitRouterNets), + Options: map[string]string{ + libovsdbops.RequestedTnlKey: strconv.Itoa(transitRouterInfo.nodeID), + libovsdbops.RequestedChassis: node.Name, + }, + ExternalIDs: map[string]string{ + types.NetworkExternalID: oc.GetNetworkName(), + types.TopologyExternalID: oc.TopologyType(), + types.NodeExternalID: node.Name, + }, + } + transitRouter := nbdb.LogicalRouter{Name: oc.GetNetworkScopedClusterRouterName()} + if err := libovsdbops.CreateOrUpdateLogicalRouterPort(oc.nbClient, &transitRouter, + &transitPort, nil, &transitPort.MAC, &transitPort.Networks, + &transitPort.Options, &transitPort.ExternalIDs); err != nil { + return fmt.Errorf("failed to create remote port %+v on router %+v: %v", transitPort, transitRouter, err) + } + return oc.addTransitRouterRoutes(node, transitRouterInfo.gatewayRouterNets) +} + +func (oc *Layer2UserDefinedNetworkController) addTransitRouterRoutes(node *corev1.Node, nextHops []*net.IPNet) error { + gwRouterJoinIPs, err := udn.GetGWRouterIPs(node, oc.GetNetInfo()) + if err != nil { + return err + } + for _, nextHop := range nextHops { + gwRouterJoinIP, err := util.MatchFirstIPNetFamily(utilnet.IsIPv6CIDR(nextHop), gwRouterJoinIPs) + if err != nil { + return fmt.Errorf("failed to add remote node join ip based "+ + "routes in distributed router %s: %v", + oc.GetNetworkScopedClusterRouterName(), err) + } + lrsr := nbdb.LogicalRouterStaticRoute{ + ExternalIDs: map[string]string{ + types.NodeExternalID: node.Name, + types.NetworkExternalID: oc.GetNetworkName(), + types.TopologyExternalID: oc.TopologyType(), + }, + IPPrefix: gwRouterJoinIP.IP.String(), + Nexthop: nextHop.IP.String(), + } + p := func(item *nbdb.LogicalRouterStaticRoute) bool { + return item.IPPrefix == lrsr.IPPrefix && + libovsdbops.PolicyEqualPredicate(lrsr.Policy, item.Policy) + } + + if err := libovsdbops.CreateOrReplaceLogicalRouterStaticRouteWithPredicate(oc.nbClient, + oc.GetNetworkScopedClusterRouterName(), &lrsr, p, &lrsr.Nexthop); err != nil { + return fmt.Errorf("error creating static route %+v in %s: %v", lrsr, oc.GetNetworkScopedClusterRouterName(), err) + } + } + return nil +} + +func (oc *Layer2UserDefinedNetworkController) cleanupRouterSetupForRemoteNodeGR(nodeName string) error { + transitPort := &nbdb.LogicalRouterPort{ + Name: types.TransitRouterToRouterPrefix + oc.GetNetworkScopedGWRouterName(nodeName), + } + var err error + transitPort, err = libovsdbops.GetLogicalRouterPort(oc.nbClient, transitPort) + if err != nil { + // logical router port doesn't exist. So nothing to cleanup. + return nil + } + + transitRouter := nbdb.LogicalRouter{ + Name: oc.GetNetworkScopedClusterRouterName(), + } + + if err = libovsdbops.DeleteLogicalRouterPorts(oc.nbClient, &transitRouter, transitPort); err != nil { + return fmt.Errorf("failed to delete logical router port %s from router %s for the node %s, error: %w", + transitPort.Name, transitRouter.Name, nodeName, err) + } + + // Delete any static routes in the transit router for this node. + p := func(lrsr *nbdb.LogicalRouterStaticRoute) bool { + return lrsr.ExternalIDs[types.NetworkExternalID] == oc.GetNetworkName() && lrsr.ExternalIDs[types.NodeExternalID] == nodeName + } + if err := libovsdbops.DeleteLogicalRouterStaticRoutesWithPredicate(oc.nbClient, oc.GetNetworkScopedClusterRouterName(), p); err != nil { + return fmt.Errorf("failed to cleanup static routes for the node %s: %w", nodeName, err) + } + + return nil +} + func (oc *Layer2UserDefinedNetworkController) deleteNodeEvent(node *corev1.Node) error { + // GatewayManager only exists for local nodes. if err := oc.gatewayManagerForNode(node.Name).Cleanup(); err != nil { return fmt.Errorf("failed to cleanup gateway on node %q: %w", node.Name, err) } @@ -753,6 +940,14 @@ func (oc *Layer2UserDefinedNetworkController) deleteNodeEvent(node *corev1.Node) oc.localZoneNodes.Delete(node.Name) oc.mgmtPortFailed.Delete(node.Name) oc.syncEIPNodeRerouteFailed.Delete(node.Name) + + if config.Layer2UsesTransitRouter { + // this is a no-op for local nodes + if err := oc.cleanupRouterSetupForRemoteNodeGR(node.Name); err != nil { + return fmt.Errorf("failed to cleanup remote node %q gateway: %w", node.Name, err) + } + oc.syncZoneICFailed.Delete(node.Name) + } return nil } @@ -770,8 +965,14 @@ func (oc *Layer2UserDefinedNetworkController) deleteNodeEvent(node *corev1.Node) // If isUDNAdvertised is true, then we want to SNAT all packets that are coming from pods on this network // leaving towards nodeIPs on the cluster to masqueradeIP. If network is advertise then the SNAT looks like this: // "eth.dst == 0a:58:5d:5d:00:02 && (ip4.dst == $a712973235162149816)" "169.254.0.36" "93.93.0.0/16" -func (oc *Layer2UserDefinedNetworkController) addOrUpdateUDNClusterSubnetEgressSNAT(localPodSubnets []*net.IPNet, gwRouterName string, isUDNAdvertised bool) error { - outputPort := types.GWRouterToJoinSwitchPrefix + gwRouterName +func (oc *Layer2UserDefinedNetworkController) addOrUpdateUDNClusterSubnetEgressSNAT(localPodSubnets []*net.IPNet, + nodeName string, isUDNAdvertised bool) error { + outputPort := oc.getCRToSwitchPortName(oc.GetNetworkScopedSwitchName("")) + routerName := oc.GetNetworkScopedClusterRouterName() + if !config.Layer2UsesTransitRouter { + routerName = oc.GetNetworkScopedGWRouterName(nodeName) + outputPort = types.GWRouterToJoinSwitchPrefix + routerName + } nats, err := oc.buildUDNEgressSNAT(localPodSubnets, outputPort, isUDNAdvertised) if err != nil { return err @@ -779,12 +980,12 @@ func (oc *Layer2UserDefinedNetworkController) addOrUpdateUDNClusterSubnetEgressS if len(nats) == 0 { return nil // nothing to do } - gwRouter := &nbdb.LogicalRouter{ - Name: gwRouterName, + router := &nbdb.LogicalRouter{ + Name: routerName, } - if err := libovsdbops.CreateOrUpdateNATs(oc.nbClient, gwRouter, nats...); err != nil { + if err := libovsdbops.CreateOrUpdateNATs(oc.nbClient, router, nats...); err != nil { return fmt.Errorf("failed to update SNAT for cluster on router: %q for network %q, error: %w", - gwRouterName, oc.GetNetworkName(), err) + routerName, oc.GetNetworkName(), err) } return nil } @@ -822,7 +1023,7 @@ func (oc *Layer2UserDefinedNetworkController) nodeGatewayConfig(node *corev1.Nod // at layer2 the GR LRP should be different per node same we do for layer3 // since they should not collide at the distributed switch later on - gwLRPJoinIPs, err := udn.GetGWRouterIPs(node, oc.GetNetInfo()) + gwRouterJoinCIDRs, err := udn.GetGWRouterIPs(node, oc.GetNetInfo()) if err != nil { return nil, fmt.Errorf("failed composing LRP addresses for layer2 network %s: %w", oc.GetNetworkName(), err) } @@ -833,13 +1034,20 @@ func (oc *Layer2UserDefinedNetworkController) nodeGatewayConfig(node *corev1.Nod annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: hostSubnets, - gwLRPJoinIPs: gwLRPJoinIPs, + gwRouterJoinCIDRs: gwRouterJoinCIDRs, hostAddrs: nil, externalIPs: externalIPs, ovnClusterLRPToJoinIfAddrs: nil, }, nil } +func (oc *Layer2UserDefinedNetworkController) newTransitRouter(tunnelKey int) (*nbdb.LogicalRouter, error) { + return oc.gatewayTopologyFactory.NewTransitRouter( + oc.GetNetInfo(), + oc.defaultCOPPUUID, strconv.Itoa(tunnelKey), + ) +} + func (oc *Layer2UserDefinedNetworkController) newGatewayManager(nodeName string) *GatewayManager { return NewGatewayManagerForLayer2Topology( nodeName, @@ -848,6 +1056,7 @@ func (oc *Layer2UserDefinedNetworkController) newGatewayManager(nodeName string) oc.nbClient, oc.GetNetInfo(), oc.watchFactory, + config.Layer2UsesTransitRouter, oc.gatewayOptions()..., ) } @@ -927,3 +1136,242 @@ func (oc *Layer2UserDefinedNetworkController) reconcileLiveMigrationTargetZone(k } return nil } + +// syncClusterRouterPorts connects the network switch to the transit router +func (oc *Layer2UserDefinedNetworkController) syncClusterRouterPorts(node *corev1.Node, hostSubnets []*net.IPNet) error { + switchName := oc.GetNetworkScopedSwitchName("") + + // Connect the switch to the router. + logicalSwitchPort := nbdb.LogicalSwitchPort{ + Name: types.SwitchToTransitRouterPrefix + switchName, + Type: "router", + Addresses: []string{"router"}, + Options: map[string]string{ + libovsdbops.RouterPort: types.TransitRouterToSwitchPrefix + switchName, + }, + ExternalIDs: map[string]string{ + types.NetworkExternalID: oc.GetNetworkName(), + types.TopologyExternalID: oc.TopologyType(), + }, + } + sw := nbdb.LogicalSwitch{Name: switchName} + err := libovsdbops.CreateOrUpdateLogicalSwitchPortsOnSwitch(oc.nbClient, &sw, &logicalSwitchPort) + if err != nil { + klog.Errorf("Failed to add logical port %+v to switch %s: %v", logicalSwitchPort, switchName, err) + return err + } + + if err = oc.syncNodeClusterRouterPort(node, hostSubnets); err != nil { + return err + } + + if len(oc.remoteNodesNoRouter) > 0 { + // now add upgrade-only connection using IP-less port + if err = oc.ensureUpgradeTopology(node); err != nil { + return fmt.Errorf("failed to ensure upgrade topology for node %s: %w", node.Name, err) + } + } else { + // cleanup upgrade topology if it exists + if err = oc.cleanupUpgradeTopology(); err != nil { + return fmt.Errorf("failed to cleanup upgrade topology for network %s: %w", oc.GetNetworkName(), err) + } + } + return nil +} + +func (oc *Layer2UserDefinedNetworkController) ensureUpgradeTopology(node *corev1.Node) error { + switchName := oc.GetNetworkScopedSwitchName("") + sw := nbdb.LogicalSwitch{Name: switchName} + + // create switch to router connection with GR MAC and dummy join IPs + upgradeRouterPortName := types.TransitRouterToSwitchPrefix + switchName + "-upgrade" + // create switch port + upgradeSwitchPort := nbdb.LogicalSwitchPort{ + Name: types.SwitchToTransitRouterPrefix + switchName + "-upgrade", + Type: "router", + Addresses: []string{"router"}, + Options: map[string]string{ + libovsdbops.RouterPort: upgradeRouterPortName, + }, + ExternalIDs: map[string]string{ + types.NetworkExternalID: oc.GetNetworkName(), + types.TopologyExternalID: oc.TopologyType(), + }, + } + tunnelID, err := util.ParseUDNLayer2NodeGRLRPTunnelIDs(node, oc.GetNetworkName()) + if err != nil { + if util.IsAnnotationNotSetError(err) { + // wait for the annotation to be assigned + return types.NewSuppressedError(err) + } + return fmt.Errorf("failed to fetch tunnelID annotation from the node %s for network %s, err: %w", + node.Name, oc.GetNetworkName(), err) + } + upgradeSwitchPort.Options[libovsdbops.RequestedTnlKey] = strconv.Itoa(tunnelID) + + err = libovsdbops.CreateOrUpdateLogicalSwitchPortsOnSwitch(oc.nbClient, &sw, &upgradeSwitchPort) + if err != nil { + klog.Errorf("Failed to add logical port %+v to switch %s: %v", upgradeSwitchPort, switchName, err) + return err + } + // create router port + // find GW MAC + gwRouterJoinNets, err := udn.GetGWRouterIPs(node, oc.GetNetInfo()) + if err != nil { + return fmt.Errorf("failed composing LRP addresses for layer2 network %s: %w", oc.GetNetworkName(), err) + } + // add fake joinIPs + fakeJoinIPs := udn.GetLastIPsFromJoinSubnet(oc.GetNetInfo()) + + gwLRPMAC := util.IPAddrToHWAddr(gwRouterJoinNets[0].IP) + logicalRouterPort := nbdb.LogicalRouterPort{ + Name: upgradeRouterPortName, + MAC: gwLRPMAC.String(), + Networks: util.IPNetsToStringSlice(fakeJoinIPs), + } + logicalRouter := nbdb.LogicalRouter{Name: oc.GetNetworkScopedClusterRouterName()} + + err = libovsdbops.CreateOrUpdateLogicalRouterPort(oc.nbClient, &logicalRouter, &logicalRouterPort, + nil, &logicalRouterPort.MAC, &logicalRouterPort.Networks, &logicalRouterPort.Options) + if err != nil { + return fmt.Errorf("failed to add logical router port %s, error: %v", upgradeRouterPortName, err) + } + + // now add masq subnet to the router port, this ensures that only one port respond to the + // ARP/NDP requests for the masq IPs + lrpName := oc.getCRToSwitchPortName(switchName) + trRouterPort, err := libovsdbops.GetLogicalRouterPort(oc.nbClient, &nbdb.LogicalRouterPort{Name: lrpName}) + if err != nil { + return fmt.Errorf("failed to get logical router port %s: %w", lrpName, err) + } + masqSubnets, err := udn.GetUDNMgmtPortMasqueradeIPs(oc.GetNetworkID()) + if err != nil { + return fmt.Errorf("failed to get masquerade IPs, network %s (%d): %w", oc.GetNetworkName(), oc.GetNetworkID(), err) + } + + existingNetworkSet := sets.New[string](trRouterPort.Networks...) + newNetworksSet := sets.New[string](util.IPNetsToStringSlice(masqSubnets)...) + // Only add masq IPs if they are not already present + if existingNetworkSet.IsSuperset(newNetworksSet) { + return nil + } + trRouterPort.Networks = append(trRouterPort.Networks, newNetworksSet.UnsortedList()...) + err = libovsdbops.CreateOrUpdateLogicalRouterPort(oc.nbClient, &logicalRouter, trRouterPort, nil, &trRouterPort.Networks) + if err != nil { + return fmt.Errorf("failed to update logical router port %s with masq IPs: %w", lrpName, err) + } + return nil +} + +func (oc *Layer2UserDefinedNetworkController) cleanupUpgradeTopology() error { + // 1. Delete switch to router connection with GR MAC and dummy join IPs + switchName := oc.GetNetworkScopedSwitchName("") + sw := nbdb.LogicalSwitch{Name: switchName} + logicalRouter := nbdb.LogicalRouter{Name: oc.GetNetworkScopedClusterRouterName()} + + upgradeRouterPortName := types.TransitRouterToSwitchPrefix + switchName + "-upgrade" + upgradeSwitchPortName := types.SwitchToTransitRouterPrefix + switchName + "-upgrade" + if err := libovsdbops.DeleteLogicalSwitchPorts(oc.nbClient, &sw, &nbdb.LogicalSwitchPort{Name: upgradeSwitchPortName}); err != nil { + return fmt.Errorf("failed to delete logical switch port %s: %w", upgradeSwitchPortName, err) + } + if err := libovsdbops.DeleteLogicalRouterPorts(oc.nbClient, &logicalRouter, &nbdb.LogicalRouterPort{Name: upgradeRouterPortName}); err != nil { + return fmt.Errorf("failed to delete logical router port %s: %w", upgradeRouterPortName, err) + } + // 2. Delete masq IPs from the router port as it is no longer needed + lrpName := oc.getCRToSwitchPortName(switchName) + masqSubnets, err := udn.GetUDNMgmtPortMasqueradeIPs(oc.GetNetworkID()) + if err != nil { + return fmt.Errorf("failed to get masquerade IPs, network %s (%d): %w", oc.GetNetworkName(), oc.GetNetworkID(), err) + } + trRouterPort, err := libovsdbops.GetLogicalRouterPort(oc.nbClient, &nbdb.LogicalRouterPort{Name: lrpName}) + if err != nil { + return fmt.Errorf("failed to get logical router port %s: %w", lrpName, err) + } + updatedNetworks := sets.New(trRouterPort.Networks...) + staleNetworksSet := sets.New[string](util.IPNetsToStringSlice(masqSubnets)...) + if updatedNetworks.Intersection(staleNetworksSet).Len() == 0 { + // No masq IPs to remove, nothing to do + return nil + } + for network := range staleNetworksSet { + updatedNetworks.Delete(network) + } + trRouterPort.Networks = updatedNetworks.UnsortedList() + err = libovsdbops.CreateOrUpdateLogicalRouterPort(oc.nbClient, &logicalRouter, trRouterPort, nil, &trRouterPort.Networks) + if err != nil { + return fmt.Errorf("failed to update logical router port %s with masq IPs: %w", lrpName, err) + } + return nil +} + +// syncNodes finds nodes that still have LRP on the transit router, but the node doesn't exist anymore +// and clean it up. +// TODO add tests +func (oc *Layer2UserDefinedNetworkController) syncNodes(nodes []interface{}) error { + if err := oc.BaseLayer2UserDefinedNetworkController.syncNodes(nodes); err != nil { + return err + } + foundNodeNames := sets.New[string]() + foundNodes := make([]*corev1.Node, len(nodes)) + for i, obj := range nodes { + node, ok := obj.(*corev1.Node) + if !ok { + return fmt.Errorf("spurious object in syncNodes: %v", obj) + } + foundNodeNames.Insert(node.Name) + foundNodes[i] = node + } + oc.setRemoteNodesNoRouter(foundNodes) + // Get the transit router. If it's not present - no cleanup to do + tr := &nbdb.LogicalRouter{ + Name: oc.GetNetworkScopedClusterRouterName(), + } + + tr, err := libovsdbops.GetLogicalRouter(oc.nbClient, tr) + if err != nil { + if errors.Is(err, libovsdbclient.ErrNotFound) { + return nil + } + return err + } + + staleNodeNames := []string{} + for _, p := range tr.Ports { + lp := &nbdb.LogicalRouterPort{ + UUID: p, + } + + lp, err = libovsdbops.GetLogicalRouterPort(oc.nbClient, lp) + if err != nil { + continue + } + + if lp.ExternalIDs == nil { + continue + } + + lportNode := lp.ExternalIDs[types.NodeExternalID] + if !foundNodeNames.Has(lportNode) { + staleNodeNames = append(staleNodeNames, lportNode) + } + } + + for _, staleNodeName := range staleNodeNames { + if err = oc.cleanupRouterSetupForRemoteNodeGR(staleNodeName); err != nil { + klog.Errorf("Failed to cleanup the transit router resources from OVN Northbound db for the stale node %s: %v", staleNodeName, err) + } + } + return nil +} + +// setRemoteNodesNoRouter finds remote nodes that do not use transit router. +func (oc *Layer2UserDefinedNetworkController) setRemoteNodesNoRouter(nodes []*corev1.Node) { + for _, node := range nodes { + if oc.isLocalZoneNode(node) { + continue + } + if !util.UDNLayer2NodeUsesTransitRouter(node) { + oc.remoteNodesNoRouter.Insert(node.Name) + } + } +} diff --git a/go-controller/pkg/ovn/layer2_user_defined_network_controller_test.go b/go-controller/pkg/ovn/layer2_user_defined_network_controller_test.go index 352ef1497f..6bd0c41373 100644 --- a/go-controller/pkg/ovn/layer2_user_defined_network_controller_test.go +++ b/go-controller/pkg/ovn/layer2_user_defined_network_controller_test.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "net" - "strconv" "time" ipamclaimsapi "github.com/k8snetworkplumbingwg/ipamclaims/pkg/crd/ipamclaims/v1alpha1" @@ -370,7 +369,6 @@ var _ = Describe("OVN Multi-Homed pod operations for layer 2 network", func() { *netConf, ) Expect(err).NotTo(HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} const nodeIPv4CIDR = "192.168.126.202/24" testNode, err := newNodeWithUserDefinedNetworks(nodeName, nodeIPv4CIDR) @@ -388,7 +386,10 @@ var _ = Describe("OVN Multi-Homed pod operations for layer 2 network", func() { Expect(err).NotTo(HaveOccurred()) initialDB.NBData = append( initialDB.NBData, - expectedLayer2EgressEntities(networkConfig, *gwConfig, nodeName)...) + expectedGWEntitiesLayer2(nodeName, networkConfig, *gwConfig)...) + initialDB.NBData = append( + initialDB.NBData, + expectedLayer2EgressEntities(networkConfig, *gwConfig, networkConfig.Subnets()[0].CIDR)...) } initialDB.NBData = append(initialDB.NBData, nbZone) @@ -542,112 +543,118 @@ func dummyL2TestPod(nsName string, info userDefinedNetInfo, podIdx, udnNetIdx in return pod } -func expectedLayer2EgressEntities(netInfo util.NetInfo, gwConfig util.L3GatewayConfig, nodeName string) []libovsdbtest.TestData { - const ( - nat1 = "nat1-UUID" - nat2 = "nat2-UUID" - nat3 = "nat3-UUID" - perPodSNAT = "pod-snat-UUID" - sr1 = "sr1-UUID" - sr2 = "sr2-UUID" - lrsr1 = "lrsr1-UUID" - routerPolicyUUID1 = "lrp1-UUID" - hostCIDRPolicyUUID = "host-cidr-policy-UUID" - masqSNATUUID1 = "masq-snat1-UUID" - ) - gwRouterName := fmt.Sprintf("GR_%s_test-node", netInfo.GetNetworkName()) - staticRouteOutputPort := ovntypes.GWRouterToExtSwitchPrefix + gwRouterName - gwRouterToNetworkSwitchPortName := ovntypes.RouterToSwitchPrefix + netInfo.GetNetworkScopedName(ovntypes.OVNLayer2Switch) - gwRouterToExtSwitchPortName := fmt.Sprintf("%s%s", ovntypes.GWRouterToExtSwitchPrefix, gwRouterName) - masqSNAT := newMasqueradeManagementNATEntry(masqSNATUUID1, netInfo) - - var nat []string - nat = append(nat, nat1, nat2, nat3, masqSNATUUID1) - gr := &nbdb.LogicalRouter{ - Name: gwRouterName, - UUID: gwRouterName + "-UUID", - Nat: nat, - Ports: []string{gwRouterToNetworkSwitchPortName + "-UUID", gwRouterToExtSwitchPortName + "-UUID"}, - StaticRoutes: []string{sr1, sr2}, - ExternalIDs: gwRouterExternalIDs(netInfo, gwConfig), - Options: gwRouterOptions(gwConfig), - Policies: []string{routerPolicyUUID1}, - } - gr.Options["lb_force_snat_ip"] = gwRouterJoinIPAddress().IP.String() - expectedEntities := []libovsdbtest.TestData{ - gr, - expectedGWToNetworkSwitchRouterPort(gwRouterToNetworkSwitchPortName, netInfo, gwRouterJoinIPAddress(), layer2SubnetGWAddr()), - expectedGRStaticRoute(sr1, dummyMasqueradeSubnet().String(), nextHopMasqueradeIP().String(), nil, &staticRouteOutputPort, netInfo), - expectedGRStaticRoute(sr2, ipv4DefaultRoute().String(), nodeGateway().IP.String(), nil, &staticRouteOutputPort, netInfo), - expectedGRToExternalSwitchLRP(gwRouterName, netInfo, nodePhysicalIPAddress(), udnGWSNATAddress()), - masqSNAT, - expectedLogicalRouterPolicy(routerPolicyUUID1, netInfo, nodeName, nodeIP().IP.String(), managementPortIP(layer2Subnet()).String()), - } +func getTestTransitRouterInfo(netInfo util.NetInfo) *transitRouterInfo { + transitRouterInfo, err := getTransitRouterInfo(netInfo, &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + // this is hardcoded in newNodeWithSecondaryNets + ovnNodeID: "4", + }, + }, + }) + Expect(err).NotTo(HaveOccurred()) + return transitRouterInfo +} +func expectedGWEntitiesLayer2(nodeName string, netInfo util.NetInfo, gwConfig util.L3GatewayConfig) []libovsdbtest.TestData { + gwRouterName := fmt.Sprintf("GR_%s_%s", netInfo.GetNetworkName(), nodeName) + trInfo := getTestTransitRouterInfo(netInfo) + expectedEntities := append( + expectedGWRouterPlusNATAndStaticRoutes(nodeName, gwRouterName, netInfo, gwConfig), + expectedGRToTransitRouterLRPLayer2(gwRouterName, gwRouterJoinIPAddress(), netInfo, trInfo), + expectedGRToExternalSwitchLRP(gwRouterName, netInfo, nodePhysicalIPAddress(), udnGWSNATAddress()), + ) expectedEntities = append(expectedEntities, expectedStaticMACBindings(gwRouterName, staticMACBindingIPs())...) - - if config.Gateway.Mode == config.GatewayModeLocal { - l2LGWLRP := expectedLogicalRouterPolicy(hostCIDRPolicyUUID, netInfo, nodeName, nodeCIDR().String(), managementPortIP(layer2Subnet()).String()) - l2LGWLRP.Match = fmt.Sprintf(`ip4.dst == %s && ip4.src == %s`, nodeCIDR().String(), layer2Subnet().String()) - l2LGWLRP.Priority, _ = strconv.Atoi(ovntypes.UDNHostCIDRPolicyPriority) - expectedEntities = append(expectedEntities, l2LGWLRP) - gr.Policies = append(gr.Policies, hostCIDRPolicyUUID) - lrsr := expectedGRStaticRoute(lrsr1, layer2Subnet().String(), managementPortIP(layer2Subnet()).String(), - &nbdb.LogicalRouterStaticRoutePolicySrcIP, nil, netInfo) - expectedEntities = append(expectedEntities, lrsr) - gr.StaticRoutes = append(gr.StaticRoutes, lrsr1) - } - expectedEntities = append(expectedEntities, expectedExternalSwitchAndLSPs(netInfo, gwConfig, nodeName)...) - expectedEntities = append(expectedEntities, newNATEntry(nat1, dummyMasqueradeIP().IP.String(), gwRouterJoinIPAddress().IP.String(), standardNonDefaultNetworkExtIDs(netInfo), "")) - expectedEntities = append(expectedEntities, newNATEntry(nat2, dummyMasqueradeIP().IP.String(), layer2Subnet().String(), standardNonDefaultNetworkExtIDs(netInfo), fmt.Sprintf("outport == %q", gwRouterToExtSwitchPortName))) - expectedEntities = append(expectedEntities, newNATEntry(nat3, dummyMasqueradeIP().IP.String(), layer2SubnetGWAddr().IP.String(), standardNonDefaultNetworkExtIDs(netInfo), "")) return expectedEntities } -func expectedGWToNetworkSwitchRouterPort(name string, netInfo util.NetInfo, networks ...*net.IPNet) *nbdb.LogicalRouterPort { +func expectedGRToTransitRouterLRPLayer2(gatewayRouterName string, gwRouterLRPIP *net.IPNet, netInfo util.NetInfo, + transitRouterInfo *transitRouterInfo) *nbdb.LogicalRouterPort { + lrpName := fmt.Sprintf("%s%s", ovntypes.RouterToTransitRouterPrefix, gatewayRouterName) options := map[string]string{libovsdbops.GatewayMTU: fmt.Sprintf("%d", 1400)} - lrp := expectedLogicalRouterPort(name, netInfo, options, networks...) - - if config.IPv6Mode { - lrp.Ipv6RaConfigs = map[string]string{ - "address_mode": "dhcpv6_stateful", - "mtu": "1400", - "send_periodic": "true", - "max_interval": "900", - "min_interval": "300", - "router_preference": "LOW", - } - } - return lrp -} -func layer2Subnet() *net.IPNet { - return &net.IPNet{ - IP: net.ParseIP("100.200.0.0"), - Mask: net.CIDRMask(16, 32), + var ips []string + ips = append(ips, gwRouterLRPIP.String()) + ips = append(ips, transitRouterInfo.gatewayRouterNets[0].String()) + mac := util.IPAddrToHWAddr(gwRouterLRPIP.IP).String() + return &nbdb.LogicalRouterPort{ + UUID: lrpName + "-UUID", + Name: lrpName, + Networks: ips, + MAC: mac, + Options: options, + ExternalIDs: map[string]string{ + ovntypes.TopologyExternalID: netInfo.TopologyType(), + ovntypes.NetworkExternalID: netInfo.GetNetworkName(), + }, + Peer: ptr.To(ovntypes.TransitRouterToRouterPrefix + gatewayRouterName), } } -func layer2SubnetGWAddr() *net.IPNet { - return &net.IPNet{ - IP: net.ParseIP("100.200.0.1"), - Mask: net.CIDRMask(16, 32), +func expectedLayer2EgressEntities(netInfo util.NetInfo, gwConfig util.L3GatewayConfig, nodeSubnet *net.IPNet) []libovsdbtest.TestData { + const ( + routerPolicyUUID1 = "lrpol1-UUID" + staticRouteUUID1 = "sr1-UUID" + staticRouteUUID2 = "sr2-UUID" + masqSNATUUID1 = "masq-snat1-UUID" + ) + trInfo := getTestTransitRouterInfo(netInfo) + transitRouterName := fmt.Sprintf("%s_transit_router", netInfo.GetNetworkName()) + + rtosLRPName := fmt.Sprintf("%s%s", ovntypes.TransitRouterToSwitchPrefix, netInfo.GetNetworkScopedName(ovntypes.OVNLayer2Switch)) + rtosLRPUUID := rtosLRPName + "-UUID" + gwRouterName := fmt.Sprintf("GR_%s_%s", netInfo.GetNetworkName(), nodeName) + + rtorLRPName := fmt.Sprintf("%s%s", ovntypes.TransitRouterToRouterPrefix, gwRouterName) + rtorLRPUUID := rtorLRPName + "-UUID" + nodeIP := gwConfig.IPAddresses[0].IP.String() + masqSNAT := newNATEntry(masqSNATUUID1, "169.254.169.14", nodeSubnet.String(), standardNonDefaultNetworkExtIDs(netInfo), "") + masqSNAT.Match = getMasqueradeManagementIPSNATMatch(util.IPAddrToHWAddr(managementPortIP(nodeSubnet)).String()) + masqSNAT.LogicalPort = ptr.To(fmt.Sprintf("trtos-%s", netInfo.GetNetworkScopedName(ovntypes.OVNLayer2Switch))) + if !config.OVNKubernetesFeature.EnableInterconnect { + masqSNAT.GatewayPort = nil } -} - -func nodeGateway() *net.IPNet { - return &net.IPNet{ - IP: net.ParseIP("192.168.126.1"), - Mask: net.CIDRMask(24, 32), + gwChassisName := fmt.Sprintf("%s-%s", rtosLRPName, gwConfig.ChassisID) + gatewayChassisUUID := gwChassisName + "-UUID" + lrsrNextHop := trInfo.gatewayRouterNets[0].IP.String() + if config.Gateway.Mode == config.GatewayModeLocal { + lrsrNextHop = managementPortIP(nodeSubnet).String() } -} - -func ipv4DefaultRoute() *net.IPNet { - return &net.IPNet{ - IP: net.ParseIP("0.0.0.0"), - Mask: net.CIDRMask(0, 32), + expectedEntities := []libovsdbtest.TestData{ + &nbdb.LogicalRouter{ + Name: transitRouterName, + UUID: transitRouterName + "-UUID", + Ports: []string{rtosLRPUUID, rtorLRPUUID}, + StaticRoutes: []string{staticRouteUUID1, staticRouteUUID2}, + Policies: []string{routerPolicyUUID1}, + ExternalIDs: standardNonDefaultNetworkExtIDs(netInfo), + Nat: []string{masqSNATUUID1}, + }, + &nbdb.LogicalRouterPort{ + UUID: rtosLRPUUID, + Name: rtosLRPName, + Networks: []string{"100.200.0.1/16"}, + MAC: "0a:58:64:c8:00:01", + GatewayChassis: []string{gatewayChassisUUID}, + Options: map[string]string{libovsdbops.GatewayMTU: "1400"}, + }, + &nbdb.LogicalRouterPort{ + UUID: rtorLRPUUID, + Name: rtorLRPName, + Networks: []string{trInfo.transitRouterNets[0].String()}, + MAC: util.IPAddrToHWAddr(trInfo.transitRouterNets[0].IP).String(), + Options: map[string]string{libovsdbops.RequestedTnlKey: "4"}, + Peer: ptr.To(fmt.Sprintf("%s%s", ovntypes.RouterToTransitRouterPrefix, gwRouterName)), + ExternalIDs: standardNonDefaultNetworkExtIDs(netInfo), + }, + expectedGRStaticRoute(staticRouteUUID1, nodeSubnet.String(), lrsrNextHop, &nbdb.LogicalRouterStaticRoutePolicySrcIP, nil, netInfo), + expectedGRStaticRoute(staticRouteUUID2, gwRouterJoinIPAddress().IP.String(), trInfo.gatewayRouterNets[0].IP.String(), nil, nil, netInfo), + expectedLogicalRouterPolicy(routerPolicyUUID1, netInfo, nodeName, nodeIP, managementPortIP(nodeSubnet).String()), + masqSNAT, + &nbdb.GatewayChassis{UUID: gatewayChassisUUID, Name: gwChassisName, Priority: 1, ChassisName: gwConfig.ChassisID}, } + return expectedEntities } func dummyLayer2SecondaryUserDefinedNetwork(subnets string) userDefinedNetInfo { @@ -665,20 +672,6 @@ func dummyLayer2PrimaryUserDefinedNetwork(subnets string) userDefinedNetInfo { return secondaryNet } -func nodeIP() *net.IPNet { - return &net.IPNet{ - IP: net.ParseIP("192.168.126.202"), - Mask: net.CIDRMask(24, 32), - } -} - -func nodeCIDR() *net.IPNet { - return &net.IPNet{ - IP: net.ParseIP("192.168.126.0"), - Mask: net.CIDRMask(24, 32), - } -} - func setupFakeOvnForLayer2Topology(fakeOvn *FakeOVN, initialDB libovsdbtest.TestSetup, netInfo userDefinedNetInfo, testNode *corev1.Node, podInfo testPod, pod *corev1.Pod, extraObjects ...runtime.Object) error { By(fmt.Sprintf("creating a network attachment definition for network: %s", netInfo.netName)) nad, err := newNetworkAttachmentDefinition( @@ -687,7 +680,6 @@ func setupFakeOvnForLayer2Topology(fakeOvn *FakeOVN, initialDB libovsdbtest.Test *netInfo.netconf(), ) Expect(err).NotTo(HaveOccurred()) - nad.Annotations = map[string]string{ovntypes.OvnNetworkIDAnnotation: userDefinedNetworkID} By("setting up the OVN DB without any entities in it") Expect(netInfo.setupOVNDependencies(&initialDB)).To(Succeed()) @@ -703,6 +695,10 @@ func setupFakeOvnForLayer2Topology(fakeOvn *FakeOVN, initialDB libovsdbtest.Test Name: fmt.Sprintf("GR_%s_%s", networkConfig.GetNetworkName(), nodeName), ExternalIDs: standardNonDefaultNetworkExtIDs(networkConfig), }, + &nbdb.LogicalRouter{ + Name: fmt.Sprintf("%s_transit_router", netInfo.netName), + ExternalIDs: standardNonDefaultNetworkExtIDs(networkConfig), + }, newNetworkClusterPortGroup(networkConfig), ) } diff --git a/go-controller/pkg/ovn/layer3_user_defined_network_controller.go b/go-controller/pkg/ovn/layer3_user_defined_network_controller.go index 815a8b4c9c..ca9ca28f69 100644 --- a/go-controller/pkg/ovn/layer3_user_defined_network_controller.go +++ b/go-controller/pkg/ovn/layer3_user_defined_network_controller.go @@ -1059,7 +1059,7 @@ func (oc *Layer3UserDefinedNetworkController) nodeGatewayConfig(node *corev1.Nod return nil, fmt.Errorf("failed to get node %q subnet annotation for network %q: %v", node.Name, oc.GetNetworkName(), err) } - gwLRPJoinIPs, err := udn.GetGWRouterIPs(node, oc.GetNetInfo()) + gwRouterJoinCIDRs, err := udn.GetGWRouterIPs(node, oc.GetNetInfo()) if err != nil { return nil, fmt.Errorf("failed extracting node %q GW router join subnet IP for layer3 network %q: %w", node.Name, networkName, err) } @@ -1071,7 +1071,7 @@ func (oc *Layer3UserDefinedNetworkController) nodeGatewayConfig(node *corev1.Nod annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterSubnets, - gwLRPJoinIPs: gwLRPJoinIPs, + gwRouterJoinCIDRs: gwRouterJoinCIDRs, hostAddrs: hostAddrs, externalIPs: externalIPs, ovnClusterLRPToJoinIfAddrs: oc.ovnClusterLRPToJoinIfAddrs, diff --git a/go-controller/pkg/ovn/layer3_user_defined_network_controller_test.go b/go-controller/pkg/ovn/layer3_user_defined_network_controller_test.go index f9cec964ae..ed17e7b9cb 100644 --- a/go-controller/pkg/ovn/layer3_user_defined_network_controller_test.go +++ b/go-controller/pkg/ovn/layer3_user_defined_network_controller_test.go @@ -113,7 +113,6 @@ var _ = Describe("OVN Multi-Homed pod operations for layer 3 network", func() { *netInfo.netconf(), ) Expect(err).NotTo(HaveOccurred()) - nad.Annotations = map[string]string{types.OvnNetworkIDAnnotation: userDefinedNetworkID} Expect(netInfo.setupOVNDependencies(&initialDB)).To(Succeed()) n := newNamespace(ns) if netInfo.isPrimary { @@ -314,7 +313,6 @@ var _ = Describe("OVN Multi-Homed pod operations for layer 3 network", func() { *netConf, ) Expect(err).NotTo(HaveOccurred()) - nad.Annotations = map[string]string{types.OvnNetworkIDAnnotation: userDefinedNetworkID} mutableNetworkConfig := util.NewMutableNetInfo(networkConfig) mutableNetworkConfig.SetNADs(util.GetNADName(nad.Namespace, nad.Name)) @@ -540,9 +538,24 @@ func (sni *userDefinedNetInfo) netconf() *ovncnitypes.NetConf { const plugin = "ovn-k8s-cni-overlay" role := types.NetworkRoleSecondary + transitSubnet := "" if sni.isPrimary { role = types.NetworkRolePrimary + if sni.topology == types.Layer2Topology { + transitSubnets := []string{} + for _, clusterSubnet := range strings.Split(sni.clustersubnets, ",") { + _, cidr, err := net.ParseCIDR(clusterSubnet) + Expect(err).NotTo(HaveOccurred()) + if knet.IsIPv4CIDR(cidr) { + transitSubnets = append(transitSubnets, config.ClusterManager.V4TransitSubnet) + } else { + transitSubnets = append(transitSubnets, config.ClusterManager.V6TransitSubnet) + } + } + transitSubnet = strings.Join(transitSubnets, ",") + } } + return &ovncnitypes.NetConf{ NetConf: cnitypes.NetConf{ Name: sni.netName, @@ -553,6 +566,7 @@ func (sni *userDefinedNetInfo) netconf() *ovncnitypes.NetConf { Subnets: sni.clustersubnets, Role: role, AllowPersistentIPs: sni.allowPersistentIPs, + TransitSubnet: transitSubnet, } } @@ -703,12 +717,10 @@ func expectedGWRouterPlusNATAndStaticRoutes( gwConfig util.L3GatewayConfig, ) []libovsdbtest.TestData { gwRouterToExtLRPUUID := fmt.Sprintf("%s%s-UUID", types.GWRouterToExtSwitchPrefix, gwRouterName) - gwRouterToJoinLRPUUID := fmt.Sprintf("%s%s-UUID", types.GWRouterToJoinSwitchPrefix, gwRouterName) const ( nat1 = "abc-UUID" nat2 = "cba-UUID" - perPodSNAT = "pod-snat-UUID" staticRoute1 = "srA-UUID" staticRoute2 = "srB-UUID" staticRoute3 = "srC-UUID" @@ -716,6 +728,16 @@ func expectedGWRouterPlusNATAndStaticRoutes( ) staticRouteOutputPort := types.GWRouterToExtSwitchPrefix + netInfo.GetNetworkScopedGWRouterName(nodeName) + gwRouterLRPUUID := fmt.Sprintf("%s%s-UUID", types.GWRouterToJoinSwitchPrefix, gwRouterName) + grOptions := gwRouterOptions(gwConfig) + sr1 := expectedGRStaticRoute(staticRoute1, netInfo.Subnets()[0].CIDR.String(), dummyMasqueradeIP().IP.String(), nil, nil, netInfo) + if netInfo.TopologyType() == types.Layer2Topology { + gwRouterLRPUUID = fmt.Sprintf("%s%s-UUID", types.RouterToTransitRouterPrefix, gwRouterName) + grOptions["lb_force_snat_ip"] = gwRouterJoinIPAddress().IP.String() + transitRouteOutputPort := types.RouterToTransitRouterPrefix + netInfo.GetNetworkScopedGWRouterName(nodeName) + trInfo := getTestTransitRouterInfo(netInfo) + sr1 = expectedGRStaticRoute(staticRoute1, netInfo.Subnets()[0].CIDR.String(), trInfo.transitRouterNets[0].IP.String(), nil, &transitRouteOutputPort, netInfo) + } nextHopIP := gwConfig.NextHops[0].String() nextHopMasqIP := nextHopMasqueradeIP().String() masqSubnet := config.Gateway.V4MasqueradeSubnet @@ -726,12 +748,12 @@ func expectedGWRouterPlusNATAndStaticRoutes( Name: gwRouterName, UUID: gwRouterName + "-UUID", ExternalIDs: gwRouterExternalIDs(netInfo, gwConfig), - Options: gwRouterOptions(gwConfig), - Ports: []string{gwRouterToJoinLRPUUID, gwRouterToExtLRPUUID}, + Options: grOptions, + Ports: []string{gwRouterLRPUUID, gwRouterToExtLRPUUID}, Nat: nat, StaticRoutes: []string{staticRoute1, staticRoute2, staticRoute3}, }, - expectedGRStaticRoute(staticRoute1, netInfo.Subnets()[0].CIDR.String(), dummyMasqueradeIP().IP.String(), nil, nil, netInfo), + sr1, expectedGRStaticRoute(staticRoute2, ipv4DefaultRoute, nextHopIP, nil, &staticRouteOutputPort, netInfo), expectedGRStaticRoute(staticRoute3, masqSubnet, nextHopMasqIP, nil, &staticRouteOutputPort, netInfo), } @@ -800,7 +822,6 @@ func expectedLayer3EgressEntities(netInfo util.NetInfo, gwConfig util.L3GatewayC routerPolicyUUID2 = "lrpol2-UUID" staticRouteUUID1 = "sr1-UUID" staticRouteUUID2 = "sr2-UUID" - staticRouteUUID3 = "sr3-UUID" masqSNATUUID1 = "masq-snat1-UUID" ) masqIPAddr := dummyMasqueradeIP().IP.String() @@ -913,18 +934,6 @@ func udnGWSNATAddress() *net.IPNet { } } -func newMasqueradeManagementNATEntry(uuid string, netInfo util.NetInfo) *nbdb.NAT { - masqSNAT := newNATEntry( - uuid, - "169.254.169.14", - layer2Subnet().String(), - standardNonDefaultNetworkExtIDs(netInfo), - getMasqueradeManagementIPSNATMatch(util.IPAddrToHWAddr(managementPortIP(layer2Subnet())).String()), - ) - masqSNAT.LogicalPort = ptr.To(fmt.Sprintf("rtoj-GR_%s_%s", netInfo.GetNetworkName(), nodeName)) - return masqSNAT -} - func newNATEntry(uuid string, externalIP string, logicalIP string, extIDs map[string]string, match string) *nbdb.NAT { return &nbdb.NAT{ UUID: uuid, diff --git a/go-controller/pkg/ovn/master.go b/go-controller/pkg/ovn/master.go index 432aa784be..3fe803660e 100644 --- a/go-controller/pkg/ovn/master.go +++ b/go-controller/pkg/ovn/master.go @@ -38,7 +38,7 @@ type GatewayConfig struct { annoConfig *util.L3GatewayConfig hostSubnets []*net.IPNet clusterSubnets []*net.IPNet - gwLRPJoinIPs []*net.IPNet + gwRouterJoinCIDRs []*net.IPNet hostAddrs []string externalIPs []net.IP ovnClusterLRPToJoinIfAddrs []*net.IPNet @@ -139,7 +139,7 @@ func (oc *DefaultNetworkController) nodeGatewayConfig(node *corev1.Node) (*Gatew annoConfig: l3GatewayConfig, hostSubnets: hostSubnets, clusterSubnets: clusterSubnets, - gwLRPJoinIPs: gwLRPIPs, + gwRouterJoinCIDRs: gwLRPIPs, hostAddrs: hostAddrs, externalIPs: externalIPs, ovnClusterLRPToJoinIfAddrs: oc.ovnClusterLRPToJoinIfAddrs, diff --git a/go-controller/pkg/ovn/multihoming_test.go b/go-controller/pkg/ovn/multihoming_test.go index cd4f07137a..84643dec7b 100644 --- a/go-controller/pkg/ovn/multihoming_test.go +++ b/go-controller/pkg/ovn/multihoming_test.go @@ -207,23 +207,20 @@ func (em *userDefinedNetworkExpectationMachine) expectedLogicalSwitchesAndPortsW data = append(data, mgmtPort) nodeslsps[switchName] = append(nodeslsps[switchName], mgmtPortUUID) - networkSwitchToGWRouterLSPName := ovntypes.SwitchToRouterPrefix + switchName - networkSwitchToGWRouterLSPUUID := networkSwitchToGWRouterLSPName + "-UUID" + networkSwitchToTransitRouterLSPName := ovntypes.SwitchToTransitRouterPrefix + switchName + networkSwitchToGWRouterLSPUUID := networkSwitchToTransitRouterLSPName + "-UUID" lsp := &nbdb.LogicalSwitchPort{ UUID: networkSwitchToGWRouterLSPUUID, - Name: networkSwitchToGWRouterLSPName, + Name: networkSwitchToTransitRouterLSPName, Addresses: []string{"router"}, ExternalIDs: map[string]string{ "k8s.ovn.org/topology": ocInfo.bnc.TopologyType(), "k8s.ovn.org/network": ocInfo.bnc.GetNetworkName(), }, - Options: map[string]string{libovsdbops.RouterPort: ovntypes.RouterToSwitchPrefix + switchName}, + Options: map[string]string{libovsdbops.RouterPort: ovntypes.TransitRouterToSwitchPrefix + switchName}, Type: "router", } data = append(data, lsp) - if util.IsNetworkSegmentationSupportEnabled() && ocInfo.bnc.IsPrimaryNetwork() { - lsp.Options[libovsdbops.RequestedTnlKey] = "25" - } nodeslsps[switchName] = append(nodeslsps[switchName], networkSwitchToGWRouterLSPUUID) const aclUUID = "acl1-UUID" @@ -277,7 +274,8 @@ func (em *userDefinedNetworkExpectationMachine) expectedLogicalSwitchesAndPortsW data = append(data, expectedGWEntities(pod.nodeName, ocInfo.bnc, *em.gatewayConfig)...) data = append(data, expectedLayer3EgressEntities(ocInfo.bnc, *em.gatewayConfig, subnet)...) } else { - data = append(data, expectedLayer2EgressEntities(ocInfo.bnc, *em.gatewayConfig, pod.nodeName)...) + data = append(data, expectedGWEntitiesLayer2(pod.nodeName, ocInfo.bnc, *em.gatewayConfig)...) + data = append(data, expectedLayer2EgressEntities(ocInfo.bnc, *em.gatewayConfig, subnet)...) } } if _, alreadyAdded := alreadyAddedManagementElements[pod.nodeName]; !alreadyAdded && diff --git a/go-controller/pkg/ovn/ovn_test.go b/go-controller/pkg/ovn/ovn_test.go index 8c0fbcd54e..c4bedb48a8 100644 --- a/go-controller/pkg/ovn/ovn_test.go +++ b/go-controller/pkg/ovn/ovn_test.go @@ -489,8 +489,13 @@ func newNetworkAttachmentDefinition(namespace, name string, netconf ovncnitypes. if err != nil { return nil, fmt.Errorf("failed marshaling podNetworks map %v", netconf) } + meta := newObjectMeta(name, namespace) + meta.Annotations = map[string]string{types.OvnNetworkIDAnnotation: userDefinedNetworkID} + if netconf.Topology == types.Layer2Topology && netconf.Role == types.NetworkRolePrimary { + meta.Annotations[types.OvnNetworkTunnelKeysAnnotation] = "[16711685,16715780]" + } return &nettypes.NetworkAttachmentDefinition{ - ObjectMeta: newObjectMeta(name, namespace), + ObjectMeta: meta, Spec: nettypes.NetworkAttachmentDefinitionSpec{ Config: string(bytes), }, diff --git a/go-controller/pkg/ovn/topology/topologyfactory.go b/go-controller/pkg/ovn/topology/topologyfactory.go index 45738cf85f..ead14e05b2 100644 --- a/go-controller/pkg/ovn/topology/topologyfactory.go +++ b/go-controller/pkg/ovn/topology/topologyfactory.go @@ -40,6 +40,15 @@ func (gtf *GatewayTopologyFactory) NewClusterRouterWithMulticastSupport( return gtf.newClusterRouter(clusterRouterName, netInfo, coopUUID, routerOptions) } +func (gtf *GatewayTopologyFactory) NewTransitRouter( + netInfo util.NetInfo, + coopUUID string, + tunnelKey string, +) (*nbdb.LogicalRouter, error) { + routerOptions := map[string]string{libovsdbops.RequestedTnlKey: tunnelKey} + return gtf.newClusterRouter(netInfo.GetNetworkScopedClusterRouterName(), netInfo, coopUUID, routerOptions) +} + func (gtf *GatewayTopologyFactory) newClusterRouter( clusterRouterName string, netInfo util.NetInfo, diff --git a/go-controller/pkg/ovn/transit_router.go b/go-controller/pkg/ovn/transit_router.go new file mode 100644 index 0000000000..47bd806d9d --- /dev/null +++ b/go-controller/pkg/ovn/transit_router.go @@ -0,0 +1,52 @@ +package ovn + +import ( + "fmt" + "net" + + corev1 "k8s.io/api/core/v1" + + udn "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/generator/ip" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types" + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" +) + +type transitRouterInfo struct { + gatewayRouterNets, transitRouterNets []*net.IPNet + nodeID int +} + +// getTransitRouterInfo calculates the gateway and cluster router networks for every node based on the node ID. +// we use netInfo.TransitSubnets() to split it into smaller networks. +// For transit-subnet: 100.88.0.0/16, and nodeID=2, we will get: +// - Transit Router IP: 100.88.0.4/31 +// - Gateway Router IP: 100.88.0.5/31 +func getTransitRouterInfo(netInfo util.NetInfo, node *corev1.Node) (*transitRouterInfo, error) { + if netInfo.TopologyType() != types.Layer2Topology || !netInfo.IsPrimaryNetwork() { + return nil, fmt.Errorf("transit router networks are only calculated for primary L2 user defined networks") + } + nodeID, _ := util.GetNodeID(node) + if nodeID == util.InvalidNodeID { + return nil, fmt.Errorf("invalid node id calculating transit router networks") + } + routerInfo := &transitRouterInfo{ + nodeID: nodeID, + } + for _, transitSubnet := range netInfo.TransitSubnets() { + ipGenerator, err := udn.NewIPGenerator(transitSubnet.String()) + if err != nil { + return nil, err + } + transitRouterIP, gatewayRouterIP, err := ipGenerator.GenerateIPPair(nodeID) + if err != nil { + return nil, err + } + + routerInfo.transitRouterNets = append(routerInfo.transitRouterNets, transitRouterIP) + routerInfo.gatewayRouterNets = append(routerInfo.gatewayRouterNets, gatewayRouterIP) + } + if len(routerInfo.transitRouterNets) == 0 || len(routerInfo.gatewayRouterNets) == 0 { + return nil, fmt.Errorf("network %s has no transit subnets defined", netInfo.GetNetworkName()) + } + return routerInfo, nil +} diff --git a/go-controller/pkg/ovn/zone_interconnect/zone_ic_handler.go b/go-controller/pkg/ovn/zone_interconnect/zone_ic_handler.go index 1549bf5481..23a310c9ab 100644 --- a/go-controller/pkg/ovn/zone_interconnect/zone_ic_handler.go +++ b/go-controller/pkg/ovn/zone_interconnect/zone_ic_handler.go @@ -163,7 +163,7 @@ func (zic *ZoneInterconnectHandler) createOrUpdateTransitSwitch(networkID int) e Name: zic.networkTransitSwitchName, ExternalIDs: externalIDs, } - zic.addTransitSwitchConfig(ts, networkID) + zic.addTransitSwitchConfig(ts, BaseTransitSwitchTunnelKey+networkID) // Create transit switch if it doesn't exist if err := libovsdbops.CreateOrUpdateLogicalSwitch(zic.nbClient, ts); err != nil { return fmt.Errorf("failed to create/update transit switch %s: %w", zic.networkTransitSwitchName, err) @@ -339,12 +339,13 @@ func (zic *ZoneInterconnectHandler) Cleanup() error { return libovsdbops.DeleteLogicalSwitch(zic.nbClient, zic.networkTransitSwitchName) } -func (zic *ZoneInterconnectHandler) AddTransitSwitchConfig(sw *nbdb.LogicalSwitch) error { +// AddTransitSwitchConfig is only used by the layer2 network controller +func (zic *ZoneInterconnectHandler) AddTransitSwitchConfig(sw *nbdb.LogicalSwitch, tunnelKey int) error { if zic.TopologyType() != types.Layer2Topology { return nil } - zic.addTransitSwitchConfig(sw, zic.GetNetworkID()) + zic.addTransitSwitchConfig(sw, tunnelKey) return nil } @@ -370,13 +371,13 @@ func (zic *ZoneInterconnectHandler) AddTransitPortConfig(remote bool, podAnnotat return nil } -func (zic *ZoneInterconnectHandler) addTransitSwitchConfig(sw *nbdb.LogicalSwitch, networkID int) { +func (zic *ZoneInterconnectHandler) addTransitSwitchConfig(sw *nbdb.LogicalSwitch, tunnelKey int) { if sw.OtherConfig == nil { sw.OtherConfig = map[string]string{} } sw.OtherConfig["interconn-ts"] = sw.Name - sw.OtherConfig[libovsdbops.RequestedTnlKey] = strconv.Itoa(BaseTransitSwitchTunnelKey + networkID) + sw.OtherConfig[libovsdbops.RequestedTnlKey] = strconv.Itoa(tunnelKey) sw.OtherConfig["mcast_snoop"] = "true" sw.OtherConfig["mcast_querier"] = "false" sw.OtherConfig["mcast_flood_unregistered"] = "true" diff --git a/go-controller/pkg/ovnwebhook/nodeadmission.go b/go-controller/pkg/ovnwebhook/nodeadmission.go index 15b98db2fc..e7dc733371 100644 --- a/go-controller/pkg/ovnwebhook/nodeadmission.go +++ b/go-controller/pkg/ovnwebhook/nodeadmission.go @@ -67,6 +67,13 @@ var interconnectNodeAnnotationChecks = map[string]checkNodeAnnot{ return fmt.Errorf("%s can only be set to %s, it cannot be removed", util.OvnNodeMigratedZoneName, nodeName) }, + util.Layer2TopologyVersion: func(v annotationChange, _ string) error { + // it is allowed for the annotation to be added or removed + if v.action == added || v.action == removed { + return nil + } + return fmt.Errorf("%s can only be added or removed, not updated", util.Layer2TopologyVersion) + }, } // hybridOverlayNodeAnnotationChecks holds annotations allowed for ovnkube-node: users hybrid overlay environments diff --git a/go-controller/pkg/testing/mocks/github.com/containernetworking/cni/pkg/types/Result.go b/go-controller/pkg/testing/mocks/github.com/containernetworking/cni/pkg/types/Result.go index 4fc1e8bd3a..a5ce63ec2a 100644 --- a/go-controller/pkg/testing/mocks/github.com/containernetworking/cni/pkg/types/Result.go +++ b/go-controller/pkg/testing/mocks/github.com/containernetworking/cni/pkg/types/Result.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -44,7 +44,7 @@ func (_m *Result) GetAsVersion(version string) (types.Result, error) { return r0, r1 } -// Print provides a mock function with given fields: +// Print provides a mock function with no fields func (_m *Result) Print() error { ret := _m.Called() @@ -80,7 +80,7 @@ func (_m *Result) PrintTo(writer io.Writer) error { return r0 } -// Version provides a mock function with given fields: +// Version provides a mock function with no fields func (_m *Result) Version() string { ret := _m.Called() diff --git a/go-controller/pkg/testing/mocks/github.com/containernetworking/plugins/pkg/ns/NetNS.go b/go-controller/pkg/testing/mocks/github.com/containernetworking/plugins/pkg/ns/NetNS.go index db75f12abc..479cde7e30 100644 --- a/go-controller/pkg/testing/mocks/github.com/containernetworking/plugins/pkg/ns/NetNS.go +++ b/go-controller/pkg/testing/mocks/github.com/containernetworking/plugins/pkg/ns/NetNS.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -12,7 +12,7 @@ type NetNS struct { mock.Mock } -// Close provides a mock function with given fields: +// Close provides a mock function with no fields func (_m *NetNS) Close() error { ret := _m.Called() @@ -48,7 +48,7 @@ func (_m *NetNS) Do(toRun func(ns.NetNS) error) error { return r0 } -// Fd provides a mock function with given fields: +// Fd provides a mock function with no fields func (_m *NetNS) Fd() uintptr { ret := _m.Called() @@ -66,7 +66,7 @@ func (_m *NetNS) Fd() uintptr { return r0 } -// Path provides a mock function with given fields: +// Path provides a mock function with no fields func (_m *NetNS) Path() string { ret := _m.Called() @@ -84,7 +84,7 @@ func (_m *NetNS) Path() string { return r0 } -// Set provides a mock function with given fields: +// Set provides a mock function with no fields func (_m *NetNS) Set() error { ret := _m.Called() diff --git a/go-controller/pkg/testing/mocks/github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/informers/externalversions/k8s.cni.cncf.io/v1/NetworkAttachmentDefinitionInformer.go b/go-controller/pkg/testing/mocks/github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/informers/externalversions/k8s.cni.cncf.io/v1/NetworkAttachmentDefinitionInformer.go index 17da81f3f7..0feed7ef73 100644 --- a/go-controller/pkg/testing/mocks/github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/informers/externalversions/k8s.cni.cncf.io/v1/NetworkAttachmentDefinitionInformer.go +++ b/go-controller/pkg/testing/mocks/github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/informers/externalversions/k8s.cni.cncf.io/v1/NetworkAttachmentDefinitionInformer.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -13,7 +13,7 @@ type NetworkAttachmentDefinitionInformer struct { mock.Mock } -// Informer provides a mock function with given fields: +// Informer provides a mock function with no fields func (_m *NetworkAttachmentDefinitionInformer) Informer() cache.SharedIndexInformer { ret := _m.Called() @@ -33,7 +33,7 @@ func (_m *NetworkAttachmentDefinitionInformer) Informer() cache.SharedIndexInfor return r0 } -// Lister provides a mock function with given fields: +// Lister provides a mock function with no fields func (_m *NetworkAttachmentDefinitionInformer) Lister() k8s_cni_cncf_iov1.NetworkAttachmentDefinitionLister { ret := _m.Called() diff --git a/go-controller/pkg/testing/mocks/github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/listers/k8s.cni.cncf.io/v1/NetworkAttachmentDefinitionLister.go b/go-controller/pkg/testing/mocks/github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/listers/k8s.cni.cncf.io/v1/NetworkAttachmentDefinitionLister.go index 1b03e0fc71..7092f9bc28 100644 --- a/go-controller/pkg/testing/mocks/github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/listers/k8s.cni.cncf.io/v1/NetworkAttachmentDefinitionLister.go +++ b/go-controller/pkg/testing/mocks/github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/listers/k8s.cni.cncf.io/v1/NetworkAttachmentDefinitionLister.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/testing/mocks/github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/listers/k8s.cni.cncf.io/v1/NetworkAttachmentDefinitionNamespaceLister.go b/go-controller/pkg/testing/mocks/github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/listers/k8s.cni.cncf.io/v1/NetworkAttachmentDefinitionNamespaceLister.go index f725105c5f..9d94ac1233 100644 --- a/go-controller/pkg/testing/mocks/github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/listers/k8s.cni.cncf.io/v1/NetworkAttachmentDefinitionNamespaceLister.go +++ b/go-controller/pkg/testing/mocks/github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/client/listers/k8s.cni.cncf.io/v1/NetworkAttachmentDefinitionNamespaceLister.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/testing/mocks/github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/ManagementPort.go b/go-controller/pkg/testing/mocks/github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/managementport/Interface.go similarity index 54% rename from go-controller/pkg/testing/mocks/github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/ManagementPort.go rename to go-controller/pkg/testing/mocks/github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/managementport/Interface.go index 0c99fb1bf3..68fcc42a81 100644 --- a/go-controller/pkg/testing/mocks/github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/ManagementPort.go +++ b/go-controller/pkg/testing/mocks/github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/managementport/Interface.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -8,13 +8,13 @@ import ( mock "github.com/stretchr/testify/mock" ) -// ManagementPort is an autogenerated mock type for the ManagementPort type -type ManagementPort struct { +// Interface is an autogenerated mock type for the Interface type +type Interface struct { mock.Mock } -// GetAddresses provides a mock function with given fields: -func (_m *ManagementPort) GetAddresses() []*net.IPNet { +// GetAddresses provides a mock function with no fields +func (_m *Interface) GetAddresses() []*net.IPNet { ret := _m.Called() if len(ret) == 0 { @@ -33,8 +33,8 @@ func (_m *ManagementPort) GetAddresses() []*net.IPNet { return r0 } -// GetInterfaceName provides a mock function with given fields: -func (_m *ManagementPort) GetInterfaceName() string { +// GetInterfaceName provides a mock function with no fields +func (_m *Interface) GetInterfaceName() string { ret := _m.Called() if len(ret) == 0 { @@ -51,13 +51,13 @@ func (_m *ManagementPort) GetInterfaceName() string { return r0 } -// NewManagementPort creates a new instance of ManagementPort. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +// NewInterface creates a new instance of Interface. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. // The first argument is typically a *testing.T value. -func NewManagementPort(t interface { +func NewInterface(t interface { mock.TestingT Cleanup(func()) -}) *ManagementPort { - mock := &ManagementPort{} +}) *Interface { + mock := &Interface{} mock.Mock.Test(t) t.Cleanup(func() { mock.AssertExpectations(t) }) diff --git a/go-controller/pkg/testing/mocks/github.com/vishvananda/netlink/Link.go b/go-controller/pkg/testing/mocks/github.com/vishvananda/netlink/Link.go index 2ffa594664..956976be77 100644 --- a/go-controller/pkg/testing/mocks/github.com/vishvananda/netlink/Link.go +++ b/go-controller/pkg/testing/mocks/github.com/vishvananda/netlink/Link.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -12,7 +12,7 @@ type Link struct { mock.Mock } -// Attrs provides a mock function with given fields: +// Attrs provides a mock function with no fields func (_m *Link) Attrs() *netlink.LinkAttrs { ret := _m.Called() @@ -32,7 +32,7 @@ func (_m *Link) Attrs() *netlink.LinkAttrs { return r0 } -// Type provides a mock function with given fields: +// Type provides a mock function with no fields func (_m *Link) Type() string { ret := _m.Called() diff --git a/go-controller/pkg/testing/mocks/k8s.io/client-go/informers/core/v1/NodeInformer.go b/go-controller/pkg/testing/mocks/k8s.io/client-go/informers/core/v1/NodeInformer.go index e1bbc30420..b23a629d3d 100644 --- a/go-controller/pkg/testing/mocks/k8s.io/client-go/informers/core/v1/NodeInformer.go +++ b/go-controller/pkg/testing/mocks/k8s.io/client-go/informers/core/v1/NodeInformer.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -14,7 +14,7 @@ type NodeInformer struct { mock.Mock } -// Informer provides a mock function with given fields: +// Informer provides a mock function with no fields func (_m *NodeInformer) Informer() cache.SharedIndexInformer { ret := _m.Called() @@ -34,7 +34,7 @@ func (_m *NodeInformer) Informer() cache.SharedIndexInformer { return r0 } -// Lister provides a mock function with given fields: +// Lister provides a mock function with no fields func (_m *NodeInformer) Lister() corev1.NodeLister { ret := _m.Called() diff --git a/go-controller/pkg/testing/mocks/k8s.io/client-go/informers/core/v1/PodInformer.go b/go-controller/pkg/testing/mocks/k8s.io/client-go/informers/core/v1/PodInformer.go index 9117816221..1fc4268995 100644 --- a/go-controller/pkg/testing/mocks/k8s.io/client-go/informers/core/v1/PodInformer.go +++ b/go-controller/pkg/testing/mocks/k8s.io/client-go/informers/core/v1/PodInformer.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -14,7 +14,7 @@ type PodInformer struct { mock.Mock } -// Informer provides a mock function with given fields: +// Informer provides a mock function with no fields func (_m *PodInformer) Informer() cache.SharedIndexInformer { ret := _m.Called() @@ -34,7 +34,7 @@ func (_m *PodInformer) Informer() cache.SharedIndexInformer { return r0 } -// Lister provides a mock function with given fields: +// Lister provides a mock function with no fields func (_m *PodInformer) Lister() corev1.PodLister { ret := _m.Called() diff --git a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/NodeLister.go b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/NodeLister.go index db2fe8bad2..062997c853 100644 --- a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/NodeLister.go +++ b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/NodeLister.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodLister.go b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodLister.go index 6406289383..614f3771f0 100644 --- a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodLister.go +++ b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodLister.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodListerExpansion.go b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodListerExpansion.go index b2b51db195..e9a83a363a 100644 --- a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodListerExpansion.go +++ b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodListerExpansion.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodNamespaceLister.go b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodNamespaceLister.go index 3787637eaf..02de951efd 100644 --- a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodNamespaceLister.go +++ b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodNamespaceLister.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodNamespaceListerExpansion.go b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodNamespaceListerExpansion.go index 5b44e905c6..a940e46524 100644 --- a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodNamespaceListerExpansion.go +++ b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodNamespaceListerExpansion.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateLister.go b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateLister.go index 949bf133dc..b93a4d1809 100644 --- a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateLister.go +++ b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateLister.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateListerExpansion.go b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateListerExpansion.go index 0b831422d3..3d0f1ebb0a 100644 --- a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateListerExpansion.go +++ b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateListerExpansion.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateNamespaceLister.go b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateNamespaceLister.go index db55859afe..eb734298b7 100644 --- a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateNamespaceLister.go +++ b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateNamespaceLister.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateNamespaceListerExpansion.go b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateNamespaceListerExpansion.go index f94253da08..eb12bf0fce 100644 --- a/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateNamespaceListerExpansion.go +++ b/go-controller/pkg/testing/mocks/k8s.io/client-go/listers/core/v1/PodTemplateNamespaceListerExpansion.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/testing/mocks/k8s.io/utils/exec/Cmd.go b/go-controller/pkg/testing/mocks/k8s.io/utils/exec/Cmd.go index 4c1db92025..4cce4a405e 100644 --- a/go-controller/pkg/testing/mocks/k8s.io/utils/exec/Cmd.go +++ b/go-controller/pkg/testing/mocks/k8s.io/utils/exec/Cmd.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -13,7 +13,7 @@ type Cmd struct { mock.Mock } -// CombinedOutput provides a mock function with given fields: +// CombinedOutput provides a mock function with no fields func (_m *Cmd) CombinedOutput() ([]byte, error) { ret := _m.Called() @@ -43,7 +43,7 @@ func (_m *Cmd) CombinedOutput() ([]byte, error) { return r0, r1 } -// Output provides a mock function with given fields: +// Output provides a mock function with no fields func (_m *Cmd) Output() ([]byte, error) { ret := _m.Called() @@ -73,7 +73,7 @@ func (_m *Cmd) Output() ([]byte, error) { return r0, r1 } -// Run provides a mock function with given fields: +// Run provides a mock function with no fields func (_m *Cmd) Run() error { ret := _m.Called() @@ -116,7 +116,7 @@ func (_m *Cmd) SetStdout(out io.Writer) { _m.Called(out) } -// Start provides a mock function with given fields: +// Start provides a mock function with no fields func (_m *Cmd) Start() error { ret := _m.Called() @@ -134,7 +134,7 @@ func (_m *Cmd) Start() error { return r0 } -// StderrPipe provides a mock function with given fields: +// StderrPipe provides a mock function with no fields func (_m *Cmd) StderrPipe() (io.ReadCloser, error) { ret := _m.Called() @@ -164,7 +164,7 @@ func (_m *Cmd) StderrPipe() (io.ReadCloser, error) { return r0, r1 } -// StdoutPipe provides a mock function with given fields: +// StdoutPipe provides a mock function with no fields func (_m *Cmd) StdoutPipe() (io.ReadCloser, error) { ret := _m.Called() @@ -194,12 +194,12 @@ func (_m *Cmd) StdoutPipe() (io.ReadCloser, error) { return r0, r1 } -// Stop provides a mock function with given fields: +// Stop provides a mock function with no fields func (_m *Cmd) Stop() { _m.Called() } -// Wait provides a mock function with given fields: +// Wait provides a mock function with no fields func (_m *Cmd) Wait() error { ret := _m.Called() diff --git a/go-controller/pkg/testing/mocks/k8s.io/utils/exec/ExitError.go b/go-controller/pkg/testing/mocks/k8s.io/utils/exec/ExitError.go index 1acd93ed11..837411f2da 100644 --- a/go-controller/pkg/testing/mocks/k8s.io/utils/exec/ExitError.go +++ b/go-controller/pkg/testing/mocks/k8s.io/utils/exec/ExitError.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -9,7 +9,7 @@ type ExitError struct { mock.Mock } -// Error provides a mock function with given fields: +// Error provides a mock function with no fields func (_m *ExitError) Error() string { ret := _m.Called() @@ -27,7 +27,7 @@ func (_m *ExitError) Error() string { return r0 } -// ExitStatus provides a mock function with given fields: +// ExitStatus provides a mock function with no fields func (_m *ExitError) ExitStatus() int { ret := _m.Called() @@ -45,7 +45,7 @@ func (_m *ExitError) ExitStatus() int { return r0 } -// Exited provides a mock function with given fields: +// Exited provides a mock function with no fields func (_m *ExitError) Exited() bool { ret := _m.Called() @@ -63,7 +63,7 @@ func (_m *ExitError) Exited() bool { return r0 } -// String provides a mock function with given fields: +// String provides a mock function with no fields func (_m *ExitError) String() string { ret := _m.Called() diff --git a/go-controller/pkg/testing/mocks/k8s.io/utils/exec/Interface.go b/go-controller/pkg/testing/mocks/k8s.io/utils/exec/Interface.go index 0b4b0a8f9c..a1523dc53b 100644 --- a/go-controller/pkg/testing/mocks/k8s.io/utils/exec/Interface.go +++ b/go-controller/pkg/testing/mocks/k8s.io/utils/exec/Interface.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/types/const.go b/go-controller/pkg/types/const.go index 20fdf23d31..2852cb624f 100644 --- a/go-controller/pkg/types/const.go +++ b/go-controller/pkg/types/const.go @@ -32,10 +32,6 @@ const ( // access to local service LocalNetworkName = "locnet" - // Local Bridge used for DGP access - LocalBridgeName = "br-local" - LocalnetGatewayNextHopPort = "ovn-k8s-gw0" - // OVS Bridge Datapath types DatapathUserspace = "netdev" @@ -43,25 +39,19 @@ const ( OVNClusterRouter = "ovn_cluster_router" OVNJoinSwitch = "join" - JoinSwitchPrefix = "join_" - ExternalSwitchPrefix = "ext_" - GWRouterPrefix = "GR_" - GWRouterLocalLBPostfix = "_local" - RouterToSwitchPrefix = "rtos-" - InterPrefix = "inter-" - HybridSubnetPrefix = "hybrid-subnet-" - SwitchToRouterPrefix = "stor-" - JoinSwitchToGWRouterPrefix = "jtor-" - GWRouterToJoinSwitchPrefix = "rtoj-" - DistRouterToJoinSwitchPrefix = "dtoj-" - JoinSwitchToDistRouterPrefix = "jtod-" - EXTSwitchToGWRouterPrefix = "etor-" - GWRouterToExtSwitchPrefix = "rtoe-" - EgressGWSwitchPrefix = "exgw-" - PatchPortPrefix = "patch-" - PatchPortSuffix = "-to-br-int" - - NodeLocalSwitch = "node_local_switch" + JoinSwitchPrefix = "join_" + ExternalSwitchPrefix = "ext_" + GWRouterPrefix = "GR_" + RouterToSwitchPrefix = "rtos-" + HybridSubnetPrefix = "hybrid-subnet-" + SwitchToRouterPrefix = "stor-" + JoinSwitchToGWRouterPrefix = "jtor-" + GWRouterToJoinSwitchPrefix = "rtoj-" + EXTSwitchToGWRouterPrefix = "etor-" + GWRouterToExtSwitchPrefix = "rtoe-" + EgressGWSwitchPrefix = "exgw-" + PatchPortPrefix = "patch-" + PatchPortSuffix = "-to-br-int" // types.OVNLayer2Switch is the name of layer2 topology switch OVNLayer2Switch = "ovn_layer2_switch" @@ -73,6 +63,11 @@ const ( TransitSwitch = "transit_switch" TransitSwitchToRouterPrefix = "tstor-" RouterToTransitSwitchPrefix = "rtots-" + TransitRouter = "transit_router" + TransitRouterToRouterPrefix = "trtor-" + RouterToTransitRouterPrefix = "rtotr-" + TransitRouterToSwitchPrefix = "trtos-" + SwitchToTransitRouterPrefix = "stotr-" // DefaultACLTier Priorities @@ -169,7 +164,9 @@ const ( // OvnNetworkIDAnnotation is a unique network identifier annotated on the // NAD by cluster manager nad controller OvnNetworkIDAnnotation = OvnK8sPrefix + "/network-id" - + // OvnNetworkTunnelKeysAnnotation is used to assign tunnel keys for the distributed switches and routers + // Assigned to the NADs for now + OvnNetworkTunnelKeysAnnotation = OvnK8sPrefix + "/tunnel-keys" // Deprecated: we used to set topology version as an annotation on the node. We don't do this anymore. OvnK8sTopoAnno = OvnK8sPrefix + "/" + "topology-version" OvnK8sSmallMTUTaintKey = OvnK8sPrefix + "/" + "mtu-too-small" diff --git a/go-controller/pkg/util/egressip/net.go b/go-controller/pkg/util/egressip/net.go new file mode 100644 index 0000000000..018e6d27f6 --- /dev/null +++ b/go-controller/pkg/util/egressip/net.go @@ -0,0 +1,39 @@ +package egressip + +import ( + "math" + "net" + + "github.com/vishvananda/netlink" + "golang.org/x/sys/unix" + + "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util" +) + +// GetNetlinkAddress returns a netlink address configured with specific +// egress ip parameters +func GetNetlinkAddress(ip net.IP, ifindex int) *netlink.Addr { + return &netlink.Addr{ + IPNet: &net.IPNet{IP: ip, Mask: util.GetIPFullMask(ip)}, + Flags: getNetlinkAddressFlag(ip), + Scope: int(netlink.SCOPE_UNIVERSE), + ValidLft: getNetlinkAddressValidLft(ip), + LinkIndex: ifindex, + } +} + +func getNetlinkAddressFlag(ip net.IP) int { + // isV6? + if ip != nil && ip.To4() == nil && ip.To16() != nil { + return unix.IFA_F_NODAD + } + return 0 +} + +func getNetlinkAddressValidLft(ip net.IP) int { + // isV6? + if ip != nil && ip.To4() == nil && ip.To16() != nil { + return math.MaxUint32 + } + return 0 +} diff --git a/go-controller/pkg/util/mocks/DNSOps.go b/go-controller/pkg/util/mocks/DNSOps.go index 1ca8487d09..b2e5d6cda6 100644 --- a/go-controller/pkg/util/mocks/DNSOps.go +++ b/go-controller/pkg/util/mocks/DNSOps.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/util/mocks/ExecRunner.go b/go-controller/pkg/util/mocks/ExecRunner.go index 88afca4180..a8fe015ea2 100644 --- a/go-controller/pkg/util/mocks/ExecRunner.go +++ b/go-controller/pkg/util/mocks/ExecRunner.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/util/mocks/FileSystemOps.go b/go-controller/pkg/util/mocks/FileSystemOps.go index c7c8bdbcb8..e075dbf0ed 100644 --- a/go-controller/pkg/util/mocks/FileSystemOps.go +++ b/go-controller/pkg/util/mocks/FileSystemOps.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/util/mocks/NetLinkOps.go b/go-controller/pkg/util/mocks/NetLinkOps.go index d9cd045b54..a72e496715 100644 --- a/go-controller/pkg/util/mocks/NetLinkOps.go +++ b/go-controller/pkg/util/mocks/NetLinkOps.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -230,7 +230,7 @@ func (_m *NetLinkOps) LinkDelete(link netlink.Link) error { return r0 } -// LinkList provides a mock function with given fields: +// LinkList provides a mock function with no fields func (_m *NetLinkOps) LinkList() ([]netlink.Link, error) { ret := _m.Called() diff --git a/go-controller/pkg/util/mocks/SriovnetOps.go b/go-controller/pkg/util/mocks/SriovnetOps.go index ea6f4560b3..ba798aa9b3 100644 --- a/go-controller/pkg/util/mocks/SriovnetOps.go +++ b/go-controller/pkg/util/mocks/SriovnetOps.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/util/mocks/VdpaDevice.go b/go-controller/pkg/util/mocks/VdpaDevice.go index f3b21787df..c18e8039c8 100644 --- a/go-controller/pkg/util/mocks/VdpaDevice.go +++ b/go-controller/pkg/util/mocks/VdpaDevice.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -12,7 +12,7 @@ type VdpaDevice struct { mock.Mock } -// Driver provides a mock function with given fields: +// Driver provides a mock function with no fields func (_m *VdpaDevice) Driver() string { ret := _m.Called() @@ -30,7 +30,7 @@ func (_m *VdpaDevice) Driver() string { return r0 } -// MgmtDev provides a mock function with given fields: +// MgmtDev provides a mock function with no fields func (_m *VdpaDevice) MgmtDev() kvdpa.MgmtDev { ret := _m.Called() @@ -50,7 +50,7 @@ func (_m *VdpaDevice) MgmtDev() kvdpa.MgmtDev { return r0 } -// Name provides a mock function with given fields: +// Name provides a mock function with no fields func (_m *VdpaDevice) Name() string { ret := _m.Called() @@ -68,7 +68,7 @@ func (_m *VdpaDevice) Name() string { return r0 } -// ParentDevicePath provides a mock function with given fields: +// ParentDevicePath provides a mock function with no fields func (_m *VdpaDevice) ParentDevicePath() (string, error) { ret := _m.Called() @@ -96,7 +96,7 @@ func (_m *VdpaDevice) ParentDevicePath() (string, error) { return r0, r1 } -// VhostVdpa provides a mock function with given fields: +// VhostVdpa provides a mock function with no fields func (_m *VdpaDevice) VhostVdpa() kvdpa.VhostVdpa { ret := _m.Called() @@ -116,7 +116,7 @@ func (_m *VdpaDevice) VhostVdpa() kvdpa.VhostVdpa { return r0 } -// VirtioNet provides a mock function with given fields: +// VirtioNet provides a mock function with no fields func (_m *VdpaDevice) VirtioNet() kvdpa.VirtioNet { ret := _m.Called() diff --git a/go-controller/pkg/util/mocks/VdpaOps.go b/go-controller/pkg/util/mocks/VdpaOps.go index 7f82bb5183..ac5b475c07 100644 --- a/go-controller/pkg/util/mocks/VdpaOps.go +++ b/go-controller/pkg/util/mocks/VdpaOps.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks diff --git a/go-controller/pkg/util/mocks/multinetwork/NetInfo.go b/go-controller/pkg/util/mocks/multinetwork/NetInfo.go index 42e5808356..50edd3ef0a 100644 --- a/go-controller/pkg/util/mocks/multinetwork/NetInfo.go +++ b/go-controller/pkg/util/mocks/multinetwork/NetInfo.go @@ -1,4 +1,4 @@ -// Code generated by mockery v2.43.2. DO NOT EDIT. +// Code generated by mockery v2.53.4. DO NOT EDIT. package mocks @@ -16,45 +16,7 @@ type NetInfo struct { mock.Mock } -func (_m *NetInfo) GetNodeGatewayIP(hostSubnet *net.IPNet) *net.IPNet { - ret := _m.Called(hostSubnet) - - if len(ret) == 0 { - panic("no return value specified for GetNodeGatewayIP") - } - - var r0 *net.IPNet - if rf, ok := ret.Get(0).(func(*net.IPNet) *net.IPNet); ok { - r0 = rf(hostSubnet) - } else { - if ret.Get(0) != nil { - r0 = ret.Get(0).(*net.IPNet) - } - } - - return r0 -} - -func (_m *NetInfo) GetNodeManagementIP(hostSubnet *net.IPNet) *net.IPNet { - ret := _m.Called(hostSubnet) - - if len(ret) == 0 { - panic("no return value specified for GetNodeManagementIP") - } - - var r0 *net.IPNet - if rf, ok := ret.Get(0).(func(*net.IPNet) *net.IPNet); ok { - r0 = rf(hostSubnet) - } else { - if ret.Get(0) != nil { - r0 = ret.Get(0).(*net.IPNet) - } - } - - return r0 -} - -// AllowsPersistentIPs provides a mock function with given fields: +// AllowsPersistentIPs provides a mock function with no fields func (_m *NetInfo) AllowsPersistentIPs() bool { ret := _m.Called() @@ -96,7 +58,7 @@ func (_m *NetInfo) EqualNADs(nads ...string) bool { return r0 } -// ExcludeSubnets provides a mock function with given fields: +// ExcludeSubnets provides a mock function with no fields func (_m *NetInfo) ExcludeSubnets() []*net.IPNet { ret := _m.Called() @@ -116,47 +78,7 @@ func (_m *NetInfo) ExcludeSubnets() []*net.IPNet { return r0 } -// InfrastructureSubnets provides a mock function with given fields: -func (_m *NetInfo) InfrastructureSubnets() []*net.IPNet { - ret := _m.Called() - - if len(ret) == 0 { - panic("no return value specified for InfrastructureSubnets") - } - - var r0 []*net.IPNet - if rf, ok := ret.Get(0).(func() []*net.IPNet); ok { - r0 = rf() - } else { - if ret.Get(0) != nil { - r0 = ret.Get(0).([]*net.IPNet) - } - } - - return r0 -} - -// ReservedSubnets provides a mock function with given fields: -func (_m *NetInfo) ReservedSubnets() []*net.IPNet { - ret := _m.Called() - - if len(ret) == 0 { - panic("no return value specified for ReservedSubnets") - } - - var r0 []*net.IPNet - if rf, ok := ret.Get(0).(func() []*net.IPNet); ok { - r0 = rf() - } else { - if ret.Get(0) != nil { - r0 = ret.Get(0).([]*net.IPNet) - } - } - - return r0 -} - -// GetEgressIPAdvertisedNodes provides a mock function with given fields: +// GetEgressIPAdvertisedNodes provides a mock function with no fields func (_m *NetInfo) GetEgressIPAdvertisedNodes() []string { ret := _m.Called() @@ -196,7 +118,7 @@ func (_m *NetInfo) GetEgressIPAdvertisedOnNodeVRFs(node string) []string { return r0 } -// GetEgressIPAdvertisedVRFs provides a mock function with given fields: +// GetEgressIPAdvertisedVRFs provides a mock function with no fields func (_m *NetInfo) GetEgressIPAdvertisedVRFs() map[string][]string { ret := _m.Called() @@ -216,7 +138,7 @@ func (_m *NetInfo) GetEgressIPAdvertisedVRFs() map[string][]string { return r0 } -// GetNADNamespaces provides a mock function with given fields: +// GetNADNamespaces provides a mock function with no fields func (_m *NetInfo) GetNADNamespaces() []string { ret := _m.Called() @@ -236,7 +158,7 @@ func (_m *NetInfo) GetNADNamespaces() []string { return r0 } -// GetNADs provides a mock function with given fields: +// GetNADs provides a mock function with no fields func (_m *NetInfo) GetNADs() []string { ret := _m.Called() @@ -256,7 +178,7 @@ func (_m *NetInfo) GetNADs() []string { return r0 } -// GetNetInfo provides a mock function with given fields: +// GetNetInfo provides a mock function with no fields func (_m *NetInfo) GetNetInfo() util.NetInfo { ret := _m.Called() @@ -276,7 +198,7 @@ func (_m *NetInfo) GetNetInfo() util.NetInfo { return r0 } -// GetNetworkID provides a mock function with given fields: +// GetNetworkID provides a mock function with no fields func (_m *NetInfo) GetNetworkID() int { ret := _m.Called() @@ -294,7 +216,7 @@ func (_m *NetInfo) GetNetworkID() int { return r0 } -// GetNetworkName provides a mock function with given fields: +// GetNetworkName provides a mock function with no fields func (_m *NetInfo) GetNetworkName() string { ret := _m.Called() @@ -312,7 +234,7 @@ func (_m *NetInfo) GetNetworkName() string { return r0 } -// GetNetworkScopedClusterRouterName provides a mock function with given fields: +// GetNetworkScopedClusterRouterName provides a mock function with no fields func (_m *NetInfo) GetNetworkScopedClusterRouterName() string { ret := _m.Called() @@ -330,24 +252,6 @@ func (_m *NetInfo) GetNetworkScopedClusterRouterName() string { return r0 } -// GetNetworkScopedClusterSubnetSNATMatch provides a mock function with given fields: nodeName -func (_m *NetInfo) GetNetworkScopedClusterSubnetSNATMatch(nodeName string) string { - ret := _m.Called(nodeName) - - if len(ret) == 0 { - panic("no return value specified for GetNetworkScopedClusterSubnetSNATMatch") - } - - var r0 string - if rf, ok := ret.Get(0).(func(string) string); ok { - r0 = rf(nodeName) - } else { - r0 = ret.Get(0).(string) - } - - return r0 -} - // GetNetworkScopedExtPortName provides a mock function with given fields: bridgeID, nodeName func (_m *NetInfo) GetNetworkScopedExtPortName(bridgeID string, nodeName string) string { ret := _m.Called(bridgeID, nodeName) @@ -402,7 +306,7 @@ func (_m *NetInfo) GetNetworkScopedGWRouterName(nodeName string) string { return r0 } -// GetNetworkScopedJoinSwitchName provides a mock function with given fields: +// GetNetworkScopedJoinSwitchName provides a mock function with no fields func (_m *NetInfo) GetNetworkScopedJoinSwitchName() string { ret := _m.Called() @@ -528,6 +432,46 @@ func (_m *NetInfo) GetNetworkScopedSwitchName(nodeName string) string { return r0 } +// GetNodeGatewayIP provides a mock function with given fields: hostSubnet +func (_m *NetInfo) GetNodeGatewayIP(hostSubnet *net.IPNet) *net.IPNet { + ret := _m.Called(hostSubnet) + + if len(ret) == 0 { + panic("no return value specified for GetNodeGatewayIP") + } + + var r0 *net.IPNet + if rf, ok := ret.Get(0).(func(*net.IPNet) *net.IPNet); ok { + r0 = rf(hostSubnet) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*net.IPNet) + } + } + + return r0 +} + +// GetNodeManagementIP provides a mock function with given fields: hostSubnet +func (_m *NetInfo) GetNodeManagementIP(hostSubnet *net.IPNet) *net.IPNet { + ret := _m.Called(hostSubnet) + + if len(ret) == 0 { + panic("no return value specified for GetNodeManagementIP") + } + + var r0 *net.IPNet + if rf, ok := ret.Get(0).(func(*net.IPNet) *net.IPNet); ok { + r0 = rf(hostSubnet) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*net.IPNet) + } + } + + return r0 +} + // GetPodNetworkAdvertisedOnNodeVRFs provides a mock function with given fields: node func (_m *NetInfo) GetPodNetworkAdvertisedOnNodeVRFs(node string) []string { ret := _m.Called(node) @@ -548,7 +492,7 @@ func (_m *NetInfo) GetPodNetworkAdvertisedOnNodeVRFs(node string) []string { return r0 } -// GetPodNetworkAdvertisedVRFs provides a mock function with given fields: +// GetPodNetworkAdvertisedVRFs provides a mock function with no fields func (_m *NetInfo) GetPodNetworkAdvertisedVRFs() map[string][]string { ret := _m.Called() @@ -568,6 +512,26 @@ func (_m *NetInfo) GetPodNetworkAdvertisedVRFs() map[string][]string { return r0 } +// GetTunnelKeys provides a mock function with no fields +func (_m *NetInfo) GetTunnelKeys() []int { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for GetTunnelKeys") + } + + var r0 []int + if rf, ok := ret.Get(0).(func() []int); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]int) + } + } + + return r0 +} + // HasNAD provides a mock function with given fields: nadName func (_m *NetInfo) HasNAD(nadName string) bool { ret := _m.Called(nadName) @@ -586,7 +550,7 @@ func (_m *NetInfo) HasNAD(nadName string) bool { return r0 } -// IPMode provides a mock function with given fields: +// IPMode provides a mock function with no fields func (_m *NetInfo) IPMode() (bool, bool) { ret := _m.Called() @@ -614,7 +578,27 @@ func (_m *NetInfo) IPMode() (bool, bool) { return r0, r1 } -// IsDefault provides a mock function with given fields: +// InfrastructureSubnets provides a mock function with no fields +func (_m *NetInfo) InfrastructureSubnets() []*net.IPNet { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for InfrastructureSubnets") + } + + var r0 []*net.IPNet + if rf, ok := ret.Get(0).(func() []*net.IPNet); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]*net.IPNet) + } + } + + return r0 +} + +// IsDefault provides a mock function with no fields func (_m *NetInfo) IsDefault() bool { ret := _m.Called() @@ -632,7 +616,7 @@ func (_m *NetInfo) IsDefault() bool { return r0 } -// IsPrimaryNetwork provides a mock function with given fields: +// IsPrimaryNetwork provides a mock function with no fields func (_m *NetInfo) IsPrimaryNetwork() bool { ret := _m.Called() @@ -650,7 +634,7 @@ func (_m *NetInfo) IsPrimaryNetwork() bool { return r0 } -// IsSecondary provides a mock function with given fields: +// IsUserDefinedNetwork provides a mock function with no fields func (_m *NetInfo) IsUserDefinedNetwork() bool { ret := _m.Called() @@ -668,7 +652,7 @@ func (_m *NetInfo) IsUserDefinedNetwork() bool { return r0 } -// JoinSubnetV4 provides a mock function with given fields: +// JoinSubnetV4 provides a mock function with no fields func (_m *NetInfo) JoinSubnetV4() *net.IPNet { ret := _m.Called() @@ -688,7 +672,7 @@ func (_m *NetInfo) JoinSubnetV4() *net.IPNet { return r0 } -// JoinSubnetV6 provides a mock function with given fields: +// JoinSubnetV6 provides a mock function with no fields func (_m *NetInfo) JoinSubnetV6() *net.IPNet { ret := _m.Called() @@ -708,7 +692,7 @@ func (_m *NetInfo) JoinSubnetV6() *net.IPNet { return r0 } -// JoinSubnets provides a mock function with given fields: +// JoinSubnets provides a mock function with no fields func (_m *NetInfo) JoinSubnets() []*net.IPNet { ret := _m.Called() @@ -728,7 +712,7 @@ func (_m *NetInfo) JoinSubnets() []*net.IPNet { return r0 } -// MTU provides a mock function with given fields: +// MTU provides a mock function with no fields func (_m *NetInfo) MTU() int { ret := _m.Called() @@ -746,7 +730,7 @@ func (_m *NetInfo) MTU() int { return r0 } -// PhysicalNetworkName provides a mock function with given fields: +// PhysicalNetworkName provides a mock function with no fields func (_m *NetInfo) PhysicalNetworkName() string { ret := _m.Called() @@ -782,7 +766,27 @@ func (_m *NetInfo) RemoveNetworkScopeFromName(name string) string { return r0 } -// Subnets provides a mock function with given fields: +// ReservedSubnets provides a mock function with no fields +func (_m *NetInfo) ReservedSubnets() []*net.IPNet { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for ReservedSubnets") + } + + var r0 []*net.IPNet + if rf, ok := ret.Get(0).(func() []*net.IPNet); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]*net.IPNet) + } + } + + return r0 +} + +// Subnets provides a mock function with no fields func (_m *NetInfo) Subnets() []config.CIDRNetworkEntry { ret := _m.Called() @@ -802,7 +806,7 @@ func (_m *NetInfo) Subnets() []config.CIDRNetworkEntry { return r0 } -// TopologyType provides a mock function with given fields: +// TopologyType provides a mock function with no fields func (_m *NetInfo) TopologyType() string { ret := _m.Called() @@ -820,7 +824,27 @@ func (_m *NetInfo) TopologyType() string { return r0 } -// Vlan provides a mock function with given fields: +// TransitSubnets provides a mock function with no fields +func (_m *NetInfo) TransitSubnets() []*net.IPNet { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for TransitSubnets") + } + + var r0 []*net.IPNet + if rf, ok := ret.Get(0).(func() []*net.IPNet); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]*net.IPNet) + } + } + + return r0 +} + +// Vlan provides a mock function with no fields func (_m *NetInfo) Vlan() uint { ret := _m.Called() diff --git a/go-controller/pkg/util/multi_network.go b/go-controller/pkg/util/multi_network.go index 6a8075f2b9..0c45070159 100644 --- a/go-controller/pkg/util/multi_network.go +++ b/go-controller/pkg/util/multi_network.go @@ -1,10 +1,12 @@ package util import ( + "encoding/json" "errors" "fmt" "net" "reflect" + "slices" "strconv" "strings" "sync" @@ -17,6 +19,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/klog/v2" knet "k8s.io/utils/net" ovncnitypes "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/cni/types" @@ -34,6 +37,7 @@ type NetInfo interface { // static information, not expected to change. GetNetworkName() string GetNetworkID() int + GetTunnelKeys() []int IsDefault() bool IsPrimaryNetwork() bool IsUserDefinedNetwork() bool @@ -47,6 +51,7 @@ type NetInfo interface { JoinSubnetV4() *net.IPNet JoinSubnetV6() *net.IPNet JoinSubnets() []*net.IPNet + TransitSubnets() []*net.IPNet Vlan() uint AllowsPersistentIPs() bool PhysicalNetworkName() string @@ -107,6 +112,7 @@ type MutableNetInfo interface { // SetNetworkID sets the network ID before any controller handles the // network SetNetworkID(id int) + SetTunnelKeys(keys []int) // NADs referencing a network SetNADs(nadName ...string) @@ -221,7 +227,8 @@ type mutableNetInfo struct { // id of the network. It's mutable because is set on day-1 but it can't be // changed or reconciled on day-2 - id int + id int + tunnelKeys []int nads sets.Set[string] podNetworkAdvertisements map[string][]string @@ -264,6 +271,7 @@ func (l *mutableNetInfo) equals(r *mutableNetInfo) bool { r.RLock() defer r.RUnlock() return reflect.DeepEqual(l.id, r.id) && + reflect.DeepEqual(l.tunnelKeys, r.tunnelKeys) && reflect.DeepEqual(l.nads, r.nads) && reflect.DeepEqual(l.podNetworkAdvertisements, r.podNetworkAdvertisements) && reflect.DeepEqual(l.eipAdvertisements, r.eipAdvertisements) @@ -276,6 +284,7 @@ func (l *mutableNetInfo) copyFrom(r *mutableNetInfo) { aux := mutableNetInfo{} r.RLock() aux.id = r.id + aux.tunnelKeys = slices.Clone(r.tunnelKeys) aux.nads = r.nads.Clone() aux.setPodNetworkAdvertisedOnVRFs(r.podNetworkAdvertisements) aux.setEgressIPAdvertisedAtNodes(r.eipAdvertisements) @@ -284,6 +293,7 @@ func (l *mutableNetInfo) copyFrom(r *mutableNetInfo) { l.Lock() defer l.Unlock() l.id = aux.id + l.tunnelKeys = aux.tunnelKeys l.nads = aux.nads l.podNetworkAdvertisements = aux.podNetworkAdvertisements l.eipAdvertisements = aux.eipAdvertisements @@ -302,6 +312,18 @@ func (nInfo *mutableNetInfo) SetNetworkID(id int) { nInfo.id = id } +func (nInfo *mutableNetInfo) GetTunnelKeys() []int { + nInfo.RLock() + defer nInfo.RUnlock() + return nInfo.tunnelKeys +} + +func (nInfo *mutableNetInfo) SetTunnelKeys(tunnelKeys []int) { + nInfo.Lock() + defer nInfo.Unlock() + nInfo.tunnelKeys = tunnelKeys +} + func (nInfo *mutableNetInfo) SetPodNetworkAdvertisedVRFs(podAdvertisements map[string][]string) { nInfo.Lock() defer nInfo.Unlock() @@ -629,6 +651,12 @@ func (nInfo *DefaultNetInfo) JoinSubnets() []*net.IPNet { return defaultJoinSubnets } +// TransitSubnets should not be used for the default network. +// It will return an empty list since transit networks are not set for this type of network. +func (nInfo *DefaultNetInfo) TransitSubnets() []*net.IPNet { + return []*net.IPNet{} +} + // Vlan returns the defaultNetConfInfo's Vlan value func (nInfo *DefaultNetInfo) Vlan() uint { return config.Gateway.VLANID @@ -671,6 +699,7 @@ type userDefinedNetInfo struct { reservedSubnets []*net.IPNet infrastructureSubnets []*net.IPNet joinSubnets []*net.IPNet + transitSubnets []*net.IPNet physicalNetworkName string defaultGatewayIPs []net.IP @@ -721,6 +750,9 @@ func (nInfo *userDefinedNetInfo) GetNetworkScopedK8sMgmtIntfName(nodeName string } func (nInfo *userDefinedNetInfo) GetNetworkScopedClusterRouterName() string { + if nInfo.TopologyType() == types.Layer2Topology { + return nInfo.GetNetworkScopedName(types.TransitRouter) + } return nInfo.GetNetworkScopedName(types.OVNClusterRouter) } @@ -864,6 +896,12 @@ func (nInfo *userDefinedNetInfo) JoinSubnets() []*net.IPNet { return nInfo.joinSubnets } +// TransitSubnets returns the userDefinedNetInfo's transit subnet values (both v4&v6) +// For now it is only set for Primary Layer2 UDNs, otherwise is empty +func (nInfo *userDefinedNetInfo) TransitSubnets() []*net.IPNet { + return nInfo.transitSubnets +} + func (nInfo *userDefinedNetInfo) canReconcile(other NetInfo) bool { if (nInfo == nil) != (other == nil) { return false @@ -913,7 +951,10 @@ func (nInfo *userDefinedNetInfo) canReconcile(other NetInfo) bool { if !cmp.Equal(nInfo.infrastructureSubnets, other.InfrastructureSubnets(), cmpopts.SortSlices(lessIPNet)) { return false } - return cmp.Equal(nInfo.joinSubnets, other.JoinSubnets(), cmpopts.SortSlices(lessIPNet)) + if !cmp.Equal(nInfo.joinSubnets, other.JoinSubnets(), cmpopts.SortSlices(lessIPNet)) { + return false + } + return cmp.Equal(nInfo.transitSubnets, other.TransitSubnets(), cmpopts.SortSlices(lessIPNet)) } func (nInfo *userDefinedNetInfo) copy() *userDefinedNetInfo { @@ -932,6 +973,7 @@ func (nInfo *userDefinedNetInfo) copy() *userDefinedNetInfo { reservedSubnets: nInfo.reservedSubnets, infrastructureSubnets: nInfo.infrastructureSubnets, joinSubnets: nInfo.joinSubnets, + transitSubnets: nInfo.transitSubnets, physicalNetworkName: nInfo.physicalNetworkName, defaultGatewayIPs: nInfo.defaultGatewayIPs, managementIPs: nInfo.managementIPs, @@ -1005,6 +1047,11 @@ func newLayer2NetConfInfo(netconf *ovncnitypes.NetConf) (MutableNetInfo, error) return nil, err } + transitSubnets, err := parseTransitSubnet(netconf.Role, netconf.TransitSubnet) + if err != nil { + return nil, fmt.Errorf("invalid transit subnet for %s netconf %s: %v", netconf.Topology, netconf.Name, err) + } + // Allocate infrastructure IPs for primary networks var defaultGatewayIPs, managementIPs []net.IP if IsPreconfiguredUDNAddressesEnabled() && netconf.Role == types.NetworkRolePrimary { @@ -1020,6 +1067,7 @@ func newLayer2NetConfInfo(netconf *ovncnitypes.NetConf) (MutableNetInfo, error) topology: types.Layer2Topology, subnets: subnets, joinSubnets: joinSubnets, + transitSubnets: transitSubnets, excludeSubnets: excludes, reservedSubnets: reserved, infrastructureSubnets: infra, @@ -1158,6 +1206,15 @@ func parseJoinSubnet(joinSubnet string) ([]*net.IPNet, error) { return joinSubnets, nil } +func parseTransitSubnet(netconfRole, transitSubnet string) ([]*net.IPNet, error) { + transitSubnets := []*net.IPNet{} + if netconfRole != types.NetworkRolePrimary { + // only primary networks can have transit subnet + return transitSubnets, nil + } + return parseSubnetList(transitSubnet) +} + func getIPMode(subnets []config.CIDRNetworkEntry) (bool, bool) { var ipv6Mode, ipv4Mode bool for _, subnet := range subnets { @@ -1261,9 +1318,15 @@ func ParseNADInfo(nad *nettypes.NetworkAttachmentDefinition) (NetInfo, error) { return nil, fmt.Errorf("failed to parse annotated network ID: %w", err) } } - n.SetNetworkID(id) + if nad.Annotations[types.OvnNetworkTunnelKeysAnnotation] != "" { + tunnelKeys, err := ParseTunnelKeysAnnotation(nad.Annotations[types.OvnNetworkTunnelKeysAnnotation]) + if err != nil { + return nil, fmt.Errorf("failed to parse annotated tunnel keys: %w", err) + } + n.SetTunnelKeys(tunnelKeys) + } return n, nil } @@ -1333,8 +1396,15 @@ func ValidateNetConf(nadName string, netconf *ovncnitypes.NetConf) error { return fmt.Errorf("defaultGatewayIPs is only supported for layer2 topology") } + if netconf.TransitSubnet == "" && netconf.Role == types.NetworkRolePrimary && netconf.Topology == types.Layer2Topology { + klog.Warningf("transitSubnet is not specified for layer2 primary NAD %s, dynamic transit subnet will be used", netconf.Name) + if err := SetTransitSubnets(netconf); err != nil { + return fmt.Errorf("failed to set dynamic transit subnet for layer2 primary NAD %s: %v", netconf.Name, err) + } + } + if netconf.Topology != types.LocalnetTopology && netconf.Name != types.DefaultNetworkName { - if err := subnetOverlapCheck(netconf); err != nil { + if _, _, err := SubnetOverlapCheck(netconf); err != nil { return fmt.Errorf("invalid subnet configuration: %w", err) } } @@ -1342,10 +1412,11 @@ func ValidateNetConf(nadName string, netconf *ovncnitypes.NetConf) error { return nil } -// subnetOverlapCheck validates whether POD and join subnet mentioned in a net-attach-def with -// topology "layer2" and "layer3" does not overlap with ClusterSubnets, ServiceCIDRs, join subnet, -// and masquerade subnet. It also considers excluded subnets mentioned in a net-attach-def. -func subnetOverlapCheck(netconf *ovncnitypes.NetConf) error { +// SubnetOverlapCheck validates whether user-configured networks (e.g. POD and join subnet) mentioned in +// a net-attach-def with topology "layer2" and "layer3" overlaps with internal and reserved networks +// (e.g. ClusterSubnets, ServiceCIDRs, join subnet, etc.). +// It also considers excluded subnets mentioned in a net-attach-def. +func SubnetOverlapCheck(netconf *ovncnitypes.NetConf) (*net.IPNet, *net.IPNet, error) { allSubnets := config.NewConfigSubnets() for _, subnet := range config.Default.ClusterSubnets { allSubnets.Append(config.ConfigSubnetCluster, subnet.CIDR) @@ -1365,9 +1436,10 @@ func subnetOverlapCheck(netconf *ovncnitypes.NetConf) error { allSubnets.Append(config.ConfigSubnetMasquerade, v4MasqueradeCIDR) allSubnets.Append(config.ConfigSubnetMasquerade, v6MasqueradeCIDR) + // Layer3 network only uses pre-defined transit subnets if netconf.Topology == types.Layer3Topology { - _, v4TransitCIDR, _ := net.ParseCIDR(config.ClusterManager.V4TransitSwitchSubnet) - _, v6TransitCIDR, _ := net.ParseCIDR(config.ClusterManager.V6TransitSwitchSubnet) + _, v4TransitCIDR, _ := net.ParseCIDR(config.ClusterManager.V4TransitSubnet) + _, v6TransitCIDR, _ := net.ParseCIDR(config.ClusterManager.V6TransitSubnet) allSubnets.Append(config.ConfigSubnetTransit, v4TransitCIDR) allSubnets.Append(config.ConfigSubnetTransit, v6TransitCIDR) @@ -1375,15 +1447,18 @@ func subnetOverlapCheck(netconf *ovncnitypes.NetConf) error { ni, err := NewNetInfo(netconf) if err != nil { - return fmt.Errorf("error while parsing subnets: %v", err) + return nil, nil, fmt.Errorf("error while parsing subnets: %v", err) } for _, subnet := range ni.Subnets() { allSubnets.Append(config.UserDefinedSubnets, subnet.CIDR) } - for _, subnet := range ni.JoinSubnets() { allSubnets.Append(config.UserDefinedJoinSubnet, subnet) } + // dynamic transit subnets are only set for primary Layer2 UDNs for now + for _, subnet := range ni.TransitSubnets() { + allSubnets.Append(config.ConfigSubnetTransit, subnet) + } if ni.ExcludeSubnets() != nil { for i, configSubnet := range allSubnets.Subnets { if IsContainedInAnyCIDR(configSubnet.Subnet, ni.ExcludeSubnets()...) { @@ -1391,12 +1466,12 @@ func subnetOverlapCheck(netconf *ovncnitypes.NetConf) error { } } } - err = allSubnets.CheckForOverlaps() + subnet1, subnet2, err := allSubnets.CheckForOverlaps() if err != nil { - return fmt.Errorf("pod or join subnet overlaps with already configured internal subnets: %w", err) + return subnet1, subnet2, fmt.Errorf("pod or join subnet overlaps with already configured internal subnets: %w", err) } - return nil + return nil, nil, nil } // GetPodNADToNetworkMapping sees if the given pod needs to plumb over this given network specified by netconf, @@ -1830,3 +1905,89 @@ func getFirstAvailableIP(subnets []*net.IPNet, excludeIPs sets.Set[string]) net. } return nil } + +func ParseTunnelKeysAnnotation(annotation string) ([]int, error) { + tunnelKeys := []int{} + if err := json.Unmarshal([]byte(annotation), &tunnelKeys); err != nil { + return nil, fmt.Errorf("failed to parse annotated network tunnel keys: %w", err) + } + return tunnelKeys, nil +} + +func FormatTunnelKeysAnnotation(tunnelKeys []int) (string, error) { + annotationBytes, err := json.Marshal(tunnelKeys) + if err != nil { + return "", fmt.Errorf("failed to format tunnel keys annotation: %w", err) + } + return string(annotationBytes), nil +} + +// SetTransitSubnets generates transit subnet for primary layer2 UDNs and sets for a given netconf. +// It should be called with the final version of netconf to make sure that util.SubnetOverlapCheck(netconf) passes. +func SetTransitSubnets(netconf *ovncnitypes.NetConf) error { + transitSubnets := []string{} + for _, subnetStr := range strings.Split(netconf.Subnets, ",") { + _, subnet, err := net.ParseCIDR(subnetStr) + if err != nil { + return fmt.Errorf("can't generate transit subnets: failed to parse CIDR %q: %w", subnetStr, err) + } + transitSubnet, err := getTransitSubnet(netconf, knet.IsIPv4CIDR(subnet)) + if err != nil { + return err + } + transitSubnets = append(transitSubnets, transitSubnet) + } + netconf.TransitSubnet = strings.Join(transitSubnets, ",") + return nil +} + +func getTransitSubnet(netconf *ovncnitypes.NetConf, isIPv4 bool) (string, error) { + var transitSubnet *net.IPNet + var err error + if isIPv4 { + _, transitSubnet, err = net.ParseCIDR(config.ClusterManager.V4TransitSubnet) + } else { + _, transitSubnet, err = net.ParseCIDR(config.ClusterManager.V6TransitSubnet) + } + if err != nil { + return "", fmt.Errorf("can't generate transit subnets: failed to parse default transit subnet: %w", err) + } + // repeat until we find a non-overlapping subnet, + // but limit the number of iterations to avoid infinite loop + for i := 0; i < 10; i++ { + // only add current transit subnet to the netconf for overlap check (for requested ipFamily), + // final assignment should be done for all ipFamilies outside this function. + netconf.TransitSubnet = transitSubnet.String() + // check if there is subnet overlap + subnet1, subnet2, err := SubnetOverlapCheck(netconf) + if err == nil { + return transitSubnet.String(), nil + } + if subnet1 == nil || subnet2 == nil || subnet1.String() != transitSubnet.String() && subnet2.String() != transitSubnet.String() { + // there is another problem with the config + // or overlap is not with transit subnet + return "", err + } + transitSubnet = getFirstNonOverlappingSubnet(subnet1, subnet2, transitSubnet.Mask) + } + // if the previous loop didn't return the result, we failed to find a non-overlapping subnet + return "", fmt.Errorf("can't generate transit subnets: failed to find non-overlapping transit subnet: %w", err) +} + +// getFirstNonOverlappingSubnet finds the first subnet with the same netmask as netMask that does not overlap with either subnet1 or subnet2. +// It expects that subnet1 and subnet2 overlap with each other, and that netmask is one of the two subnet's netmask. +// If these conditions are not met, the result won't be correct. +func getFirstNonOverlappingSubnet(subnet1, subnet2 *net.IPNet, netMask net.IPMask) *net.IPNet { + // find the bigger network, and get the first subnet outside of it with the same netmask as default transit subnet + subnet1MaskSize, _ := subnet1.Mask.Size() + subnet2MaskSize, _ := subnet2.Mask.Size() + // bigger mask size means smaller network + baseSubnet := subnet1 + if subnet2MaskSize < subnet1MaskSize { + baseSubnet = subnet2 + } + // now find the first subnet outside the baseSubnet with the same mask + baseSubnetLastIP := GetLastIPOfSubnet(baseSubnet, 0) + nextIP := iputils.NextIP(baseSubnetLastIP.IP) + return &net.IPNet{IP: nextIP, Mask: netMask} +} diff --git a/go-controller/pkg/util/multi_network_test.go b/go-controller/pkg/util/multi_network_test.go index 5fee0f9c42..67947de39d 100644 --- a/go-controller/pkg/util/multi_network_test.go +++ b/go-controller/pkg/util/multi_network_test.go @@ -405,13 +405,14 @@ func TestParseNetconf(t *testing.T) { } `, expectedNetConf: &ovncnitypes.NetConf{ - Topology: "layer2", - NADName: "ns1/nad1", - MTU: 1400, - Role: "primary", - Subnets: "192.168.200.0/16", - NetConf: cnitypes.NetConf{Name: "tenant-red", Type: "ovn-k8s-cni-overlay"}, - JoinSubnet: "100.66.0.0/16,fd99::/64", + Topology: "layer2", + NADName: "ns1/nad1", + MTU: 1400, + Role: "primary", + Subnets: "192.168.200.0/16", + TransitSubnet: config.ClusterManager.V4TransitSubnet, + NetConf: cnitypes.NetConf{Name: "tenant-red", Type: "ovn-k8s-cni-overlay"}, + JoinSubnet: "100.66.0.0/16,fd99::/64", }, }, { @@ -1364,8 +1365,8 @@ func TestSubnetOverlapCheck(t *testing.T) { config.Gateway.V6MasqueradeSubnet = "fd69::/125" config.Gateway.V4JoinSubnet = "100.64.0.0/16" config.Gateway.V6JoinSubnet = "fd98::/64" - config.ClusterManager.V4TransitSwitchSubnet = "100.88.0.0/16" - config.ClusterManager.V6TransitSwitchSubnet = "fd97::/64" + config.ClusterManager.V4TransitSubnet = "100.88.0.0/16" + config.ClusterManager.V6TransitSubnet = "fd97::/64" type testConfig struct { desc string inputNetAttachDefConfigSpec string @@ -1388,7 +1389,7 @@ func TestSubnetOverlapCheck(t *testing.T) { `, expectedError: config.NewSubnetOverlapError( config.ConfigSubnet{SubnetType: config.UserDefinedSubnets, Subnet: MustParseCIDR("100.88.0.0/17")}, - config.ConfigSubnet{SubnetType: config.ConfigSubnetTransit, Subnet: MustParseCIDR(config.ClusterManager.V4TransitSwitchSubnet)}), + config.ConfigSubnet{SubnetType: config.ConfigSubnetTransit, Subnet: MustParseCIDR(config.ClusterManager.V4TransitSubnet)}), }, { desc: "return error when IPv4 POD subnet in net-attach-def overlaps other subnets", diff --git a/go-controller/pkg/util/ndp/ra.go b/go-controller/pkg/util/ndp/ra.go index 425d096335..66b76ec49d 100644 --- a/go-controller/pkg/util/ndp/ra.go +++ b/go-controller/pkg/util/ndp/ra.go @@ -1,6 +1,7 @@ package ndp import ( + "encoding/binary" "fmt" "net" "syscall" @@ -11,11 +12,21 @@ import ( "golang.org/x/sys/unix" ) +// PrefixInformation represents a Prefix Information Option for Router Advertisements +type PrefixInformation struct { + Prefix net.IPNet + ValidLifetime uint32 + PreferredLifetime uint32 + OnLink bool + Autonomous bool +} + // RouterAdvertisement with mac, ips and lifetime field to send type RouterAdvertisement struct { SourceMAC, DestinationMAC net.HardwareAddr SourceIP, DestinationIP net.IP Lifetime uint16 + PrefixInfos []PrefixInformation } // SendRouterAdvertisements sends one or more Router Advertisements (RAs) on the specified network interface. @@ -45,6 +56,21 @@ func SendRouterAdvertisements(interfaceName string, ras ...RouterAdvertisement) } defer c.Close() + serializedRAs, err := generateRouterAdvertisements(ras...) + if err != nil { + return fmt.Errorf("failed to generate Router Advertisements: %w", err) + } + + // Send each serialized Router Advertisement using the raw socket. + for _, serializedRA := range serializedRAs { + if err := c.Sendto(serializedRA, &unix.SockaddrLinklayer{Ifindex: iface.Index}, 0); err != nil { + return err + } + } + return nil +} + +func generateRouterAdvertisements(ras ...RouterAdvertisement) ([][]byte, error) { serializedRAs := [][]byte{} for _, ra := range ras { serializeBuffer := gopacket.NewSerializeBuffer() @@ -70,7 +96,7 @@ func SendRouterAdvertisements(interfaceName string, ras ...RouterAdvertisement) TypeCode: layers.CreateICMPv6TypeCode(layers.ICMPv6TypeRouterAdvertisement, 0), } if err := icmp6Layer.SetNetworkLayerForChecksum(&ip6Layer); err != nil { - return err + return nil, err } // https://datatracker.ietf.org/doc/html/rfc4861#section-4.2 @@ -91,16 +117,27 @@ func SendRouterAdvertisements(interfaceName string, ras ...RouterAdvertisement) } // Create the ICMPv6 Router Advertisement layer. + options := layers.ICMPv6Options{{ + Type: layers.ICMPv6OptSourceAddress, + Data: ra.SourceMAC, + }} + + // Add Prefix Information Options if specified + for _, prefixInfo := range ra.PrefixInfos { + prefixData := createPrefixInfoData(&prefixInfo) + options = append(options, layers.ICMPv6Option{ + Type: layers.ICMPv6OptPrefixInfo, + Data: prefixData, + }) + } + raLayer := layers.ICMPv6RouterAdvertisement{ HopLimit: 255, Flags: managedAddressFlag | defaultRoutePreferenceFlag, RouterLifetime: ra.Lifetime, ReachableTime: 0, RetransTimer: 0, - Options: layers.ICMPv6Options{{ - Type: layers.ICMPv6OptSourceAddress, - Data: ra.SourceMAC, - }}, + Options: options, } // Serialize the layers into a byte slice. @@ -110,16 +147,42 @@ func SendRouterAdvertisements(interfaceName string, ras ...RouterAdvertisement) &icmp6Layer, &raLayer, ); err != nil { - return err + return nil, err } serializedRAs = append(serializedRAs, serializeBuffer.Bytes()) } + return serializedRAs, nil +} - // Send each serialized Router Advertisement using the raw socket. - for _, serializedRA := range serializedRAs { - if err := c.Sendto(serializedRA, &unix.SockaddrLinklayer{Ifindex: iface.Index}, 0); err != nil { - return err - } +// createPrefixInfoData creates the data payload for a Prefix Information Option +// according to RFC 4861 Section 4.6.2 +func createPrefixInfoData(prefixInfo *PrefixInformation) []byte { + data := make([]byte, 30) + + // Prefix Length (8 bits) + prefixLen, _ := prefixInfo.Prefix.Mask.Size() + data[0] = uint8(prefixLen) + + // Flags (8 bits) + var flags uint8 + if prefixInfo.OnLink { + flags |= 0x80 // L flag } - return nil + if prefixInfo.Autonomous { + flags |= 0x40 // A flag + } + data[1] = flags + + // Valid Lifetime (32 bits) + binary.BigEndian.PutUint32(data[2:6], prefixInfo.ValidLifetime) + + // Preferred Lifetime (32 bits) + binary.BigEndian.PutUint32(data[6:10], prefixInfo.PreferredLifetime) + + // Reserved (32 bits) - already zero from make + + // Prefix (128 bits) + copy(data[14:], prefixInfo.Prefix.IP.To16()) + + return data } diff --git a/go-controller/pkg/util/ndp/ra_test.go b/go-controller/pkg/util/ndp/ra_test.go new file mode 100644 index 0000000000..eba772e90a --- /dev/null +++ b/go-controller/pkg/util/ndp/ra_test.go @@ -0,0 +1,282 @@ +package ndp + +import ( + "encoding/binary" + "net" + "testing" + + "github.com/google/gopacket" + "github.com/google/gopacket/layers" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCreatePrefixInfoData(t *testing.T) { + tests := []struct { + name string + prefix string + expected struct { + prefixLen uint8 + flags uint8 + validLifetime uint32 + preferredLifetime uint32 + } + prefixInfo PrefixInformation + }{ + { + name: "fd99::/64 with both flags set", + prefix: "fd99::/64", + expected: struct { + prefixLen uint8 + flags uint8 + validLifetime uint32 + preferredLifetime uint32 + }{ + prefixLen: 64, + flags: 0xC0, // L=1, A=1 (0x80 | 0x40) + validLifetime: 65535, + preferredLifetime: 0, + }, + prefixInfo: PrefixInformation{ + ValidLifetime: 65535, + PreferredLifetime: 0, + OnLink: true, + Autonomous: true, + }, + }, + { + name: "2001:db8::/32 with only OnLink flag", + prefix: "2001:db8::/32", + expected: struct { + prefixLen uint8 + flags uint8 + validLifetime uint32 + preferredLifetime uint32 + }{ + prefixLen: 32, + flags: 0x80, // L=1, A=0 + validLifetime: 3600, + preferredLifetime: 1800, + }, + prefixInfo: PrefixInformation{ + ValidLifetime: 3600, + PreferredLifetime: 1800, + OnLink: true, + Autonomous: false, + }, + }, + { + name: "::1/128 with no flags", + prefix: "::1/128", + expected: struct { + prefixLen uint8 + flags uint8 + validLifetime uint32 + preferredLifetime uint32 + }{ + prefixLen: 128, + flags: 0x00, // L=0, A=0 + validLifetime: 0, + preferredLifetime: 0, + }, + prefixInfo: PrefixInformation{ + ValidLifetime: 0, + PreferredLifetime: 0, + OnLink: false, + Autonomous: false, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, prefixNet, err := net.ParseCIDR(tt.prefix) + require.NoError(t, err) + + tt.prefixInfo.Prefix = *prefixNet + data := createPrefixInfoData(&tt.prefixInfo) + + // Verify the data length (should be 30 bytes) + assert.Len(t, data, 30, "PrefixInfo data should be 30 bytes") + + // Verify prefix length + assert.Equal(t, tt.expected.prefixLen, data[0], "Prefix length mismatch") + + // Verify flags + assert.Equal(t, tt.expected.flags, data[1], "Flags mismatch") + + // Verify valid lifetime + actualValidLifetime := binary.BigEndian.Uint32(data[2:6]) + assert.Equal(t, tt.expected.validLifetime, actualValidLifetime, "Valid lifetime mismatch") + + // Verify preferred lifetime + actualPreferredLifetime := binary.BigEndian.Uint32(data[6:10]) + assert.Equal(t, tt.expected.preferredLifetime, actualPreferredLifetime, "Preferred lifetime mismatch") + + // Verify reserved field is zero + reserved := binary.BigEndian.Uint32(data[10:14]) + assert.Equal(t, uint32(0), reserved, "Reserved field should be zero") + + // Verify prefix IP + expectedPrefix := prefixNet.IP.To16() + actualPrefix := data[14:30] + assert.Equal(t, []byte(expectedPrefix), actualPrefix, "Prefix IP mismatch") + }) + } +} + +func TestRouterAdvertisementSerialization(t *testing.T) { + sourceMAC := net.HardwareAddr{0x00, 0x11, 0x22, 0x33, 0x44, 0x55} + destinationMAC := net.HardwareAddr{0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff} + sourceIP := net.ParseIP("fe80::211:22ff:fe33:4455") + destinationIP := net.ParseIP("fe80::aabb:ccff:fedd:eeff") + + // Create test prefix information + _, prefix1, err := net.ParseCIDR("fd99::/64") + require.NoError(t, err) + + prefixInfos := []PrefixInformation{ + { + Prefix: *prefix1, + ValidLifetime: 65535, + PreferredLifetime: 0, + OnLink: true, + Autonomous: true, + }, + } + + ra := RouterAdvertisement{ + SourceMAC: sourceMAC, + SourceIP: sourceIP, + DestinationMAC: destinationMAC, + DestinationIP: destinationIP, + Lifetime: 65535, + PrefixInfos: prefixInfos, + } + + serializedData, err := generateRouterAdvertisements(ra) + require.NoError(t, err) + + // Parse the serialized data to verify structure + packet := gopacket.NewPacket(serializedData[0], layers.LayerTypeEthernet, gopacket.Default) + + // Verify Ethernet layer + ethLayer := packet.Layer(layers.LayerTypeEthernet) + require.NotNil(t, ethLayer) + eth := ethLayer.(*layers.Ethernet) + assert.Equal(t, destinationMAC, eth.DstMAC) + assert.Equal(t, sourceMAC, eth.SrcMAC) + assert.Equal(t, layers.EthernetTypeIPv6, eth.EthernetType) + + // Verify IPv6 layer + ipv6Layer := packet.Layer(layers.LayerTypeIPv6) + require.NotNil(t, ipv6Layer) + ipv6 := ipv6Layer.(*layers.IPv6) + assert.Equal(t, sourceIP, ipv6.SrcIP) + assert.Equal(t, destinationIP, ipv6.DstIP) + assert.Equal(t, layers.IPProtocolICMPv6, ipv6.NextHeader) + + // Verify ICMPv6 layer + icmpv6Layer := packet.Layer(layers.LayerTypeICMPv6) + require.NotNil(t, icmpv6Layer) + icmpv6 := icmpv6Layer.(*layers.ICMPv6) + assert.Equal(t, uint8(layers.ICMPv6TypeRouterAdvertisement), uint8(icmpv6.TypeCode.Type())) + + // Verify Router Advertisement layer + raLayerParsed := packet.Layer(layers.LayerTypeICMPv6RouterAdvertisement) + require.NotNil(t, raLayerParsed) + raParsed := raLayerParsed.(*layers.ICMPv6RouterAdvertisement) + assert.Equal(t, uint16(65535), raParsed.RouterLifetime) + + // Verify we have the expected options (source address + prefix info) + assert.Len(t, raParsed.Options, 2, "Should have 2 options: source address and prefix info") + + // Check for source address option + foundSourceOpt := false + foundPrefixOpt := false + for _, opt := range raParsed.Options { + if opt.Type == layers.ICMPv6OptSourceAddress { + foundSourceOpt = true + assert.Equal(t, sourceMAC, net.HardwareAddr(opt.Data)) + } + if opt.Type == layers.ICMPv6OptPrefixInfo { + foundPrefixOpt = true + assert.Len(t, opt.Data, 30, "Prefix info data should be 30 bytes") + // Verify prefix length + assert.Equal(t, uint8(64), opt.Data[0]) + // Verify flags (OnLink=1, Autonomous=1) + assert.Equal(t, uint8(0xC0), opt.Data[1]) + } + } + assert.True(t, foundSourceOpt, "Should have source address option") + assert.True(t, foundPrefixOpt, "Should have prefix info option") +} + +func TestMultiplePrefixInfosSerialization(t *testing.T) { + sourceMAC := net.HardwareAddr{0x00, 0x11, 0x22, 0x33, 0x44, 0x55} + destinationMAC := net.HardwareAddr{0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff} + sourceIP := net.ParseIP("fe80::211:22ff:fe33:4455") + destinationIP := net.ParseIP("fe80::aabb:ccff:fedd:eeff") + + // Create multiple prefix information entries + _, prefix1, err := net.ParseCIDR("fd99::/64") + require.NoError(t, err) + _, prefix2, err := net.ParseCIDR("2001:db8::/32") + require.NoError(t, err) + _, prefix3, err := net.ParseCIDR("fc00::/7") + require.NoError(t, err) + + prefixInfos := []PrefixInformation{ + { + Prefix: *prefix1, + ValidLifetime: 65535, + PreferredLifetime: 0, + OnLink: true, + Autonomous: true, + }, + { + Prefix: *prefix2, + ValidLifetime: 3600, + PreferredLifetime: 1800, + OnLink: true, + Autonomous: false, + }, + { + Prefix: *prefix3, + ValidLifetime: 7200, + PreferredLifetime: 3600, + OnLink: false, + Autonomous: true, + }, + } + + ra := RouterAdvertisement{ + SourceMAC: sourceMAC, + SourceIP: sourceIP, + DestinationMAC: destinationMAC, + DestinationIP: destinationIP, + Lifetime: 65535, + PrefixInfos: prefixInfos, + } + + serializedData, err := generateRouterAdvertisements(ra) + require.NoError(t, err) + + // Parse and verify + packet := gopacket.NewPacket(serializedData[0], layers.LayerTypeEthernet, gopacket.Default) + raLayerParsed := packet.Layer(layers.LayerTypeICMPv6RouterAdvertisement) + require.NotNil(t, raLayerParsed) + raParsed := raLayerParsed.(*layers.ICMPv6RouterAdvertisement) + + // Should have 1 source address option + 3 prefix info options + assert.Len(t, raParsed.Options, 4, "Should have 4 options: 1 source address + 3 prefix infos") + + prefixOptCount := 0 + for _, opt := range raParsed.Options { + if opt.Type == layers.ICMPv6OptPrefixInfo { + prefixOptCount++ + assert.Len(t, opt.Data, 30, "Each prefix info should be 30 bytes") + } + } + assert.Equal(t, 3, prefixOptCount, "Should have exactly 3 prefix info options") +} diff --git a/go-controller/pkg/util/net.go b/go-controller/pkg/util/net.go index 94f7610cd0..6016a946b5 100644 --- a/go-controller/pkg/util/net.go +++ b/go-controller/pkg/util/net.go @@ -5,6 +5,7 @@ import ( "crypto/sha256" "errors" "fmt" + "math/big" "net" "strconv" "strings" @@ -354,6 +355,22 @@ func IPNetsIPToStringSlice(ips []*net.IPNet) []string { return ipAddrs } +func IPNetsToStringSlice(ipNets []*net.IPNet) []string { + ipNetStrings := make([]string, 0, len(ipNets)) + for _, ipNet := range ipNets { + ipNetStrings = append(ipNetStrings, ipNet.String()) + } + return ipNetStrings +} + +func IPNetsToIPs(ipNets []*net.IPNet) []net.IP { + ips := make([]net.IP, 0, len(ipNets)) + for _, ipNet := range ipNets { + ips = append(ips, ipNet.IP) + } + return ips +} + // CalculateRouteTableID will calculate route table ID based on the network // interface index func CalculateRouteTableID(ifIndex int) int { @@ -431,3 +448,18 @@ func ParseIPList(ipsStr string) ([]net.IP, error) { } return ips, nil } + +// GetLastIPOfSubnet returns the `indexFromLast`th IP address of a given subnet. +// For example, if indexFromLast is 0 and subnet is 10.0.0.0/24, it returns +// 10.0.0.255/24. +func GetLastIPOfSubnet(subnet *net.IPNet, indexFromLast int) *net.IPNet { + mask, total := subnet.Mask.Size() + base := big.NewInt(1) + totalIPs := new(big.Int).Lsh(base, uint(total-mask)) + lastIPIndex := totalIPs.Sub(totalIPs, big.NewInt(int64(indexFromLast+1))) + // this is copied from utilnet.AddIPOffset but to allow big.Int offset + r := big.NewInt(0).Add(utilnet.BigForIP(subnet.IP), lastIPIndex).Bytes() + r = append(make([]byte, 16), r...) + lastIP := net.IP(r[len(r)-16:]) + return &net.IPNet{IP: lastIP, Mask: subnet.Mask} +} diff --git a/go-controller/pkg/util/node_annotations.go b/go-controller/pkg/util/node_annotations.go index 5e77a26acf..d8e8a1dbb7 100644 --- a/go-controller/pkg/util/node_annotations.go +++ b/go-controller/pkg/util/node_annotations.go @@ -155,6 +155,9 @@ const ( // }", ovnUDNLayer2NodeGRLRPTunnelIDs = "k8s.ovn.org/udn-layer2-node-gateway-router-lrp-tunnel-ids" + Layer2TopologyVersion = "k8s.ovn.org/layer2-topology-version" + TransitRouterTopoVersion = "2.0" + // ovnNodeEncapIPs is used to indicate encap IPs set on the node OVNNodeEncapIPs = "k8s.ovn.org/node-encap-ips" @@ -510,6 +513,10 @@ func UpdateUDNLayer2NodeGRLRPTunnelIDs(annotations map[string]string, netName st return annotations, nil } +func UDNLayer2NodeUsesTransitRouter(node *corev1.Node) bool { + return node.Annotations[Layer2TopologyVersion] == TransitRouterTopoVersion +} + // PrimaryIfAddrAnnotation represents IPv4 and/or IPv6 addresses stored in node annotations. // It is used for JSON marshalling/unmarshalling of node interface address information, // including primary interface addresses and other node IP configurations. diff --git a/helm/ovn-kubernetes/charts/ovnkube-control-plane/templates/ovnkube-control-plane.yaml b/helm/ovn-kubernetes/charts/ovnkube-control-plane/templates/ovnkube-control-plane.yaml index 48b9fd36f5..da8b42e48f 100644 --- a/helm/ovn-kubernetes/charts/ovnkube-control-plane/templates/ovnkube-control-plane.yaml +++ b/helm/ovn-kubernetes/charts/ovnkube-control-plane/templates/ovnkube-control-plane.yaml @@ -159,10 +159,10 @@ spec: value: {{ hasKey .Values.global "enableMultiExternalGateway" | ternary .Values.global.enableMultiExternalGateway false | quote }} - name: OVN_NETWORK_QOS_ENABLE value: {{ hasKey .Values.global "enableNetworkQos" | ternary .Values.global.enableNetworkQos false | quote }} - - name: OVN_V4_TRANSIT_SWITCH_SUBNET - value: {{ default "" .Values.global.v4TransitSwitchSubnet | quote }} - - name: OVN_V6_TRANSIT_SWITCH_SUBNET - value: {{ default "" .Values.global.v6TransitSwitchSubnet | quote }} + - name: OVN_V4_TRANSIT_SUBNET + value: {{ default "" .Values.global.v4TransitSubnet | quote }} + - name: OVN_V6_TRANSIT_SUBNET + value: {{ default "" .Values.global.v6TransitSubnet | quote }} - name: OVN_ENABLE_PERSISTENT_IPS value: {{ hasKey .Values.global "enablePersistentIPs" | ternary .Values.global.enablePersistentIPs false | quote }} - name: OVN_ENABLE_DNSNAMERESOLVER diff --git a/helm/ovn-kubernetes/values-multi-node-zone.yaml b/helm/ovn-kubernetes/values-multi-node-zone.yaml index 8056461256..65a2e8ed11 100644 --- a/helm/ovn-kubernetes/values-multi-node-zone.yaml +++ b/helm/ovn-kubernetes/values-multi-node-zone.yaml @@ -14,7 +14,7 @@ tags: # -- Endpoint of Kubernetes api server k8sAPIServer: https://172.25.0.2:6443 -# -- IP range for Kubernetes pods, /14 is the top level range, under which each /23 range will be assigned to a node +# -- IP range for Kubernetes pods, /16 is the top level range, under which each /24 range will be assigned to a node podNetwork: 10.244.0.0/16/24 # -- A comma-separated set of CIDR notation IP ranges from which k8s assigns service cluster IPs. This should be the same as the value provided for kube-apiserver "--service-cluster-ip-range" option serviceNetwork: 10.96.0.0/16 @@ -40,14 +40,14 @@ global: v4JoinSubnet: "100.64.0.0/16" # -- The v4 masquerade subnet used for assigning masquerade IPv4 addresses v4MasqueradeSubnet: "169.254.0.0/17" - # -- The v4 subnet for transit switch - v4TransitSwitchSubnet: "100.88.0.0/16" + # -- The v4 subnet for transit switches and routers + v4TransitSubnet: "100.88.0.0/16" # -- The v6 join subnet used for assigning join switch IPv6 addresses v6JoinSubnet: "fd98::/64" # -- The v6 masquerade subnet used for assigning masquerade IPv6 addresses v6MasqueradeSubnet: "fd69::/112" - # -- The v6 subnet for transit switch - v6TransitSwitchSubnet: "fd97::/64" + # -- The v6 subnet for transit switches and routers + v6TransitSubnet: "fd97::/64" # -- Whether or not enable ovnkube identity webhook enableOvnKubeIdentity: false # -- Indicate if ovnkube run master and node in one process diff --git a/helm/ovn-kubernetes/values-no-ic.yaml b/helm/ovn-kubernetes/values-no-ic.yaml index f643f81133..a02d849cd9 100644 --- a/helm/ovn-kubernetes/values-no-ic.yaml +++ b/helm/ovn-kubernetes/values-no-ic.yaml @@ -12,7 +12,7 @@ tags: # -- Endpoint of Kubernetes api server k8sAPIServer: https://172.25.0.2:6443 -# -- IP range for Kubernetes pods, /14 is the top level range, under which each /23 range will be assigned to a node +# -- IP range for Kubernetes pods, /16 is the top level range, under which each /24 range will be assigned to a node podNetwork: 10.244.0.0/16/24 # -- A comma-separated set of CIDR notation IP ranges from which k8s assigns service cluster IPs. This should be the same as the value provided for kube-apiserver "--service-cluster-ip-range" option serviceNetwork: 10.96.0.0/16 diff --git a/helm/ovn-kubernetes/values-single-node-zone.yaml b/helm/ovn-kubernetes/values-single-node-zone.yaml index 516b77220b..221cf45247 100644 --- a/helm/ovn-kubernetes/values-single-node-zone.yaml +++ b/helm/ovn-kubernetes/values-single-node-zone.yaml @@ -14,7 +14,7 @@ tags: # -- Endpoint of Kubernetes api server k8sAPIServer: https://172.25.0.2:6443 -# -- IP range for Kubernetes pods, /14 is the top level range, under which each /23 range will be assigned to a node +# -- IP range for Kubernetes pods, /16 is the top level range, under which each /24 range will be assigned to a node podNetwork: 10.244.0.0/16/24 # -- A comma-separated set of CIDR notation IP ranges from which k8s assigns service cluster IPs. This should be the same as the value provided for kube-apiserver "--service-cluster-ip-range" option serviceNetwork: 10.96.0.0/16 @@ -40,14 +40,14 @@ global: v4JoinSubnet: "100.64.0.0/16" # -- The v4 masquerade subnet used for assigning masquerade IPv4 addresses v4MasqueradeSubnet: "169.254.0.0/17" - # -- The v4 subnet for transit switch - v4TransitSwitchSubnet: "100.88.0.0/16" + # -- The v4 subnet for transit switches and routers + v4TransitSubnet: "100.88.0.0/16" # -- The v6 join subnet used for assigning join switch IPv6 addresses v6JoinSubnet: "fd98::/64" # -- The v6 masquerade subnet used for assigning masquerade IPv6 addresses v6MasqueradeSubnet: "fd69::/112" - # -- The v6 subnet for transit switch - v6TransitSwitchSubnet: "fd97::/64" + # -- The v6 subnet for transit switches and routers + v6TransitSubnet: "fd97::/64" # -- Whether or not enable ovnkube identity webhook enableOvnKubeIdentity: true # -- Indicate if ovnkube run master and node in one process diff --git a/mkdocs.yml b/mkdocs.yml index 45f5a277e0..925f2f96d6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -155,5 +155,7 @@ nav: - Preconfigured UDN Addresses: okeps/okep-5233-preconfigured-udn-addresses.md - BGP: okeps/okep-5296-bgp.md - Layer2TransitRouter: okeps/okep-5094-layer2-transit-router.md + - MCP for Troubleshooting: okeps/okep-5494-ovn-kubernetes-mcp-server.md + - Dynamic UDN Node Allocation: okeps/okep-5552-dynamic-udn-node-allocation.md - Blog: - blog/index.md diff --git a/test/e2e/kubevirt.go b/test/e2e/kubevirt.go index 2dfaf5e2e2..553421b119 100644 --- a/test/e2e/kubevirt.go +++ b/test/e2e/kubevirt.go @@ -1332,13 +1332,13 @@ fi }, 30*time.Second, time.Second).Should(Equal("Accepted")) } - getJoinIPs = func(cudn *udnv1.ClusterUserDefinedNetwork) []string { + getCUDNSubnets = func(cudn *udnv1.ClusterUserDefinedNetwork) []string { nad, err := nadClient.NetworkAttachmentDefinitions(namespace).Get(context.TODO(), cudn.Name, metav1.GetOptions{}) Expect(err).NotTo(HaveOccurred()) var result map[string]interface{} err = json.Unmarshal([]byte(nad.Spec.Config), &result) Expect(err).NotTo(HaveOccurred()) - return strings.Split(result["joinSubnet"].(string), ",") + return strings.Split(result["subnets"].(string), ",") } ) BeforeEach(func() { @@ -1893,10 +1893,7 @@ ip route add %[3]s via %[4]s if isIPv6Supported(fr.ClientSet) && isInterconnectEnabled() { step = by(vmi.Name, fmt.Sprintf("Checking IPv6 gateway before %s %s", td.resource.description, td.test.description)) - nodeRunningVMI, err := fr.ClientSet.CoreV1().Nodes().Get(context.Background(), vmi.Status.NodeName, metav1.GetOptions{}) - Expect(err).NotTo(HaveOccurred(), step) - - expectedIPv6GatewayPath, err := kubevirt.GenerateGatewayIPv6RouterLLA(nodeRunningVMI, getJoinIPs(cudn)) + expectedIPv6GatewayPath, err := kubevirt.GenerateGatewayIPv6RouterLLA(getCUDNSubnets(cudn)) Expect(err).NotTo(HaveOccurred()) Eventually(kubevirt.RetrieveIPv6Gateways). WithArguments(virtClient, vmi). @@ -1964,10 +1961,7 @@ ip route add %[3]s via %[4]s step = by(vmi.Name, fmt.Sprintf("Checking IPv4 gateway cached mac after %s %s", td.resource.description, td.test.description)) Expect(crClient.Get(context.TODO(), crclient.ObjectKeyFromObject(vmi), vmi)).To(Succeed()) - targetNode, err := fr.ClientSet.CoreV1().Nodes().Get(context.Background(), vmi.Status.MigrationState.TargetNode, metav1.GetOptions{}) - Expect(err).NotTo(HaveOccurred(), step) - - expectedGatewayMAC, err := kubevirt.GenerateGatewayMAC(targetNode, getJoinIPs(cudn)) + expectedGatewayMAC, err := kubevirt.GenerateGatewayMAC(getCUDNSubnets(cudn)) Expect(err).NotTo(HaveOccurred(), step) Expect(err).NotTo(HaveOccurred(), step) @@ -1980,10 +1974,7 @@ ip route add %[3]s via %[4]s if isIPv6Supported(fr.ClientSet) { step = by(vmi.Name, fmt.Sprintf("Checking IPv6 gateway after %s %s", td.resource.description, td.test.description)) - targetNode, err := fr.ClientSet.CoreV1().Nodes().Get(context.Background(), vmi.Status.MigrationState.TargetNode, metav1.GetOptions{}) - Expect(err).NotTo(HaveOccurred(), step) - - targetNodeIPv6GatewayPath, err := kubevirt.GenerateGatewayIPv6RouterLLA(targetNode, getJoinIPs(cudn)) + targetNodeIPv6GatewayPath, err := kubevirt.GenerateGatewayIPv6RouterLLA(getCUDNSubnets(cudn)) Expect(err).NotTo(HaveOccurred()) Eventually(kubevirt.RetrieveIPv6Gateways). WithArguments(virtClient, vmi). diff --git a/test/e2e/kubevirt/net.go b/test/e2e/kubevirt/net.go index c89f146f91..f28275a25d 100644 --- a/test/e2e/kubevirt/net.go +++ b/test/e2e/kubevirt/net.go @@ -9,8 +9,6 @@ import ( iputils "github.com/containernetworking/plugins/pkg/ip" - corev1 "k8s.io/api/core/v1" - kubevirtv1 "kubevirt.io/api/core/v1" v1 "kubevirt.io/api/core/v1" @@ -67,61 +65,51 @@ func RetrieveIPv6Gateways(cli *Client, vmi *v1.VirtualMachineInstance) ([]string return paths, nil } -func GenerateGatewayMAC(node *corev1.Node, joinSubnets []string) (string, error) { +func GenerateGatewayMAC(subnets []string) (string, error) { config.IPv4Mode = true - lrpJoinAddress, err := GetDefaultUDNGWRouterIPs(node, joinSubnets) + defaultGWIPs, err := GetLayer2UDNDefaultGWIPs(subnets) if err != nil { return "", err } - if len(lrpJoinAddress) == 0 { - return "", fmt.Errorf("missing lrp join ip at node %q", node.Name) + if len(defaultGWIPs) == 0 { + return "", fmt.Errorf("can't find default GW IP for subnets %v", subnets) } - return util.IPAddrToHWAddr(*lrpJoinAddress[0]).String(), nil + return util.IPAddrToHWAddr(*defaultGWIPs[0]).String(), nil } -func GenerateGatewayIPv6RouterLLA(node *corev1.Node, joinSubnets []string) (string, error) { +func GenerateGatewayIPv6RouterLLA(subnets []string) (string, error) { config.IPv4Mode = true - joinAddresses, err := GetDefaultUDNGWRouterIPs(node, joinSubnets) + defaultGWIPs, err := GetLayer2UDNDefaultGWIPs(subnets) if err != nil { return "", err } - if len(joinAddresses) == 0 { - return "", fmt.Errorf("missing join addresses at node %q", node.Name) + if len(defaultGWIPs) == 0 { + return "", fmt.Errorf("can't find default GW IP for subnets %v", subnets) } - return util.HWAddrToIPv6LLA(util.IPAddrToHWAddr(*joinAddresses[0])).String(), nil + return util.HWAddrToIPv6LLA(util.IPAddrToHWAddr(*defaultGWIPs[0])).String(), nil } -func GetDefaultUDNGWRouterIPs(node *corev1.Node, joinSubnets []string) ([]*net.IP, error) { - nodeID, err := util.GetNodeID(node) - if err != nil { - // Don't consider this node as cluster-manager has not allocated node id yet. - return nil, err - } +// GetLayer2UDNDefaultGWIPs returns the default gateway IPs (.1) for a Layer2 UDN subnet +func GetLayer2UDNDefaultGWIPs(subnets []string) ([]*net.IP, error) { var udnJoinNetv4, udnJoinNetv6 net.IP - for _, subnet := range joinSubnets { + for _, subnet := range subnets { ip, _, err := net.ParseCIDR(subnet) if err != nil { return nil, fmt.Errorf("failed to parse CIDR %q: %v", subnet, err) } if ip.To4() != nil { - udnJoinNetv4 = ip + udnJoinNetv4 = iputils.NextIP(ip) } else { - udnJoinNetv6 = ip + udnJoinNetv6 = iputils.NextIP(ip) } } res := []*net.IP{} if config.IPv4Mode { - for range nodeID { - udnJoinNetv4 = iputils.NextIP(udnJoinNetv4) - } res = append(res, &udnJoinNetv4) } if config.IPv6Mode { - for range nodeID { - udnJoinNetv6 = iputils.NextIP(udnJoinNetv6) - } res = append(res, &udnJoinNetv6) } return res, nil diff --git a/test/e2e/route_advertisements.go b/test/e2e/route_advertisements.go index 95dfc5e7c3..e5ac64155b 100644 --- a/test/e2e/route_advertisements.go +++ b/test/e2e/route_advertisements.go @@ -1109,7 +1109,7 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" }), ginkgo.Entry("UDN pod to the same node nodeport service in different UDN network should not work", // FIXME: This test should work: https://github.com/ovn-kubernetes/ovn-kubernetes/issues/5419 - // This traffic flow is expected to work eventually but doesn't work today on Layer3 (v4 and v6) and Layer2 (v4 only) networks. + // This traffic flow is expected to work eventually but doesn't work today on Layer3 (v4 and v6) and Layer2 (v4 and v6) networks. // Reason it doesn't work today is because UDN networks don't have MAC bindings for masqueradeIPs of other networks. // Traffic flow: UDN pod in network A -> samenode nodeIP:nodePort service of networkB // UDN pod in networkA -> ovn-switch -> ovn-cluster-router (SNAT to masqueradeIP of networkA) -> mpX interface -> @@ -1118,8 +1118,6 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" // On the GR we DNAT to backend pod and SNAT to joinIP. // Reply: Pod replies and now OVN in networkB tries to ARP for the masqueradeIP of networkA which is the source and simply // fails as it doesn't know how to reach this masqueradeIP. - // There is also inconsistency in behaviour within Layer2 networks for how IPv4 works and how IPv6 works where the traffic - // works on ipv6 because of the flows described below. func(ipFamily utilnet.IPFamily) (clientName string, clientNamespace string, dst string, expectedOutput string, expectErr bool) { clientPod := podsNetA[0] node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), nodes.Items[0].Name, metav1.GetOptions{}) @@ -1130,23 +1128,8 @@ var _ = ginkgo.DescribeTableSubtree("BGP: isolation between advertised networks" nodeIP = nodeIPv6 } nodePort := svcNetB.Spec.Ports[0].NodePort - out := curlConnectionTimeoutCode - errBool := true - if ipFamily == utilnet.IPv6 && cudnATemplate.Spec.Network.Topology == udnv1.NetworkTopologyLayer2 { - // For Layer2 networks, we have these flows we add on breth0: - // cookie=0xdeff105, duration=173.245s, table=1, n_packets=0, n_bytes=0, idle_age=173, priority=14,icmp6,icmp_type=134 actions=FLOOD - // cookie=0xdeff105, duration=173.245s, table=1, n_packets=8, n_bytes=640, idle_age=4, priority=14,icmp6,icmp_type=136 actions=FLOOD - // which floods the Router Advertisement (RA, type 134) and Neighbor Advertisement (NA, type 136) - // Given on Layer2 the GR has the SNATs for both masqueradeIPs this works perfectly well and - // the networks are able to NDP for the masqueradeIPs for the other networks. - // This doesn't work on Layer3 networks since masqueradeIP SNATs are present on the ovn-cluster-router in that case. - // See the tcpdump on the issue: https://github.com/ovn-kubernetes/ovn-kubernetes/issues/5410 for more details. - out = "" - errBool = false - } - // sourceIP will be joinSubnetIP for nodeports, so only using hostname endpoint - return clientPod.Name, clientPod.Namespace, net.JoinHostPort(nodeIP, fmt.Sprint(nodePort)) + "/hostname", out, errBool + return clientPod.Name, clientPod.Namespace, net.JoinHostPort(nodeIP, fmt.Sprint(nodePort)) + "/hostname", curlConnectionTimeoutCode, true }), ginkgo.Entry("UDN pod to a different node nodeport service in different UDN network should work", func(ipFamily utilnet.IPFamily) (clientName string, clientNamespace string, dst string, expectedOutput string, expectErr bool) {