@@ -19,14 +19,7 @@ typedef struct pmtud_pkt {
1919 struct icmphdr icmp ;
2020}pmtud_pkt_t ;
2121
22- typedef struct __attribute__((packed )) pmtud_flow_hash {
23- __be32 saddr ;
24- __be32 daddr ;
25- __u8 proto ;
26- __be16 sport ;
27- __be16 dport ;
28- __u8 zero [3 ];
29- }pmtud_flow_hash_t ;
22+ typedef pkt_hdrs_t pmtud_flow_hash_t ;
3023
3124COMPILATION_ASSERT (sizeof (pmtud_flow_hash_t ) == 16 ,
3225 "Size of pmtud_flow_hash_t must be 16!" );
@@ -256,10 +249,11 @@ static __always_inline
256249int pmtud_proc_icmp (struct __sk_buff * skb , struct iphdr * ip ){
257250 int rc ;
258251 const sfunnel_ip4_rule_t * rule = NULL ;
252+ pmtud_flow_state_t * state = NULL ;
259253 struct udphdr * udp ;
260254 struct icmphdr * icmp ;
261255 struct iphdr * inner_ip ;
262- pkt_hdrs_t hdrs ;
256+ pkt_hdrs_t hdrs = { 0 } ;
263257 __u8 fhdr_size ;
264258
265259 icmp = (struct icmphdr * ) ((__u8 * )ip + (ip -> ihl * 4 ));
@@ -278,11 +272,15 @@ int pmtud_proc_icmp(struct __sk_buff* skb, struct iphdr* ip){
278272 hdrs .daddr = inner_ip -> daddr ;
279273 hdrs .proto = inner_ip -> protocol ;
280274
281- //Note RFC 792 only ensures the first 8 bytes of the original L4 hdr
275+ //Note: RFC 792 only ensures the first 8 bytes of the original L4 hdr
282276 //This got updated with RFC 4884 and 1812 in practice most systems
283- //will send _up_ to 64 bytes, which includes the inner L4 hdr, which
284- //allows us not to have to do "connection/flow tracking".
285- udp = (struct udphdr * ) ((__u8 * )inner_ip + (inner_ip -> ihl * 4 ));
277+ //will send at least 64 bytes, which includes the inner L4 hdr. This
278+ //allows us to look in the inner L4 hdr instead of having to do
279+ //flow tracking
280+ //
281+ //Note2: we are only interested in the s/dport of the L4 hdr. Using
282+ //UDP as they are in the same position of the hdr.
283+ udp = (struct udphdr * )((__u8 * )inner_ip + (inner_ip -> ihl * 4 ));
286284 CHECK_SKB_PTR (skb , ((__u8 * )udp ) + 8 );
287285
288286 hdrs .sport = udp -> source ;
@@ -293,6 +291,7 @@ int pmtud_proc_icmp(struct __sk_buff* skb, struct iphdr* ip){
293291 if (!rule || !rule -> actions .unfunnel .execute )
294292 return TC_ACT_UNSPEC ;
295293
294+
296295 if (rule -> actions .unfunnel .p .unfunnel .proto == IPPROTO_UDP ){
297296 fhdr_size = sizeof (struct udphdr );
298297 }else if (rule -> actions .unfunnel .p .unfunnel .proto == IPPROTO_TCP ){
@@ -301,6 +300,38 @@ int pmtud_proc_icmp(struct __sk_buff* skb, struct iphdr* ip){
301300 return TC_ACT_SHOT ;
302301 }
303302
303+ //Recover the max network MTU
304+ __u16 net_mtu = bpf_ntohs (icmp -> un .frag .mtu );
305+ __s64 icmp_diff = 0 ;
306+
307+ //Check whether we have to adjust the PMTUD map and adapt net_mtu
308+ //Note: if not present in the map, the end host effective MTU will be
309+ //lowered. We will further lower it once the first packet exceeding
310+ //net_mtu + fhdr_size is intercepted, so no need to do anything here.
311+ state = bpf_map_lookup_elem (& pmtud_map , & hdrs );
312+ if (state ){
313+ if (net_mtu < state -> last_seen_net_mtu ){
314+ state -> last_seen_net_mtu = net_mtu ;
315+ state -> adjusted_mtu = net_mtu - fhdr_size ;
316+
317+ rc = bpf_map_update_elem (& pmtud_map , & hdrs , & state ,
318+ BPF_ANY );
319+ if (rc < 0 ){
320+ PRINTK ("[%d:0x%p][pmtud][net] Unable to create flow state rc=%d" ,
321+ skb -> ifindex , skb , rc );
322+ }
323+ }
324+
325+ __be32 old_mtu = * (__be32 * )& icmp -> un .frag ;
326+
327+ //Adjust ICMP network MTU (-fhdr_size)
328+ icmp -> un .frag .mtu = bpf_htons (state -> adjusted_mtu );
329+
330+ //Adjust ICMP checksum
331+ icmp_diff = bpf_csum_diff (& old_mtu , 4 , (__be32 * )& icmp -> un .frag ,
332+ 4 , 0 );
333+ }
334+
304335 CHECK_SKB_PTR (skb , ((__u8 * )udp ) + fhdr_size + 8 );
305336
306337 //Now unfunnel
@@ -309,6 +340,9 @@ int pmtud_proc_icmp(struct __sk_buff* skb, struct iphdr* ip){
309340 union ttl_proto old_ttl = * (union ttl_proto * )& inner_ip -> ttl ;
310341 __s64 diff = bpf_csum_diff ((__be32 * )& old_ttl , 4 ,
311342 (__be32 * )& inner_ip -> ttl , 4 , 0 );
343+ icmp_diff = bpf_csum_diff ((__be32 * )& old_ttl , 4 ,
344+ (__be32 * )& inner_ip -> ttl , 4 ,
345+ icmp_diff );
312346
313347 __u32 l3_off = (__u8 * )inner_ip - (__u8 * )SKB_GET_ETH (skb );
314348 l3_off += offsetof(struct iphdr , check );
@@ -320,8 +354,23 @@ int pmtud_proc_icmp(struct __sk_buff* skb, struct iphdr* ip){
320354 }
321355 }
322356
323- //Now set ports from inner L4 (+fhdr_size). We are lazy here.
357+ //Now set ports from inner L4, which we recovered from the funneled
358+ //L4 hdr (+fhdr_size)
359+ __be32 old_ports = * (__be32 * )udp ;
324360 * (__be32 * )udp = * (__be32 * )(((__u8 * )udp ) + fhdr_size );
361+ icmp_diff = bpf_csum_diff ((__be32 * )& old_ports , 4 ,
362+ (__be32 * )udp , 4 ,
363+ icmp_diff );
364+
365+ __u32 l4_off = (__u8 * )udp - (__u8 * )SKB_GET_ETH (skb );
366+ l4_off += offsetof(struct icmphdr , checksum );
367+ rc = bpf_l4_csum_replace (skb , l4_off , 0 , icmp_diff , 0 );
368+ if (rc < 0 ){
369+ PRINTK ("[%d:0x%p][pmtud][net] Unable to set L4 csum. rc=%d" ,
370+ skb -> ifindex , skb ,
371+ rc );
372+ return TC_ACT_SHOT ;
373+ }
325374
326375 return TC_ACT_OK ;
327376}
0 commit comments