@@ -269,11 +269,11 @@ func tcpChecksumValid(pkt []byte, iphLen uint8, isV6 bool) bool {
269
269
type coalesceResult int
270
270
271
271
const (
272
- coalesceInsufficientCap coalesceResult = 0
273
- coalescePSHEnding coalesceResult = 1
274
- coalesceItemInvalidCSum coalesceResult = 2
275
- coalescePktInvalidCSum coalesceResult = 3
276
- coalesceSuccess coalesceResult = 4
272
+ coalesceInsufficientCap coalesceResult = iota
273
+ coalescePSHEnding
274
+ coalesceItemInvalidCSum
275
+ coalescePktInvalidCSum
276
+ coalesceSuccess
277
277
)
278
278
279
279
// coalesceTCPPackets attempts to coalesce pkt with the packet described by
@@ -339,42 +339,6 @@ func coalesceTCPPackets(mode canCoalesce, pkt []byte, pktBuffsIndex int, gsoSize
339
339
if gsoSize > item .gsoSize {
340
340
item .gsoSize = gsoSize
341
341
}
342
- hdr := virtioNetHdr {
343
- flags : unix .VIRTIO_NET_HDR_F_NEEDS_CSUM , // this turns into CHECKSUM_PARTIAL in the skb
344
- hdrLen : uint16 (headersLen ),
345
- gsoSize : uint16 (item .gsoSize ),
346
- csumStart : uint16 (item .iphLen ),
347
- csumOffset : 16 ,
348
- }
349
-
350
- // Recalculate the total len (IPv4) or payload len (IPv6). Recalculate the
351
- // (IPv4) header checksum.
352
- if isV6 {
353
- hdr .gsoType = unix .VIRTIO_NET_HDR_GSO_TCPV6
354
- binary .BigEndian .PutUint16 (pktHead [4 :], uint16 (coalescedLen )- uint16 (item .iphLen )) // set new payload len
355
- } else {
356
- hdr .gsoType = unix .VIRTIO_NET_HDR_GSO_TCPV4
357
- pktHead [10 ], pktHead [11 ] = 0 , 0 // clear checksum field
358
- binary .BigEndian .PutUint16 (pktHead [2 :], uint16 (coalescedLen )) // set new total length
359
- iphCSum := ^ checksum (pktHead [:item .iphLen ], 0 ) // compute checksum
360
- binary .BigEndian .PutUint16 (pktHead [10 :], iphCSum ) // set checksum field
361
- }
362
- hdr .encode (bufs [item .bufsIndex ][bufsOffset - virtioNetHdrLen :])
363
-
364
- // Calculate the pseudo header checksum and place it at the TCP checksum
365
- // offset. Downstream checksum offloading will combine this with computation
366
- // of the tcp header and payload checksum.
367
- addrLen := 4
368
- addrOffset := ipv4SrcAddrOffset
369
- if isV6 {
370
- addrLen = 16
371
- addrOffset = ipv6SrcAddrOffset
372
- }
373
- srcAddrAt := bufsOffset + addrOffset
374
- srcAddr := bufs [item .bufsIndex ][srcAddrAt : srcAddrAt + addrLen ]
375
- dstAddr := bufs [item .bufsIndex ][srcAddrAt + addrLen : srcAddrAt + addrLen * 2 ]
376
- psum := pseudoHeaderChecksumNoFold (unix .IPPROTO_TCP , srcAddr , dstAddr , uint16 (coalescedLen - int (item .iphLen )))
377
- binary .BigEndian .PutUint16 (pktHead [hdr .csumStart + hdr .csumOffset :], checksum ([]byte {}, psum ))
378
342
379
343
item .numMerged ++
380
344
return coalesceSuccess
@@ -390,58 +354,67 @@ const (
390
354
maxUint16 = 1 << 16 - 1
391
355
)
392
356
357
+ type tcpGROResult int
358
+
359
+ const (
360
+ tcpGROResultNoop tcpGROResult = iota
361
+ tcpGROResultTableInsert
362
+ tcpGROResultCoalesced
363
+ )
364
+
393
365
// tcpGRO evaluates the TCP packet at pktI in bufs for coalescing with
394
- // existing packets tracked in table. It will return false when pktI is not
395
- // coalesced, otherwise true. This indicates to the caller if bufs[pktI]
396
- // should be written to the Device.
397
- func tcpGRO (bufs [][]byte , offset int , pktI int , table * tcpGROTable , isV6 bool ) (pktCoalesced bool ) {
366
+ // existing packets tracked in table. It returns a tcpGROResultNoop when no
367
+ // action was taken, tcpGROResultTableInsert when the evaluated packet was
368
+ // inserted into table, and tcpGROResultCoalesced when the evaluated packet was
369
+ // coalesced with another packet in table.
370
+ func tcpGRO (bufs [][]byte , offset int , pktI int , table * tcpGROTable , isV6 bool ) tcpGROResult {
398
371
pkt := bufs [pktI ][offset :]
399
372
if len (pkt ) > maxUint16 {
400
373
// A valid IPv4 or IPv6 packet will never exceed this.
401
- return false
374
+ return tcpGROResultNoop
402
375
}
403
376
iphLen := int ((pkt [0 ] & 0x0F ) * 4 )
404
377
if isV6 {
405
378
iphLen = 40
406
379
ipv6HPayloadLen := int (binary .BigEndian .Uint16 (pkt [4 :]))
407
380
if ipv6HPayloadLen != len (pkt )- iphLen {
408
- return false
381
+ return tcpGROResultNoop
409
382
}
410
383
} else {
411
384
totalLen := int (binary .BigEndian .Uint16 (pkt [2 :]))
412
385
if totalLen != len (pkt ) {
413
- return false
386
+ return tcpGROResultNoop
414
387
}
415
388
}
416
389
if len (pkt ) < iphLen {
417
- return false
390
+ return tcpGROResultNoop
418
391
}
419
392
tcphLen := int ((pkt [iphLen + 12 ] >> 4 ) * 4 )
420
393
if tcphLen < 20 || tcphLen > 60 {
421
- return false
394
+ return tcpGROResultNoop
422
395
}
423
396
if len (pkt ) < iphLen + tcphLen {
424
- return false
397
+ return tcpGROResultNoop
425
398
}
426
399
if ! isV6 {
427
400
if pkt [6 ]& ipv4FlagMoreFragments != 0 || pkt [6 ]<< 3 != 0 || pkt [7 ] != 0 {
428
401
// no GRO support for fragmented segments for now
429
- return false
402
+ return tcpGROResultNoop
430
403
}
431
404
}
432
405
tcpFlags := pkt [iphLen + tcpFlagsOffset ]
433
406
var pshSet bool
434
407
// not a candidate if any non-ACK flags (except PSH+ACK) are set
435
408
if tcpFlags != tcpFlagACK {
436
409
if pkt [iphLen + tcpFlagsOffset ] != tcpFlagACK | tcpFlagPSH {
437
- return false
410
+ return tcpGROResultNoop
438
411
}
439
412
pshSet = true
440
413
}
441
414
gsoSize := uint16 (len (pkt ) - tcphLen - iphLen )
442
415
// not a candidate if payload len is 0
443
416
if gsoSize < 1 {
444
- return false
417
+ return tcpGROResultNoop
445
418
}
446
419
seq := binary .BigEndian .Uint32 (pkt [iphLen + 4 :])
447
420
srcAddrOffset := ipv4SrcAddrOffset
@@ -452,7 +425,7 @@ func tcpGRO(bufs [][]byte, offset int, pktI int, table *tcpGROTable, isV6 bool)
452
425
}
453
426
items , existing := table .lookupOrInsert (pkt , srcAddrOffset , srcAddrOffset + addrLen , iphLen , tcphLen , pktI )
454
427
if ! existing {
455
- return false
428
+ return tcpGROResultNoop
456
429
}
457
430
for i := len (items ) - 1 ; i >= 0 ; i -- {
458
431
// In the best case of packets arriving in order iterating in reverse is
@@ -470,20 +443,20 @@ func tcpGRO(bufs [][]byte, offset int, pktI int, table *tcpGROTable, isV6 bool)
470
443
switch result {
471
444
case coalesceSuccess :
472
445
table .updateAt (item , i )
473
- return true
446
+ return tcpGROResultCoalesced
474
447
case coalesceItemInvalidCSum :
475
448
// delete the item with an invalid csum
476
449
table .deleteAt (item .key , i )
477
450
case coalescePktInvalidCSum :
478
451
// no point in inserting an item that we can't coalesce
479
- return false
452
+ return tcpGROResultNoop
480
453
default :
481
454
}
482
455
}
483
456
}
484
457
// failed to coalesce with any other packets; store the item in the flow
485
458
table .insert (pkt , srcAddrOffset , srcAddrOffset + addrLen , iphLen , tcphLen , pktI )
486
- return false
459
+ return tcpGROResultTableInsert
487
460
}
488
461
489
462
func isTCP4NoIPOptions (b []byte ) bool {
@@ -515,6 +488,64 @@ func isTCP6NoEH(b []byte) bool {
515
488
return true
516
489
}
517
490
491
+ // applyCoalesceAccounting updates bufs to account for coalescing based on the
492
+ // metadata found in table.
493
+ func applyCoalesceAccounting (bufs [][]byte , offset int , table * tcpGROTable , isV6 bool ) error {
494
+ for _ , items := range table .itemsByFlow {
495
+ for _ , item := range items {
496
+ if item .numMerged > 0 {
497
+ hdr := virtioNetHdr {
498
+ flags : unix .VIRTIO_NET_HDR_F_NEEDS_CSUM , // this turns into CHECKSUM_PARTIAL in the skb
499
+ hdrLen : uint16 (item .iphLen + item .tcphLen ),
500
+ gsoSize : item .gsoSize ,
501
+ csumStart : uint16 (item .iphLen ),
502
+ csumOffset : 16 ,
503
+ }
504
+ pkt := bufs [item .bufsIndex ][offset :]
505
+
506
+ // Recalculate the total len (IPv4) or payload len (IPv6).
507
+ // Recalculate the (IPv4) header checksum.
508
+ if isV6 {
509
+ hdr .gsoType = unix .VIRTIO_NET_HDR_GSO_TCPV6
510
+ binary .BigEndian .PutUint16 (pkt [4 :], uint16 (len (pkt ))- uint16 (item .iphLen )) // set new IPv6 header payload len
511
+ } else {
512
+ hdr .gsoType = unix .VIRTIO_NET_HDR_GSO_TCPV4
513
+ pkt [10 ], pkt [11 ] = 0 , 0
514
+ binary .BigEndian .PutUint16 (pkt [2 :], uint16 (len (pkt ))) // set new total length
515
+ iphCSum := ^ checksum (pkt [:item .iphLen ], 0 ) // compute IPv4 header checksum
516
+ binary .BigEndian .PutUint16 (pkt [10 :], iphCSum ) // set IPv4 header checksum field
517
+ }
518
+ err := hdr .encode (bufs [item .bufsIndex ][offset - virtioNetHdrLen :])
519
+ if err != nil {
520
+ return err
521
+ }
522
+
523
+ // Calculate the pseudo header checksum and place it at the TCP
524
+ // checksum offset. Downstream checksum offloading will combine
525
+ // this with computation of the tcp header and payload checksum.
526
+ addrLen := 4
527
+ addrOffset := ipv4SrcAddrOffset
528
+ if isV6 {
529
+ addrLen = 16
530
+ addrOffset = ipv6SrcAddrOffset
531
+ }
532
+ srcAddrAt := offset + addrOffset
533
+ srcAddr := bufs [item .bufsIndex ][srcAddrAt : srcAddrAt + addrLen ]
534
+ dstAddr := bufs [item .bufsIndex ][srcAddrAt + addrLen : srcAddrAt + addrLen * 2 ]
535
+ psum := pseudoHeaderChecksumNoFold (unix .IPPROTO_TCP , srcAddr , dstAddr , uint16 (len (pkt )- int (item .iphLen )))
536
+ binary .BigEndian .PutUint16 (pkt [hdr .csumStart + hdr .csumOffset :], checksum ([]byte {}, psum ))
537
+ } else {
538
+ hdr := virtioNetHdr {}
539
+ err := hdr .encode (bufs [item .bufsIndex ][offset - virtioNetHdrLen :])
540
+ if err != nil {
541
+ return err
542
+ }
543
+ }
544
+ }
545
+ }
546
+ return nil
547
+ }
548
+
518
549
// handleGRO evaluates bufs for GRO, and writes the indices of the resulting
519
550
// packets into toWrite. toWrite, tcp4Table, and tcp6Table should initially be
520
551
// empty (but non-nil), and are passed in to save allocs as the caller may reset
@@ -524,23 +555,28 @@ func handleGRO(bufs [][]byte, offset int, tcp4Table, tcp6Table *tcpGROTable, toW
524
555
if offset < virtioNetHdrLen || offset > len (bufs [i ])- 1 {
525
556
return errors .New ("invalid offset" )
526
557
}
527
- var coalesced bool
558
+ var result tcpGROResult
528
559
switch {
529
560
case isTCP4NoIPOptions (bufs [i ][offset :]): // ipv4 packets w/IP options do not coalesce
530
- coalesced = tcpGRO (bufs , offset , i , tcp4Table , false )
561
+ result = tcpGRO (bufs , offset , i , tcp4Table , false )
531
562
case isTCP6NoEH (bufs [i ][offset :]): // ipv6 packets w/extension headers do not coalesce
532
- coalesced = tcpGRO (bufs , offset , i , tcp6Table , true )
563
+ result = tcpGRO (bufs , offset , i , tcp6Table , true )
533
564
}
534
- if ! coalesced {
565
+ switch result {
566
+ case tcpGROResultNoop :
535
567
hdr := virtioNetHdr {}
536
568
err := hdr .encode (bufs [i ][offset - virtioNetHdrLen :])
537
569
if err != nil {
538
570
return err
539
571
}
572
+ fallthrough
573
+ case tcpGROResultTableInsert :
540
574
* toWrite = append (* toWrite , i )
541
575
}
542
576
}
543
- return nil
577
+ err4 := applyCoalesceAccounting (bufs , offset , tcp4Table , false )
578
+ err6 := applyCoalesceAccounting (bufs , offset , tcp6Table , true )
579
+ return errors .Join (err4 , err6 )
544
580
}
545
581
546
582
// tcpTSO splits packets from in into outBuffs, writing the size of each
0 commit comments