Skip to content

Commit be30f56

Browse files
committed
Merge branch 'net-af_packet-optimize-retire-operation'
Xin Zhao says: ==================== net: af_packet: optimize retire operation In a system with high real-time requirements, the timeout mechanism of ordinary timers with jiffies granularity is insufficient to meet the demands for real-time performance. Meanwhile, the optimization of CPU usage with af_packet is quite significant. Use hrtimer instead of timer to help compensate for the shortcomings in real-time performance. In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time enough, with fluctuations reaching over 8ms (on a system with HZ=250). This is unacceptable in some high real-time systems that require timely processing of network packets. By replacing it with hrtimer, if a timeout of 2ms is set, the update of TP_STATUS_USER can be stabilized to within 3 ms. ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents e663ad6 + f7460d2 commit be30f56

File tree

3 files changed

+44
-104
lines changed

3 files changed

+44
-104
lines changed

net/packet/af_packet.c

Lines changed: 39 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
203203
static int prb_queue_frozen(struct tpacket_kbdq_core *);
204204
static void prb_open_block(struct tpacket_kbdq_core *,
205205
struct tpacket_block_desc *);
206-
static void prb_retire_rx_blk_timer_expired(struct timer_list *);
207-
static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
206+
static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
208207
static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
209208
static void prb_clear_rxhash(struct tpacket_kbdq_core *,
210209
struct tpacket3_hdr *);
@@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
579578
return proto;
580579
}
581580

582-
static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
583-
{
584-
timer_delete_sync(&pkc->retire_blk_timer);
585-
}
586-
587581
static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
588582
struct sk_buff_head *rb_queue)
589583
{
590584
struct tpacket_kbdq_core *pkc;
591585

592586
pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
593-
594-
spin_lock_bh(&rb_queue->lock);
595-
pkc->delete_blk_timer = 1;
596-
spin_unlock_bh(&rb_queue->lock);
597-
598-
prb_del_retire_blk_timer(pkc);
599-
}
600-
601-
static void prb_setup_retire_blk_timer(struct packet_sock *po)
602-
{
603-
struct tpacket_kbdq_core *pkc;
604-
605-
pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
606-
timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
607-
0);
608-
pkc->retire_blk_timer.expires = jiffies;
587+
hrtimer_cancel(&pkc->retire_blk_timer);
609588
}
610589

611590
static int prb_calc_retire_blk_tmo(struct packet_sock *po,
@@ -669,57 +648,36 @@ static void init_prb_bdqc(struct packet_sock *po,
669648
p1->knum_blocks = req_u->req3.tp_block_nr;
670649
p1->hdrlen = po->tp_hdrlen;
671650
p1->version = po->tp_version;
672-
p1->last_kactive_blk_num = 0;
673651
po->stats.stats3.tp_freeze_q_cnt = 0;
674652
if (req_u->req3.tp_retire_blk_tov)
675-
p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
653+
p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov);
676654
else
677-
p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
678-
req_u->req3.tp_block_size);
679-
p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
655+
p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
656+
req_u->req3.tp_block_size));
680657
p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
681658
rwlock_init(&p1->blk_fill_in_prog_lock);
682659

683660
p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
684661
prb_init_ft_ops(p1, req_u);
685-
prb_setup_retire_blk_timer(po);
662+
hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired,
663+
CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
664+
hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime,
665+
HRTIMER_MODE_REL_SOFT);
686666
prb_open_block(p1, pbd);
687667
}
688668

689-
/* Do NOT update the last_blk_num first.
690-
* Assumes sk_buff_head lock is held.
691-
*/
692-
static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
693-
{
694-
mod_timer(&pkc->retire_blk_timer,
695-
jiffies + pkc->tov_in_jiffies);
696-
pkc->last_kactive_blk_num = pkc->kactive_blk_num;
697-
}
698-
699669
/*
700-
* Timer logic:
701-
* 1) We refresh the timer only when we open a block.
702-
* By doing this we don't waste cycles refreshing the timer
703-
* on packet-by-packet basis.
704-
*
705670
* With a 1MB block-size, on a 1Gbps line, it will take
706671
* i) ~8 ms to fill a block + ii) memcpy etc.
707672
* In this cut we are not accounting for the memcpy time.
708673
*
709-
* So, if the user sets the 'tmo' to 10ms then the timer
710-
* will never fire while the block is still getting filled
711-
* (which is what we want). However, the user could choose
712-
* to close a block early and that's fine.
713-
*
714-
* But when the timer does fire, we check whether or not to refresh it.
715674
* Since the tmo granularity is in msecs, it is not too expensive
716675
* to refresh the timer, lets say every '8' msecs.
717676
* Either the user can set the 'tmo' or we can derive it based on
718677
* a) line-speed and b) block-size.
719678
* prb_calc_retire_blk_tmo() calculates the tmo.
720-
*
721679
*/
722-
static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
680+
static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
723681
{
724682
struct packet_sock *po =
725683
timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
@@ -732,9 +690,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
732690
frozen = prb_queue_frozen(pkc);
733691
pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
734692

735-
if (unlikely(pkc->delete_blk_timer))
736-
goto out;
737-
738693
/* We only need to plug the race when the block is partially filled.
739694
* tpacket_rcv:
740695
* lock(); increment BLOCK_NUM_PKTS; unlock()
@@ -750,46 +705,31 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
750705
write_unlock(&pkc->blk_fill_in_prog_lock);
751706
}
752707

753-
if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
754-
if (!frozen) {
755-
if (!BLOCK_NUM_PKTS(pbd)) {
756-
/* An empty block. Just refresh the timer. */
757-
goto refresh_timer;
758-
}
708+
if (!frozen) {
709+
if (BLOCK_NUM_PKTS(pbd)) {
710+
/* Not an empty block. Need retire the block. */
759711
prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
760-
if (!prb_dispatch_next_block(pkc, po))
761-
goto refresh_timer;
762-
else
763-
goto out;
764-
} else {
765-
/* Case 1. Queue was frozen because user-space was
766-
* lagging behind.
712+
prb_dispatch_next_block(pkc, po);
713+
}
714+
} else {
715+
/* Case 1. Queue was frozen because user-space was
716+
* lagging behind.
717+
*/
718+
if (!prb_curr_blk_in_use(pbd)) {
719+
/* Case 2. queue was frozen,user-space caught up,
720+
* now the link went idle && the timer fired.
721+
* We don't have a block to close.So we open this
722+
* block and restart the timer.
723+
* opening a block thaws the queue,restarts timer
724+
* Thawing/timer-refresh is a side effect.
767725
*/
768-
if (prb_curr_blk_in_use(pbd)) {
769-
/*
770-
* Ok, user-space is still behind.
771-
* So just refresh the timer.
772-
*/
773-
goto refresh_timer;
774-
} else {
775-
/* Case 2. queue was frozen,user-space caught up,
776-
* now the link went idle && the timer fired.
777-
* We don't have a block to close.So we open this
778-
* block and restart the timer.
779-
* opening a block thaws the queue,restarts timer
780-
* Thawing/timer-refresh is a side effect.
781-
*/
782-
prb_open_block(pkc, pbd);
783-
goto out;
784-
}
726+
prb_open_block(pkc, pbd);
785727
}
786728
}
787729

788-
refresh_timer:
789-
_prb_refresh_rx_retire_blk_timer(pkc);
790-
791-
out:
730+
hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime);
792731
spin_unlock(&po->sk.sk_receive_queue.lock);
732+
return HRTIMER_RESTART;
793733
}
794734

795735
static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
@@ -883,11 +823,18 @@ static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
883823
}
884824

885825
/*
886-
* Side effect of opening a block:
826+
* prb_open_block is called by tpacket_rcv or timer callback.
887827
*
888-
* 1) prb_queue is thawed.
889-
* 2) retire_blk_timer is refreshed.
828+
* Reasons why NOT update hrtimer in prb_open_block:
829+
* 1) It will increase complexity to distinguish the two caller scenario.
830+
* 2) hrtimer_cancel and hrtimer_start need to be called if you want to update
831+
* TMO of an already enqueued hrtimer, leading to complex shutdown logic.
890832
*
833+
* One side effect of NOT update hrtimer when called by tpacket_rcv is that
834+
* a newly opened block triggered by tpacket_rcv may be retired earlier than
835+
* expected. On the other hand, if timeout is updated in prb_open_block, the
836+
* frequent reception of network packets that leads to prb_open_block being
837+
* called may cause hrtimer to be removed and enqueued repeatedly.
891838
*/
892839
static void prb_open_block(struct tpacket_kbdq_core *pkc1,
893840
struct tpacket_block_desc *pbd1)
@@ -921,7 +868,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
921868
pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
922869

923870
prb_thaw_queue(pkc1);
924-
_prb_refresh_rx_retire_blk_timer(pkc1);
925871

926872
smp_wmb();
927873
}

net/packet/diag.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
8383
pdr.pdr_frame_nr = ring->frame_max + 1;
8484

8585
if (ver > TPACKET_V2) {
86-
pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
86+
pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime);
8787
pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
8888
pdr.pdr_features = ring->prb_bdqc.feature_req_word;
8989
} else {

net/packet/internal.h

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,10 @@ struct tpacket_kbdq_core {
2020
unsigned int feature_req_word;
2121
unsigned int hdrlen;
2222
unsigned char reset_pending_on_curr_blk;
23-
unsigned char delete_blk_timer;
2423
unsigned short kactive_blk_num;
2524
unsigned short blk_sizeof_priv;
2625

27-
/* last_kactive_blk_num:
28-
* trick to see if user-space has caught up
29-
* in order to avoid refreshing timer when every single pkt arrives.
30-
*/
31-
unsigned short last_kactive_blk_num;
26+
unsigned short version;
3227

3328
char *pkblk_start;
3429
char *pkblk_end;
@@ -38,19 +33,18 @@ struct tpacket_kbdq_core {
3833
uint64_t knxt_seq_num;
3934
char *prev;
4035
char *nxt_offset;
36+
4137
struct sk_buff *skb;
4238

4339
rwlock_t blk_fill_in_prog_lock;
4440

4541
/* Default is set to 8ms */
4642
#define DEFAULT_PRB_RETIRE_TOV (8)
4743

48-
unsigned short retire_blk_tov;
49-
unsigned short version;
50-
unsigned long tov_in_jiffies;
44+
ktime_t interval_ktime;
5145

5246
/* timer to retire an outstanding block */
53-
struct timer_list retire_blk_timer;
47+
struct hrtimer retire_blk_timer;
5448
};
5549

5650
struct pgv {

0 commit comments

Comments
 (0)