Skip to content

Commit f7460d2

Browse files
jackzxcui1989kuba-moo
authored andcommitted
net: af_packet: Use hrtimer to do the retire operation
In a system with high real-time requirements, the timeout mechanism of ordinary timers with jiffies granularity is insufficient to meet the demands for real-time performance. Meanwhile, the optimization of CPU usage with af_packet is quite significant. Use hrtimer instead of timer to help compensate for the shortcomings in real-time performance. In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time enough, with fluctuations reaching over 8ms (on a system with HZ=250). This is unacceptable in some high real-time systems that require timely processing of network packets. By replacing it with hrtimer, if a timeout of 2ms is set, the update of TP_STATUS_USER can be stabilized to within 3 ms. Delete delete_blk_timer field, because hrtimer_cancel will check and wait until the timer callback return and ensure never enter callback again. Simplify the logic related to setting timeout, only update the hrtimer expire time within the hrtimer callback, no longer update the expire time in prb_open_block which is called by tpacket_rcv or timer callback. Reasons why NOT update hrtimer in prb_open_block: 1) It will increase complexity to distinguish the two caller scenario. 2) hrtimer_cancel and hrtimer_start need to be called if you want to update TMO of an already enqueued hrtimer, leading to complex shutdown logic. One side effect of NOT update hrtimer when called by tpacket_rcv is that a newly opened block triggered by tpacket_rcv may be retired earlier than expected. On the other hand, if timeout is updated in prb_open_block, the frequent reception of network packets that leads to prb_open_block being called may cause hrtimer to be removed and enqueued repeatedly. The retire hrtimer expiration is unconditional and periodic. If there are numerous packet sockets on the system, please set an appropriate timeout to avoid frequent enqueueing of hrtimers. Reviewed-by: Willem de Bruijn <[email protected]> Reviewed-by: Jason Xing <[email protected]> Link: https://lore.kernel.org/all/[email protected]/ Signed-off-by: Xin Zhao <[email protected]> Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 28d2420 commit f7460d2

File tree

3 files changed

+33
-83
lines changed

3 files changed

+33
-83
lines changed

net/packet/af_packet.c

Lines changed: 27 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
203203
static int prb_queue_frozen(struct tpacket_kbdq_core *);
204204
static void prb_open_block(struct tpacket_kbdq_core *,
205205
struct tpacket_block_desc *);
206-
static void prb_retire_rx_blk_timer_expired(struct timer_list *);
207-
static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
206+
static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
208207
static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
209208
static void prb_clear_rxhash(struct tpacket_kbdq_core *,
210209
struct tpacket3_hdr *);
@@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
579578
return proto;
580579
}
581580

582-
static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
583-
{
584-
timer_delete_sync(&pkc->retire_blk_timer);
585-
}
586-
587581
static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
588582
struct sk_buff_head *rb_queue)
589583
{
590584
struct tpacket_kbdq_core *pkc;
591585

592586
pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
593-
594-
spin_lock_bh(&rb_queue->lock);
595-
pkc->delete_blk_timer = 1;
596-
spin_unlock_bh(&rb_queue->lock);
597-
598-
prb_del_retire_blk_timer(pkc);
599-
}
600-
601-
static void prb_setup_retire_blk_timer(struct packet_sock *po)
602-
{
603-
struct tpacket_kbdq_core *pkc;
604-
605-
pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
606-
timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
607-
0);
608-
pkc->retire_blk_timer.expires = jiffies;
587+
hrtimer_cancel(&pkc->retire_blk_timer);
609588
}
610589

611590
static int prb_calc_retire_blk_tmo(struct packet_sock *po,
@@ -671,53 +650,34 @@ static void init_prb_bdqc(struct packet_sock *po,
671650
p1->version = po->tp_version;
672651
po->stats.stats3.tp_freeze_q_cnt = 0;
673652
if (req_u->req3.tp_retire_blk_tov)
674-
p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
653+
p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov);
675654
else
676-
p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
677-
req_u->req3.tp_block_size);
678-
p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
655+
p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
656+
req_u->req3.tp_block_size));
679657
p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
680658
rwlock_init(&p1->blk_fill_in_prog_lock);
681659

682660
p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
683661
prb_init_ft_ops(p1, req_u);
684-
prb_setup_retire_blk_timer(po);
662+
hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired,
663+
CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
664+
hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime,
665+
HRTIMER_MODE_REL_SOFT);
685666
prb_open_block(p1, pbd);
686667
}
687668

688-
/* Do NOT update the last_blk_num first.
689-
* Assumes sk_buff_head lock is held.
690-
*/
691-
static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
692-
{
693-
mod_timer(&pkc->retire_blk_timer,
694-
jiffies + pkc->tov_in_jiffies);
695-
}
696-
697669
/*
698-
* Timer logic:
699-
* 1) We refresh the timer only when we open a block.
700-
* By doing this we don't waste cycles refreshing the timer
701-
* on packet-by-packet basis.
702-
*
703670
* With a 1MB block-size, on a 1Gbps line, it will take
704671
* i) ~8 ms to fill a block + ii) memcpy etc.
705672
* In this cut we are not accounting for the memcpy time.
706673
*
707-
* So, if the user sets the 'tmo' to 10ms then the timer
708-
* will never fire while the block is still getting filled
709-
* (which is what we want). However, the user could choose
710-
* to close a block early and that's fine.
711-
*
712-
* But when the timer does fire, we check whether or not to refresh it.
713674
* Since the tmo granularity is in msecs, it is not too expensive
714675
* to refresh the timer, lets say every '8' msecs.
715676
* Either the user can set the 'tmo' or we can derive it based on
716677
* a) line-speed and b) block-size.
717678
* prb_calc_retire_blk_tmo() calculates the tmo.
718-
*
719679
*/
720-
static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
680+
static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
721681
{
722682
struct packet_sock *po =
723683
timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
@@ -730,9 +690,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
730690
frozen = prb_queue_frozen(pkc);
731691
pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
732692

733-
if (unlikely(pkc->delete_blk_timer))
734-
goto out;
735-
736693
/* We only need to plug the race when the block is partially filled.
737694
* tpacket_rcv:
738695
* lock(); increment BLOCK_NUM_PKTS; unlock()
@@ -749,26 +706,16 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
749706
}
750707

751708
if (!frozen) {
752-
if (!BLOCK_NUM_PKTS(pbd)) {
753-
/* An empty block. Just refresh the timer. */
754-
goto refresh_timer;
709+
if (BLOCK_NUM_PKTS(pbd)) {
710+
/* Not an empty block. Need retire the block. */
711+
prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
712+
prb_dispatch_next_block(pkc, po);
755713
}
756-
prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
757-
if (!prb_dispatch_next_block(pkc, po))
758-
goto refresh_timer;
759-
else
760-
goto out;
761714
} else {
762715
/* Case 1. Queue was frozen because user-space was
763716
* lagging behind.
764717
*/
765-
if (prb_curr_blk_in_use(pbd)) {
766-
/*
767-
* Ok, user-space is still behind.
768-
* So just refresh the timer.
769-
*/
770-
goto refresh_timer;
771-
} else {
718+
if (!prb_curr_blk_in_use(pbd)) {
772719
/* Case 2. queue was frozen,user-space caught up,
773720
* now the link went idle && the timer fired.
774721
* We don't have a block to close.So we open this
@@ -777,15 +724,12 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
777724
* Thawing/timer-refresh is a side effect.
778725
*/
779726
prb_open_block(pkc, pbd);
780-
goto out;
781727
}
782728
}
783729

784-
refresh_timer:
785-
_prb_refresh_rx_retire_blk_timer(pkc);
786-
787-
out:
730+
hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime);
788731
spin_unlock(&po->sk.sk_receive_queue.lock);
732+
return HRTIMER_RESTART;
789733
}
790734

791735
static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
@@ -879,11 +823,18 @@ static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
879823
}
880824

881825
/*
882-
* Side effect of opening a block:
826+
* prb_open_block is called by tpacket_rcv or timer callback.
883827
*
884-
* 1) prb_queue is thawed.
885-
* 2) retire_blk_timer is refreshed.
828+
* Reasons why NOT update hrtimer in prb_open_block:
829+
* 1) It will increase complexity to distinguish the two caller scenario.
830+
* 2) hrtimer_cancel and hrtimer_start need to be called if you want to update
831+
* TMO of an already enqueued hrtimer, leading to complex shutdown logic.
886832
*
833+
* One side effect of NOT update hrtimer when called by tpacket_rcv is that
834+
* a newly opened block triggered by tpacket_rcv may be retired earlier than
835+
* expected. On the other hand, if timeout is updated in prb_open_block, the
836+
* frequent reception of network packets that leads to prb_open_block being
837+
* called may cause hrtimer to be removed and enqueued repeatedly.
887838
*/
888839
static void prb_open_block(struct tpacket_kbdq_core *pkc1,
889840
struct tpacket_block_desc *pbd1)
@@ -917,7 +868,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
917868
pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
918869

919870
prb_thaw_queue(pkc1);
920-
_prb_refresh_rx_retire_blk_timer(pkc1);
921871

922872
smp_wmb();
923873
}

net/packet/diag.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
8383
pdr.pdr_frame_nr = ring->frame_max + 1;
8484

8585
if (ver > TPACKET_V2) {
86-
pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
86+
pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime);
8787
pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
8888
pdr.pdr_features = ring->prb_bdqc.feature_req_word;
8989
} else {

net/packet/internal.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,11 @@ struct tpacket_kbdq_core {
2020
unsigned int feature_req_word;
2121
unsigned int hdrlen;
2222
unsigned char reset_pending_on_curr_blk;
23-
unsigned char delete_blk_timer;
2423
unsigned short kactive_blk_num;
2524
unsigned short blk_sizeof_priv;
2625

26+
unsigned short version;
27+
2728
char *pkblk_start;
2829
char *pkblk_end;
2930
int kblk_size;
@@ -32,19 +33,18 @@ struct tpacket_kbdq_core {
3233
uint64_t knxt_seq_num;
3334
char *prev;
3435
char *nxt_offset;
36+
3537
struct sk_buff *skb;
3638

3739
rwlock_t blk_fill_in_prog_lock;
3840

3941
/* Default is set to 8ms */
4042
#define DEFAULT_PRB_RETIRE_TOV (8)
4143

42-
unsigned short retire_blk_tov;
43-
unsigned short version;
44-
unsigned long tov_in_jiffies;
44+
ktime_t interval_ktime;
4545

4646
/* timer to retire an outstanding block */
47-
struct timer_list retire_blk_timer;
47+
struct hrtimer retire_blk_timer;
4848
};
4949

5050
struct pgv {

0 commit comments

Comments
 (0)