Name: Serialize Network Packets for PPPoE Author: Rusty Russell Status: Experimental D: Paul Mackerras says PPPoE relies on receiving packets in wire order, D: and he has bug reports caused by packet reordering. D: D: This is icky. Example code below: D: 1) Extract core queuing part of netif_rx into __netif_rx. D: D: 2) If the protocol is requires serialization, packets are put on a D: global "serial" queue instead of the local queue. (Which protocols D: currently hardcoded). D: D: 3) One cpu (boot cpu as it happens) drains this serial queue, so it D: stays ordered. D: D: 4) Fix bug in cpu_raise_softirq: need to wake softirqd if it's a D: different cpu. D: D: Another option would simply be to stamp a serialization number into D: the skb if the proto needs serialization, and drop packets if serial D: number goes backwards. But since this is actually happening to D: people, that would suck, too. diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.72-bk2/kernel/softirq.c working-2.5.72-bk2-serial-protocols/kernel/softirq.c --- linux-2.5.72-bk2/kernel/softirq.c 2003-06-25 17:17:19.000000000 +1000 +++ working-2.5.72-bk2-serial-protocols/kernel/softirq.c 2003-06-25 14:55:15.000000000 +1000 @@ -130,7 +130,7 @@ inline void cpu_raise_softirq(unsigned i * Otherwise we wake up ksoftirqd to make sure we * schedule the softirq soon. */ - if (!in_interrupt()) + if (!in_interrupt() || cpu != smp_processor_id()) wakeup_softirqd(cpu); } diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal linux-2.5.72-bk2/net/core/dev.c working-2.5.72-bk2-serial-protocols/net/core/dev.c --- linux-2.5.72-bk2/net/core/dev.c 2003-06-20 11:53:36.000000000 +1000 +++ working-2.5.72-bk2-serial-protocols/net/core/dev.c 2003-06-25 17:11:36.000000000 +1000 @@ -1323,42 +1323,11 @@ static void sample_queue(unsigned long d } #endif - -/** - * netif_rx - post buffer to the network code - * @skb: buffer to post - * - * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. - * - * return values: - * NET_RX_SUCCESS (no congestion) - * NET_RX_CN_LOW (low congestion) - * NET_RX_CN_MOD (moderate congestion) - * NET_RX_CN_HIGH (high congestion) - * NET_RX_DROP (packet was dropped) - * - */ - -int netif_rx(struct sk_buff *skb) +/* Called with IRQs disabled. */ +static inline int __netif_rx(int this_cpu, + struct softnet_data *queue, + struct sk_buff *skb) { - int this_cpu; - struct softnet_data *queue; - unsigned long flags; - - if (!skb->stamp.tv_sec) - do_gettimeofday(&skb->stamp); - - /* - * The code is rearranged so that the path is the most - * short when CPU is congested, but is still operating. - */ - local_irq_save(flags); - this_cpu = smp_processor_id(); - queue = &softnet_data[this_cpu]; - netdev_rx_stat[this_cpu].total++; if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { if (queue->input_pkt_queue.qlen) { @@ -1371,7 +1340,6 @@ enqueue: #ifndef OFFLINE_SAMPLE get_sample_stats(this_cpu); #endif - local_irq_restore(flags); return queue->cng_level; } @@ -1397,12 +1365,116 @@ enqueue: drop: netdev_rx_stat[this_cpu].dropped++; - local_irq_restore(flags); kfree_skb(skb); return NET_RX_DROP; } +#ifdef CONFIG_SMP +/* Queue for serial protocols (eg PPPoe). All handled by one CPU. */ +static spinlock_t serial_queue_lock = SPIN_LOCK_UNLOCKED; +static struct softnet_data serial_queue; + +/* Which cpu does serial queue. */ +static int serial_cpu; + +static inline int net_proto_serialize(struct sk_buff *skb, + int this_cpu, + int *ret) +{ + if (likely(skb->protocol != ETH_P_PPP_DISC + && skb->protocol != ETH_P_PPP_SES)) + return 0; + + spin_lock(&serial_queue_lock); + *ret = __netif_rx(this_cpu, &serial_queue, skb); + spin_unlock(&serial_queue_lock); + if (this_cpu != serial_cpu) + cpu_raise_softirq(serial_cpu, NET_RX_SOFTIRQ); + return 1; +} + +static void init_queue(struct softnet_data *queue); + +static void init_serial(void) +{ + init_queue(&serial_queue); + serial_cpu = smp_processor_id(); +} + +static inline void drain_serial_queue(int this_cpu) +{ + if (this_cpu != serial_cpu) + return; + + spin_lock(&serial_queue_lock); + while (!list_empty(&serial_queue.poll_list)) { + struct net_device *dev; + + dev = list_entry(serial_queue.poll_list.next, + struct net_device, poll_list); + + list_del(&dev->poll_list); + list_add_tail(&dev->poll_list, &serial_queue.poll_list); + } + spin_unlock(&serial_queue_lock); +} +#else +static inline int net_proto_serialize(struct sk_buff *skb, + int this_cpu, + int *ret) +{ + return 0; +} + +static void init_serial(void) +{ +} + +static inline void drain_serial_queue(int this_cpu) +{ +} +#endif /* CONFIG_SMP */ + +/** + * netif_rx - post buffer to the network code + * @skb: buffer to post + * + * This function receives a packet from a device driver and queues it for + * the upper (protocol) levels to process. It always succeeds. The buffer + * may be dropped during processing for congestion control or by the + * protocol layers. + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_CN_LOW (low congestion) + * NET_RX_CN_MOD (moderate congestion) + * NET_RX_CN_HIGH (high congestion) + * NET_RX_DROP (packet was dropped) + * + */ + +int netif_rx(struct sk_buff *skb) +{ + int ret, this_cpu; + unsigned long flags; + + if (!skb->stamp.tv_sec) + do_gettimeofday(&skb->stamp); + + /* + * The code is rearranged so that the path is the most + * short when CPU is congested, but is still operating. + */ + local_irq_save(flags); + this_cpu = smp_processor_id(); + + if (!net_proto_serialize(skb, this_cpu, &ret)) + ret = __netif_rx(this_cpu, &softnet_data[this_cpu], skb); + local_irq_restore(flags); + return ret; +} + /* Deliver skb to an old protocol, which is not threaded well or which do not understand shared skbs. */ @@ -1705,6 +1777,8 @@ static void net_rx_action(struct softirq local_irq_disable(); } } + + drain_serial_queue(this_cpu); out: local_irq_enable(); preempt_enable(); @@ -2944,6 +3018,20 @@ int unregister_netdevice(struct net_devi } +static void init_queue(struct softnet_data *queue) +{ + skb_queue_head_init(&queue->input_pkt_queue); + queue->throttle = 0; + queue->cng_level = 0; + queue->avg_blog = 10; /* arbitrary non-zero */ + queue->completion_queue = NULL; + INIT_LIST_HEAD(&queue->poll_list); + set_bit(__LINK_STATE_START, &queue->backlog_dev.state); + queue->backlog_dev.weight = weight_p; + queue->backlog_dev.poll = process_backlog; + atomic_set(&queue->backlog_dev.refcnt, 1); +} + /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not @@ -2976,23 +3064,10 @@ static int __init net_dev_init(void) * Initialise the packet receive queues. */ - for (i = 0; i < NR_CPUS; i++) { - struct softnet_data *queue; - - queue = &softnet_data[i]; - skb_queue_head_init(&queue->input_pkt_queue); - queue->throttle = 0; - queue->cng_level = 0; - queue->avg_blog = 10; /* arbitrary non-zero */ - queue->completion_queue = NULL; - INIT_LIST_HEAD(&queue->poll_list); - set_bit(__LINK_STATE_START, &queue->backlog_dev.state); - queue->backlog_dev.weight = weight_p; - queue->backlog_dev.poll = process_backlog; - atomic_set(&queue->backlog_dev.refcnt, 1); - } + for (i = 0; i < NR_CPUS; i++) + init_queue(&softnet_data[i]); + init_serial(); #ifdef CONFIG_NET_PROFILE net_profile_init(); -- Anyone who quotes me in their sig is an idiot. -- Rusty Russell.