Name: Fragment ID Wrap Workaround Status: Untested Depends: Netfilter/skb_iter.patch.gz There's at least one old IBM Bugzilla bug, in which fragement IDs wrapped, causing NFS data corruption on stresstesting. Solution presented here is twofold: 1) Move the offset of the fragments every time the ID wraps (usually the packet doesn't fit exactly into the MTU, so we have some slack), and 2) Check overlapping fragments that the contents match: if not, drop the whole thing. Also applies to IPv6. Simpler implementation would just drop all fragments on any overlap as a "doesn't happen IRL" case (it needs someone to duplicate a packet, then send each one by a different MTU path). Index: linux-2.6.10-rc3-bk6-Netfilter/net/ipv4/ip_output.c =================================================================== --- linux-2.6.10-rc3-bk6-Netfilter.orig/net/ipv4/ip_output.c 2004-12-13 12:01:57.522447912 +1100 +++ linux-2.6.10-rc3-bk6-Netfilter/net/ipv4/ip_output.c 2004-12-13 12:03:24.948157184 +1100 @@ -563,20 +563,33 @@ offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; not_last_frag = iph->frag_off & htons(IP_MF); + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + + /* IF: we are not sending upto and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) + len &= ~7; + + /* Try to shift initial fragment boundary if we can, to help + * other end detect ID wrap. */ + if (skb->sk) { + unsigned int slack; + struct inet_opt *inet = inet_sk(skb->sk); + + slack = (left % mtu); + if (slack) + /* Shift by 8 bytes per id wrap. */ + len = mtu - (slack % ((inet->id >> 16) << 3)); + } + /* * Keep copying data until we run out. */ while(left > 0) { - len = left; - /* IF: it doesn't fit, use 'mtu' - the data space left */ - if (len > mtu) - len = mtu; - /* IF: we are not sending upto and including the packet end - then align the next start on an eight byte boundary */ - if (len < left) { - len &= ~7; - } /* * Allocate buffer. */ @@ -655,6 +668,16 @@ err = output(skb2); if (err) goto fail; + + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + /* IF: we are not sending upto and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) { + len &= ~7; + } } kfree_skb(skb); IP_INC_STATS(IPSTATS_MIB_FRAGOKS); Index: linux-2.6.10-rc3-bk6-Netfilter/net/ipv4/ip_fragment.c =================================================================== --- linux-2.6.10-rc3-bk6-Netfilter.orig/net/ipv4/ip_fragment.c 2004-12-13 12:01:57.522447912 +1100 +++ linux-2.6.10-rc3-bk6-Netfilter/net/ipv4/ip_fragment.c 2004-12-13 12:03:24.950156880 +1100 @@ -411,8 +411,81 @@ return ip_frag_create(hash, iph); } -/* Add new segment to existing queue. */ -static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) +static int skb_data_equal(const struct sk_buff *new, int startnew, + const struct sk_buff *old, int startold, + int len) +{ + struct skb_iter newi, oldi; + int ret = 1; + + /* Move to first chunk with this offset in both cases */ + skb_iter_first(new, &newi); + while (newi.len < startnew) { + startnew -= newi.len; + skb_iter_next(new, &newi); + } + + skb_iter_first(old, &oldi); + while (oldi.len < startold) { + startold -= oldi.len; + skb_iter_next(old, &oldi); + } + + while (len > 0) { + int cmplen = len; + + /* How much can we compare? */ + if (cmplen > oldi.len - startold) + cmplen = oldi.len - startold; + if (cmplen > newi.len - startnew) + cmplen = newi.len - startnew; + if (memcmp(oldi.data+startold, newi.data+startnew, cmplen)) { + ret = 0; + break; + } + startnew += cmplen; + startold += cmplen; + if (startold == oldi.len) { + skb_iter_next(old, &oldi); + startold = 0; + } + if (startnew == newi.len) { + skb_iter_next(new, &newi); + startnew = 0; + } + len -= cmplen; + } + + skb_iter_abort(new, &newi); + skb_iter_abort(old, &oldi); + return ret; +} + +static int frag_overlap_mismatch(const struct sk_buff *new, + int offset, + const struct sk_buff *old) +{ + int old_offset = FRAG_CB(old)->offset; + int startnew, startold, len; + + if (offset < old_offset) { + startnew = old_offset - offset; + startold = 0; + } else { + startnew = 0; + startold = offset - old_offset; + } + + len = min(old->len - startold, new->len - startnew); + if (len < 0) + return 0; + + return !skb_data_equal(new, startnew, old, startold, len); +} + +/* Add new segment to existing queue. Return false if whole queue + * must drop. */ +static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct sk_buff *prev, *next; int flags, offset; @@ -483,6 +556,8 @@ offset += i; if (end <= offset) goto err; + if (frag_overlap_mismatch(skb, offset, prev)) + goto mismatch; if (!pskb_pull(skb, i)) goto err; if (skb->ip_summed != CHECKSUM_UNNECESSARY) @@ -493,6 +568,9 @@ while (next && FRAG_CB(next)->offset < end) { int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ + if (frag_overlap_mismatch(skb, offset, next)) + goto mismatch; + if (i < next->len) { /* Eat head of the next overlapped fragment * and leave the loop. The next ones cannot overlap. @@ -544,10 +622,17 @@ list_move_tail(&qp->lru_list, &ipq_lru_list); write_unlock(&ipfrag_lock); - return; + return 1; err: kfree_skb(skb); + return 1; + +mismatch: + /* Roughly equiv. to checksum incorrect. */ + ipq_kill(qp); + kfree_skb(skb); + return 0; } @@ -662,12 +747,13 @@ spin_lock(&qp->lock); - ip_frag_queue(qp, skb); - - if (qp->last_in == (FIRST_IN|LAST_IN) && - qp->meat == qp->len) - ret = ip_frag_reasm(qp, dev); - + if (!ip_frag_queue(qp, skb)) + ipq_kill(qp); + else { + if (qp->last_in == (FIRST_IN|LAST_IN) && + qp->meat == qp->len) + ret = ip_frag_reasm(qp, dev); + } spin_unlock(&qp->lock); ipq_put(qp, NULL); return ret; Index: linux-2.6.10-rc3-bk6-Netfilter/include/linux/ip.h =================================================================== --- linux-2.6.10-rc3-bk6-Netfilter.orig/include/linux/ip.h 2004-12-13 12:01:57.522447912 +1100 +++ linux-2.6.10-rc3-bk6-Netfilter/include/linux/ip.h 2004-12-13 12:03:24.951156728 +1100 @@ -118,12 +118,12 @@ int tos; /* TOS */ unsigned cmsg_flags; struct ip_options *opt; + __u32 id; /* ID counter for DF pkts */ __u16 sport; /* Source port */ unsigned char hdrincl; /* Include headers ? */ __u8 mc_ttl; /* Multicasting TTL */ __u8 mc_loop; /* Loopback */ __u8 pmtudisc; - __u16 id; /* ID counter for DF pkts */ unsigned recverr : 1, freebind : 1; int mc_index; /* Multicast device index */