Name: Connection Tracking Expect Fix Author: Rusty Russell Status: Experimental Depends: Netfilter/conntrack_expect_lock_removal.patch.gz Depends: Netfilter/conntrack_expect_alloc.patch.gz Depends: Netfilter/compulsory_timers.patch.gz D: This fixes the race where the packet on which the expectfn is D: called may not be the one which actually passes: we need to call D: the expectfn at confirm time. D: D: This patch also formalizes the reference counts: an expect holds a D: reference count on its creator, and an expected connection holds a D: reference count on the expect. diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .13723-linux-2.5.33/include/linux/netfilter_ipv4/ip_conntrack.h .13723-linux-2.5.33.updated/include/linux/netfilter_ipv4/ip_conntrack.h --- .13723-linux-2.5.33/include/linux/netfilter_ipv4/ip_conntrack.h 2002-08-28 09:29:52.000000000 +1000 +++ .13723-linux-2.5.33.updated/include/linux/netfilter_ipv4/ip_conntrack.h 2002-09-05 21:45:30.000000000 +1000 @@ -110,7 +110,7 @@ do { \ struct ip_conntrack_expect { - /* Internal linked list (global expectation list) */ + /* Internal linked list (global expectation list while still pending) */ struct list_head list; /* reference count */ @@ -119,11 +119,11 @@ struct ip_conntrack_expect /* expectation list for this master */ struct list_head expected_list; - /* The conntrack of the master connection */ + /* The conntrack of the master connection: we hold a reference to it */ struct ip_conntrack *expectant; /* The conntrack of the sibling connection, set after - * expectation arrived */ + * expectation arrived: it holds a reference to us. */ struct ip_conntrack *sibling; /* Tuple saved for conntrack */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .13723-linux-2.5.33/net/ipv4/netfilter/ip_conntrack_core.c .13723-linux-2.5.33.updated/net/ipv4/netfilter/ip_conntrack_core.c --- .13723-linux-2.5.33/net/ipv4/netfilter/ip_conntrack_core.c 2002-09-05 21:45:28.000000000 +1000 +++ .13723-linux-2.5.33.updated/net/ipv4/netfilter/ip_conntrack_core.c 2002-09-05 21:45:58.000000000 +1000 @@ -177,13 +177,15 @@ static void destroy_expect(struct ip_conntrack_expect *exp) { DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(exp->use)); - IP_NF_ASSERT(atomic_read(exp->use)); + IP_NF_ASSERT(atomic_read(&exp->use) == 0); IP_NF_ASSERT(!timer_pending(&exp->timeout)); + MUST_BE_READ_WRITE_UNLOCKED(&ip_conntrack_lock); + /* Release refcnt we hold on our creator */ + ip_conntrack_put(exp->expectant); kfree(exp); } - inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp) { IP_NF_ASSERT(exp); @@ -222,18 +224,16 @@ ip_conntrack_expect_find_get(const struc static void __unexpect_related(struct ip_conntrack_expect *expect) { DEBUGP("unexpect_related(%p)\n", expect); - MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); /* we're not allowed to unexpect a confirmed expectation! */ IP_NF_ASSERT(!expect->sibling); - /* delete from global and local lists */ + /* delete from global list and local list */ + WRITE_LOCK(&ip_conntrack_lock); list_del(&expect->list); list_del(&expect->expected_list); - - /* decrement expect-count of master conntrack */ - if (expect->expectant) - expect->expectant->expecting--; + expect->expectant->expecting--; + WRITE_UNLOCK(&ip_conntrack_lock); ip_conntrack_expect_put(expect); } @@ -253,14 +253,15 @@ static void unexpect_related(struct ip_c /* delete all unconfirmed expectations for this conntrack */ static void remove_expectations(struct ip_conntrack *ct) { - struct list_head *exp_entry, *next; - struct ip_conntrack_expect *exp; + LIST_HEAD(unconfirmed); + struct list_head *exp_entry, *tmp; DEBUGP("remove_expectations(%p)\n", ct); - for (exp_entry = ct->sibling_list.next; - exp_entry != &ct->sibling_list; exp_entry = next) { - next = exp_entry->next; + /* First take them off our list and the expect list. */ + WRITE_LOCK(&ip_conntrack_lock); + list_for_each_safe(exp_entry, tmp, &ct->sibling_list) { + struct ip_conntrack_expect *exp; exp = list_entry(exp_entry, struct ip_conntrack_expect, expected_list); @@ -274,8 +275,24 @@ static void remove_expectations(struct i IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp)); IP_NF_ASSERT(exp->expectant == ct); - /* delete expectation from global and private lists */ - unexpect_related(exp); + /* Don't transfer if already dying via timer */ + if (del_timer(&exp->timeout)) { + list_del(&exp->list); + list_del(&exp->expected_list); + list_add(&exp->list, &unconfirmed); + ct->expecting--; + } + } + WRITE_UNLOCK(&ip_conntrack_lock); + + /* Now destroy them all, without lock. */ + while (!list_empty(&unconfirmed)) { + struct ip_conntrack_expect *exp; + + exp = list_entry(unconfirmed.next, struct ip_conntrack_expect, + expected_list); + list_del(unconfirmed.next); + ip_conntrack_expect_put(exp); } } @@ -308,8 +325,8 @@ destroy_conntrack(struct nf_conntrack *n IP_NF_ASSERT(atomic_read(&nfct->use) == 0); IP_NF_ASSERT(!timer_pending(&ct->timeout)); - if (ct->master && master_ct(ct)) - ip_conntrack_put(master_ct(ct)); + if (ct->master) + ip_conntrack_expect_put(ct->master); /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock @@ -321,15 +338,8 @@ destroy_conntrack(struct nf_conntrack *n if (ip_conntrack_destroyed) ip_conntrack_destroyed(ct); - WRITE_LOCK(&ip_conntrack_lock); - /* Delete our master expectation */ - if (ct->master) { - /* can't call __unexpect_related here, - * since it would screw up expect_list */ - list_del(&ct->master->expected_list); - kfree(ct->master); - } - WRITE_UNLOCK(&ip_conntrack_lock); + if (ct->master) + ip_conntrack_expect_put(ct->master); DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); kmem_cache_free(ip_conntrack_cachep, ct); @@ -449,6 +459,8 @@ __ip_conntrack_confirm(struct nf_ct_info conntrack_tuple_cmp, struct ip_conntrack_tuple_hash *, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { + struct ip_conntrack_expect *expected = NULL; + list_prepend(&ip_conntrack_hash[hash], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); list_prepend(&ip_conntrack_hash[repl_hash], @@ -459,10 +471,28 @@ __ip_conntrack_confirm(struct nf_ct_info ct->timeout.expires += jiffies; add_timer(&ct->timeout); atomic_inc(&ct->ct_general.use); + + /* If we were expected, delete expectation from list + and tell it we are here */ + if (ct->master) { + /* Kill expecation expiry timer: if it's + already gone off, we lost race. */ + if (!del_timer(&ct->master->timeout)) + goto lost_race; + + ct->master->sibling = conntrack; + LIST_DELETE(&ip_conntrack_expect_list, ct->master); + ct->master->expectant->expecting--; + } WRITE_UNLOCK(&ip_conntrack_lock); + + /* Finally, callback to say expectation is done. */ + if (ct->master && ct->master->expectfn) + return ct->master->expectfn(ct); return NF_ACCEPT; } + lost_race: WRITE_UNLOCK(&ip_conntrack_lock); return NF_DROP; } @@ -685,8 +715,9 @@ init_conntrack(const struct ip_conntrack /* Mark clearly that it's not in the hash table. */ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list.next = NULL; - WRITE_LOCK(&ip_conntrack_lock); - /* Need finding and deleting of expected ONLY if we win race */ + READ_LOCK(&ip_conntrack_lock); + /* We only call expect function and delete expectation if we + win race at confirmation time */ expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp, struct ip_conntrack_expect *, tuple); @@ -695,10 +726,7 @@ init_conntrack(const struct ip_conntrack conntrack->helper = ip_ct_find_helper(&repl_tuple); /* If master is not in hash table yet (ie. packet hasn't left - this machine yet), how can other end know about expected? - Hence these are not the droids you are looking for (if - master ct never got confirmed, we'd hold a reference to it - and weird things would happen to future packets). */ + this machine yet), how can other end know about expected? */ if (expected && is_confirmed(expected->expectant)) { DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", conntrack, expected); @@ -706,16 +734,11 @@ init_conntrack(const struct ip_conntrack IP_NF_ASSERT(master_ct(conntrack)); conntrack->status = IPS_EXPECTED; conntrack->master = expected; - expected->sibling = conntrack; - LIST_DELETE(&ip_conntrack_expect_list, expected); - expected->expectant->expecting--; - nf_conntrack_get(&master_ct(conntrack)->infos[0]); + atomic_inc(&expected->use); } atomic_inc(&ip_conntrack_count); WRITE_UNLOCK(&ip_conntrack_lock); - if (expected && expected->expectfn) - expected->expectfn(conntrack); return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; } @@ -874,21 +897,12 @@ static inline int resent_expect(const st && ip_ct_tuple_equal(&i->mask, mask)); } -inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect) -{ - WRITE_LOCK(&ip_conntrack_lock); - unexpect_related(expect); - WRITE_UNLOCK(&ip_conntrack_lock); -} - static void expectation_timed_out(unsigned long ul_expect) { struct ip_conntrack_expect *expect = (void *) ul_expect; DEBUGP("expectation %p timed out\n", expect); - WRITE_LOCK(&ip_conntrack_lock); __unexpect_related(expect); - WRITE_UNLOCK(&ip_conntrack_lock); } /* Add a related connection. */ @@ -898,6 +912,17 @@ int ip_conntrack_expect_related(struct i struct ip_conntrack_expect *old, *new; int ret = 0; + /* Initialize */ + atomic_set(&newexp->count, 1); + newexp->expectant = related_to; + newexp->sibling = NULL; + memset(&newexp->ct_tuple, 0, sizeof(newexp->ct_tuple)); + init_timer(&newexp->timeout); + newexp->timeout.data = (unsigned long)newexp; + newexp->timeout.function = expectation_timed_out; + newexp->timeout.expires = jiffies + related_to->helper->timeout * HZ; + + again: WRITE_LOCK(&ip_conntrack_lock); /* Because of the write lock, no reader can walk the lists, * so there is no need to use the tuple lock too */ @@ -908,12 +933,8 @@ int ip_conntrack_expect_related(struct i old = LIST_FIND(&ip_conntrack_expect_list, resent_expect, struct ip_conntrack_expect *, &newexp->tuple, &newexp->mask); - if (old) { - /* Remove the old one */ - unexpect_related(old); - } else if (related_to->helper->max_expected - && (related_to->expecting - >= related_to->helper->max_expected)) { + if (!old && related_to->helper->max_expected + && (related_to->expecting >= related_to->helper->max_expected)) { /* Over limit */ if (net_ratelimit()) printk(KERN_WARNING @@ -937,12 +958,26 @@ int ip_conntrack_expect_related(struct i /* choose the the oldest expectation to evict */ list_for_each_entry(old, &related_to->sibling_list, expected_list) { - /* We *will* find one. */ + /* We may not find one, if being destroyed now. */ if (old->sibling == NULL) - break; + goto found; } + old = NULL; + } - unexpect_related(old); + if (old) { + found: + /* Hold so it doesn't vanish */ + atomic_inc(&old->use); + + WRITE_UNLOCK(&ip_conntrack_lock); + /* Might already be dying... */ + if (del_timer(&expect->timeout)) + __unexpect_related(old); + + /* Drop reference again. */ + ip_conntrack_expect_put(old); + goto again; } /* Initialize */ @@ -956,14 +991,11 @@ int ip_conntrack_expect_related(struct i /* add to global list of expectations */ list_add(&newexp->list, &ip_conntrack_expect_list); - /* add and start timer */ - init_timer(&newexp->timeout); - newexp->timeout.data = (unsigned long)newexp; - newexp->timeout.function = expectation_timed_out; - newexp->timeout.expires = jiffies + related_to->helper->timeout * HZ; + /* start timer */ add_timer(&newexp->timeout); related_to->expecting++; + atomic_inc(&related_to->use); WRITE_UNLOCK(&ip_conntrack_lock); return 0;