Name: Remove Double NAT on LOCAL_OUT Status: Untested Depends: Netfilter/remove-multiple-ranges.patch.gz Signed-off-by: Rusty Russell Normally the NAT code maps source IP/port on the NF_IP_POST_ROUTING hook, and destination IP/port on the NF_IP_PRE_ROUTING or NF_IP_LOCAL_OUT hooks. There are two situations where we do source manipulations on the destination hooks: 1) On NF_IP_LOCAL_OUT, when destination NAT changes the destination interface, we also change the source address, so the packet is the same as if it were generated to go that way in the first place. This is not strictly neccessary, I believe. 2) On NF_IP_LOCAL_OUT or NF_IP_PRE_ROUTING, if destination NAT is not sufficient to create a unique tuple, we try changing the source port as well. However, this is also not strictly neccessary: if the tuple is not unique, we will also try to change the source on the NF_IP_POST_ROUTING hook. When we finally confirm the connection, if the tuple is still not unique the packet will be dropped (this is required anyway as we could race: the conntrack is not placed in the hash until the packet is about to leave the box anyway). This patch rips that code out to see what breaks. Index: linux-2.6.10-rc2-bk1-Netfilter/net/ipv4/netfilter/ip_nat_core.c =================================================================== --- linux-2.6.10-rc2-bk1-Netfilter.orig/net/ipv4/netfilter/ip_nat_core.c 2004-11-17 22:41:43.823432016 +1100 +++ linux-2.6.10-rc2-bk1-Netfilter/net/ipv4/netfilter/ip_nat_core.c 2004-11-17 22:45:42.261183968 +1100 @@ -180,28 +180,6 @@ return 0; } -#ifdef CONFIG_IP_NF_NAT_LOCAL -/* If it's really a local destination manip, it may need to do a - source manip too. */ -static int -do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp) -{ - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } }; - struct rtable *rt; - - /* FIXME: IPTOS_TOS(iph->tos) --RR */ - if (ip_route_output_key(&rt, &fl) != 0) { - DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n", - NIPQUAD(var_ip)); - return 0; - } - - *other_ipp = rt->rt_src; - ip_rt_put(rt); - return 1; -} -#endif - /* Simple way to iterate through all. */ static inline int fake_cmp(const struct ip_conntrack *ct, u_int32_t src, u_int32_t dst, u_int16_t protonum, @@ -240,38 +218,40 @@ 1-65535, we don't do pro-rata allocation based on ports; we choose the ip with the lowest src-ip/dst-ip/proto usage. */ -static int -find_best_ips_proto_slow(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - const struct ip_conntrack *conntrack, - unsigned int hooknum) +static void +find_best_ips_proto(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + const struct ip_conntrack *conntrack, + unsigned int hooknum) { unsigned int best_score = 0xFFFFFFFF; struct ip_conntrack_tuple best_tuple; - u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip; + u_int32_t *var_ipp; static unsigned int randomness; /* Host order */ u_int32_t minip, maxip, j; - if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) { + /* No IP mapping? Do nothing. */ + if (!(range->flags & IP_NAT_RANGE_MAP_IPS)) + return; + + if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) var_ipp = &tuple->src.ip; - saved_ip = tuple->dst.ip; - other_ipp = &tuple->dst.ip; - } else { + else var_ipp = &tuple->dst.ip; - saved_ip = tuple->src.ip; - other_ipp = &tuple->src.ip; + + /* Fast path: only one choice. */ + if (range->min_ip == range->max_ip) { + *var_ipp = range->min_ip; + return; } - /* Don't do do_extra_mangle unless necessary (overrides - explicit socket bindings, for example) */ - orig_dstip = tuple->dst.ip; - - if (range->flags & IP_NAT_RANGE_MAP_IPS) { - minip = ntohl(range->min_ip); - maxip = ntohl(range->max_ip); - } else - minip = maxip = ntohl(*var_ipp); + minip = ntohl(range->min_ip); + maxip = ntohl(range->max_ip); + + /* FIXME: use hash of ips like ipt_SAME, not randomness. + This way same pairs get same IP: think Internet Banking. + */ randomness++; for (j = 0; j < maxip - minip + 1; j++) { unsigned int score; @@ -279,77 +259,28 @@ *var_ipp = htonl(minip + (randomness + j) % (maxip - minip + 1)); - /* Reset the other ip in case it was mangled by - * do_extra_mangle last time. */ - *other_ipp = saved_ip; - -#ifdef CONFIG_IP_NF_NAT_LOCAL - if (hooknum == NF_IP_LOCAL_OUT - && *var_ipp != orig_dstip - && !do_extra_mangle(*var_ipp, other_ipp)) { - DEBUGP("Range %u %u.%u.%u.%u rt failed!\n", - i, NIPQUAD(*var_ipp)); - /* Can't route? This whole range part is - * probably screwed, but keep trying - * anyway. */ - continue; - } -#endif - /* Count how many others map onto this. */ score = count_maps(tuple->src.ip, tuple->dst.ip, tuple->dst.protonum, conntrack); if (score < best_score) { /* Optimization: doesn't get any better than this. */ if (score == 0) - return 1; + return; best_score = score; best_tuple = *tuple; } } - - if (best_score == 0xFFFFFFFF) - return 0; - *tuple = best_tuple; - return 1; -} - -/* Fast version doesn't iterate through hash chains, but only handles - common case of single IP address (null NAT, masquerade) */ -static int -find_best_ips_proto(struct ip_conntrack_tuple *tuple, - const struct ip_nat_range *range, - const struct ip_conntrack *conntrack, - unsigned int hooknum) -{ - if ((range->flags & IP_NAT_RANGE_MAP_IPS) - && range->min_ip != range->max_ip) - return find_best_ips_proto_slow(tuple, range, conntrack, - hooknum); - - if (range->flags & IP_NAT_RANGE_MAP_IPS) { - if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) - tuple->src.ip = range->min_ip; - else { - /* Only do extra mangle when required (breaks - socket binding) */ -#ifdef CONFIG_IP_NF_NAT_LOCAL - if (tuple->dst.ip != range->min_ip - && hooknum == NF_IP_LOCAL_OUT - && !do_extra_mangle(range->min_ip, &tuple->src.ip)) - return 0; -#endif - tuple->dst.ip = range->min_ip; - } - } - - /* Discard const. */ - return 1; } -static int +/* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING, + * we change the source to map into the range. For NF_IP_PRE_ROUTING + * and NF_IP_LOCAL_OUT, we change the destination to map into the + * range. It might not be possible to get a unique tuple, but we try. + * At worst (or if we race), we will end up with a final duplicate in + * __ip_conntrack_confirm and drop the packet. */ +static void get_unique_tuple(struct ip_conntrack_tuple *tuple, const struct ip_conntrack_tuple *orig_tuple, const struct ip_nat_range *range, @@ -370,15 +301,14 @@ if (find_appropriate_src(orig_tuple, tuple, range)) { DEBUGP("get_unique_tuple: Found current src map\n"); if (!ip_nat_used_tuple(tuple, conntrack)) - return 1; + return; } } /* 2) Select the least-used IP/proto combination in the given range. */ *tuple = *orig_tuple; - if (!find_best_ips_proto(tuple, range, conntrack, hooknum)) - return 0; + find_best_ips_proto(tuple, range, conntrack, hooknum); /* 3) The per-protocol part of the manip is made to map into the range to make a unique tuple. */ @@ -388,30 +318,10 @@ || proto->in_range(tuple, HOOK2MANIP(hooknum), &range->min, &range->max)) && !ip_nat_used_tuple(tuple, conntrack)) - return 1; + return; - if (proto->unique_tuple(tuple, range, HOOK2MANIP(hooknum), conntrack)){ - /* Must be unique. */ - IP_NF_ASSERT(!ip_nat_used_tuple(tuple, conntrack)); - return 1; - } - - if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) { - /* Try implicit source NAT; protocol may be able to - play with ports to make it unique. */ - struct ip_nat_range r - = { IP_NAT_RANGE_MAP_IPS, - tuple->src.ip, tuple->src.ip, { 0 }, { 0 } }; - DEBUGP("Trying implicit mapping\n"); - if (proto->unique_tuple(tuple, &r, IP_NAT_MANIP_SRC, - conntrack)) { - /* Must be unique. */ - IP_NF_ASSERT(!ip_nat_used_tuple(tuple, conntrack)); - return 1; - } - DEBUGP("Protocol can't get unique tuple %u.\n", hooknum); - } - return 0; + /* Last change: get protocol to try to obtain unique tuple. */ + proto->unique_tuple(tuple, range, HOOK2MANIP(hooknum), conntrack); } /* Where to manip the reply packets (will be reverse manip). */ @@ -450,11 +360,7 @@ invert_tuplepr(&orig_tp, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple); - if (!get_unique_tuple(&new_tuple, &orig_tp, range,conntrack,hooknum)) { - DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n", - conntrack); - return NF_DROP; - } + get_unique_tuple(&new_tuple, &orig_tp, range, conntrack, hooknum); /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT): the original (A/B/C/D') and the mangled one (E/F/G/H'). @@ -477,6 +383,9 @@ /* Has source changed?. */ if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) { + IP_NF_ASSERT(HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC); + IP_NF_ASSERT(ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)); + /* In this direction, a source manip. */ info->manips[info->num_manips++] = ((struct ip_nat_info_manip) @@ -495,6 +404,8 @@ /* Has destination changed? */ if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) { + IP_NF_ASSERT(HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST); + /* In this direction, a destination manip */ info->manips[info->num_manips++] = ((struct ip_nat_info_manip)