## Automatically generated incremental diff ## From: linux-2.4.22-bk1 ## To: linux-2.4.22-bk2 ## Robot: $Id: make-incremental-diff,v 1.11 2002/02/20 02:59:33 hpa Exp $ diff -urN linux-2.4.22-bk1/Documentation/Configure.help linux-2.4.22-bk2/Documentation/Configure.help --- linux-2.4.22-bk1/Documentation/Configure.help 2003-08-26 15:54:21.000000000 -0700 +++ linux-2.4.22-bk2/Documentation/Configure.help 2003-08-26 15:54:22.000000000 -0700 @@ -3226,6 +3226,190 @@ If you want to compile it as a module, say M here and read . If unsure, say `N'. +IP: virtual server support +CONFIG_IP_VS + IP Virtual Server support will let you build a high-performance + virtual server based on cluster of two or more real servers. This + option must be enabled for at least one of the clustered computers + that will take care of intercepting incomming connections to a + single IP address and scheduling them to real servers. + + Three request dispatching techniques are implemented, they are + virtual server via NAT, virtual server via tunneling and virtual + server via direct routing. The several scheduling algorithms can + be used to choose which server the connection is directed to, + thus load balancing can be achieved among the servers. For more + information and its administration program, please visit the + following URL: + http://www.linuxvirtualserver.org/ + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +IP virtual server debugging +CONFIG_IP_VS_DEBUG + Say Y here if you want to get additional messages useful in + debugging the IP virtual server code. You can change the debug + level in /proc/sys/net/ipv4/vs/debug_level + +IPVS connection hash table size (the Nth power of 2) +CONFIG_IP_VS_TAB_BITS + The IPVS connection hash table uses the chaining scheme to handle + hash collisions. Using a big IPVS connection hash table will greatly + reduce conflicts when there are hundreds of thousands of connections + in the hash table. + + Note the table size must be power of 2. The table size will be the + value of 2 to the your input number power. The number to choose is + from 8 to 20, the default number is 12, which means the table size + is 4096. Don't input the number too small, otherwise you will lose + performance on it. You can adapt the table size yourself, according + to your virtual server application. It is good to set the table size + not far less than the number of connections per second multiplying + average lasting time of connection in the table. For example, your + virtual server gets 200 connections per second, the connection lasts + for 200 seconds in average in the connection table, the table size + should be not far less than 200x200, it is good to set the table + size 32768 (2**15). + + Another note that each connection occupies 128 bytes effectively and + each hash entry uses 8 bytes, so you can estimate how much memory is + needed for your box. + +IPVS: round-robin scheduling +CONFIG_IP_VS_RR + The robin-robin scheduling algorithm simply directs network + connections to different real servers in a round-robin manner. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +IPVS: weighted round-robin scheduling +CONFIG_IP_VS_WRR + The weighted robin-robin scheduling algorithm directs network + connections to different real servers based on server weights + in a round-robin manner. Servers with higher weights receive + new connections first than those with less weights, and servers + with higher weights get more connections than those with less + weights and servers with equal weights get equal connections. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +IPVS: least-connection scheduling +CONFIG_IP_VS_LC + The least-connection scheduling algorithm directs network + connections to the server with the least number of active + connections. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +IPVS: weighted least-connection scheduling +CONFIG_IP_VS_WLC + The weighted least-connection scheduling algorithm directs network + connections to the server with the least active connections + normalized by the server weight. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +IPVS: locality-based least-connection scheduling +CONFIG_IP_VS_LBLC + The locality-based least-connection scheduling algorithm is for + destination IP load balancing. It is usually used in cache cluster. + This algorithm usually directs packet destined for an IP address to + its server if the server is alive and under load. If the server is + overloaded (its active connection numbers is larger than its weight) + and there is a server in its half load, then allocate the weighted + least-connection server to this IP address. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +IPVS: locality-based least-connection with replication scheduling +CONFIG_IP_VS_LBLCR + The locality-based least-connection with replication scheduling + algorithm is also for destination IP load balancing. It is + usually used in cache cluster. It differs from the LBLC scheduling + as follows: the load balancer maintains mappings from a target + to a set of server nodes that can serve the target. Requests for + a target are assigned to the least-connection node in the target's + server set. If all the node in the server set are over loaded, + it picks up a least-connection node in the cluster and adds it + in the sever set for the target. If the server set has not been + modified for the specified time, the most loaded node is removed + from the server set, in order to avoid high degree of replication. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +IPVS: destination hashing scheduling +CONFIG_IP_VS_DH + The destination hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their destination IP addresses. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +IPVS: source hashing scheduling +CONFIG_IP_VS_SH + The source hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their source IP addresses. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +IPVS: shortest expected delay scheduling +CONFIG_IP_VS_SED + The shortest expected delay scheduling algorithm assigns network + connections to the server with the shortest expected delay. The + expected delay that the job will experience is (Ci + 1) / Ui if + sent to the ith server, in which Ci is the number of connections + on the the ith server and Ui is the fixed service rate (weight) + of the ith server. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +IPVS: never queue scheduling +CONFIG_IP_VS_NQ + The never queue scheduling algorithm adopts a two-speed model. + When there is an idle server available, the job will be sent to + the idle server, instead of waiting for a fast one. When there + is no idle server available, the job will be sent to the server + that minimize its expected delay (The Shortest Expected Delay + scheduling algorithm). + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + +IPVS: FTP protocol helper +CONFIG_IP_VS_FTP + FTP is a protocol that transfers IP address and/or port number in + the payload. In the virtual server via Network Address Translation, + the IP address and port number of real servers cannot be sent to + clients in ftp connections directly, so FTP protocol helper is + required for tracking the connection and mangling it back to that of + virtual service. + + If you want to compile it in kernel, say Y. If you want to compile + it as a module, say M here and read Documentation/modules.txt. If + unsure, say N. + SYN flood protection CONFIG_SYN_COOKIES Normal TCP/IP networking is open to an attack known as "SYN @@ -24918,6 +25102,19 @@ output to the second serial port on these devices. Saying N will cause the debug messages to appear on the first serial port. +Kernel log buffer length shift +CONFIG_LOG_BUF_SHIFT + The kernel log buffer has a fixed size of : + 64 kB (2^16) on MULTIQUAD and IA64, + 128 kB (2^17) on S390 + 32 kB (2^15) on SMP systems + 16 kB (2^14) on UP systems + + You have the ability to change this size with this paramter which + fixes the bit shift of to get the buffer length (which must be a + power of 2). Eg: a value of 16 sets the buffer to 64 kB (2^16). + The default value of 0 uses standard values above. + Disable pgtable cache CONFIG_NO_PGT_CACHE Normally the kernel maintains a `quicklist' of preallocated @@ -27502,6 +27699,12 @@ See http://csrc.nist.gov/encryption/aes/ for more information. +CONFIG_CRYPTO_CAST5 + CAST5 (CAST-128) cipher algorithm. + + The CAST5 encryption algorithm (synonymous with CAST-128) is + described in RFC2144. + CONFIG_CRYPTO_DEFLATE This is the Deflate algorithm (RFC1951), specified for use in IPSec with the IPCOMP protocol (RFC3173, RFC2394). diff -urN linux-2.4.22-bk1/Documentation/crypto/api-intro.txt linux-2.4.22-bk2/Documentation/crypto/api-intro.txt --- linux-2.4.22-bk1/Documentation/crypto/api-intro.txt 2003-08-25 04:44:39.000000000 -0700 +++ linux-2.4.22-bk2/Documentation/crypto/api-intro.txt 2003-08-26 15:54:22.000000000 -0700 @@ -186,7 +186,6 @@ Dag Arne Osvik (Serpent) Brian Gladman (AES) - SHA1 algorithm contributors: Jean-Francois Dive @@ -214,6 +213,9 @@ Kyle McMartin Adam J. Richter +CAST5 algorithm contributors: + Kartikey Mahendra Bhatt (original developers unknown, FSF copyright). + Generic scatterwalk code by Adam J. Richter Please send any credits updates or corrections to: diff -urN linux-2.4.22-bk1/Documentation/sonypi.txt linux-2.4.22-bk2/Documentation/sonypi.txt --- linux-2.4.22-bk1/Documentation/sonypi.txt 2003-08-25 04:44:39.000000000 -0700 +++ linux-2.4.22-bk2/Documentation/sonypi.txt 2003-08-26 15:54:22.000000000 -0700 @@ -8,7 +8,9 @@ Copyright (C) 2000 Andrew Tridgell This driver enables access to the Sony Programmable I/O Control Device which -can be found in many (all ?) Sony Vaio laptops. +can be found in many Sony Vaio laptops. Some newer Sony laptops (seems to be +limited to new FX series laptops, at least the FX501 and the FX702) lack a +sonypi device and are not supported at all by this driver. It will give access (through a user space utility) to some events those laptops generate, like: @@ -96,6 +98,7 @@ SONYPI_THUMBPHRASE_MASK 0x0200 SONYPI_MEYE_MASK 0x0400 SONYPI_MEMORYSTICK_MASK 0x0800 + SONYPI_BATTERY_MASK 0x1000 useinput: if set (which is the default) jogdial events are forwarded to the input subsystem as mouse wheel diff -urN linux-2.4.22-bk1/Documentation/video4linux/meye.txt linux-2.4.22-bk2/Documentation/video4linux/meye.txt --- linux-2.4.22-bk1/Documentation/video4linux/meye.txt 2003-06-13 07:51:29.000000000 -0700 +++ linux-2.4.22-bk2/Documentation/video4linux/meye.txt 2003-08-26 15:54:22.000000000 -0700 @@ -16,6 +16,23 @@ MJPEG hardware grabbing is supported via a private API (see below). +Hardware supported: +------------------- + +This driver supports the 'second' version of the MotionEye camera :) + +The first version was connected directly on the video bus of the Neomagic +video card and is unsupported. + +The second one, made by Kawasaki Steel is fully supported by this +driver (PCI vendor/device is 0x136b/0xff01) + +The third one, present in recent (more or less last year) Picturebooks +(C1M* models), is not supported. The manufacturer has given the specs +to the developers under a NDA (which allows the develoment of a GPL +driver however), but things are not moving very fast (see +http://r-engine.sourceforge.net/) (PCI vendor/device is 0x10cf/0x2011). + Driver options: --------------- diff -urN linux-2.4.22-bk1/MAINTAINERS linux-2.4.22-bk2/MAINTAINERS --- linux-2.4.22-bk1/MAINTAINERS 2003-08-25 04:44:39.000000000 -0700 +++ linux-2.4.22-bk2/MAINTAINERS 2003-08-26 15:54:22.000000000 -0700 @@ -670,7 +670,7 @@ ETHERNET BRIDGE P: Stephen Hemminger M: shemminger@osdl.org -L: bridge@math.leidenuniv.nl +L: bridge@osdl.org W: http://bridge.sourceforge.net/ S: Maintained diff -urN linux-2.4.22-bk1/Makefile linux-2.4.22-bk2/Makefile --- linux-2.4.22-bk1/Makefile 2003-08-26 15:54:21.000000000 -0700 +++ linux-2.4.22-bk2/Makefile 2003-08-26 15:54:22.000000000 -0700 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 22 -EXTRAVERSION = -bk1 +EXTRAVERSION = -bk2 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -urN linux-2.4.22-bk1/arch/i386/config.in linux-2.4.22-bk2/arch/i386/config.in --- linux-2.4.22-bk1/arch/i386/config.in 2003-08-26 15:54:21.000000000 -0700 +++ linux-2.4.22-bk2/arch/i386/config.in 2003-08-26 15:54:22.000000000 -0700 @@ -477,6 +477,8 @@ bool ' Compile the kernel with frame pointers' CONFIG_FRAME_POINTER fi +int 'Kernel messages buffer length shift (0 = default)' CONFIG_LOG_BUF_SHIFT 0 + endmenu source crypto/Config.in diff -urN linux-2.4.22-bk1/arch/i386/kernel/io_apic.c linux-2.4.22-bk2/arch/i386/kernel/io_apic.c --- linux-2.4.22-bk1/arch/i386/kernel/io_apic.c 2003-08-25 04:44:39.000000000 -0700 +++ linux-2.4.22-bk2/arch/i386/kernel/io_apic.c 2003-08-26 15:54:22.000000000 -0700 @@ -169,6 +169,14 @@ { struct IO_APIC_route_entry entry; unsigned long flags; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + spin_lock_irqsave(&ioapic_lock, flags); + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (entry.delivery_mode == dest_SMI) + return; /* * Disable it in the IO-APIC irq-routing table: @@ -1365,6 +1373,13 @@ static void set_ioapic_affinity (unsigned int irq, unsigned long mask) { unsigned long flags; + + /* pick a single cpu for clustered xapics */ + if(clustered_apic_mode == CLUSTERED_APIC_XAPIC){ + int cpu = ffs(mask)-1; + mask = cpu_to_physical_apicid(cpu); + } + /* * Only the first 8 bits are valid. */ diff -urN linux-2.4.22-bk1/arch/i386/kernel/pci-pc.c linux-2.4.22-bk2/arch/i386/kernel/pci-pc.c --- linux-2.4.22-bk1/arch/i386/kernel/pci-pc.c 2003-08-25 04:44:39.000000000 -0700 +++ linux-2.4.22-bk2/arch/i386/kernel/pci-pc.c 2003-08-26 15:54:22.000000000 -0700 @@ -1016,7 +1016,8 @@ "xor %%ah, %%ah\n" "1:" : "=a" (ret), - "=b" (map) + "=b" (map), + "+m" (opt) : "0" (PCIBIOS_GET_ROUTING_OPTIONS), "1" (0), "D" ((long) &opt), diff -urN linux-2.4.22-bk1/arch/x86_64/kernel/io_apic.c linux-2.4.22-bk2/arch/x86_64/kernel/io_apic.c --- linux-2.4.22-bk1/arch/x86_64/kernel/io_apic.c 2003-08-25 04:44:40.000000000 -0700 +++ linux-2.4.22-bk2/arch/x86_64/kernel/io_apic.c 2003-08-26 15:54:22.000000000 -0700 @@ -1762,7 +1762,7 @@ } -int io_apic_set_pci_routing (int ioapic, int pin, int irq) +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level ,int active_high_low) { struct IO_APIC_route_entry entry; unsigned long flags; @@ -1785,18 +1785,21 @@ entry.dest_mode = INT_DELIVERY_MODE; entry.dest.logical.logical_dest = TARGET_CPUS; entry.mask = 1; /* Disabled (masked) */ - entry.trigger = 1; /* Level sensitive */ - entry.polarity = 1; /* Low active */ + entry.trigger = edge_level; + entry.polarity = active_high_low; add_pin_to_irq(irq, ioapic, pin); entry.vector = assign_irq_vector(irq); printk(KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " - "IRQ %d)\n", ioapic, - mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq); + "IRQ %d) Mode:%i Active:%i\n", ioapic, + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, edge_level, active_high_low); - irq_desc[irq].handler = &ioapic_level_irq_type; + if (edge_level) + irq_desc[irq].handler = &ioapic_level_irq_type; + else + irq_desc[irq].handler = &ioapic_edge_irq_type; set_intr_gate(entry.vector, interrupt[irq]); diff -urN linux-2.4.22-bk1/arch/x86_64/kernel/mpparse.c linux-2.4.22-bk2/arch/x86_64/kernel/mpparse.c --- linux-2.4.22-bk1/arch/x86_64/kernel/mpparse.c 2003-08-25 04:44:40.000000000 -0700 +++ linux-2.4.22-bk2/arch/x86_64/kernel/mpparse.c 2003-08-26 15:54:22.000000000 -0700 @@ -923,7 +923,7 @@ ioapic_pin = irq - mp_ioapic_routing[ioapic].irq_start; - io_apic_set_pci_routing(ioapic, ioapic_pin, irq); + io_apic_set_pci_routing(ioapic, ioapic_pin, irq, 1, 1); } #endif /*CONFIG_ACPI_HT_ONLY*/ @@ -939,6 +939,8 @@ int ioapic_pin = 0; int irq = 0; int idx, bit = 0; + int edge_level = 0; + int active_high_low = 0; /* * Parsing through the PCI Interrupt Routing Table (PRT) and program @@ -949,11 +951,14 @@ /* Need to get irq for dynamic entry */ if (entry->link.handle) { - irq = acpi_pci_link_get_irq(entry->link.handle, entry->link.index); + irq = acpi_pci_link_get_irq(entry->link.handle, entry->link.index, &edge_level, &active_high_low); if (!irq) continue; - } else + } else { + edge_level = 1; + active_high_low = 1; irq = entry->link.index; + } irq = entry->link.index; ioapic = mp_find_ioapic(irq); @@ -983,7 +988,7 @@ mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<irq = irq; diff -urN linux-2.4.22-bk1/crypto/Config.in linux-2.4.22-bk2/crypto/Config.in --- linux-2.4.22-bk1/crypto/Config.in 2003-08-25 04:44:40.000000000 -0700 +++ linux-2.4.22-bk2/crypto/Config.in 2003-08-26 15:54:22.000000000 -0700 @@ -70,6 +70,7 @@ tristate ' Twofish cipher algorithm' CONFIG_CRYPTO_TWOFISH tristate ' Serpent cipher algorithm' CONFIG_CRYPTO_SERPENT tristate ' AES cipher algorithms' CONFIG_CRYPTO_AES + tristate ' CAST5 (CAST-128) cipher algorithm' CONFIG_CRYPTO_CAST5 if [ "$CONFIG_INET_IPCOMP" = "y" -o \ "$CONFIG_INET_IPCOMP" = "m" -o \ "$CONFIG_INET6_IPCOMP" = "y" -o \ diff -urN linux-2.4.22-bk1/crypto/Makefile linux-2.4.22-bk2/crypto/Makefile --- linux-2.4.22-bk1/crypto/Makefile 2003-08-25 04:44:40.000000000 -0700 +++ linux-2.4.22-bk2/crypto/Makefile 2003-08-26 15:54:22.000000000 -0700 @@ -24,6 +24,7 @@ obj-$(CONFIG_CRYPTO_TWOFISH) += twofish.o obj-$(CONFIG_CRYPTO_SERPENT) += serpent.o obj-$(CONFIG_CRYPTO_AES) += aes.o +obj-$(CONFIG_CRYPTO_CAST5) += cast5.o obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o diff -urN linux-2.4.22-bk1/crypto/cast5.c linux-2.4.22-bk2/crypto/cast5.c --- linux-2.4.22-bk1/crypto/cast5.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/crypto/cast5.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,852 @@ +/* Kernel cryptographic api. +* cast5.c - Cast5 cipher algorithm (rfc2144). +* +* Derived from GnuPG implementation of cast5. +* +* Major Changes. +* Complete conformance to rfc2144. +* Supports key size from 40 to 128 bits. +* +* Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc. +* Copyright (C) 2003 Kartikey Mahendra Bhatt . +* +* This program is free software; you can redistribute it and/or modify it +* under the terms of GNU General Public License as published by the Free +* Software Foundation; either version 2 of the License, or (at your option) +* any later version. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA +*/ + + +#include +#include +#include +#include +#include + +#define CAST5_BLOCK_SIZE 8 +#define CAST5_MIN_KEY_SIZE 5 +#define CAST5_MAX_KEY_SIZE 16 + +struct cast5_ctx { + u32 Km[16]; + u8 Kr[16]; + int rr; /* rr?number of rounds = 16:number of rounds = 12; (rfc 2144) */ +}; + + +static const u32 s1[256] = { + 0x30fb40d4, 0x9fa0ff0b, 0x6beccd2f, 0x3f258c7a, 0x1e213f2f, + 0x9c004dd3, 0x6003e540, 0xcf9fc949, + 0xbfd4af27, 0x88bbbdb5, 0xe2034090, 0x98d09675, 0x6e63a0e0, + 0x15c361d2, 0xc2e7661d, 0x22d4ff8e, + 0x28683b6f, 0xc07fd059, 0xff2379c8, 0x775f50e2, 0x43c340d3, + 0xdf2f8656, 0x887ca41a, 0xa2d2bd2d, + 0xa1c9e0d6, 0x346c4819, 0x61b76d87, 0x22540f2f, 0x2abe32e1, + 0xaa54166b, 0x22568e3a, 0xa2d341d0, + 0x66db40c8, 0xa784392f, 0x004dff2f, 0x2db9d2de, 0x97943fac, + 0x4a97c1d8, 0x527644b7, 0xb5f437a7, + 0xb82cbaef, 0xd751d159, 0x6ff7f0ed, 0x5a097a1f, 0x827b68d0, + 0x90ecf52e, 0x22b0c054, 0xbc8e5935, + 0x4b6d2f7f, 0x50bb64a2, 0xd2664910, 0xbee5812d, 0xb7332290, + 0xe93b159f, 0xb48ee411, 0x4bff345d, + 0xfd45c240, 0xad31973f, 0xc4f6d02e, 0x55fc8165, 0xd5b1caad, + 0xa1ac2dae, 0xa2d4b76d, 0xc19b0c50, + 0x882240f2, 0x0c6e4f38, 0xa4e4bfd7, 0x4f5ba272, 0x564c1d2f, + 0xc59c5319, 0xb949e354, 0xb04669fe, + 0xb1b6ab8a, 0xc71358dd, 0x6385c545, 0x110f935d, 0x57538ad5, + 0x6a390493, 0xe63d37e0, 0x2a54f6b3, + 0x3a787d5f, 0x6276a0b5, 0x19a6fcdf, 0x7a42206a, 0x29f9d4d5, + 0xf61b1891, 0xbb72275e, 0xaa508167, + 0x38901091, 0xc6b505eb, 0x84c7cb8c, 0x2ad75a0f, 0x874a1427, + 0xa2d1936b, 0x2ad286af, 0xaa56d291, + 0xd7894360, 0x425c750d, 0x93b39e26, 0x187184c9, 0x6c00b32d, + 0x73e2bb14, 0xa0bebc3c, 0x54623779, + 0x64459eab, 0x3f328b82, 0x7718cf82, 0x59a2cea6, 0x04ee002e, + 0x89fe78e6, 0x3fab0950, 0x325ff6c2, + 0x81383f05, 0x6963c5c8, 0x76cb5ad6, 0xd49974c9, 0xca180dcf, + 0x380782d5, 0xc7fa5cf6, 0x8ac31511, + 0x35e79e13, 0x47da91d0, 0xf40f9086, 0xa7e2419e, 0x31366241, + 0x051ef495, 0xaa573b04, 0x4a805d8d, + 0x548300d0, 0x00322a3c, 0xbf64cddf, 0xba57a68e, 0x75c6372b, + 0x50afd341, 0xa7c13275, 0x915a0bf5, + 0x6b54bfab, 0x2b0b1426, 0xab4cc9d7, 0x449ccd82, 0xf7fbf265, + 0xab85c5f3, 0x1b55db94, 0xaad4e324, + 0xcfa4bd3f, 0x2deaa3e2, 0x9e204d02, 0xc8bd25ac, 0xeadf55b3, + 0xd5bd9e98, 0xe31231b2, 0x2ad5ad6c, + 0x954329de, 0xadbe4528, 0xd8710f69, 0xaa51c90f, 0xaa786bf6, + 0x22513f1e, 0xaa51a79b, 0x2ad344cc, + 0x7b5a41f0, 0xd37cfbad, 0x1b069505, 0x41ece491, 0xb4c332e6, + 0x032268d4, 0xc9600acc, 0xce387e6d, + 0xbf6bb16c, 0x6a70fb78, 0x0d03d9c9, 0xd4df39de, 0xe01063da, + 0x4736f464, 0x5ad328d8, 0xb347cc96, + 0x75bb0fc3, 0x98511bfb, 0x4ffbcc35, 0xb58bcf6a, 0xe11f0abc, + 0xbfc5fe4a, 0xa70aec10, 0xac39570a, + 0x3f04442f, 0x6188b153, 0xe0397a2e, 0x5727cb79, 0x9ceb418f, + 0x1cacd68d, 0x2ad37c96, 0x0175cb9d, + 0xc69dff09, 0xc75b65f0, 0xd9db40d8, 0xec0e7779, 0x4744ead4, + 0xb11c3274, 0xdd24cb9e, 0x7e1c54bd, + 0xf01144f9, 0xd2240eb1, 0x9675b3fd, 0xa3ac3755, 0xd47c27af, + 0x51c85f4d, 0x56907596, 0xa5bb15e6, + 0x580304f0, 0xca042cf1, 0x011a37ea, 0x8dbfaadb, 0x35ba3e4a, + 0x3526ffa0, 0xc37b4d09, 0xbc306ed9, + 0x98a52666, 0x5648f725, 0xff5e569d, 0x0ced63d0, 0x7c63b2cf, + 0x700b45e1, 0xd5ea50f1, 0x85a92872, + 0xaf1fbda7, 0xd4234870, 0xa7870bf3, 0x2d3b4d79, 0x42e04198, + 0x0cd0ede7, 0x26470db8, 0xf881814c, + 0x474d6ad7, 0x7c0c5e5c, 0xd1231959, 0x381b7298, 0xf5d2f4db, + 0xab838653, 0x6e2f1e23, 0x83719c9e, + 0xbd91e046, 0x9a56456e, 0xdc39200c, 0x20c8c571, 0x962bda1c, + 0xe1e696ff, 0xb141ab08, 0x7cca89b9, + 0x1a69e783, 0x02cc4843, 0xa2f7c579, 0x429ef47d, 0x427b169c, + 0x5ac9f049, 0xdd8f0f00, 0x5c8165bf +}; +static const u32 s2[256] = { + 0x1f201094, 0xef0ba75b, 0x69e3cf7e, 0x393f4380, 0xfe61cf7a, + 0xeec5207a, 0x55889c94, 0x72fc0651, + 0xada7ef79, 0x4e1d7235, 0xd55a63ce, 0xde0436ba, 0x99c430ef, + 0x5f0c0794, 0x18dcdb7d, 0xa1d6eff3, + 0xa0b52f7b, 0x59e83605, 0xee15b094, 0xe9ffd909, 0xdc440086, + 0xef944459, 0xba83ccb3, 0xe0c3cdfb, + 0xd1da4181, 0x3b092ab1, 0xf997f1c1, 0xa5e6cf7b, 0x01420ddb, + 0xe4e7ef5b, 0x25a1ff41, 0xe180f806, + 0x1fc41080, 0x179bee7a, 0xd37ac6a9, 0xfe5830a4, 0x98de8b7f, + 0x77e83f4e, 0x79929269, 0x24fa9f7b, + 0xe113c85b, 0xacc40083, 0xd7503525, 0xf7ea615f, 0x62143154, + 0x0d554b63, 0x5d681121, 0xc866c359, + 0x3d63cf73, 0xcee234c0, 0xd4d87e87, 0x5c672b21, 0x071f6181, + 0x39f7627f, 0x361e3084, 0xe4eb573b, + 0x602f64a4, 0xd63acd9c, 0x1bbc4635, 0x9e81032d, 0x2701f50c, + 0x99847ab4, 0xa0e3df79, 0xba6cf38c, + 0x10843094, 0x2537a95e, 0xf46f6ffe, 0xa1ff3b1f, 0x208cfb6a, + 0x8f458c74, 0xd9e0a227, 0x4ec73a34, + 0xfc884f69, 0x3e4de8df, 0xef0e0088, 0x3559648d, 0x8a45388c, + 0x1d804366, 0x721d9bfd, 0xa58684bb, + 0xe8256333, 0x844e8212, 0x128d8098, 0xfed33fb4, 0xce280ae1, + 0x27e19ba5, 0xd5a6c252, 0xe49754bd, + 0xc5d655dd, 0xeb667064, 0x77840b4d, 0xa1b6a801, 0x84db26a9, + 0xe0b56714, 0x21f043b7, 0xe5d05860, + 0x54f03084, 0x066ff472, 0xa31aa153, 0xdadc4755, 0xb5625dbf, + 0x68561be6, 0x83ca6b94, 0x2d6ed23b, + 0xeccf01db, 0xa6d3d0ba, 0xb6803d5c, 0xaf77a709, 0x33b4a34c, + 0x397bc8d6, 0x5ee22b95, 0x5f0e5304, + 0x81ed6f61, 0x20e74364, 0xb45e1378, 0xde18639b, 0x881ca122, + 0xb96726d1, 0x8049a7e8, 0x22b7da7b, + 0x5e552d25, 0x5272d237, 0x79d2951c, 0xc60d894c, 0x488cb402, + 0x1ba4fe5b, 0xa4b09f6b, 0x1ca815cf, + 0xa20c3005, 0x8871df63, 0xb9de2fcb, 0x0cc6c9e9, 0x0beeff53, + 0xe3214517, 0xb4542835, 0x9f63293c, + 0xee41e729, 0x6e1d2d7c, 0x50045286, 0x1e6685f3, 0xf33401c6, + 0x30a22c95, 0x31a70850, 0x60930f13, + 0x73f98417, 0xa1269859, 0xec645c44, 0x52c877a9, 0xcdff33a6, + 0xa02b1741, 0x7cbad9a2, 0x2180036f, + 0x50d99c08, 0xcb3f4861, 0xc26bd765, 0x64a3f6ab, 0x80342676, + 0x25a75e7b, 0xe4e6d1fc, 0x20c710e6, + 0xcdf0b680, 0x17844d3b, 0x31eef84d, 0x7e0824e4, 0x2ccb49eb, + 0x846a3bae, 0x8ff77888, 0xee5d60f6, + 0x7af75673, 0x2fdd5cdb, 0xa11631c1, 0x30f66f43, 0xb3faec54, + 0x157fd7fa, 0xef8579cc, 0xd152de58, + 0xdb2ffd5e, 0x8f32ce19, 0x306af97a, 0x02f03ef8, 0x99319ad5, + 0xc242fa0f, 0xa7e3ebb0, 0xc68e4906, + 0xb8da230c, 0x80823028, 0xdcdef3c8, 0xd35fb171, 0x088a1bc8, + 0xbec0c560, 0x61a3c9e8, 0xbca8f54d, + 0xc72feffa, 0x22822e99, 0x82c570b4, 0xd8d94e89, 0x8b1c34bc, + 0x301e16e6, 0x273be979, 0xb0ffeaa6, + 0x61d9b8c6, 0x00b24869, 0xb7ffce3f, 0x08dc283b, 0x43daf65a, + 0xf7e19798, 0x7619b72f, 0x8f1c9ba4, + 0xdc8637a0, 0x16a7d3b1, 0x9fc393b7, 0xa7136eeb, 0xc6bcc63e, + 0x1a513742, 0xef6828bc, 0x520365d6, + 0x2d6a77ab, 0x3527ed4b, 0x821fd216, 0x095c6e2e, 0xdb92f2fb, + 0x5eea29cb, 0x145892f5, 0x91584f7f, + 0x5483697b, 0x2667a8cc, 0x85196048, 0x8c4bacea, 0x833860d4, + 0x0d23e0f9, 0x6c387e8a, 0x0ae6d249, + 0xb284600c, 0xd835731d, 0xdcb1c647, 0xac4c56ea, 0x3ebd81b3, + 0x230eabb0, 0x6438bc87, 0xf0b5b1fa, + 0x8f5ea2b3, 0xfc184642, 0x0a036b7a, 0x4fb089bd, 0x649da589, + 0xa345415e, 0x5c038323, 0x3e5d3bb9, + 0x43d79572, 0x7e6dd07c, 0x06dfdf1e, 0x6c6cc4ef, 0x7160a539, + 0x73bfbe70, 0x83877605, 0x4523ecf1 +}; +static const u32 s3[256] = { + 0x8defc240, 0x25fa5d9f, 0xeb903dbf, 0xe810c907, 0x47607fff, + 0x369fe44b, 0x8c1fc644, 0xaececa90, + 0xbeb1f9bf, 0xeefbcaea, 0xe8cf1950, 0x51df07ae, 0x920e8806, + 0xf0ad0548, 0xe13c8d83, 0x927010d5, + 0x11107d9f, 0x07647db9, 0xb2e3e4d4, 0x3d4f285e, 0xb9afa820, + 0xfade82e0, 0xa067268b, 0x8272792e, + 0x553fb2c0, 0x489ae22b, 0xd4ef9794, 0x125e3fbc, 0x21fffcee, + 0x825b1bfd, 0x9255c5ed, 0x1257a240, + 0x4e1a8302, 0xbae07fff, 0x528246e7, 0x8e57140e, 0x3373f7bf, + 0x8c9f8188, 0xa6fc4ee8, 0xc982b5a5, + 0xa8c01db7, 0x579fc264, 0x67094f31, 0xf2bd3f5f, 0x40fff7c1, + 0x1fb78dfc, 0x8e6bd2c1, 0x437be59b, + 0x99b03dbf, 0xb5dbc64b, 0x638dc0e6, 0x55819d99, 0xa197c81c, + 0x4a012d6e, 0xc5884a28, 0xccc36f71, + 0xb843c213, 0x6c0743f1, 0x8309893c, 0x0feddd5f, 0x2f7fe850, + 0xd7c07f7e, 0x02507fbf, 0x5afb9a04, + 0xa747d2d0, 0x1651192e, 0xaf70bf3e, 0x58c31380, 0x5f98302e, + 0x727cc3c4, 0x0a0fb402, 0x0f7fef82, + 0x8c96fdad, 0x5d2c2aae, 0x8ee99a49, 0x50da88b8, 0x8427f4a0, + 0x1eac5790, 0x796fb449, 0x8252dc15, + 0xefbd7d9b, 0xa672597d, 0xada840d8, 0x45f54504, 0xfa5d7403, + 0xe83ec305, 0x4f91751a, 0x925669c2, + 0x23efe941, 0xa903f12e, 0x60270df2, 0x0276e4b6, 0x94fd6574, + 0x927985b2, 0x8276dbcb, 0x02778176, + 0xf8af918d, 0x4e48f79e, 0x8f616ddf, 0xe29d840e, 0x842f7d83, + 0x340ce5c8, 0x96bbb682, 0x93b4b148, + 0xef303cab, 0x984faf28, 0x779faf9b, 0x92dc560d, 0x224d1e20, + 0x8437aa88, 0x7d29dc96, 0x2756d3dc, + 0x8b907cee, 0xb51fd240, 0xe7c07ce3, 0xe566b4a1, 0xc3e9615e, + 0x3cf8209d, 0x6094d1e3, 0xcd9ca341, + 0x5c76460e, 0x00ea983b, 0xd4d67881, 0xfd47572c, 0xf76cedd9, + 0xbda8229c, 0x127dadaa, 0x438a074e, + 0x1f97c090, 0x081bdb8a, 0x93a07ebe, 0xb938ca15, 0x97b03cff, + 0x3dc2c0f8, 0x8d1ab2ec, 0x64380e51, + 0x68cc7bfb, 0xd90f2788, 0x12490181, 0x5de5ffd4, 0xdd7ef86a, + 0x76a2e214, 0xb9a40368, 0x925d958f, + 0x4b39fffa, 0xba39aee9, 0xa4ffd30b, 0xfaf7933b, 0x6d498623, + 0x193cbcfa, 0x27627545, 0x825cf47a, + 0x61bd8ba0, 0xd11e42d1, 0xcead04f4, 0x127ea392, 0x10428db7, + 0x8272a972, 0x9270c4a8, 0x127de50b, + 0x285ba1c8, 0x3c62f44f, 0x35c0eaa5, 0xe805d231, 0x428929fb, + 0xb4fcdf82, 0x4fb66a53, 0x0e7dc15b, + 0x1f081fab, 0x108618ae, 0xfcfd086d, 0xf9ff2889, 0x694bcc11, + 0x236a5cae, 0x12deca4d, 0x2c3f8cc5, + 0xd2d02dfe, 0xf8ef5896, 0xe4cf52da, 0x95155b67, 0x494a488c, + 0xb9b6a80c, 0x5c8f82bc, 0x89d36b45, + 0x3a609437, 0xec00c9a9, 0x44715253, 0x0a874b49, 0xd773bc40, + 0x7c34671c, 0x02717ef6, 0x4feb5536, + 0xa2d02fff, 0xd2bf60c4, 0xd43f03c0, 0x50b4ef6d, 0x07478cd1, + 0x006e1888, 0xa2e53f55, 0xb9e6d4bc, + 0xa2048016, 0x97573833, 0xd7207d67, 0xde0f8f3d, 0x72f87b33, + 0xabcc4f33, 0x7688c55d, 0x7b00a6b0, + 0x947b0001, 0x570075d2, 0xf9bb88f8, 0x8942019e, 0x4264a5ff, + 0x856302e0, 0x72dbd92b, 0xee971b69, + 0x6ea22fde, 0x5f08ae2b, 0xaf7a616d, 0xe5c98767, 0xcf1febd2, + 0x61efc8c2, 0xf1ac2571, 0xcc8239c2, + 0x67214cb8, 0xb1e583d1, 0xb7dc3e62, 0x7f10bdce, 0xf90a5c38, + 0x0ff0443d, 0x606e6dc6, 0x60543a49, + 0x5727c148, 0x2be98a1d, 0x8ab41738, 0x20e1be24, 0xaf96da0f, + 0x68458425, 0x99833be5, 0x600d457d, + 0x282f9350, 0x8334b362, 0xd91d1120, 0x2b6d8da0, 0x642b1e31, + 0x9c305a00, 0x52bce688, 0x1b03588a, + 0xf7baefd5, 0x4142ed9c, 0xa4315c11, 0x83323ec5, 0xdfef4636, + 0xa133c501, 0xe9d3531c, 0xee353783 +}; +static const u32 s4[256] = { + 0x9db30420, 0x1fb6e9de, 0xa7be7bef, 0xd273a298, 0x4a4f7bdb, + 0x64ad8c57, 0x85510443, 0xfa020ed1, + 0x7e287aff, 0xe60fb663, 0x095f35a1, 0x79ebf120, 0xfd059d43, + 0x6497b7b1, 0xf3641f63, 0x241e4adf, + 0x28147f5f, 0x4fa2b8cd, 0xc9430040, 0x0cc32220, 0xfdd30b30, + 0xc0a5374f, 0x1d2d00d9, 0x24147b15, + 0xee4d111a, 0x0fca5167, 0x71ff904c, 0x2d195ffe, 0x1a05645f, + 0x0c13fefe, 0x081b08ca, 0x05170121, + 0x80530100, 0xe83e5efe, 0xac9af4f8, 0x7fe72701, 0xd2b8ee5f, + 0x06df4261, 0xbb9e9b8a, 0x7293ea25, + 0xce84ffdf, 0xf5718801, 0x3dd64b04, 0xa26f263b, 0x7ed48400, + 0x547eebe6, 0x446d4ca0, 0x6cf3d6f5, + 0x2649abdf, 0xaea0c7f5, 0x36338cc1, 0x503f7e93, 0xd3772061, + 0x11b638e1, 0x72500e03, 0xf80eb2bb, + 0xabe0502e, 0xec8d77de, 0x57971e81, 0xe14f6746, 0xc9335400, + 0x6920318f, 0x081dbb99, 0xffc304a5, + 0x4d351805, 0x7f3d5ce3, 0xa6c866c6, 0x5d5bcca9, 0xdaec6fea, + 0x9f926f91, 0x9f46222f, 0x3991467d, + 0xa5bf6d8e, 0x1143c44f, 0x43958302, 0xd0214eeb, 0x022083b8, + 0x3fb6180c, 0x18f8931e, 0x281658e6, + 0x26486e3e, 0x8bd78a70, 0x7477e4c1, 0xb506e07c, 0xf32d0a25, + 0x79098b02, 0xe4eabb81, 0x28123b23, + 0x69dead38, 0x1574ca16, 0xdf871b62, 0x211c40b7, 0xa51a9ef9, + 0x0014377b, 0x041e8ac8, 0x09114003, + 0xbd59e4d2, 0xe3d156d5, 0x4fe876d5, 0x2f91a340, 0x557be8de, + 0x00eae4a7, 0x0ce5c2ec, 0x4db4bba6, + 0xe756bdff, 0xdd3369ac, 0xec17b035, 0x06572327, 0x99afc8b0, + 0x56c8c391, 0x6b65811c, 0x5e146119, + 0x6e85cb75, 0xbe07c002, 0xc2325577, 0x893ff4ec, 0x5bbfc92d, + 0xd0ec3b25, 0xb7801ab7, 0x8d6d3b24, + 0x20c763ef, 0xc366a5fc, 0x9c382880, 0x0ace3205, 0xaac9548a, + 0xeca1d7c7, 0x041afa32, 0x1d16625a, + 0x6701902c, 0x9b757a54, 0x31d477f7, 0x9126b031, 0x36cc6fdb, + 0xc70b8b46, 0xd9e66a48, 0x56e55a79, + 0x026a4ceb, 0x52437eff, 0x2f8f76b4, 0x0df980a5, 0x8674cde3, + 0xedda04eb, 0x17a9be04, 0x2c18f4df, + 0xb7747f9d, 0xab2af7b4, 0xefc34d20, 0x2e096b7c, 0x1741a254, + 0xe5b6a035, 0x213d42f6, 0x2c1c7c26, + 0x61c2f50f, 0x6552daf9, 0xd2c231f8, 0x25130f69, 0xd8167fa2, + 0x0418f2c8, 0x001a96a6, 0x0d1526ab, + 0x63315c21, 0x5e0a72ec, 0x49bafefd, 0x187908d9, 0x8d0dbd86, + 0x311170a7, 0x3e9b640c, 0xcc3e10d7, + 0xd5cad3b6, 0x0caec388, 0xf73001e1, 0x6c728aff, 0x71eae2a1, + 0x1f9af36e, 0xcfcbd12f, 0xc1de8417, + 0xac07be6b, 0xcb44a1d8, 0x8b9b0f56, 0x013988c3, 0xb1c52fca, + 0xb4be31cd, 0xd8782806, 0x12a3a4e2, + 0x6f7de532, 0x58fd7eb6, 0xd01ee900, 0x24adffc2, 0xf4990fc5, + 0x9711aac5, 0x001d7b95, 0x82e5e7d2, + 0x109873f6, 0x00613096, 0xc32d9521, 0xada121ff, 0x29908415, + 0x7fbb977f, 0xaf9eb3db, 0x29c9ed2a, + 0x5ce2a465, 0xa730f32c, 0xd0aa3fe8, 0x8a5cc091, 0xd49e2ce7, + 0x0ce454a9, 0xd60acd86, 0x015f1919, + 0x77079103, 0xdea03af6, 0x78a8565e, 0xdee356df, 0x21f05cbe, + 0x8b75e387, 0xb3c50651, 0xb8a5c3ef, + 0xd8eeb6d2, 0xe523be77, 0xc2154529, 0x2f69efdf, 0xafe67afb, + 0xf470c4b2, 0xf3e0eb5b, 0xd6cc9876, + 0x39e4460c, 0x1fda8538, 0x1987832f, 0xca007367, 0xa99144f8, + 0x296b299e, 0x492fc295, 0x9266beab, + 0xb5676e69, 0x9bd3ddda, 0xdf7e052f, 0xdb25701c, 0x1b5e51ee, + 0xf65324e6, 0x6afce36c, 0x0316cc04, + 0x8644213e, 0xb7dc59d0, 0x7965291f, 0xccd6fd43, 0x41823979, + 0x932bcdf6, 0xb657c34d, 0x4edfd282, + 0x7ae5290c, 0x3cb9536b, 0x851e20fe, 0x9833557e, 0x13ecf0b0, + 0xd3ffb372, 0x3f85c5c1, 0x0aef7ed2 +}; +static const u32 s5[256] = { + 0x7ec90c04, 0x2c6e74b9, 0x9b0e66df, 0xa6337911, 0xb86a7fff, + 0x1dd358f5, 0x44dd9d44, 0x1731167f, + 0x08fbf1fa, 0xe7f511cc, 0xd2051b00, 0x735aba00, 0x2ab722d8, + 0x386381cb, 0xacf6243a, 0x69befd7a, + 0xe6a2e77f, 0xf0c720cd, 0xc4494816, 0xccf5c180, 0x38851640, + 0x15b0a848, 0xe68b18cb, 0x4caadeff, + 0x5f480a01, 0x0412b2aa, 0x259814fc, 0x41d0efe2, 0x4e40b48d, + 0x248eb6fb, 0x8dba1cfe, 0x41a99b02, + 0x1a550a04, 0xba8f65cb, 0x7251f4e7, 0x95a51725, 0xc106ecd7, + 0x97a5980a, 0xc539b9aa, 0x4d79fe6a, + 0xf2f3f763, 0x68af8040, 0xed0c9e56, 0x11b4958b, 0xe1eb5a88, + 0x8709e6b0, 0xd7e07156, 0x4e29fea7, + 0x6366e52d, 0x02d1c000, 0xc4ac8e05, 0x9377f571, 0x0c05372a, + 0x578535f2, 0x2261be02, 0xd642a0c9, + 0xdf13a280, 0x74b55bd2, 0x682199c0, 0xd421e5ec, 0x53fb3ce8, + 0xc8adedb3, 0x28a87fc9, 0x3d959981, + 0x5c1ff900, 0xfe38d399, 0x0c4eff0b, 0x062407ea, 0xaa2f4fb1, + 0x4fb96976, 0x90c79505, 0xb0a8a774, + 0xef55a1ff, 0xe59ca2c2, 0xa6b62d27, 0xe66a4263, 0xdf65001f, + 0x0ec50966, 0xdfdd55bc, 0x29de0655, + 0x911e739a, 0x17af8975, 0x32c7911c, 0x89f89468, 0x0d01e980, + 0x524755f4, 0x03b63cc9, 0x0cc844b2, + 0xbcf3f0aa, 0x87ac36e9, 0xe53a7426, 0x01b3d82b, 0x1a9e7449, + 0x64ee2d7e, 0xcddbb1da, 0x01c94910, + 0xb868bf80, 0x0d26f3fd, 0x9342ede7, 0x04a5c284, 0x636737b6, + 0x50f5b616, 0xf24766e3, 0x8eca36c1, + 0x136e05db, 0xfef18391, 0xfb887a37, 0xd6e7f7d4, 0xc7fb7dc9, + 0x3063fcdf, 0xb6f589de, 0xec2941da, + 0x26e46695, 0xb7566419, 0xf654efc5, 0xd08d58b7, 0x48925401, + 0xc1bacb7f, 0xe5ff550f, 0xb6083049, + 0x5bb5d0e8, 0x87d72e5a, 0xab6a6ee1, 0x223a66ce, 0xc62bf3cd, + 0x9e0885f9, 0x68cb3e47, 0x086c010f, + 0xa21de820, 0xd18b69de, 0xf3f65777, 0xfa02c3f6, 0x407edac3, + 0xcbb3d550, 0x1793084d, 0xb0d70eba, + 0x0ab378d5, 0xd951fb0c, 0xded7da56, 0x4124bbe4, 0x94ca0b56, + 0x0f5755d1, 0xe0e1e56e, 0x6184b5be, + 0x580a249f, 0x94f74bc0, 0xe327888e, 0x9f7b5561, 0xc3dc0280, + 0x05687715, 0x646c6bd7, 0x44904db3, + 0x66b4f0a3, 0xc0f1648a, 0x697ed5af, 0x49e92ff6, 0x309e374f, + 0x2cb6356a, 0x85808573, 0x4991f840, + 0x76f0ae02, 0x083be84d, 0x28421c9a, 0x44489406, 0x736e4cb8, + 0xc1092910, 0x8bc95fc6, 0x7d869cf4, + 0x134f616f, 0x2e77118d, 0xb31b2be1, 0xaa90b472, 0x3ca5d717, + 0x7d161bba, 0x9cad9010, 0xaf462ba2, + 0x9fe459d2, 0x45d34559, 0xd9f2da13, 0xdbc65487, 0xf3e4f94e, + 0x176d486f, 0x097c13ea, 0x631da5c7, + 0x445f7382, 0x175683f4, 0xcdc66a97, 0x70be0288, 0xb3cdcf72, + 0x6e5dd2f3, 0x20936079, 0x459b80a5, + 0xbe60e2db, 0xa9c23101, 0xeba5315c, 0x224e42f2, 0x1c5c1572, + 0xf6721b2c, 0x1ad2fff3, 0x8c25404e, + 0x324ed72f, 0x4067b7fd, 0x0523138e, 0x5ca3bc78, 0xdc0fd66e, + 0x75922283, 0x784d6b17, 0x58ebb16e, + 0x44094f85, 0x3f481d87, 0xfcfeae7b, 0x77b5ff76, 0x8c2302bf, + 0xaaf47556, 0x5f46b02a, 0x2b092801, + 0x3d38f5f7, 0x0ca81f36, 0x52af4a8a, 0x66d5e7c0, 0xdf3b0874, + 0x95055110, 0x1b5ad7a8, 0xf61ed5ad, + 0x6cf6e479, 0x20758184, 0xd0cefa65, 0x88f7be58, 0x4a046826, + 0x0ff6f8f3, 0xa09c7f70, 0x5346aba0, + 0x5ce96c28, 0xe176eda3, 0x6bac307f, 0x376829d2, 0x85360fa9, + 0x17e3fe2a, 0x24b79767, 0xf5a96b20, + 0xd6cd2595, 0x68ff1ebf, 0x7555442c, 0xf19f06be, 0xf9e0659a, + 0xeeb9491d, 0x34010718, 0xbb30cab8, + 0xe822fe15, 0x88570983, 0x750e6249, 0xda627e55, 0x5e76ffa8, + 0xb1534546, 0x6d47de08, 0xefe9e7d4 +}; +static const u32 s6[256] = { + 0xf6fa8f9d, 0x2cac6ce1, 0x4ca34867, 0xe2337f7c, 0x95db08e7, + 0x016843b4, 0xeced5cbc, 0x325553ac, + 0xbf9f0960, 0xdfa1e2ed, 0x83f0579d, 0x63ed86b9, 0x1ab6a6b8, + 0xde5ebe39, 0xf38ff732, 0x8989b138, + 0x33f14961, 0xc01937bd, 0xf506c6da, 0xe4625e7e, 0xa308ea99, + 0x4e23e33c, 0x79cbd7cc, 0x48a14367, + 0xa3149619, 0xfec94bd5, 0xa114174a, 0xeaa01866, 0xa084db2d, + 0x09a8486f, 0xa888614a, 0x2900af98, + 0x01665991, 0xe1992863, 0xc8f30c60, 0x2e78ef3c, 0xd0d51932, + 0xcf0fec14, 0xf7ca07d2, 0xd0a82072, + 0xfd41197e, 0x9305a6b0, 0xe86be3da, 0x74bed3cd, 0x372da53c, + 0x4c7f4448, 0xdab5d440, 0x6dba0ec3, + 0x083919a7, 0x9fbaeed9, 0x49dbcfb0, 0x4e670c53, 0x5c3d9c01, + 0x64bdb941, 0x2c0e636a, 0xba7dd9cd, + 0xea6f7388, 0xe70bc762, 0x35f29adb, 0x5c4cdd8d, 0xf0d48d8c, + 0xb88153e2, 0x08a19866, 0x1ae2eac8, + 0x284caf89, 0xaa928223, 0x9334be53, 0x3b3a21bf, 0x16434be3, + 0x9aea3906, 0xefe8c36e, 0xf890cdd9, + 0x80226dae, 0xc340a4a3, 0xdf7e9c09, 0xa694a807, 0x5b7c5ecc, + 0x221db3a6, 0x9a69a02f, 0x68818a54, + 0xceb2296f, 0x53c0843a, 0xfe893655, 0x25bfe68a, 0xb4628abc, + 0xcf222ebf, 0x25ac6f48, 0xa9a99387, + 0x53bddb65, 0xe76ffbe7, 0xe967fd78, 0x0ba93563, 0x8e342bc1, + 0xe8a11be9, 0x4980740d, 0xc8087dfc, + 0x8de4bf99, 0xa11101a0, 0x7fd37975, 0xda5a26c0, 0xe81f994f, + 0x9528cd89, 0xfd339fed, 0xb87834bf, + 0x5f04456d, 0x22258698, 0xc9c4c83b, 0x2dc156be, 0x4f628daa, + 0x57f55ec5, 0xe2220abe, 0xd2916ebf, + 0x4ec75b95, 0x24f2c3c0, 0x42d15d99, 0xcd0d7fa0, 0x7b6e27ff, + 0xa8dc8af0, 0x7345c106, 0xf41e232f, + 0x35162386, 0xe6ea8926, 0x3333b094, 0x157ec6f2, 0x372b74af, + 0x692573e4, 0xe9a9d848, 0xf3160289, + 0x3a62ef1d, 0xa787e238, 0xf3a5f676, 0x74364853, 0x20951063, + 0x4576698d, 0xb6fad407, 0x592af950, + 0x36f73523, 0x4cfb6e87, 0x7da4cec0, 0x6c152daa, 0xcb0396a8, + 0xc50dfe5d, 0xfcd707ab, 0x0921c42f, + 0x89dff0bb, 0x5fe2be78, 0x448f4f33, 0x754613c9, 0x2b05d08d, + 0x48b9d585, 0xdc049441, 0xc8098f9b, + 0x7dede786, 0xc39a3373, 0x42410005, 0x6a091751, 0x0ef3c8a6, + 0x890072d6, 0x28207682, 0xa9a9f7be, + 0xbf32679d, 0xd45b5b75, 0xb353fd00, 0xcbb0e358, 0x830f220a, + 0x1f8fb214, 0xd372cf08, 0xcc3c4a13, + 0x8cf63166, 0x061c87be, 0x88c98f88, 0x6062e397, 0x47cf8e7a, + 0xb6c85283, 0x3cc2acfb, 0x3fc06976, + 0x4e8f0252, 0x64d8314d, 0xda3870e3, 0x1e665459, 0xc10908f0, + 0x513021a5, 0x6c5b68b7, 0x822f8aa0, + 0x3007cd3e, 0x74719eef, 0xdc872681, 0x073340d4, 0x7e432fd9, + 0x0c5ec241, 0x8809286c, 0xf592d891, + 0x08a930f6, 0x957ef305, 0xb7fbffbd, 0xc266e96f, 0x6fe4ac98, + 0xb173ecc0, 0xbc60b42a, 0x953498da, + 0xfba1ae12, 0x2d4bd736, 0x0f25faab, 0xa4f3fceb, 0xe2969123, + 0x257f0c3d, 0x9348af49, 0x361400bc, + 0xe8816f4a, 0x3814f200, 0xa3f94043, 0x9c7a54c2, 0xbc704f57, + 0xda41e7f9, 0xc25ad33a, 0x54f4a084, + 0xb17f5505, 0x59357cbe, 0xedbd15c8, 0x7f97c5ab, 0xba5ac7b5, + 0xb6f6deaf, 0x3a479c3a, 0x5302da25, + 0x653d7e6a, 0x54268d49, 0x51a477ea, 0x5017d55b, 0xd7d25d88, + 0x44136c76, 0x0404a8c8, 0xb8e5a121, + 0xb81a928a, 0x60ed5869, 0x97c55b96, 0xeaec991b, 0x29935913, + 0x01fdb7f1, 0x088e8dfa, 0x9ab6f6f5, + 0x3b4cbf9f, 0x4a5de3ab, 0xe6051d35, 0xa0e1d855, 0xd36b4cf1, + 0xf544edeb, 0xb0e93524, 0xbebb8fbd, + 0xa2d762cf, 0x49c92f54, 0x38b5f331, 0x7128a454, 0x48392905, + 0xa65b1db8, 0x851c97bd, 0xd675cf2f +}; +static const u32 s7[256] = { + 0x85e04019, 0x332bf567, 0x662dbfff, 0xcfc65693, 0x2a8d7f6f, + 0xab9bc912, 0xde6008a1, 0x2028da1f, + 0x0227bce7, 0x4d642916, 0x18fac300, 0x50f18b82, 0x2cb2cb11, + 0xb232e75c, 0x4b3695f2, 0xb28707de, + 0xa05fbcf6, 0xcd4181e9, 0xe150210c, 0xe24ef1bd, 0xb168c381, + 0xfde4e789, 0x5c79b0d8, 0x1e8bfd43, + 0x4d495001, 0x38be4341, 0x913cee1d, 0x92a79c3f, 0x089766be, + 0xbaeeadf4, 0x1286becf, 0xb6eacb19, + 0x2660c200, 0x7565bde4, 0x64241f7a, 0x8248dca9, 0xc3b3ad66, + 0x28136086, 0x0bd8dfa8, 0x356d1cf2, + 0x107789be, 0xb3b2e9ce, 0x0502aa8f, 0x0bc0351e, 0x166bf52a, + 0xeb12ff82, 0xe3486911, 0xd34d7516, + 0x4e7b3aff, 0x5f43671b, 0x9cf6e037, 0x4981ac83, 0x334266ce, + 0x8c9341b7, 0xd0d854c0, 0xcb3a6c88, + 0x47bc2829, 0x4725ba37, 0xa66ad22b, 0x7ad61f1e, 0x0c5cbafa, + 0x4437f107, 0xb6e79962, 0x42d2d816, + 0x0a961288, 0xe1a5c06e, 0x13749e67, 0x72fc081a, 0xb1d139f7, + 0xf9583745, 0xcf19df58, 0xbec3f756, + 0xc06eba30, 0x07211b24, 0x45c28829, 0xc95e317f, 0xbc8ec511, + 0x38bc46e9, 0xc6e6fa14, 0xbae8584a, + 0xad4ebc46, 0x468f508b, 0x7829435f, 0xf124183b, 0x821dba9f, + 0xaff60ff4, 0xea2c4e6d, 0x16e39264, + 0x92544a8b, 0x009b4fc3, 0xaba68ced, 0x9ac96f78, 0x06a5b79a, + 0xb2856e6e, 0x1aec3ca9, 0xbe838688, + 0x0e0804e9, 0x55f1be56, 0xe7e5363b, 0xb3a1f25d, 0xf7debb85, + 0x61fe033c, 0x16746233, 0x3c034c28, + 0xda6d0c74, 0x79aac56c, 0x3ce4e1ad, 0x51f0c802, 0x98f8f35a, + 0x1626a49f, 0xeed82b29, 0x1d382fe3, + 0x0c4fb99a, 0xbb325778, 0x3ec6d97b, 0x6e77a6a9, 0xcb658b5c, + 0xd45230c7, 0x2bd1408b, 0x60c03eb7, + 0xb9068d78, 0xa33754f4, 0xf430c87d, 0xc8a71302, 0xb96d8c32, + 0xebd4e7be, 0xbe8b9d2d, 0x7979fb06, + 0xe7225308, 0x8b75cf77, 0x11ef8da4, 0xe083c858, 0x8d6b786f, + 0x5a6317a6, 0xfa5cf7a0, 0x5dda0033, + 0xf28ebfb0, 0xf5b9c310, 0xa0eac280, 0x08b9767a, 0xa3d9d2b0, + 0x79d34217, 0x021a718d, 0x9ac6336a, + 0x2711fd60, 0x438050e3, 0x069908a8, 0x3d7fedc4, 0x826d2bef, + 0x4eeb8476, 0x488dcf25, 0x36c9d566, + 0x28e74e41, 0xc2610aca, 0x3d49a9cf, 0xbae3b9df, 0xb65f8de6, + 0x92aeaf64, 0x3ac7d5e6, 0x9ea80509, + 0xf22b017d, 0xa4173f70, 0xdd1e16c3, 0x15e0d7f9, 0x50b1b887, + 0x2b9f4fd5, 0x625aba82, 0x6a017962, + 0x2ec01b9c, 0x15488aa9, 0xd716e740, 0x40055a2c, 0x93d29a22, + 0xe32dbf9a, 0x058745b9, 0x3453dc1e, + 0xd699296e, 0x496cff6f, 0x1c9f4986, 0xdfe2ed07, 0xb87242d1, + 0x19de7eae, 0x053e561a, 0x15ad6f8c, + 0x66626c1c, 0x7154c24c, 0xea082b2a, 0x93eb2939, 0x17dcb0f0, + 0x58d4f2ae, 0x9ea294fb, 0x52cf564c, + 0x9883fe66, 0x2ec40581, 0x763953c3, 0x01d6692e, 0xd3a0c108, + 0xa1e7160e, 0xe4f2dfa6, 0x693ed285, + 0x74904698, 0x4c2b0edd, 0x4f757656, 0x5d393378, 0xa132234f, + 0x3d321c5d, 0xc3f5e194, 0x4b269301, + 0xc79f022f, 0x3c997e7e, 0x5e4f9504, 0x3ffafbbd, 0x76f7ad0e, + 0x296693f4, 0x3d1fce6f, 0xc61e45be, + 0xd3b5ab34, 0xf72bf9b7, 0x1b0434c0, 0x4e72b567, 0x5592a33d, + 0xb5229301, 0xcfd2a87f, 0x60aeb767, + 0x1814386b, 0x30bcc33d, 0x38a0c07d, 0xfd1606f2, 0xc363519b, + 0x589dd390, 0x5479f8e6, 0x1cb8d647, + 0x97fd61a9, 0xea7759f4, 0x2d57539d, 0x569a58cf, 0xe84e63ad, + 0x462e1b78, 0x6580f87e, 0xf3817914, + 0x91da55f4, 0x40a230f3, 0xd1988f35, 0xb6e318d2, 0x3ffa50bc, + 0x3d40f021, 0xc3c0bdae, 0x4958c24c, + 0x518f36b2, 0x84b1d370, 0x0fedce83, 0x878ddada, 0xf2a279c7, + 0x94e01be8, 0x90716f4b, 0x954b8aa3 +}; +static const u32 sb8[256] = { + 0xe216300d, 0xbbddfffc, 0xa7ebdabd, 0x35648095, 0x7789f8b7, + 0xe6c1121b, 0x0e241600, 0x052ce8b5, + 0x11a9cfb0, 0xe5952f11, 0xece7990a, 0x9386d174, 0x2a42931c, + 0x76e38111, 0xb12def3a, 0x37ddddfc, + 0xde9adeb1, 0x0a0cc32c, 0xbe197029, 0x84a00940, 0xbb243a0f, + 0xb4d137cf, 0xb44e79f0, 0x049eedfd, + 0x0b15a15d, 0x480d3168, 0x8bbbde5a, 0x669ded42, 0xc7ece831, + 0x3f8f95e7, 0x72df191b, 0x7580330d, + 0x94074251, 0x5c7dcdfa, 0xabbe6d63, 0xaa402164, 0xb301d40a, + 0x02e7d1ca, 0x53571dae, 0x7a3182a2, + 0x12a8ddec, 0xfdaa335d, 0x176f43e8, 0x71fb46d4, 0x38129022, + 0xce949ad4, 0xb84769ad, 0x965bd862, + 0x82f3d055, 0x66fb9767, 0x15b80b4e, 0x1d5b47a0, 0x4cfde06f, + 0xc28ec4b8, 0x57e8726e, 0x647a78fc, + 0x99865d44, 0x608bd593, 0x6c200e03, 0x39dc5ff6, 0x5d0b00a3, + 0xae63aff2, 0x7e8bd632, 0x70108c0c, + 0xbbd35049, 0x2998df04, 0x980cf42a, 0x9b6df491, 0x9e7edd53, + 0x06918548, 0x58cb7e07, 0x3b74ef2e, + 0x522fffb1, 0xd24708cc, 0x1c7e27cd, 0xa4eb215b, 0x3cf1d2e2, + 0x19b47a38, 0x424f7618, 0x35856039, + 0x9d17dee7, 0x27eb35e6, 0xc9aff67b, 0x36baf5b8, 0x09c467cd, + 0xc18910b1, 0xe11dbf7b, 0x06cd1af8, + 0x7170c608, 0x2d5e3354, 0xd4de495a, 0x64c6d006, 0xbcc0c62c, + 0x3dd00db3, 0x708f8f34, 0x77d51b42, + 0x264f620f, 0x24b8d2bf, 0x15c1b79e, 0x46a52564, 0xf8d7e54e, + 0x3e378160, 0x7895cda5, 0x859c15a5, + 0xe6459788, 0xc37bc75f, 0xdb07ba0c, 0x0676a3ab, 0x7f229b1e, + 0x31842e7b, 0x24259fd7, 0xf8bef472, + 0x835ffcb8, 0x6df4c1f2, 0x96f5b195, 0xfd0af0fc, 0xb0fe134c, + 0xe2506d3d, 0x4f9b12ea, 0xf215f225, + 0xa223736f, 0x9fb4c428, 0x25d04979, 0x34c713f8, 0xc4618187, + 0xea7a6e98, 0x7cd16efc, 0x1436876c, + 0xf1544107, 0xbedeee14, 0x56e9af27, 0xa04aa441, 0x3cf7c899, + 0x92ecbae6, 0xdd67016d, 0x151682eb, + 0xa842eedf, 0xfdba60b4, 0xf1907b75, 0x20e3030f, 0x24d8c29e, + 0xe139673b, 0xefa63fb8, 0x71873054, + 0xb6f2cf3b, 0x9f326442, 0xcb15a4cc, 0xb01a4504, 0xf1e47d8d, + 0x844a1be5, 0xbae7dfdc, 0x42cbda70, + 0xcd7dae0a, 0x57e85b7a, 0xd53f5af6, 0x20cf4d8c, 0xcea4d428, + 0x79d130a4, 0x3486ebfb, 0x33d3cddc, + 0x77853b53, 0x37effcb5, 0xc5068778, 0xe580b3e6, 0x4e68b8f4, + 0xc5c8b37e, 0x0d809ea2, 0x398feb7c, + 0x132a4f94, 0x43b7950e, 0x2fee7d1c, 0x223613bd, 0xdd06caa2, + 0x37df932b, 0xc4248289, 0xacf3ebc3, + 0x5715f6b7, 0xef3478dd, 0xf267616f, 0xc148cbe4, 0x9052815e, + 0x5e410fab, 0xb48a2465, 0x2eda7fa4, + 0xe87b40e4, 0xe98ea084, 0x5889e9e1, 0xefd390fc, 0xdd07d35b, + 0xdb485694, 0x38d7e5b2, 0x57720101, + 0x730edebc, 0x5b643113, 0x94917e4f, 0x503c2fba, 0x646f1282, + 0x7523d24a, 0xe0779695, 0xf9c17a8f, + 0x7a5b2121, 0xd187b896, 0x29263a4d, 0xba510cdf, 0x81f47c9f, + 0xad1163ed, 0xea7b5965, 0x1a00726e, + 0x11403092, 0x00da6d77, 0x4a0cdd61, 0xad1f4603, 0x605bdfb0, + 0x9eedc364, 0x22ebe6a8, 0xcee7d28a, + 0xa0e736a0, 0x5564a6b9, 0x10853209, 0xc7eb8f37, 0x2de705ca, + 0x8951570f, 0xdf09822b, 0xbd691a6c, + 0xaa12e4f2, 0x87451c0f, 0xe0f6a27a, 0x3ada4819, 0x4cf1764f, + 0x0d771c2b, 0x67cdb156, 0x350d8384, + 0x5938fa0f, 0x42399ef3, 0x36997b07, 0x0e84093d, 0x4aa93e61, + 0x8360d87b, 0x1fa98b0c, 0x1149382c, + 0xe97625a5, 0x0614d1b7, 0x0e25244b, 0x0c768347, 0x589e8d82, + 0x0d2059d1, 0xa466bb1e, 0xf8da0a82, + 0x04f19130, 0xba6e4ec0, 0x99265164, 0x1ee7230d, 0x50b2ad80, + 0xeaee6801, 0x8db2a283, 0xea8bf59e +}; + + +#define rol(n,x) ( ((x) << (n)) | ((x) >> (32-(n))) ) + +#define F1(D,m,r) ( (I = ((m) + (D))), (I=rol((r),I)), \ + (((s1[I >> 24] ^ s2[(I>>16)&0xff]) - s3[(I>>8)&0xff]) + s4[I&0xff]) ) +#define F2(D,m,r) ( (I = ((m) ^ (D))), (I=rol((r),I)), \ + (((s1[I >> 24] - s2[(I>>16)&0xff]) + s3[(I>>8)&0xff]) ^ s4[I&0xff]) ) +#define F3(D,m,r) ( (I = ((m) - (D))), (I=rol((r),I)), \ + (((s1[I >> 24] + s2[(I>>16)&0xff]) ^ s3[(I>>8)&0xff]) - s4[I&0xff]) ) + + +static void cast5_encrypt(void *ctx, u8 * outbuf, const u8 * inbuf) +{ + struct cast5_ctx *c = (struct cast5_ctx *) ctx; + u32 l, r, t; + u32 I; /* used by the Fx macros */ + u32 *Km; + u8 *Kr; + + Km = c->Km; + Kr = c->Kr; + + /* (L0,R0) <-- (m1...m64). (Split the plaintext into left and + * right 32-bit halves L0 = m1...m32 and R0 = m33...m64.) + */ + l = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3]; + r = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7]; + + /* (16 rounds) for i from 1 to 16, compute Li and Ri as follows: + * Li = Ri-1; + * Ri = Li-1 ^ f(Ri-1,Kmi,Kri), where f is defined in Section 2.2 + * Rounds 1, 4, 7, 10, 13, and 16 use f function Type 1. + * Rounds 2, 5, 8, 11, and 14 use f function Type 2. + * Rounds 3, 6, 9, 12, and 15 use f function Type 3. + */ + + if (!(c->rr)) { + t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]); + t = l; l = r; r = t ^ F2(r, Km[1], Kr[1]); + t = l; l = r; r = t ^ F3(r, Km[2], Kr[2]); + t = l; l = r; r = t ^ F1(r, Km[3], Kr[3]); + t = l; l = r; r = t ^ F2(r, Km[4], Kr[4]); + t = l; l = r; r = t ^ F3(r, Km[5], Kr[5]); + t = l; l = r; r = t ^ F1(r, Km[6], Kr[6]); + t = l; l = r; r = t ^ F2(r, Km[7], Kr[7]); + t = l; l = r; r = t ^ F3(r, Km[8], Kr[8]); + t = l; l = r; r = t ^ F1(r, Km[9], Kr[9]); + t = l; l = r; r = t ^ F2(r, Km[10], Kr[10]); + t = l; l = r; r = t ^ F3(r, Km[11], Kr[11]); + t = l; l = r; r = t ^ F1(r, Km[12], Kr[12]); + t = l; l = r; r = t ^ F2(r, Km[13], Kr[13]); + t = l; l = r; r = t ^ F3(r, Km[14], Kr[14]); + t = l; l = r; r = t ^ F1(r, Km[15], Kr[15]); + } else { + t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]); + t = l; l = r; r = t ^ F2(r, Km[1], Kr[1]); + t = l; l = r; r = t ^ F3(r, Km[2], Kr[2]); + t = l; l = r; r = t ^ F1(r, Km[3], Kr[3]); + t = l; l = r; r = t ^ F2(r, Km[4], Kr[4]); + t = l; l = r; r = t ^ F3(r, Km[5], Kr[5]); + t = l; l = r; r = t ^ F1(r, Km[6], Kr[6]); + t = l; l = r; r = t ^ F2(r, Km[7], Kr[7]); + t = l; l = r; r = t ^ F3(r, Km[8], Kr[8]); + t = l; l = r; r = t ^ F1(r, Km[9], Kr[9]); + t = l; l = r; r = t ^ F2(r, Km[10], Kr[10]); + t = l; l = r; r = t ^ F3(r, Km[11], Kr[11]); + } + + /* c1...c64 <-- (R16,L16). (Exchange final blocks L16, R16 and + * concatenate to form the ciphertext.) */ + outbuf[0] = (r >> 24) & 0xff; + outbuf[1] = (r >> 16) & 0xff; + outbuf[2] = (r >> 8) & 0xff; + outbuf[3] = r & 0xff; + outbuf[4] = (l >> 24) & 0xff; + outbuf[5] = (l >> 16) & 0xff; + outbuf[6] = (l >> 8) & 0xff; + outbuf[7] = l & 0xff; +} + +static void cast5_decrypt(void *ctx, u8 * outbuf, const u8 * inbuf) +{ + struct cast5_ctx *c = (struct cast5_ctx *) ctx; + u32 l, r, t; + u32 I; + u32 *Km; + u8 *Kr; + + Km = c->Km; + Kr = c->Kr; + + l = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3]; + r = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7]; + + if (!(c->rr)) { + t = l; l = r; r = t ^ F1(r, Km[15], Kr[15]); + t = l; l = r; r = t ^ F3(r, Km[14], Kr[14]); + t = l; l = r; r = t ^ F2(r, Km[13], Kr[13]); + t = l; l = r; r = t ^ F1(r, Km[12], Kr[12]); + t = l; l = r; r = t ^ F3(r, Km[11], Kr[11]); + t = l; l = r; r = t ^ F2(r, Km[10], Kr[10]); + t = l; l = r; r = t ^ F1(r, Km[9], Kr[9]); + t = l; l = r; r = t ^ F3(r, Km[8], Kr[8]); + t = l; l = r; r = t ^ F2(r, Km[7], Kr[7]); + t = l; l = r; r = t ^ F1(r, Km[6], Kr[6]); + t = l; l = r; r = t ^ F3(r, Km[5], Kr[5]); + t = l; l = r; r = t ^ F2(r, Km[4], Kr[4]); + t = l; l = r; r = t ^ F1(r, Km[3], Kr[3]); + t = l; l = r; r = t ^ F3(r, Km[2], Kr[2]); + t = l; l = r; r = t ^ F2(r, Km[1], Kr[1]); + t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]); + } else { + t = l; l = r; r = t ^ F3(r, Km[11], Kr[11]); + t = l; l = r; r = t ^ F2(r, Km[10], Kr[10]); + t = l; l = r; r = t ^ F1(r, Km[9], Kr[9]); + t = l; l = r; r = t ^ F3(r, Km[8], Kr[8]); + t = l; l = r; r = t ^ F2(r, Km[7], Kr[7]); + t = l; l = r; r = t ^ F1(r, Km[6], Kr[6]); + t = l; l = r; r = t ^ F3(r, Km[5], Kr[5]); + t = l; l = r; r = t ^ F2(r, Km[4], Kr[4]); + t = l; l = r; r = t ^ F1(r, Km[3], Kr[3]); + t = l; l = r; r = t ^ F3(r, Km[2], Kr[2]); + t = l; l = r; r = t ^ F2(r, Km[1], Kr[1]); + t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]); + } + + outbuf[0] = (r >> 24) & 0xff; + outbuf[1] = (r >> 16) & 0xff; + outbuf[2] = (r >> 8) & 0xff; + outbuf[3] = r & 0xff; + outbuf[4] = (l >> 24) & 0xff; + outbuf[5] = (l >> 16) & 0xff; + outbuf[6] = (l >> 8) & 0xff; + outbuf[7] = l & 0xff; +} + +static void key_schedule(u32 * x, u32 * z, u32 * k) +{ + +#define xi(i) ((x[(i)/4] >> (8*(3-((i)%4)))) & 0xff) +#define zi(i) ((z[(i)/4] >> (8*(3-((i)%4)))) & 0xff) + + z[0] = x[0] ^ s5[xi(13)] ^ s6[xi(15)] ^ s7[xi(12)] ^ sb8[xi(14)] ^ + s7[xi(8)]; + z[1] = x[2] ^ s5[zi(0)] ^ s6[zi(2)] ^ s7[zi(1)] ^ sb8[zi(3)] ^ + sb8[xi(10)]; + z[2] = x[3] ^ s5[zi(7)] ^ s6[zi(6)] ^ s7[zi(5)] ^ sb8[zi(4)] ^ + s5[xi(9)]; + z[3] = x[1] ^ s5[zi(10)] ^ s6[zi(9)] ^ s7[zi(11)] ^ sb8[zi(8)] ^ + s6[xi(11)]; + k[0] = s5[zi(8)] ^ s6[zi(9)] ^ s7[zi(7)] ^ sb8[zi(6)] ^ s5[zi(2)]; + k[1] = s5[zi(10)] ^ s6[zi(11)] ^ s7[zi(5)] ^ sb8[zi(4)] ^ + s6[zi(6)]; + k[2] = s5[zi(12)] ^ s6[zi(13)] ^ s7[zi(3)] ^ sb8[zi(2)] ^ + s7[zi(9)]; + k[3] = s5[zi(14)] ^ s6[zi(15)] ^ s7[zi(1)] ^ sb8[zi(0)] ^ + sb8[zi(12)]; + + x[0] = z[2] ^ s5[zi(5)] ^ s6[zi(7)] ^ s7[zi(4)] ^ sb8[zi(6)] ^ + s7[zi(0)]; + x[1] = z[0] ^ s5[xi(0)] ^ s6[xi(2)] ^ s7[xi(1)] ^ sb8[xi(3)] ^ + sb8[zi(2)]; + x[2] = z[1] ^ s5[xi(7)] ^ s6[xi(6)] ^ s7[xi(5)] ^ sb8[xi(4)] ^ + s5[zi(1)]; + x[3] = z[3] ^ s5[xi(10)] ^ s6[xi(9)] ^ s7[xi(11)] ^ sb8[xi(8)] ^ + s6[zi(3)]; + k[4] = s5[xi(3)] ^ s6[xi(2)] ^ s7[xi(12)] ^ sb8[xi(13)] ^ + s5[xi(8)]; + k[5] = s5[xi(1)] ^ s6[xi(0)] ^ s7[xi(14)] ^ sb8[xi(15)] ^ + s6[xi(13)]; + k[6] = s5[xi(7)] ^ s6[xi(6)] ^ s7[xi(8)] ^ sb8[xi(9)] ^ s7[xi(3)]; + k[7] = s5[xi(5)] ^ s6[xi(4)] ^ s7[xi(10)] ^ sb8[xi(11)] ^ + sb8[xi(7)]; + + z[0] = x[0] ^ s5[xi(13)] ^ s6[xi(15)] ^ s7[xi(12)] ^ sb8[xi(14)] ^ + s7[xi(8)]; + z[1] = x[2] ^ s5[zi(0)] ^ s6[zi(2)] ^ s7[zi(1)] ^ sb8[zi(3)] ^ + sb8[xi(10)]; + z[2] = x[3] ^ s5[zi(7)] ^ s6[zi(6)] ^ s7[zi(5)] ^ sb8[zi(4)] ^ + s5[xi(9)]; + z[3] = x[1] ^ s5[zi(10)] ^ s6[zi(9)] ^ s7[zi(11)] ^ sb8[zi(8)] ^ + s6[xi(11)]; + k[8] = s5[zi(3)] ^ s6[zi(2)] ^ s7[zi(12)] ^ sb8[zi(13)] ^ + s5[zi(9)]; + k[9] = s5[zi(1)] ^ s6[zi(0)] ^ s7[zi(14)] ^ sb8[zi(15)] ^ + s6[zi(12)]; + k[10] = s5[zi(7)] ^ s6[zi(6)] ^ s7[zi(8)] ^ sb8[zi(9)] ^ s7[zi(2)]; + k[11] = s5[zi(5)] ^ s6[zi(4)] ^ s7[zi(10)] ^ sb8[zi(11)] ^ + sb8[zi(6)]; + + x[0] = z[2] ^ s5[zi(5)] ^ s6[zi(7)] ^ s7[zi(4)] ^ sb8[zi(6)] ^ + s7[zi(0)]; + x[1] = z[0] ^ s5[xi(0)] ^ s6[xi(2)] ^ s7[xi(1)] ^ sb8[xi(3)] ^ + sb8[zi(2)]; + x[2] = z[1] ^ s5[xi(7)] ^ s6[xi(6)] ^ s7[xi(5)] ^ sb8[xi(4)] ^ + s5[zi(1)]; + x[3] = z[3] ^ s5[xi(10)] ^ s6[xi(9)] ^ s7[xi(11)] ^ sb8[xi(8)] ^ + s6[zi(3)]; + k[12] = s5[xi(8)] ^ s6[xi(9)] ^ s7[xi(7)] ^ sb8[xi(6)] ^ s5[xi(3)]; + k[13] = s5[xi(10)] ^ s6[xi(11)] ^ s7[xi(5)] ^ sb8[xi(4)] ^ + s6[xi(7)]; + k[14] = s5[xi(12)] ^ s6[xi(13)] ^ s7[xi(3)] ^ sb8[xi(2)] ^ + s7[xi(8)]; + k[15] = s5[xi(14)] ^ s6[xi(15)] ^ s7[xi(1)] ^ sb8[xi(0)] ^ + sb8[xi(13)]; + +#undef xi +#undef zi +} + + +static int +cast5_setkey(void *ctx, const u8 * key, unsigned key_len, u32 * flags) +{ + int i; + u32 x[4]; + u32 z[4]; + u32 k[16]; + u8 p_key[16]; + struct cast5_ctx *c = (struct cast5_ctx *) ctx; + + if (key_len < 5 || key_len > 16) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + c->rr = key_len <= 10 ? 1 : 0; + + memset(p_key, 0, 16); + memcpy(p_key, key, key_len); + + + x[0] = p_key[0] << 24 | p_key[1] << 16 | p_key[2] << 8 | p_key[3]; + x[1] = p_key[4] << 24 | p_key[5] << 16 | p_key[6] << 8 | p_key[7]; + x[2] = + p_key[8] << 24 | p_key[9] << 16 | p_key[10] << 8 | p_key[11]; + x[3] = + p_key[12] << 24 | p_key[13] << 16 | p_key[14] << 8 | p_key[15]; + + key_schedule(x, z, k); + for (i = 0; i < 16; i++) + c->Km[i] = k[i]; + key_schedule(x, z, k); + for (i = 0; i < 16; i++) + c->Kr[i] = k[i] & 0x1f; + return 0; +} + +static struct crypto_alg alg = { + .cra_name = "cast5", + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = CAST5_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct cast5_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg.cra_list), + .cra_u = { + .cipher = { + .cia_min_keysize = CAST5_MIN_KEY_SIZE, + .cia_max_keysize = CAST5_MAX_KEY_SIZE, + .cia_ivsize = CAST5_BLOCK_SIZE, + .cia_setkey = cast5_setkey, + .cia_encrypt = cast5_encrypt, + .cia_decrypt = cast5_decrypt + } + } +}; + +static int __init init(void) +{ + return crypto_register_alg(&alg); +} + +static void __exit fini(void) +{ + crypto_unregister_alg(&alg); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Cast5 Cipher Algorithm"); + diff -urN linux-2.4.22-bk1/crypto/tcrypt.c linux-2.4.22-bk2/crypto/tcrypt.c --- linux-2.4.22-bk1/crypto/tcrypt.c 2003-08-25 04:44:40.000000000 -0700 +++ linux-2.4.22-bk2/crypto/tcrypt.c 2003-08-26 15:54:22.000000000 -0700 @@ -2192,6 +2192,102 @@ crypto_free_tfm(tfm); } +void +test_cast5(void) +{ + unsigned int ret, i, tsize; + u8 *p, *q, *key; + struct crypto_tfm *tfm; + struct cast5_tv *c5_tv; + struct scatterlist sg[1]; + + printk("\ntesting cast5 encryption\n"); + + tfm = crypto_alloc_tfm("cast5", 0); + if (tfm == NULL) { + printk("failed to load transform for cast5 (default ecb)\n"); + return; + } + + tsize = sizeof (cast5_enc_tv_template); + if (tsize > TVMEMSIZE) { + printk("template (%u) too big for tvmem (%u)\n", tsize, + TVMEMSIZE); + return; + } + + memcpy(tvmem, cast5_enc_tv_template, tsize); + c5_tv = (void *) tvmem; + for (i = 0; i < CAST5_ENC_TEST_VECTORS; i++) { + printk("test %u (%d bit key):\n", i + 1, c5_tv[i].keylen * 8); + key = c5_tv[i].key; + + ret = crypto_cipher_setkey(tfm, key, c5_tv[i].keylen); + if (ret) { + printk("setkey() failed flags=%x\n", tfm->crt_flags); + + if (!c5_tv[i].fail) + goto out; + } + + p = c5_tv[i].plaintext; + sg[0].page = virt_to_page(p); + sg[0].offset = ((long) p & ~PAGE_MASK); + sg[0].length = sizeof(c5_tv[i].plaintext); + ret = crypto_cipher_encrypt(tfm, sg, sg, sg[0].length); + if (ret) { + printk("encrypt() failed flags=%x\n", tfm->crt_flags); + goto out; + } + + q = kmap(sg[0].page) + sg[0].offset; + hexdump(q, sizeof(c5_tv[i].ciphertext)); + + printk("%s\n", memcmp(q, c5_tv[i].ciphertext, + sizeof(c5_tv[i].ciphertext)) ? "fail" : "pass"); + } + + tsize = sizeof (cast5_dec_tv_template); + if (tsize > TVMEMSIZE) { + printk("template (%u) too big for tvmem (%u)\n", tsize, + TVMEMSIZE); + return; + } + + memcpy(tvmem, cast5_dec_tv_template, tsize); + c5_tv = (void *) tvmem; + for (i = 0; i < CAST5_DEC_TEST_VECTORS; i++) { + printk("test %u (%d bit key):\n", i + 1, c5_tv[i].keylen * 8); + key = c5_tv[i].key; + + ret = crypto_cipher_setkey(tfm, key, c5_tv[i].keylen); + if (ret) { + printk("setkey() failed flags=%x\n", tfm->crt_flags); + + if (!c5_tv[i].fail) + goto out; + } + + p = c5_tv[i].plaintext; + sg[0].page = virt_to_page(p); + sg[0].offset = ((long) p & ~PAGE_MASK); + sg[0].length = sizeof(c5_tv[i].plaintext); + ret = crypto_cipher_decrypt(tfm, sg, sg, sg[0].length); + if (ret) { + printk("decrypt() failed flags=%x\n", tfm->crt_flags); + goto out; + } + + q = kmap(sg[0].page) + sg[0].offset; + hexdump(q, sizeof(c5_tv[i].ciphertext)); + + printk("%s\n", memcmp(q, c5_tv[i].ciphertext, + sizeof(c5_tv[i].ciphertext)) ? "fail" : "pass"); + } +out: + crypto_free_tfm (tfm); +} + static void test_deflate(void) { @@ -2304,6 +2400,7 @@ test_sha384(); test_sha512(); test_deflate(); + test_cast5(); #ifdef CONFIG_CRYPTO_HMAC test_hmac_md5(); test_hmac_sha1(); @@ -2363,6 +2460,10 @@ test_deflate(); break; + case 14: + test_cast5(); + break; + #ifdef CONFIG_CRYPTO_HMAC case 100: test_hmac_md5(); diff -urN linux-2.4.22-bk1/crypto/tcrypt.h linux-2.4.22-bk2/crypto/tcrypt.h --- linux-2.4.22-bk1/crypto/tcrypt.h 2003-08-25 04:44:40.000000000 -0700 +++ linux-2.4.22-bk2/crypto/tcrypt.h 2003-08-26 15:54:22.000000000 -0700 @@ -1682,6 +1682,74 @@ }, }; +/* Cast5 test vectors from RFC 2144 */ +#define CAST5_ENC_TEST_VECTORS 3 +#define CAST5_DEC_TEST_VECTORS 3 + +struct cast5_tv { + unsigned keylen; + unsigned fail; + u8 key[16]; + u8 plaintext[8]; + u8 ciphertext[8]; +}; + +struct cast5_tv cast5_enc_tv_template[] = +{ + { + 16, + 0, + { 0x01, 0x23, 0x45, 0x67, 0x12, 0x34, 0x56, 0x78, + 0x23, 0x45, 0x67, 0x89, 0x34, 0x56, 0x78, 0x9A }, + { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, + { 0x23, 0x8b, 0x4f, 0xe5, 0x84, 0x7e, 0x44, 0xb2 }, + + }, + { + 10, + 0, + { 0x01, 0x23, 0x45, 0x67, 0x12, 0x34, 0x56, 0x78, + 0x23, 0x45 }, + { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, + { 0xeb, 0x6a, 0x71, 0x1a, 0x2c, 0x02, 0x27, 0x1b }, + }, + { + 5, + 0, + { 0x01, 0x23, 0x45, 0x67, 0x12 }, + { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, + { 0x7a, 0xc8, 0x16, 0xd1, 0x6e, 0x9b, 0x30, 0x2e }, + } +}; + +struct cast5_tv cast5_dec_tv_template[] = +{ + { + 16, + 0, + { 0x01, 0x23, 0x45, 0x67, 0x12, 0x34, 0x56, 0x78, + 0x23, 0x45, 0x67, 0x89, 0x34, 0x56, 0x78, 0x9A }, + { 0x23, 0x8b, 0x4f, 0xe5, 0x84, 0x7e, 0x44, 0xb2 }, + { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, + + }, + { + 10, + 0, + { 0x01, 0x23, 0x45, 0x67, 0x12, 0x34, 0x56, 0x78, + 0x23, 0x45 }, + { 0xeb, 0x6a, 0x71, 0x1a, 0x2c, 0x02, 0x27, 0x1b }, + { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, + }, + { + 5, + 0, + { 0x01, 0x23, 0x45, 0x67, 0x12 }, + { 0x7a, 0xc8, 0x16, 0xd1, 0x6e, 0x9b, 0x30, 0x2e }, + { 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef }, + } +}; + /* * Compression stuff. */ diff -urN linux-2.4.22-bk1/drivers/atm/Makefile linux-2.4.22-bk2/drivers/atm/Makefile --- linux-2.4.22-bk1/drivers/atm/Makefile 2003-08-25 04:44:41.000000000 -0700 +++ linux-2.4.22-bk2/drivers/atm/Makefile 2003-08-26 15:54:22.000000000 -0700 @@ -59,8 +59,6 @@ obj-$(CONFIG_ATM_HE) += suni.o endif -EXTRA_CFLAGS=-g - list-multi := fore_200e.o fore_200e-objs := fore200e.o $(FORE200E_FW_OBJS) diff -urN linux-2.4.22-bk1/drivers/char/sonypi.c linux-2.4.22-bk2/drivers/char/sonypi.c --- linux-2.4.22-bk1/drivers/char/sonypi.c 2003-08-25 04:44:41.000000000 -0700 +++ linux-2.4.22-bk2/drivers/char/sonypi.c 2003-08-26 15:54:22.000000000 -0700 @@ -308,7 +308,7 @@ int i, j; v1 = inb_p(sonypi_device.ioport1); - v2 = inb_p(sonypi_device.ioport2); + v2 = inb_p(sonypi_device.ioport1 + sonypi_device.evtype_offset); for (i = 0; sonypi_eventtypes[i].model; i++) { if (sonypi_device.model != sonypi_eventtypes[i].model) @@ -665,11 +665,13 @@ if (sonypi_device.model == SONYPI_DEVICE_MODEL_TYPE2) { ioport_list = sonypi_type2_ioport_list; sonypi_device.region_size = SONYPI_TYPE2_REGION_SIZE; + sonypi_device.evtype_offset = SONYPI_TYPE2_EVTYPE_OFFSET; irq_list = sonypi_type2_irq_list; } else { ioport_list = sonypi_type1_ioport_list; sonypi_device.region_size = SONYPI_TYPE1_REGION_SIZE; + sonypi_device.evtype_offset = SONYPI_TYPE1_EVTYPE_OFFSET; irq_list = sonypi_type1_irq_list; } diff -urN linux-2.4.22-bk1/drivers/char/sonypi.h linux-2.4.22-bk2/drivers/char/sonypi.h --- linux-2.4.22-bk1/drivers/char/sonypi.h 2003-08-25 04:44:41.000000000 -0700 +++ linux-2.4.22-bk2/drivers/char/sonypi.h 2003-08-26 15:54:22.000000000 -0700 @@ -56,12 +56,14 @@ #define SONYPI_BASE 0x50 #define SONYPI_G10A (SONYPI_BASE+0x14) #define SONYPI_TYPE1_REGION_SIZE 0x08 +#define SONYPI_TYPE1_EVTYPE_OFFSET 0x04 /* type2 series specifics */ #define SONYPI_SIRQ 0x9b #define SONYPI_SLOB 0x9c #define SONYPI_SHIB 0x9d #define SONYPI_TYPE2_REGION_SIZE 0x20 +#define SONYPI_TYPE2_EVTYPE_OFFSET 0x12 /* battery / brightness addresses */ #define SONYPI_BAT_FLAGS 0x81 @@ -167,6 +169,7 @@ #define SONYPI_THUMBPHRASE_MASK 0x00000200 #define SONYPI_MEYE_MASK 0x00000400 #define SONYPI_MEMORYSTICK_MASK 0x00000800 +#define SONYPI_BATTERY_MASK 0x00001000 struct sonypi_event { u8 data; @@ -293,6 +296,13 @@ { 0, 0 } }; +/* The set of possible battery events */ +static struct sonypi_event sonypi_batteryev[] = { + { 0x20, SONYPI_EVENT_BATTERY_INSERT }, + { 0x30, SONYPI_EVENT_BATTERY_REMOVE }, + { 0, 0 } +}; + struct sonypi_eventtypes { int model; u8 data; @@ -307,19 +317,22 @@ { SONYPI_DEVICE_MODEL_TYPE1, 0x20, SONYPI_FNKEY_MASK, sonypi_fnkeyev }, { SONYPI_DEVICE_MODEL_TYPE1, 0x30, SONYPI_BLUETOOTH_MASK, sonypi_blueev }, { SONYPI_DEVICE_MODEL_TYPE1, 0x40, SONYPI_PKEY_MASK, sonypi_pkeyev }, + { SONYPI_DEVICE_MODEL_TYPE1, 0x30, SONYPI_MEMORYSTICK_MASK, sonypi_memorystickev }, + { SONYPI_DEVICE_MODEL_TYPE1, 0x40, SONYPI_BATTERY_MASK, sonypi_batteryev }, { SONYPI_DEVICE_MODEL_TYPE2, 0, 0xffffffff, sonypi_releaseev }, { SONYPI_DEVICE_MODEL_TYPE2, 0x38, SONYPI_LID_MASK, sonypi_lidev }, - { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_JOGGER_MASK, sonypi_joggerev }, + { SONYPI_DEVICE_MODEL_TYPE2, 0x11, SONYPI_JOGGER_MASK, sonypi_joggerev }, { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_CAPTURE_MASK, sonypi_captureev }, - { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_FNKEY_MASK, sonypi_fnkeyev }, - { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_BLUETOOTH_MASK, sonypi_blueev }, + { SONYPI_DEVICE_MODEL_TYPE2, 0x21, SONYPI_FNKEY_MASK, sonypi_fnkeyev }, + { SONYPI_DEVICE_MODEL_TYPE2, 0x31, SONYPI_BLUETOOTH_MASK, sonypi_blueev }, { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_PKEY_MASK, sonypi_pkeyev }, - { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_BACK_MASK, sonypi_backev }, + { SONYPI_DEVICE_MODEL_TYPE2, 0x11, SONYPI_BACK_MASK, sonypi_backev }, { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_HELP_MASK, sonypi_helpev }, { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_ZOOM_MASK, sonypi_zoomev }, { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_THUMBPHRASE_MASK, sonypi_thumbphraseev }, - { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_MEMORYSTICK_MASK, sonypi_memorystickev }, + { SONYPI_DEVICE_MODEL_TYPE2, 0x31, SONYPI_MEMORYSTICK_MASK, sonypi_memorystickev }, + { SONYPI_DEVICE_MODEL_TYPE2, 0x41, SONYPI_BATTERY_MASK, sonypi_batteryev }, { 0, 0, 0, 0 } }; @@ -354,6 +367,7 @@ u16 ioport1; u16 ioport2; u16 region_size; + u16 evtype_offset; int camera_power; int bluetooth_power; struct semaphore lock; @@ -380,30 +394,17 @@ } #ifdef CONFIG_ACPI -#include -#if (ACPI_CA_VERSION > 0x20021121) -#ifdef CONFIG_ACPI_EC -#define SONYPI_USE_ACPI -#endif -#endif -#endif /* CONFIG_ACPI */ - -#ifdef CONFIG_ACPI -#ifdef SONYPI_USE_ACPI extern int acpi_disabled; #define SONYPI_ACPI_ACTIVE (!acpi_disabled) #else -#define SONYPI_ACPI_ACTIVE 1 -#endif -#else /* CONFIG_ACPI */ #define SONYPI_ACPI_ACTIVE 0 #endif /* CONFIG_ACPI */ extern int verbose; static inline int sonypi_ec_write(u8 addr, u8 value) { -#ifdef SONYPI_USE_ACPI - if (!acpi_disabled) +#ifdef CONFIG_ACPI_EC + if (SONYPI_ACPI_ACTIVE) return ec_write(addr, value); #endif wait_on_command(1, inb_p(SONYPI_CST_IOPORT) & 3, ITERATIONS_LONG); @@ -417,8 +418,8 @@ } static inline int sonypi_ec_read(u8 addr, u8 *value) { -#ifdef SONYPI_USE_ACPI - if (!acpi_disabled) +#ifdef CONFIG_ACPI_EC + if (SONYPI_ACPI_ACTIVE) return ec_read(addr, value); #endif wait_on_command(1, inb_p(SONYPI_CST_IOPORT) & 3, ITERATIONS_LONG); diff -urN linux-2.4.22-bk1/drivers/media/video/meye.c linux-2.4.22-bk2/drivers/media/video/meye.c --- linux-2.4.22-bk1/drivers/media/video/meye.c 2003-08-25 04:44:42.000000000 -0700 +++ linux-2.4.22-bk2/drivers/media/video/meye.c 2003-08-26 15:54:22.000000000 -0700 @@ -35,7 +35,6 @@ #include #include #include -#include #include #include @@ -139,7 +138,7 @@ memset(mem, 0, size); /* Clear the ram out, no junk to the user */ adr = (unsigned long)mem; while (size > 0) { - mem_map_reserve(vmalloc_to_page((void *)adr)); + SetPageReserved(vmalloc_to_page((void *)adr)); adr += PAGE_SIZE; size -= PAGE_SIZE; } @@ -153,7 +152,7 @@ if (mem) { adr = (unsigned long) mem; while ((long) size > 0) { - mem_map_unreserve(vmalloc_to_page((void *)adr)); + ClearPageReserved(vmalloc_to_page((void *)adr)); adr += PAGE_SIZE; size -= PAGE_SIZE; } diff -urN linux-2.4.22-bk1/drivers/net/dummy.c linux-2.4.22-bk2/drivers/net/dummy.c --- linux-2.4.22-bk1/drivers/net/dummy.c 2001-09-30 12:26:06.000000000 -0700 +++ linux-2.4.22-bk2/drivers/net/dummy.c 2003-08-26 15:54:22.000000000 -0700 @@ -28,8 +28,6 @@ Alan Cox, 30th May 1994 */ -/* To have statistics (just packets sent) define this */ - #include #include #include diff -urN linux-2.4.22-bk1/drivers/net/sungem.c linux-2.4.22-bk2/drivers/net/sungem.c --- linux-2.4.22-bk1/drivers/net/sungem.c 2003-08-25 04:44:42.000000000 -0700 +++ linux-2.4.22-bk2/drivers/net/sungem.c 2003-08-26 15:54:22.000000000 -0700 @@ -2330,17 +2330,14 @@ gp->hw_running = 1; } - spin_lock_irq(&gp->lock); - /* We can now request the interrupt as we know it's masked * on the controller */ if (request_irq(gp->pdev->irq, gem_interrupt, SA_SHIRQ, dev->name, (void *)dev)) { - spin_unlock_irq(&gp->lock); - printk(KERN_ERR "%s: failed to request irq !\n", gp->dev->name); + spin_lock_irq(&gp->lock); #ifdef CONFIG_ALL_PPC if (!hw_was_up && gp->pdev->vendor == PCI_VENDOR_ID_APPLE) gem_apple_powerdown(gp); @@ -2349,10 +2346,13 @@ gp->pm_timer.expires = jiffies + 10*HZ; add_timer(&gp->pm_timer); up(&gp->pm_sem); + spin_unlock_irq(&gp->lock); return -EAGAIN; } + spin_lock_irq(&gp->lock); + /* Allocate & setup ring buffers */ gem_init_rings(gp); diff -urN linux-2.4.22-bk1/drivers/net/tg3.c linux-2.4.22-bk2/drivers/net/tg3.c --- linux-2.4.22-bk1/drivers/net/tg3.c 2003-08-26 15:54:21.000000000 -0700 +++ linux-2.4.22-bk2/drivers/net/tg3.c 2003-08-26 15:54:22.000000000 -0700 @@ -149,6 +149,8 @@ PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL }, { PCI_VENDOR_ID_ALTIMA, PCI_DEVICE_ID_ALTIMA_AC1000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL }, + { PCI_VENDOR_ID_ALTIMA, PCI_DEVICE_ID_ALTIMA_AC1001, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL }, { PCI_VENDOR_ID_ALTIMA, PCI_DEVICE_ID_ALTIMA_AC9100, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL }, { 0, } @@ -251,6 +253,7 @@ { if (!test_bit(__LINK_STATE_RX_SCHED, &dev->state)) BUG(); list_del(&dev->poll_list); + smp_mb__before_clear_bit(); clear_bit(__LINK_STATE_RX_SCHED, &dev->state); } diff -urN linux-2.4.22-bk1/drivers/pci/pci.ids linux-2.4.22-bk2/drivers/pci/pci.ids --- linux-2.4.22-bk1/drivers/pci/pci.ids 2003-08-26 15:54:21.000000000 -0700 +++ linux-2.4.22-bk2/drivers/pci/pci.ids 2003-08-26 15:54:22.000000000 -0700 @@ -6067,6 +6067,7 @@ 1737 Linksys 173b Altima (nee Broadcom) 03e8 AC1000 Gigabit Ethernet + 03e9 AC1001 Gigabit Ethernet 03ea AC9100 Gigabit Ethernet 173b 0001 AC1002 1743 Peppercon AG diff -urN linux-2.4.22-bk1/fs/buffer.c linux-2.4.22-bk2/fs/buffer.c --- linux-2.4.22-bk1/fs/buffer.c 2003-08-26 15:54:21.000000000 -0700 +++ linux-2.4.22-bk2/fs/buffer.c 2003-08-26 15:54:22.000000000 -0700 @@ -612,7 +612,7 @@ if (buffer_attached(bh)) list_del(&bh->b_inode_buffers); set_buffer_attached(bh); - list_add(&bh->b_inode_buffers, list); + list_add_tail(&bh->b_inode_buffers, list); spin_unlock(&lru_list_lock); } diff -urN linux-2.4.22-bk1/include/asm-x86_64/io_apic.h linux-2.4.22-bk2/include/asm-x86_64/io_apic.h --- linux-2.4.22-bk1/include/asm-x86_64/io_apic.h 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/include/asm-x86_64/io_apic.h 2003-08-26 15:54:22.000000000 -0700 @@ -148,6 +148,6 @@ extern int io_apic_get_unique_id (int ioapic, int apic_id); extern int io_apic_get_version (int ioapic); extern int io_apic_get_redir_entries (int ioapic); -extern int io_apic_set_pci_routing (int ioapic, int pin, int irq); +extern int io_apic_set_pci_routing (int ioapic, int pin, int irq, int, int); #endif diff -urN linux-2.4.22-bk1/include/linux/ethtool.h linux-2.4.22-bk2/include/linux/ethtool.h --- linux-2.4.22-bk1/include/linux/ethtool.h 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/include/linux/ethtool.h 2003-08-26 15:54:22.000000000 -0700 @@ -97,7 +97,7 @@ u32 rx_max_coalesced_frames; /* Same as above two parameters, except that these values - * apply while an IRQ is being services by the host. Not + * apply while an IRQ is being serviced by the host. Not * all cards support this feature and the values are ignored * in that case. */ @@ -119,7 +119,7 @@ u32 tx_max_coalesced_frames; /* Same as above two parameters, except that these values - * apply while an IRQ is being services by the host. Not + * apply while an IRQ is being serviced by the host. Not * all cards support this feature and the values are ignored * in that case. */ @@ -250,6 +250,101 @@ u64 data[0]; }; +struct net_device; + +/* Some generic methods drivers may use in their ethtool_ops */ +u32 ethtool_op_get_link(struct net_device *dev); +u32 ethtool_op_get_tx_csum(struct net_device *dev); +u32 ethtool_op_get_sg(struct net_device *dev); +int ethtool_op_set_sg(struct net_device *dev, u32 data); + +/** + * ðtool_ops - Alter and report network device settings + * get_settings: Get device-specific settings + * set_settings: Set device-specific settings + * get_drvinfo: Report driver information + * get_regs: Get device registers + * get_wol: Report whether Wake-on-Lan is enabled + * set_wol: Turn Wake-on-Lan on or off + * get_msglevel: Report driver message level + * set_msglevel: Set driver message level + * nway_reset: Restart autonegotiation + * get_link: Get link status + * get_eeprom: Read data from the device EEPROM + * set_eeprom: Write data to the device EEPROM + * get_coalesce: Get interrupt coalescing parameters + * set_coalesce: Set interrupt coalescing parameters + * get_ringparam: Report ring sizes + * set_ringparam: Set ring sizes + * get_pauseparam: Report pause parameters + * set_pauseparam: Set pause paramters + * get_rx_csum: Report whether receive checksums are turned on or off + * set_rx_csum: Turn receive checksum on or off + * get_tx_csum: Report whether transmit checksums are turned on or off + * set_tx_csum: Turn transmit checksums on or off + * get_sg: Report whether scatter-gather is enabled + * set_sg: Turn scatter-gather on or off + * self_test: Run specified self-tests + * get_strings: Return a set of strings that describe the requested objects + * phys_id: Identify the device + * get_stats: Return statistics about the device + * + * Description: + * + * get_settings: + * @get_settings is passed an ðtool_cmd to fill in. It returns + * an negative errno or zero. + * + * set_settings: + * @set_settings is passed an ðtool_cmd and should attempt to set + * all the settings this device supports. It may return an error value + * if something goes wrong (otherwise 0). + * + * get_eeprom: + * Should fill in the magic field. Don't need to check len for zero + * or wraparound but must check offset + len < size. Fill in the data + * argument with the eeprom values from offset to offset + len. Update + * len to the amount read. Returns an error or zero. + * + * set_eeprom: + * Should validate the magic field. Don't need to check len for zero + * or wraparound but must check offset + len < size. Update len to + * the amount written. Returns an error or zero. + */ +struct ethtool_ops { + int (*get_settings)(struct net_device *, struct ethtool_cmd *); + int (*set_settings)(struct net_device *, struct ethtool_cmd *); + void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); + int (*get_regs_len)(struct net_device *); + void (*get_regs)(struct net_device *, struct ethtool_regs *, void *); + void (*get_wol)(struct net_device *, struct ethtool_wolinfo *); + int (*set_wol)(struct net_device *, struct ethtool_wolinfo *); + u32 (*get_msglevel)(struct net_device *); + void (*set_msglevel)(struct net_device *, u32); + int (*nway_reset)(struct net_device *); + u32 (*get_link)(struct net_device *); + int (*get_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); + int (*set_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); + int (*get_coalesce)(struct net_device *, struct ethtool_coalesce *); + int (*set_coalesce)(struct net_device *, struct ethtool_coalesce *); + void (*get_ringparam)(struct net_device *, struct ethtool_ringparam *); + int (*set_ringparam)(struct net_device *, struct ethtool_ringparam *); + void (*get_pauseparam)(struct net_device *, struct ethtool_pauseparam*); + int (*set_pauseparam)(struct net_device *, struct ethtool_pauseparam*); + u32 (*get_rx_csum)(struct net_device *); + int (*set_rx_csum)(struct net_device *, u32); + u32 (*get_tx_csum)(struct net_device *); + int (*set_tx_csum)(struct net_device *, u32); + u32 (*get_sg)(struct net_device *); + int (*set_sg)(struct net_device *, u32); + int (*self_test_count)(struct net_device *); + void (*self_test)(struct net_device *, struct ethtool_test *, u64 *); + void (*get_strings)(struct net_device *, u32 stringset, u8 *); + int (*phys_id)(struct net_device *, u32); + int (*get_stats_count)(struct net_device *); + void (*get_ethtool_stats)(struct net_device *, struct ethtool_stats *, u64 *); +}; + /* CMDs currently supported */ #define ETHTOOL_GSET 0x00000001 /* Get settings. */ #define ETHTOOL_SSET 0x00000002 /* Set settings. */ diff -urN linux-2.4.22-bk1/include/linux/ipv6.h linux-2.4.22-bk2/include/linux/ipv6.h --- linux-2.4.22-bk1/include/linux/ipv6.h 2001-11-22 11:47:11.000000000 -0800 +++ linux-2.4.22-bk2/include/linux/ipv6.h 2003-08-26 15:54:22.000000000 -0700 @@ -70,7 +70,7 @@ __u32 bitmap; /* strict/loose bit map */ struct in6_addr addr[0]; -#define rt0_type rt_hdr.type; +#define rt0_type rt_hdr.type }; /* diff -urN linux-2.4.22-bk1/include/linux/ipv6_route.h linux-2.4.22-bk2/include/linux/ipv6_route.h --- linux-2.4.22-bk1/include/linux/ipv6_route.h 1998-08-27 19:33:08.000000000 -0700 +++ linux-2.4.22-bk2/include/linux/ipv6_route.h 2003-08-26 15:54:22.000000000 -0700 @@ -25,6 +25,7 @@ #define RTF_DEFAULT 0x00010000 /* default - learned via ND */ #define RTF_ALLONLINK 0x00020000 /* fallback, no routers on link */ #define RTF_ADDRCONF 0x00040000 /* addrconf route - RA */ +#define RTF_PREFIX_RT 0x00080000 /* A prefix only route - RA */ #define RTF_NONEXTHOP 0x00200000 /* route with no nexthop */ #define RTF_EXPIRES 0x00400000 diff -urN linux-2.4.22-bk1/include/linux/netdevice.h linux-2.4.22-bk2/include/linux/netdevice.h --- linux-2.4.22-bk1/include/linux/netdevice.h 2003-08-26 15:54:21.000000000 -0700 +++ linux-2.4.22-bk2/include/linux/netdevice.h 2003-08-26 15:54:22.000000000 -0700 @@ -41,6 +41,7 @@ struct divert_blk; struct vlan_group; +struct ethtool_ops; #define HAVE_ALLOC_NETDEV /* feature macro: alloc_xxxdev functions are available. */ @@ -290,6 +291,8 @@ * See for details. Jean II */ struct iw_handler_def * wireless_handlers; + struct ethtool_ops *ethtool_ops; + /* * This marks the end of the "visible" part of the structure. All * fields hereafter are internal to the system, and may change at @@ -601,6 +604,7 @@ #define HAVE_NETIF_RECEIVE_SKB 1 extern int netif_receive_skb(struct sk_buff *skb); extern int dev_ioctl(unsigned int cmd, void *); +extern int dev_ethtool(struct ifreq *); extern int dev_change_flags(struct net_device *, unsigned); extern void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); @@ -789,6 +793,7 @@ local_irq_save(flags); if (!test_bit(__LINK_STATE_RX_SCHED, &dev->state)) BUG(); list_del(&dev->poll_list); + smp_mb__before_clear_bit(); clear_bit(__LINK_STATE_RX_SCHED, &dev->state); local_irq_restore(flags); } diff -urN linux-2.4.22-bk1/include/linux/pci_ids.h linux-2.4.22-bk2/include/linux/pci_ids.h --- linux-2.4.22-bk1/include/linux/pci_ids.h 2003-08-26 15:54:21.000000000 -0700 +++ linux-2.4.22-bk2/include/linux/pci_ids.h 2003-08-26 15:54:22.000000000 -0700 @@ -1673,6 +1673,7 @@ #define PCI_VENDOR_ID_ALTIMA 0x173b #define PCI_DEVICE_ID_ALTIMA_AC1000 0x03e8 +#define PCI_DEVICE_ID_ALTIMA_AC1001 0x03e9 #define PCI_DEVICE_ID_ALTIMA_AC9100 0x03ea #define PCI_VENDOR_ID_SYMPHONY 0x1c1c diff -urN linux-2.4.22-bk1/include/linux/pkt_sched.h linux-2.4.22-bk2/include/linux/pkt_sched.h --- linux-2.4.22-bk1/include/linux/pkt_sched.h 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.22-bk2/include/linux/pkt_sched.h 2003-08-26 15:54:22.000000000 -0700 @@ -45,7 +45,7 @@ struct tc_estimator { - char interval; + signed char interval; unsigned char ewma_log; }; diff -urN linux-2.4.22-bk1/include/linux/rtnetlink.h linux-2.4.22-bk2/include/linux/rtnetlink.h --- linux-2.4.22-bk1/include/linux/rtnetlink.h 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/include/linux/rtnetlink.h 2003-08-26 15:54:22.000000000 -0700 @@ -167,6 +167,7 @@ #define RTM_F_NOTIFY 0x100 /* Notify user of route change */ #define RTM_F_CLONED 0x200 /* This route is cloned */ #define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */ +#define RTM_F_PREFIX 0x800 /* Prefix addresses */ /* Reserved table identifiers */ diff -urN linux-2.4.22-bk1/include/linux/sonypi.h linux-2.4.22-bk2/include/linux/sonypi.h --- linux-2.4.22-bk1/include/linux/sonypi.h 2003-06-13 07:51:39.000000000 -0700 +++ linux-2.4.22-bk2/include/linux/sonypi.h 2003-08-26 15:54:22.000000000 -0700 @@ -94,6 +94,8 @@ #define SONYPI_EVENT_MEMORYSTICK_INSERT 54 #define SONYPI_EVENT_MEMORYSTICK_EJECT 55 #define SONYPI_EVENT_ANYBUTTON_RELEASED 56 +#define SONYPI_EVENT_BATTERY_INSERT 57 +#define SONYPI_EVENT_BATTERY_REMOVE 58 /* get/set brightness */ #define SONYPI_IOCGBRT _IOR('v', 0, __u8) diff -urN linux-2.4.22-bk1/include/linux/sysctl.h linux-2.4.22-bk2/include/linux/sysctl.h --- linux-2.4.22-bk1/include/linux/sysctl.h 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/include/linux/sysctl.h 2003-08-26 15:54:22.000000000 -0700 @@ -146,6 +146,7 @@ VM_MAX_MAP_COUNT=11, /* int: Maximum number of active map areas */ VM_MIN_READAHEAD=12, /* Min file readahead */ VM_MAX_READAHEAD=13, /* Max file readahead */ + VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ }; diff -urN linux-2.4.22-bk1/include/net/ip_vs.h linux-2.4.22-bk2/include/net/ip_vs.h --- linux-2.4.22-bk1/include/net/ip_vs.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/include/net/ip_vs.h 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,932 @@ +/* + * IP Virtual Server + * data structure and functionality definitions + */ + +#ifndef _IP_VS_H +#define _IP_VS_H + +#include /* For __uXX types */ + +#define IP_VS_VERSION_CODE 0x01000A +#define NVERSION(version) \ + (version >> 16) & 0xFF, \ + (version >> 8) & 0xFF, \ + version & 0xFF + +/* + * Virtual Service Flags + */ +#define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */ +#define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */ + +/* + * Destination Server Flags + */ +#define IP_VS_DEST_F_AVAILABLE 0x0001 /* Available tag */ + +/* + * IPVS sync daemon states + */ +#define IP_VS_STATE_NONE 0 /* daemon is stopped */ +#define IP_VS_STATE_MASTER 1 /* started as master */ +#define IP_VS_STATE_BACKUP 2 /* started as backup */ + +/* + * IPVS socket options + */ +#define IP_VS_BASE_CTL (64+1024+64) /* base */ + +#define IP_VS_SO_SET_NONE IP_VS_BASE_CTL /* just peek */ +#define IP_VS_SO_SET_INSERT (IP_VS_BASE_CTL+1) +#define IP_VS_SO_SET_ADD (IP_VS_BASE_CTL+2) +#define IP_VS_SO_SET_EDIT (IP_VS_BASE_CTL+3) +#define IP_VS_SO_SET_DEL (IP_VS_BASE_CTL+4) +#define IP_VS_SO_SET_FLUSH (IP_VS_BASE_CTL+5) +#define IP_VS_SO_SET_LIST (IP_VS_BASE_CTL+6) +#define IP_VS_SO_SET_ADDDEST (IP_VS_BASE_CTL+7) +#define IP_VS_SO_SET_DELDEST (IP_VS_BASE_CTL+8) +#define IP_VS_SO_SET_EDITDEST (IP_VS_BASE_CTL+9) +#define IP_VS_SO_SET_TIMEOUTS (IP_VS_BASE_CTL+10) +#define IP_VS_SO_SET_STARTDAEMON (IP_VS_BASE_CTL+11) +#define IP_VS_SO_SET_STOPDAEMON (IP_VS_BASE_CTL+12) +#define IP_VS_SO_SET_RESTORE (IP_VS_BASE_CTL+13) +#define IP_VS_SO_SET_SAVE (IP_VS_BASE_CTL+14) +#define IP_VS_SO_SET_ZERO (IP_VS_BASE_CTL+15) +#define IP_VS_SO_SET_MAX IP_VS_SO_SET_ZERO + +#define IP_VS_SO_GET_VERSION IP_VS_BASE_CTL +#define IP_VS_SO_GET_INFO (IP_VS_BASE_CTL+1) +#define IP_VS_SO_GET_SERVICES (IP_VS_BASE_CTL+2) +#define IP_VS_SO_GET_SERVICE (IP_VS_BASE_CTL+3) +#define IP_VS_SO_GET_DESTS (IP_VS_BASE_CTL+4) +#define IP_VS_SO_GET_DEST (IP_VS_BASE_CTL+5) /* not used now */ +#define IP_VS_SO_GET_TIMEOUTS (IP_VS_BASE_CTL+6) +#define IP_VS_SO_GET_DAEMON (IP_VS_BASE_CTL+7) +#define IP_VS_SO_GET_MAX IP_VS_SO_GET_DAEMON + + +/* + * IPVS Connection Flags + */ +#define IP_VS_CONN_F_FWD_MASK 0x0007 /* mask for the fwd methods */ +#define IP_VS_CONN_F_MASQ 0x0000 /* masquerading */ +#define IP_VS_CONN_F_LOCALNODE 0x0001 /* local node */ +#define IP_VS_CONN_F_TUNNEL 0x0002 /* tunneling */ +#define IP_VS_CONN_F_DROUTE 0x0003 /* direct routing */ +#define IP_VS_CONN_F_BYPASS 0x0004 /* cache bypass */ +#define IP_VS_CONN_F_HASHED 0x0040 /* hashed entry */ +#define IP_VS_CONN_F_NOOUTPUT 0x0080 /* no output packets */ +#define IP_VS_CONN_F_INACTIVE 0x0100 /* not established */ +#define IP_VS_CONN_F_OUT_SEQ 0x0200 /* must do output seq adjust */ +#define IP_VS_CONN_F_IN_SEQ 0x0400 /* must do input seq adjust */ +#define IP_VS_CONN_F_SEQ_MASK 0x0600 /* in/out sequence mask */ +#define IP_VS_CONN_F_NO_CPORT 0x0800 /* no client port set yet */ + +/* Move it to better place one day, for now keep it unique */ +#define NFC_IPVS_PROPERTY 0x10000 + +#define IP_VS_SCHEDNAME_MAXLEN 16 +#define IP_VS_IFNAME_MAXLEN 16 + +struct ip_vs_rule_user { + /* global options */ + int tcp_timeout; /* timeout values */ + int tcp_fin_timeout; + int udp_timeout; + int state; /* sync daemon state */ + char mcast_ifn[IP_VS_IFNAME_MAXLEN]; + /* multicast interface name */ + + /* virtual service options */ + u_int16_t protocol; + u_int32_t vaddr; /* virtual address */ + u_int16_t vport; + u_int32_t vfwmark; /* firwall mark of virtual service*/ + char sched_name[IP_VS_SCHEDNAME_MAXLEN]; + unsigned vs_flags; /* virtual service flags */ + unsigned timeout; /* persistent timeout in ticks */ + u_int32_t netmask; /* persistent netmask */ + + /* destination specific options */ + u_int32_t daddr; /* destination address */ + u_int16_t dport; + unsigned conn_flags; /* destination flags */ + int weight; /* destination weight */ +}; + + +/* + * IPVS statistics object (for user space) + */ +struct ip_vs_stats_user +{ + __u32 conns; /* connections scheduled */ + __u32 inpkts; /* incoming packets */ + __u32 outpkts; /* outgoing packets */ + __u64 inbytes; /* incoming bytes */ + __u64 outbytes; /* outgoing bytes */ + + __u32 cps; /* current connection rate */ + __u32 inpps; /* current in packet rate */ + __u32 outpps; /* current out packet rate */ + __u32 inbps; /* current in byte rate */ + __u32 outbps; /* current out byte rate */ +}; + + +/* The argument to IP_VS_SO_GET_INFO */ +struct ip_vs_getinfo { + /* version number */ + unsigned int version; + + /* size of connection hash table */ + unsigned int size; + + /* number of virtual services */ + unsigned int num_services; +}; + +/* The argument to IP_VS_SO_GET_SERVICE */ +struct ip_vs_service_user { + /* which service: user fills this in */ + u_int16_t protocol; + u_int32_t addr; /* virtual address */ + u_int16_t port; + u_int32_t fwmark; /* firwall mark of virtual service */ + + /* service options */ + char sched_name[IP_VS_SCHEDNAME_MAXLEN]; + unsigned flags; /* virtual service flags */ + unsigned timeout; /* persistent timeout in ticks */ + u_int32_t netmask; /* persistent netmask */ + + /* number of real servers */ + unsigned int num_dests; + + /* statistics */ + struct ip_vs_stats_user stats; +}; + +struct ip_vs_dest_user { + u_int32_t addr; /* destination address */ + u_int16_t port; + unsigned flags; /* destination flags */ + int weight; /* destination weight */ + u_int32_t activeconns; /* active connections */ + u_int32_t inactconns; /* inactive connections */ + + /* statistics */ + struct ip_vs_stats_user stats; +}; + +/* The argument to IP_VS_SO_GET_DESTS */ +struct ip_vs_get_dests { + /* which service: user fills this in */ + u_int16_t protocol; + u_int32_t addr; /* virtual address */ + u_int16_t port; + u_int32_t fwmark; /* firwall mark of virtual service */ + + /* number of real servers */ + unsigned int num_dests; + + /* the real servers */ + struct ip_vs_dest_user entrytable[0]; +}; + +/* The argument to IP_VS_SO_GET_SERVICES */ +struct ip_vs_get_services { + /* number of virtual services */ + unsigned int num_services; + + /* service table */ + struct ip_vs_service_user entrytable[0]; +}; + +/* The argument to IP_VS_SO_GET_TIMEOUTS */ +struct ip_vs_timeout_user { + int tcp_timeout; + int tcp_fin_timeout; + int udp_timeout; +}; + +/* The argument to IP_VS_SO_GET_DAEMON */ +struct ip_vs_daemon_user { + int state; /* sync daemon state */ + char mcast_ifn[IP_VS_IFNAME_MAXLEN]; /* multicast interface name */ +}; + + +#ifdef __KERNEL__ + +#include +#include /* for struct list_head */ +#include /* for struct rwlock_t */ +#include /* for struct sk_buff */ +#include /* for struct iphdr */ +#include /* for struct atomic_t */ +#include /* for struct neighbour; */ +#include /* for struct dst_entry */ +#include /* for ip_route_output */ +#include +#include + + +#ifdef CONFIG_IP_VS_DEBUG +extern int ip_vs_get_debug_level(void); +#define IP_VS_DBG(level, msg...) \ + do { \ + if (level <= ip_vs_get_debug_level()) \ + printk(KERN_DEBUG "IPVS: " msg); \ + } while (0) +#define IP_VS_DBG_RL(msg...) \ + do { \ + if (net_ratelimit()) \ + printk(KERN_DEBUG "IPVS: " msg); \ + } while (0) +#else /* NO DEBUGGING at ALL */ +#define IP_VS_DBG(level, msg...) do {} while (0) +#define IP_VS_DBG_RL(msg...) do {} while (0) +#endif + +#define IP_VS_BUG() BUG() +#define IP_VS_ERR(msg...) printk(KERN_ERR "IPVS: " msg) +#define IP_VS_INFO(msg...) printk(KERN_INFO "IPVS: " msg) +#define IP_VS_WARNING(msg...) \ + printk(KERN_WARNING "IPVS: " msg) +#define IP_VS_ERR_RL(msg...) \ + do { \ + if (net_ratelimit()) \ + printk(KERN_ERR "IPVS: " msg); \ + } while (0) + +#ifdef CONFIG_IP_VS_DEBUG +#define EnterFunction(level) \ + do { \ + if (level <= ip_vs_get_debug_level()) \ + printk(KERN_DEBUG "Enter: %s, %s line %i\n", \ + __FUNCTION__, __FILE__, __LINE__); \ + } while (0) +#define LeaveFunction(level) \ + do { \ + if (level <= ip_vs_get_debug_level()) \ + printk(KERN_DEBUG "Leave: %s, %s line %i\n", \ + __FUNCTION__, __FILE__, __LINE__); \ + } while (0) +#else +#define EnterFunction(level) do {} while (0) +#define LeaveFunction(level) do {} while (0) +#endif + + +/* + * The port number of FTP service (in network order). + */ +#define FTPPORT __constant_htons(21) +#define FTPDATA __constant_htons(20) + + +/* + * IPVS sysctl variables under the /proc/sys/net/ipv4/vs/ + */ +#define NET_IPV4_VS 21 + +enum { + NET_IPV4_VS_DEBUG_LEVEL=1, + NET_IPV4_VS_AMEMTHRESH=2, + NET_IPV4_VS_AMDROPRATE=3, + NET_IPV4_VS_DROP_ENTRY=4, + NET_IPV4_VS_DROP_PACKET=5, + NET_IPV4_VS_SECURE_TCP=6, + NET_IPV4_VS_TO_ES=7, + NET_IPV4_VS_TO_SS=8, + NET_IPV4_VS_TO_SR=9, + NET_IPV4_VS_TO_FW=10, + NET_IPV4_VS_TO_TW=11, + NET_IPV4_VS_TO_CL=12, + NET_IPV4_VS_TO_CW=13, + NET_IPV4_VS_TO_LA=14, + NET_IPV4_VS_TO_LI=15, + NET_IPV4_VS_TO_SA=16, + NET_IPV4_VS_TO_UDP=17, + NET_IPV4_VS_TO_ICMP=18, + NET_IPV4_VS_LBLC_EXPIRE=19, + NET_IPV4_VS_LBLCR_EXPIRE=20, + NET_IPV4_VS_CACHE_BYPASS=22, + NET_IPV4_VS_EXPIRE_NODEST_CONN=23, + NET_IPV4_VS_SYNC_THRESHOLD=24, + NET_IPV4_VS_NAT_ICMP_SEND=25, + NET_IPV4_VS_LAST +}; + + +/* + * IPVS State Values + */ +enum { + IP_VS_S_NONE = 0, + IP_VS_S_ESTABLISHED, + IP_VS_S_SYN_SENT, + IP_VS_S_SYN_RECV, + IP_VS_S_FIN_WAIT, + IP_VS_S_TIME_WAIT, + IP_VS_S_CLOSE, + IP_VS_S_CLOSE_WAIT, + IP_VS_S_LAST_ACK, + IP_VS_S_LISTEN, + IP_VS_S_SYNACK, + IP_VS_S_UDP, + IP_VS_S_ICMP, + IP_VS_S_LAST +}; + + +struct ip_vs_timeout_table { + atomic_t refcnt; + int scale; + int timeout[IP_VS_S_LAST+1]; +}; + + +/* + * Transport protocol header + */ +union ip_vs_tphdr { + unsigned char *raw; + struct udphdr *uh; + struct tcphdr *th; + struct icmphdr *icmph; + __u16 *portp; +}; + + +/* + * Delta sequence info structure + * Each ip_vs_conn has 2 (output AND input seq. changes). + * Only used in the VS/NAT. + */ +struct ip_vs_seq { + __u32 init_seq; /* Add delta from this seq */ + __u32 delta; /* Delta in sequence numbers */ + __u32 previous_delta; /* Delta in sequence numbers + before last resized pkt */ +}; + + +/* + * IPVS statistics object + */ +struct ip_vs_stats +{ + __u32 conns; /* connections scheduled */ + __u32 inpkts; /* incoming packets */ + __u32 outpkts; /* outgoing packets */ + __u64 inbytes; /* incoming bytes */ + __u64 outbytes; /* outgoing bytes */ + + __u32 cps; /* current connection rate */ + __u32 inpps; /* current in packet rate */ + __u32 outpps; /* current out packet rate */ + __u32 inbps; /* current in byte rate */ + __u32 outbps; /* current out byte rate */ + + spinlock_t lock; /* spin lock */ +}; + + +/* + * IP_VS structure allocated for each dynamically scheduled connection + */ +struct ip_vs_conn { + struct list_head c_list; /* hashed list heads */ + + /* Protocol, addresses and port numbers */ + __u32 caddr; /* client address */ + __u32 vaddr; /* virtual address */ + __u32 daddr; /* destination address */ + __u16 cport; + __u16 vport; + __u16 dport; + __u16 protocol; /* Which protocol (TCP/UDP) */ + + /* counter and timer */ + atomic_t refcnt; /* reference count */ + struct timer_list timer; /* Expiration timer */ + volatile unsigned long timeout; /* timeout */ + struct ip_vs_timeout_table *timeout_table; + + /* Flags and state transition */ + spinlock_t lock; /* lock for state transition */ + volatile __u16 flags; /* status flags */ + volatile __u16 state; /* state info */ + + /* Control members */ + struct ip_vs_conn *control; /* Master control connection */ + atomic_t n_control; /* Number of controlled ones */ + struct ip_vs_dest *dest; /* real server */ + atomic_t in_pkts; /* incoming packet counter */ + + /* packet transmitter for different forwarding methods */ + int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp); + + /* Note: we can group the following members into a structure, + in order to save more space, and the following members are + only used in VS/NAT anyway */ + struct ip_vs_app *app; /* bound ip_vs_app object */ + void *app_data; /* Application private data */ + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ +}; + + +/* + * The information about the virtual service offered to the net + * and the forwarding entries + */ +struct ip_vs_service { + struct list_head s_list; /* for normal service table */ + struct list_head f_list; /* for fwmark-based service table */ + atomic_t refcnt; /* reference counter */ + atomic_t usecnt; /* use counter */ + + __u16 protocol; /* which protocol (TCP/UDP) */ + __u32 addr; /* IP address for virtual service */ + __u16 port; /* port number for the service */ + __u32 fwmark; /* firewall mark of the service */ + unsigned flags; /* service status flags */ + unsigned timeout; /* persistent timeout in ticks */ + __u32 netmask; /* grouping granularity */ + + struct list_head destinations; /* real server d-linked list */ + __u32 num_dests; /* number of servers */ + struct ip_vs_stats stats; /* statistics for the service */ + + /* for scheduling */ + struct ip_vs_scheduler *scheduler; /* bound scheduler object */ + rwlock_t sched_lock; /* lock sched_data */ + void *sched_data; /* scheduler application data */ +}; + + +/* + * The real server destination forwarding entry + * with ip address, port number, and so on. + */ +struct ip_vs_dest { + struct list_head n_list; /* for the dests in the service */ + struct list_head d_list; /* for table with all the dests */ + + __u32 addr; /* IP address of real server */ + __u16 port; /* port number of the service */ + unsigned flags; /* dest status flags */ + atomic_t weight; /* server weight */ + atomic_t conn_flags; /* flags to copy to conn */ + atomic_t activeconns; /* active connections */ + atomic_t inactconns; /* inactive connections */ + atomic_t refcnt; /* reference counter */ + struct ip_vs_stats stats; /* statistics */ + + /* for destination cache */ + spinlock_t dst_lock; /* lock dst_cache */ + struct dst_entry *dst_cache; /* destination cache entry */ + u32 dst_rtos; /* RT_TOS(tos) for dst */ + + /* for virtual service */ + struct ip_vs_service *svc; /* service that it belongs to */ + __u16 protocol; /* which protocol (TCP/UDP) */ + __u32 vaddr; /* IP address for virtual service */ + __u16 vport; /* port number for the service */ + __u32 vfwmark; /* firewall mark of the service */ +}; + + +/* + * The scheduler object + */ +struct ip_vs_scheduler { + struct list_head n_list; /* d-linked list head */ + char *name; /* scheduler name */ + atomic_t refcnt; /* reference counter */ + struct module *module; /* THIS_MODULE/NULL */ + + /* scheduler initializing service */ + int (*init_service)(struct ip_vs_service *svc); + /* scheduling service finish */ + int (*done_service)(struct ip_vs_service *svc); + /* scheduler updating service */ + int (*update_service)(struct ip_vs_service *svc); + + /* selecting a server from the given service */ + struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc, + struct iphdr *iph); +}; + + +/* + * The application module object + */ +struct ip_vs_app +{ + struct list_head n_list; /* d-linked list head */ + char *name; /* name of application module */ + unsigned type; /* type = proto<<16 | port + (host byte order)*/ + struct module *module; /* THIS_MODULE/NULL */ + + /* ip_vs_app initializer */ + int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *); + /* ip_vs_app finish */ + int (*done_conn)(struct ip_vs_app *, struct ip_vs_conn *); + /* output hook */ + int (*pkt_out)(struct ip_vs_app *, + struct ip_vs_conn *, struct sk_buff *); + /* input hook */ + int (*pkt_in)(struct ip_vs_app *, + struct ip_vs_conn *, struct sk_buff *); +}; + + +/* + * IPVS core functions + * (from ip_vs_core.c) + */ +extern const char *ip_vs_proto_name(unsigned proto); +extern unsigned int check_for_ip_vs_out(struct sk_buff **skb_p, + int (*okfn)(struct sk_buff *)); + + +/* + * ip_vs_conn handling functions + * (from ip_vs_conn.c) + */ + +/* + * IPVS connection entry hash table + */ +#ifndef CONFIG_IP_VS_TAB_BITS +#define CONFIG_IP_VS_TAB_BITS 12 +#endif +/* make sure that IP_VS_CONN_TAB_BITS is located in [8, 20] */ +#if CONFIG_IP_VS_TAB_BITS < 8 +#define IP_VS_CONN_TAB_BITS 8 +#endif +#if CONFIG_IP_VS_TAB_BITS > 20 +#define IP_VS_CONN_TAB_BITS 20 +#endif +#if 8 <= CONFIG_IP_VS_TAB_BITS && CONFIG_IP_VS_TAB_BITS <= 20 +#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS +#endif +#define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS) +#define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1) + +#define VS_STATE_INPUT 0 +#define VS_STATE_OUTPUT 4 +#define VS_STATE_INPUT_ONLY 8 + +extern struct ip_vs_timeout_table vs_timeout_table; +extern struct ip_vs_timeout_table vs_timeout_table_dos; + +extern struct ip_vs_conn *ip_vs_conn_in_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port); +extern struct ip_vs_conn *ip_vs_conn_out_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port); + +/* put back the conn without restarting its timer */ +static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) +{ + atomic_dec(&cp->refcnt); +} +extern void ip_vs_conn_put(struct ip_vs_conn *cp); + +extern struct ip_vs_conn * +ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport, + __u32 daddr, __u16 dport, unsigned flags, + struct ip_vs_dest *dest); +extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp); + +extern const char * ip_vs_state_name(int state); +extern int ip_vs_set_state(struct ip_vs_conn *cp, int state_off, + struct iphdr *iph, void *tp); +extern int ip_vs_conn_listen(struct ip_vs_conn *cp); +extern int ip_vs_check_template(struct ip_vs_conn *ct); +extern void ip_vs_secure_tcp_set(int on); +extern void ip_vs_random_dropentry(void); +extern int ip_vs_conn_init(void); +extern void ip_vs_conn_cleanup(void); + +static inline void ip_vs_control_del(struct ip_vs_conn *cp) +{ + struct ip_vs_conn *ctl_cp = cp->control; + if (!ctl_cp) { + IP_VS_ERR("request control DEL for uncontrolled: " + "%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n", + NIPQUAD(cp->caddr),ntohs(cp->cport), + NIPQUAD(cp->vaddr),ntohs(cp->vport)); + return; + } + + IP_VS_DBG(7, "DELeting control for: " + "cp.dst=%d.%d.%d.%d:%d ctl_cp.dst=%d.%d.%d.%d:%d\n", + NIPQUAD(cp->caddr),ntohs(cp->cport), + NIPQUAD(ctl_cp->caddr),ntohs(ctl_cp->cport)); + + cp->control = NULL; + if (atomic_read(&ctl_cp->n_control) == 0) { + IP_VS_ERR("BUG control DEL with n=0 : " + "%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n", + NIPQUAD(cp->caddr),ntohs(cp->cport), + NIPQUAD(cp->vaddr),ntohs(cp->vport)); + return; + } + atomic_dec(&ctl_cp->n_control); +} + +static inline void +ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) +{ + if (cp->control) { + IP_VS_ERR("request control ADD for already controlled: " + "%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n", + NIPQUAD(cp->caddr),ntohs(cp->cport), + NIPQUAD(cp->vaddr),ntohs(cp->vport)); + ip_vs_control_del(cp); + } + + IP_VS_DBG(7, "ADDing control for: " + "cp.dst=%d.%d.%d.%d:%d ctl_cp.dst=%d.%d.%d.%d:%d\n", + NIPQUAD(cp->caddr),ntohs(cp->cport), + NIPQUAD(ctl_cp->caddr),ntohs(ctl_cp->cport)); + + cp->control = ctl_cp; + atomic_inc(&ctl_cp->n_control); +} + + +/* + * IPVS application functions + * (from ip_vs_app.c) + */ +#define IP_VS_APP_MAX_PORTS 8 +extern int register_ip_vs_app(struct ip_vs_app *mapp, + unsigned short proto, __u16 port); +extern int unregister_ip_vs_app(struct ip_vs_app *mapp); +extern struct ip_vs_app * ip_vs_bind_app(struct ip_vs_conn *cp); +extern int ip_vs_unbind_app(struct ip_vs_conn *cp); +extern int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb); +extern int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb); +extern int ip_vs_skb_replace(struct sk_buff *skb, int pri, + char *o_buf, int o_len, char *n_buf, int n_len); +extern int ip_vs_app_init(void); +extern void ip_vs_app_cleanup(void); + + +/* + * Registering/unregistering scheduler functions + * (from ip_vs_sched.c) + */ +extern int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); +extern int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); +extern int ip_vs_bind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *scheduler); +extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc); +extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name); +extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler); + + +/* + * IPVS control data and functions + * (from ip_vs_ctl.c) + */ +extern int sysctl_ip_vs_cache_bypass; +extern int sysctl_ip_vs_expire_nodest_conn; +extern int sysctl_ip_vs_sync_threshold; +extern int sysctl_ip_vs_nat_icmp_send; +extern struct ip_vs_stats ip_vs_stats; + +extern struct ip_vs_service *ip_vs_service_get(__u32 fwmark, + __u16 protocol, + __u32 vaddr, __u16 vport); +static inline void ip_vs_service_put(struct ip_vs_service *svc) +{ + atomic_dec(&svc->usecnt); +} + +extern struct ip_vs_dest * +ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport); +extern void ip_vs_random_dropentry(void); +extern int ip_vs_control_init(void); +extern void ip_vs_control_cleanup(void); + + +/* + * IPVS sync daemon data and function prototypes + * (from ip_vs_sync.c) + */ +extern volatile int ip_vs_sync_state; +extern char ip_vs_mcast_ifn[IP_VS_IFNAME_MAXLEN]; +extern int start_sync_thread(int state, char *mcast_ifn); +extern int stop_sync_thread(void); +extern void ip_vs_sync_conn(struct ip_vs_conn *cp); + + +/* + * IPVS rate estimator prototypes (from ip_vs_est.c) + */ +extern int ip_vs_new_estimator(struct ip_vs_stats *stats); +extern void ip_vs_kill_estimator(struct ip_vs_stats *stats); +extern void ip_vs_zero_estimator(struct ip_vs_stats *stats); + + +/* + * This is a simple mechanism to ignore packets when + * we are loaded. Just set ip_vs_drop_rate to 'n' and + * we start to drop 1/rate of the packets + */ +extern int ip_vs_drop_rate; +extern int ip_vs_drop_counter; + +static __inline__ int ip_vs_todrop(void) +{ + if (!ip_vs_drop_rate) return 0; + if (--ip_vs_drop_counter > 0) return 0; + ip_vs_drop_counter = ip_vs_drop_rate; + return 1; +} + + +/* + * ip_vs_fwd_tag returns the forwarding tag of the connection + */ +#define IP_VS_FWD_METHOD(cp) (cp->flags & IP_VS_CONN_F_FWD_MASK) + +extern __inline__ char ip_vs_fwd_tag(struct ip_vs_conn *cp) +{ + char fwd; + + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + fwd = 'M'; break; + case IP_VS_CONN_F_LOCALNODE: + fwd = 'L'; break; + case IP_VS_CONN_F_TUNNEL: + fwd = 'T'; break; + case IP_VS_CONN_F_DROUTE: + fwd = 'R'; break; + case IP_VS_CONN_F_BYPASS: + fwd = 'B'; break; + default: + fwd = '?'; break; + } + return fwd; +} + + +/* + * transport layer header checking + */ +extern inline int ip_vs_header_check(struct sk_buff *skb, int proto, int ihl) +{ + int len; + + switch (proto) { + case IPPROTO_TCP: + len = ihl + sizeof(struct tcphdr); + /* we don't care about TCP options */ + break; + case IPPROTO_UDP: + len = ihl + sizeof(struct udphdr); + break; + default: + len = 0; + } + + /* guarantee protocol header available in skb data area */ + if (!pskb_may_pull(skb, len)) + return -1; + else + return 0; +} + + +/* + * Destination cache + */ +static inline void +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = dst; + dest->dst_rtos = rtos; + dst_release(old_dst); +} + +static inline void +__ip_vs_dst_reset(struct ip_vs_dest *dest) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = NULL; + dst_release(old_dst); +} + +static inline struct dst_entry * +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) +{ + struct dst_entry *dst = dest->dst_cache; + + if (!dst) + return NULL; + if ((dst->obsolete || rtos != dest->dst_rtos) && + dst->ops->check(dst, cookie) == NULL) { + dest->dst_cache = 0; + return NULL; + } + dst_hold(dst); + return dst; +} + +static inline struct rtable * +__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) +{ + struct rtable *rt; /* Route to the other host */ + struct ip_vs_dest *dest = cp->dest; + + if (dest) { + spin_lock(&dest->dst_lock); + if (!(rt = (struct rtable *) + __ip_vs_dst_check(dest, rtos, 0))) { + if (ip_route_output(&rt, dest->addr, 0, rtos, 0)) { + spin_unlock(&dest->dst_lock); + IP_VS_DBG_RL("ip_route_output error, " + "dest: %u.%u.%u.%u\n", + NIPQUAD(dest->addr)); + return NULL; + } + __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); + IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", + NIPQUAD(dest->addr), + atomic_read(&rt->u.dst.__refcnt), rtos); + } + spin_unlock(&dest->dst_lock); + } else { + if (ip_route_output(&rt, cp->daddr, 0, rtos, 0)) { + IP_VS_DBG_RL("ip_route_output error, dest: " + "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); + return NULL; + } + } + + return rt; +} + +static inline u16 ip_vs_check_diff(u32 old, u32 new, u16 oldsum) +{ + u32 diff[2] = { old, new }; + + return csum_fold(csum_partial((char *) diff, sizeof(diff), + oldsum ^ 0xFFFF)); +} + +static inline void ip_vs_fast_check_update(union ip_vs_tphdr *h, + u32 oldip, u32 newip, u16 oldport, u16 newport, u8 protocol) +{ + u16 *checkp; + + if (protocol == IPPROTO_TCP) + checkp = &h->th->check; + else + checkp = &h->uh->check; + *checkp = ip_vs_check_diff(~oldip, newip, + ip_vs_check_diff(oldport ^ 0xFFFF, + newport, *checkp)); + if (!*checkp && protocol == IPPROTO_UDP) + *checkp = 0xFFFF; +} + +static inline int +ip_vs_skb_cow(struct sk_buff *skb, unsigned int headroom, + struct iphdr **iph_p, unsigned char **t_p) +{ + int delta = (headroom > 16 ? headroom : 16) - skb_headroom(skb); + + if (delta < 0) + delta = 0; + + if (delta || skb_cloned(skb)) { + if (pskb_expand_head(skb, (delta+15)&~15, 0, GFP_ATOMIC)) + return -ENOMEM; + + /* skb data changed, update pointers */ + *iph_p = skb->nh.iph; + *t_p = (char*) (*iph_p) + (*iph_p)->ihl * 4; + } + return 0; +} + +#endif /* __KERNEL__ */ + +#endif /* _IP_VS_H */ diff -urN linux-2.4.22-bk1/include/net/neighbour.h linux-2.4.22-bk2/include/net/neighbour.h --- linux-2.4.22-bk1/include/net/neighbour.h 2001-11-22 11:47:11.000000000 -0800 +++ linux-2.4.22-bk2/include/net/neighbour.h 2003-08-26 15:54:22.000000000 -0700 @@ -180,6 +180,7 @@ extern void neigh_destroy(struct neighbour *neigh); extern int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); extern int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, int override, int arp); +extern void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); extern int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); extern int neigh_resolve_output(struct sk_buff *skb); extern int neigh_connected_output(struct sk_buff *skb); diff -urN linux-2.4.22-bk1/include/net/pkt_sched.h linux-2.4.22-bk2/include/net/pkt_sched.h --- linux-2.4.22-bk1/include/net/pkt_sched.h 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/include/net/pkt_sched.h 2003-08-26 15:54:22.000000000 -0700 @@ -212,12 +212,16 @@ #if PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES -#if HZ == 100 +#if HZ < 96 +#define PSCHED_JSCALE 14 +#elif HZ >= 96 && HZ < 192 #define PSCHED_JSCALE 13 -#elif HZ == 1024 +#elif HZ >= 192 && HZ < 384 +#define PSCHED_JSCALE 12 +#elif HZ >= 384 && HZ < 768 +#define PSCHED_JSCALE 11 +#elif HZ >= 768 #define PSCHED_JSCALE 10 -#else -#define PSCHED_JSCALE 0 #endif #define PSCHED_EXPORTLIST_2 diff -urN linux-2.4.22-bk1/kernel/printk.c linux-2.4.22-bk2/kernel/printk.c --- linux-2.4.22-bk1/kernel/printk.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/kernel/printk.c 2003-08-26 15:54:22.000000000 -0700 @@ -29,6 +29,7 @@ #include +#if !defined(CONFIG_LOG_BUF_SHIFT) || (CONFIG_LOG_BUF_SHIFT - 0 == 0) #if defined(CONFIG_MULTIQUAD) || defined(CONFIG_IA64) #define LOG_BUF_LEN (65536) #elif defined(CONFIG_ARCH_S390) @@ -38,6 +39,9 @@ #else #define LOG_BUF_LEN (16384) /* This must be a power of two */ #endif +#else /* CONFIG_LOG_BUF_SHIFT */ +#define LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#endif #define LOG_BUF_MASK (LOG_BUF_LEN-1) diff -urN linux-2.4.22-bk1/net/8021q/vlan.c linux-2.4.22-bk2/net/8021q/vlan.c --- linux-2.4.22-bk1/net/8021q/vlan.c 2003-06-13 07:51:39.000000000 -0700 +++ linux-2.4.22-bk2/net/8021q/vlan.c 2003-08-26 15:54:22.000000000 -0700 @@ -533,7 +533,9 @@ grp->vlan_devices[VLAN_ID] = new_dev; - vlan_proc_add_dev(new_dev); /* create it's proc entry */ + if (vlan_proc_add_dev(new_dev)<0)/* create it's proc entry */ + printk(KERN_WARNING "VLAN: failed to add proc entry for %s\n", + new_dev->name); if (real_dev->features & NETIF_F_HW_VLAN_FILTER) real_dev->vlan_rx_add_vid(real_dev, VLAN_ID); diff -urN linux-2.4.22-bk1/net/8021q/vlan_dev.c linux-2.4.22-bk2/net/8021q/vlan_dev.c --- linux-2.4.22-bk1/net/8021q/vlan_dev.c 2003-06-13 07:51:39.000000000 -0700 +++ linux-2.4.22-bk2/net/8021q/vlan_dev.c 2003-08-26 15:54:22.000000000 -0700 @@ -171,7 +171,7 @@ #ifdef VLAN_DEBUG printk(VLAN_DBG "%s: dropping skb: %p because came in on wrong device, dev: %s real_dev: %s, skb_dev: %s\n", - __FUNCTION__ skb, dev->name, + __FUNCTION__, skb, dev->name, VLAN_DEV_INFO(skb->dev)->real_dev->name, skb->dev->name); #endif diff -urN linux-2.4.22-bk1/net/8021q/vlanproc.c linux-2.4.22-bk2/net/8021q/vlanproc.c --- linux-2.4.22-bk1/net/8021q/vlanproc.c 2003-06-13 07:51:39.000000000 -0700 +++ linux-2.4.22-bk2/net/8021q/vlanproc.c 2003-08-26 15:54:22.000000000 -0700 @@ -204,8 +204,10 @@ #endif /** NOTE: This will consume the memory pointed to by dent, it seems. */ - remove_proc_entry(VLAN_DEV_INFO(vlandev)->dent->name, proc_vlan_dir); - VLAN_DEV_INFO(vlandev)->dent = NULL; + if (VLAN_DEV_INFO(vlandev)->dent) { + remove_proc_entry(VLAN_DEV_INFO(vlandev)->dent->name, proc_vlan_dir); + VLAN_DEV_INFO(vlandev)->dent = NULL; + } return 0; } diff -urN linux-2.4.22-bk1/net/Makefile linux-2.4.22-bk2/net/Makefile --- linux-2.4.22-bk1/net/Makefile 2002-08-02 17:39:46.000000000 -0700 +++ linux-2.4.22-bk2/net/Makefile 2003-08-26 15:54:22.000000000 -0700 @@ -46,6 +46,10 @@ subdir-$(CONFIG_ECONET) += econet subdir-$(CONFIG_VLAN_8021Q) += 8021q +ifeq ($(CONFIG_NETFILTER),y) + mod-subdirs += ipv4/ipvs + subdir-$(CONFIG_IP_VS) += ipv4/ipvs +endif obj-y := socket.o $(join $(subdir-y), $(patsubst %,/%.o,$(notdir $(subdir-y)))) ifeq ($(CONFIG_NET),y) diff -urN linux-2.4.22-bk1/net/atm/common.c linux-2.4.22-bk2/net/atm/common.c --- linux-2.4.22-bk1/net/atm/common.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/atm/common.c 2003-08-26 15:54:22.000000000 -0700 @@ -84,7 +84,7 @@ #ifdef CONFIG_ATM_CLIP_MODULE EXPORT_SYMBOL(atm_clip_ops); -EXPORT_SYMBOL(atm_clip_ops_mutex); +EXPORT_SYMBOL(try_atm_clip_ops); EXPORT_SYMBOL(atm_clip_ops_set); #endif #endif diff -urN linux-2.4.22-bk1/net/core/Makefile linux-2.4.22-bk2/net/core/Makefile --- linux-2.4.22-bk1/net/core/Makefile 2002-08-02 17:39:46.000000000 -0700 +++ linux-2.4.22-bk2/net/core/Makefile 2003-08-26 15:54:22.000000000 -0700 @@ -21,7 +21,8 @@ obj-$(CONFIG_FILTER) += filter.o -obj-$(CONFIG_NET) += dev.o dev_mcast.o dst.o neighbour.o rtnetlink.o utils.o +obj-$(CONFIG_NET) += dev.o ethtool.o dev_mcast.o dst.o neighbour.o \ + rtnetlink.o utils.o obj-$(CONFIG_NETFILTER) += netfilter.o obj-$(CONFIG_NET_DIVERT) += dv.o diff -urN linux-2.4.22-bk1/net/core/dev.c linux-2.4.22-bk2/net/core/dev.c --- linux-2.4.22-bk1/net/core/dev.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/core/dev.c 2003-08-26 15:54:22.000000000 -0700 @@ -1588,6 +1588,7 @@ *budget -= work; list_del(&blog_dev->poll_list); + smp_mb__before_clear_bit(); clear_bit(__LINK_STATE_RX_SCHED, &blog_dev->state); if (queue->throttle) { @@ -2198,7 +2199,6 @@ cmd == SIOCBONDSLAVEINFOQUERY || cmd == SIOCBONDINFOQUERY || cmd == SIOCBONDCHANGEACTIVE || - cmd == SIOCETHTOOL || cmd == SIOCGMIIPHY || cmd == SIOCGMIIREG || cmd == SIOCSMIIREG || @@ -2294,6 +2294,20 @@ } return ret; + case SIOCETHTOOL: + dev_load(ifr.ifr_name); + rtnl_lock(); + ret = dev_ethtool(&ifr); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + /* * These ioctl calls: * - require superuser power. @@ -2301,7 +2315,6 @@ * - return a value */ - case SIOCETHTOOL: case SIOCGMIIPHY: case SIOCGMIIREG: if (!capable(CAP_NET_ADMIN)) diff -urN linux-2.4.22-bk1/net/core/ethtool.c linux-2.4.22-bk2/net/core/ethtool.c --- linux-2.4.22-bk1/net/core/ethtool.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/core/ethtool.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,673 @@ +/* + * net/core/ethtool.c - Ethtool ioctl handler + * Copyright (c) 2003 Matthew Wilcox + * + * This file is where we call all the ethtool_ops commands to get + * the information ethtool needs. We fall back to calling do_ioctl() + * for drivers which haven't been converted to ethtool_ops yet. + * + * It's GPL, stupid. + */ + +#include +#include +#include +#include +#include + +/* + * Some useful ethtool_ops methods that're device independent. + * If we find that all drivers want to do the same thing here, + * we can turn these into dev_() function calls. + */ + +u32 ethtool_op_get_link(struct net_device *dev) +{ + return netif_carrier_ok(dev) ? 1 : 0; +} + +u32 ethtool_op_get_tx_csum(struct net_device *dev) +{ + return (dev->features & NETIF_F_IP_CSUM) != 0; +} + +u32 ethtool_op_get_sg(struct net_device *dev) +{ + return (dev->features & NETIF_F_SG) != 0; +} + +int ethtool_op_set_sg(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_SG; + else + dev->features &= ~NETIF_F_SG; + + return 0; +} + +/* Handlers for each ethtool command */ + +static int ethtool_get_settings(struct net_device *dev, void *useraddr) +{ + struct ethtool_cmd cmd = { ETHTOOL_GSET }; + int err; + + if (!dev->ethtool_ops->get_settings) + return -EOPNOTSUPP; + + err = dev->ethtool_ops->get_settings(dev, &cmd); + if (err < 0) + return err; + + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) + return -EFAULT; + return 0; +} + +static int ethtool_set_settings(struct net_device *dev, void *useraddr) +{ + struct ethtool_cmd cmd; + + if (!dev->ethtool_ops->set_settings) + return -EOPNOTSUPP; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + + return dev->ethtool_ops->set_settings(dev, &cmd); +} + +static int ethtool_get_drvinfo(struct net_device *dev, void *useraddr) +{ + struct ethtool_drvinfo info; + struct ethtool_ops *ops = dev->ethtool_ops; + + if (!ops->get_drvinfo) + return -EOPNOTSUPP; + + memset(&info, 0, sizeof(info)); + info.cmd = ETHTOOL_GDRVINFO; + ops->get_drvinfo(dev, &info); + + if (ops->self_test_count) + info.testinfo_len = ops->self_test_count(dev); + if (ops->get_stats_count) + info.n_stats = ops->get_stats_count(dev); + if (ops->get_regs_len) + info.regdump_len = ops->get_regs_len(dev); + /* XXX: eeprom? */ + + if (copy_to_user(useraddr, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +static int ethtool_get_regs(struct net_device *dev, char *useraddr) +{ + struct ethtool_regs regs; + struct ethtool_ops *ops = dev->ethtool_ops; + void *regbuf; + int reglen, ret; + + if (!ops->get_regs || !ops->get_regs_len) + return -EOPNOTSUPP; + + if (copy_from_user(®s, useraddr, sizeof(regs))) + return -EFAULT; + + reglen = ops->get_regs_len(dev); + if (regs.len > reglen) + regs.len = reglen; + + regbuf = kmalloc(reglen, GFP_USER); + if (!regbuf) + return -ENOMEM; + + ops->get_regs(dev, ®s, regbuf); + + ret = -EFAULT; + if (copy_to_user(useraddr, ®s, sizeof(regs))) + goto out; + useraddr += offsetof(struct ethtool_regs, data); + if (copy_to_user(useraddr, regbuf, reglen)) + goto out; + ret = 0; + + out: + kfree(regbuf); + return ret; +} + +static int ethtool_get_wol(struct net_device *dev, char *useraddr) +{ + struct ethtool_wolinfo wol = { ETHTOOL_GWOL }; + + if (!dev->ethtool_ops->get_wol) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_wol(dev, &wol); + + if (copy_to_user(useraddr, &wol, sizeof(wol))) + return -EFAULT; + return 0; +} + +static int ethtool_set_wol(struct net_device *dev, char *useraddr) +{ + struct ethtool_wolinfo wol; + + if (!dev->ethtool_ops->set_wol) + return -EOPNOTSUPP; + + if (copy_from_user(&wol, useraddr, sizeof(wol))) + return -EFAULT; + + return dev->ethtool_ops->set_wol(dev, &wol); +} + +static int ethtool_get_msglevel(struct net_device *dev, char *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GMSGLVL }; + + if (!dev->ethtool_ops->get_msglevel) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_msglevel(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_msglevel(struct net_device *dev, char *useraddr) +{ + struct ethtool_value edata; + + if (!dev->ethtool_ops->set_msglevel) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + dev->ethtool_ops->set_msglevel(dev, edata.data); + return 0; +} + +static int ethtool_nway_reset(struct net_device *dev) +{ + if (!dev->ethtool_ops->nway_reset) + return -EOPNOTSUPP; + + return dev->ethtool_ops->nway_reset(dev); +} + +static int ethtool_get_link(struct net_device *dev, void *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GLINK }; + + if (!dev->ethtool_ops->get_link) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_link(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_get_eeprom(struct net_device *dev, void *useraddr) +{ + struct ethtool_eeprom eeprom; + u8 *data; + int len, ret; + + if (!dev->ethtool_ops->get_eeprom) + return -EOPNOTSUPP; + + if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) + return -EFAULT; + + len = eeprom.len; + /* Check for wrap and zero */ + if (eeprom.offset + len <= eeprom.offset) + return -EINVAL; + + data = kmalloc(len, GFP_USER); + if (!data) + return -ENOMEM; + + if (copy_from_user(data, useraddr + sizeof(eeprom), len)) + return -EFAULT; + + ret = dev->ethtool_ops->get_eeprom(dev, &eeprom, data); + if (!ret) + goto out; + + ret = -EFAULT; + if (copy_to_user(useraddr, &eeprom, sizeof(eeprom))) + goto out; + if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len)) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +static int ethtool_set_eeprom(struct net_device *dev, void *useraddr) +{ + struct ethtool_eeprom eeprom; + u8 *data; + int len, ret; + + if (!dev->ethtool_ops->set_eeprom) + return -EOPNOTSUPP; + + if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) + return -EFAULT; + + len = eeprom.len; + /* Check for wrap and zero */ + if (eeprom.offset + len <= eeprom.offset) + return -EINVAL; + + data = kmalloc(len, GFP_USER); + if (!data) + return -ENOMEM; + + if (copy_from_user(data, useraddr + sizeof(eeprom), len)) + return -EFAULT; + + ret = dev->ethtool_ops->set_eeprom(dev, &eeprom, data); + if (ret) + goto out; + + if (copy_to_user(useraddr + sizeof(eeprom), data, len)) + ret = -EFAULT; + + out: + kfree(data); + return ret; +} + +static int ethtool_get_coalesce(struct net_device *dev, void *useraddr) +{ + struct ethtool_coalesce coalesce = { ETHTOOL_GCOALESCE }; + + if (!dev->ethtool_ops->get_coalesce) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_coalesce(dev, &coalesce); + + if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) + return -EFAULT; + return 0; +} + +static int ethtool_set_coalesce(struct net_device *dev, void *useraddr) +{ + struct ethtool_coalesce coalesce; + + if (!dev->ethtool_ops->get_coalesce) + return -EOPNOTSUPP; + + if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) + return -EFAULT; + + return dev->ethtool_ops->set_coalesce(dev, &coalesce); +} + +static int ethtool_get_ringparam(struct net_device *dev, void *useraddr) +{ + struct ethtool_ringparam ringparam = { ETHTOOL_GRINGPARAM }; + + if (!dev->ethtool_ops->get_ringparam) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_ringparam(dev, &ringparam); + + if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_ringparam(struct net_device *dev, void *useraddr) +{ + struct ethtool_ringparam ringparam; + + if (!dev->ethtool_ops->get_ringparam) + return -EOPNOTSUPP; + + if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) + return -EFAULT; + + return dev->ethtool_ops->set_ringparam(dev, &ringparam); +} + +static int ethtool_get_pauseparam(struct net_device *dev, void *useraddr) +{ + struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM }; + + if (!dev->ethtool_ops->get_pauseparam) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_pauseparam(dev, &pauseparam); + + if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_pauseparam(struct net_device *dev, void *useraddr) +{ + struct ethtool_pauseparam pauseparam; + + if (!dev->ethtool_ops->get_pauseparam) + return -EOPNOTSUPP; + + if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) + return -EFAULT; + + return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); +} + +static int ethtool_get_rx_csum(struct net_device *dev, char *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GRXCSUM }; + + if (!dev->ethtool_ops->get_rx_csum) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_rx_csum(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_rx_csum(struct net_device *dev, char *useraddr) +{ + struct ethtool_value edata; + + if (!dev->ethtool_ops->set_rx_csum) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + dev->ethtool_ops->set_rx_csum(dev, edata.data); + return 0; +} + +static int ethtool_get_tx_csum(struct net_device *dev, char *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GTXCSUM }; + + if (!dev->ethtool_ops->get_tx_csum) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_tx_csum(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_tx_csum(struct net_device *dev, char *useraddr) +{ + struct ethtool_value edata; + + if (!dev->ethtool_ops->set_tx_csum) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + return dev->ethtool_ops->set_tx_csum(dev, edata.data); +} + +static int ethtool_get_sg(struct net_device *dev, char *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GSG }; + + if (!dev->ethtool_ops->get_sg) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_sg(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_sg(struct net_device *dev, char *useraddr) +{ + struct ethtool_value edata; + + if (!dev->ethtool_ops->set_sg) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + return dev->ethtool_ops->set_sg(dev, edata.data); +} + +static int ethtool_self_test(struct net_device *dev, char *useraddr) +{ + struct ethtool_test test; + struct ethtool_ops *ops = dev->ethtool_ops; + u64 *data; + int ret; + + if (!ops->self_test || !ops->self_test_count) + return -EOPNOTSUPP; + + if (copy_from_user(&test, useraddr, sizeof(test))) + return -EFAULT; + + test.len = ops->self_test_count(dev); + data = kmalloc(test.len * sizeof(u64), GFP_USER); + if (!data) + return -ENOMEM; + + ops->self_test(dev, &test, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &test, sizeof(test))) + goto out; + useraddr += sizeof(test); + if (copy_to_user(useraddr, data, test.len * sizeof(u64))) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +static int ethtool_get_strings(struct net_device *dev, void *useraddr) +{ + struct ethtool_gstrings gstrings; + struct ethtool_ops *ops = dev->ethtool_ops; + u8 *data; + int ret; + + if (!ops->get_strings) + return -EOPNOTSUPP; + + if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) + return -EFAULT; + + switch (gstrings.string_set) { + case ETH_SS_TEST: + if (ops->self_test_count) + gstrings.len = ops->self_test_count(dev); + else + return -EOPNOTSUPP; + case ETH_SS_STATS: + if (ops->get_stats_count) + gstrings.len = ops->get_stats_count(dev); + else + return -EOPNOTSUPP; + default: + return -EINVAL; + } + + data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); + if (!data) + return -ENOMEM; + + ops->get_strings(dev, gstrings.string_set, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) + goto out; + useraddr += sizeof(gstrings); + if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +static int ethtool_phys_id(struct net_device *dev, void *useraddr) +{ + struct ethtool_value id; + + if (!dev->ethtool_ops->phys_id) + return -EOPNOTSUPP; + + if (copy_from_user(&id, useraddr, sizeof(id))) + return -EFAULT; + + return dev->ethtool_ops->phys_id(dev, id.data); +} + +static int ethtool_get_stats(struct net_device *dev, void *useraddr) +{ + struct ethtool_stats stats; + struct ethtool_ops *ops = dev->ethtool_ops; + u64 *data; + int ret; + + if (!ops->get_ethtool_stats || !ops->get_stats_count) + return -EOPNOTSUPP; + + if (copy_from_user(&stats, useraddr, sizeof(stats))) + return -EFAULT; + + stats.n_stats = ops->get_stats_count(dev); + data = kmalloc(stats.n_stats * sizeof(u64), GFP_USER); + if (!data) + return -ENOMEM; + + ops->get_ethtool_stats(dev, &stats, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &stats, sizeof(stats))) + goto out; + useraddr += sizeof(stats); + if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +/* The main entry point in this file. Called from net/core/dev.c */ + +int dev_ethtool(struct ifreq *ifr) +{ + struct net_device *dev = __dev_get_by_name(ifr->ifr_name); + void *useraddr = (void *) ifr->ifr_data; + u32 ethcmd; + + /* + * XXX: This can be pushed down into the ethtool_* handlers that + * need it. Keep existing behaviour for the moment. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!dev || !netif_device_present(dev)) + return -ENODEV; + + if (!dev->ethtool_ops) + goto ioctl; + + if (copy_from_user(ðcmd, useraddr, sizeof (ethcmd))) + return -EFAULT; + + switch (ethcmd) { + case ETHTOOL_GSET: + return ethtool_get_settings(dev, useraddr); + case ETHTOOL_SSET: + return ethtool_set_settings(dev, useraddr); + case ETHTOOL_GDRVINFO: + return ethtool_get_drvinfo(dev, useraddr); + case ETHTOOL_GREGS: + return ethtool_get_regs(dev, useraddr); + case ETHTOOL_GWOL: + return ethtool_get_wol(dev, useraddr); + case ETHTOOL_SWOL: + return ethtool_set_wol(dev, useraddr); + case ETHTOOL_GMSGLVL: + return ethtool_get_msglevel(dev, useraddr); + case ETHTOOL_SMSGLVL: + return ethtool_set_msglevel(dev, useraddr); + case ETHTOOL_NWAY_RST: + return ethtool_nway_reset(dev); + case ETHTOOL_GLINK: + return ethtool_get_link(dev, useraddr); + case ETHTOOL_GEEPROM: + return ethtool_get_eeprom(dev, useraddr); + case ETHTOOL_SEEPROM: + return ethtool_set_eeprom(dev, useraddr); + case ETHTOOL_GCOALESCE: + return ethtool_get_coalesce(dev, useraddr); + case ETHTOOL_SCOALESCE: + return ethtool_set_coalesce(dev, useraddr); + case ETHTOOL_GRINGPARAM: + return ethtool_get_ringparam(dev, useraddr); + case ETHTOOL_SRINGPARAM: + return ethtool_set_ringparam(dev, useraddr); + case ETHTOOL_GPAUSEPARAM: + return ethtool_get_pauseparam(dev, useraddr); + case ETHTOOL_SPAUSEPARAM: + return ethtool_set_pauseparam(dev, useraddr); + case ETHTOOL_GRXCSUM: + return ethtool_get_rx_csum(dev, useraddr); + case ETHTOOL_SRXCSUM: + return ethtool_set_rx_csum(dev, useraddr); + case ETHTOOL_GTXCSUM: + return ethtool_get_tx_csum(dev, useraddr); + case ETHTOOL_STXCSUM: + return ethtool_set_tx_csum(dev, useraddr); + case ETHTOOL_GSG: + return ethtool_get_sg(dev, useraddr); + case ETHTOOL_SSG: + return ethtool_set_sg(dev, useraddr); + case ETHTOOL_TEST: + return ethtool_self_test(dev, useraddr); + case ETHTOOL_GSTRINGS: + return ethtool_get_strings(dev, useraddr); + case ETHTOOL_PHYS_ID: + return ethtool_phys_id(dev, useraddr); + case ETHTOOL_GSTATS: + return ethtool_get_stats(dev, useraddr); + default: + return -EOPNOTSUPP; + } + + ioctl: + if (dev->do_ioctl) + return dev->do_ioctl(dev, ifr, SIOCETHTOOL); + return -EOPNOTSUPP; +} diff -urN linux-2.4.22-bk1/net/core/neighbour.c linux-2.4.22-bk2/net/core/neighbour.c --- linux-2.4.22-bk1/net/core/neighbour.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/core/neighbour.c 2003-08-26 15:54:22.000000000 -0700 @@ -50,6 +50,7 @@ static void neigh_app_notify(struct neighbour *n); #endif static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); +void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); static int neigh_glbl_allocs; static struct neigh_table *neigh_tables; @@ -169,6 +170,33 @@ } } +void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) +{ + int i; + + write_lock_bh(&tbl->lock); + + for (i=0; i <= NEIGH_HASHMASK; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + if (dev && n->dev != dev) { + np = &n->next; + continue; + } + *np = n->next; + write_lock_bh(&n->lock); + n->dead = 1; + neigh_del_timer(n); + write_unlock_bh(&n->lock); + neigh_release(n); + } + } + + write_unlock_bh(&tbl->lock); +} + int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) { int i; diff -urN linux-2.4.22-bk1/net/core/pktgen.c linux-2.4.22-bk2/net/core/pktgen.c --- linux-2.4.22-bk1/net/core/pktgen.c 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.22-bk2/net/core/pktgen.c 2003-08-26 15:54:22.000000000 -0700 @@ -46,6 +46,9 @@ * Also moved to /proc/net/pktgen/ * --ro * + * Fix refcount off by one if first packet fails, potential null deref, + * memleak 030710- KJP + * * See Documentation/networking/pktgen.txt for how to use this. */ @@ -84,9 +87,9 @@ #define cycles() ((u32)get_cycles()) -#define VERSION "pktgen version 1.2" +#define VERSION "pktgen version 1.3" static char version[] __initdata = - "pktgen.c: v1.2: Packet Generator for packet performance testing.\n"; + "pktgen.c: v1.3: Packet Generator for packet performance testing.\n"; /* Used to help with determining the pkts on receive */ @@ -613,12 +616,11 @@ kfree_skb(skb); skb = fill_packet(odev, info); if (skb == NULL) { - break; + goto out_reldev; } fp++; fp_tmp = 0; /* reset counter */ } - atomic_inc(&skb->users); } nr_frags = skb_shinfo(skb)->nr_frags; @@ -626,7 +628,11 @@ spin_lock_bh(&odev->xmit_lock); if (!netif_queue_stopped(odev)) { + atomic_inc(&skb->users); + if (odev->hard_start_xmit(skb, odev)) { + + atomic_dec(&skb->users); if (net_ratelimit()) { printk(KERN_INFO "Hard xmit error\n"); } @@ -731,15 +737,15 @@ (unsigned long long) info->errors ); } - + + kfree_skb(skb); + out_reldev: if (odev) { dev_put(odev); odev = NULL; } - /* TODO: Is this worth printing out (other than for debug?) */ - printk("fp = %llu\n", (unsigned long long) fp); return; } @@ -955,7 +961,8 @@ if (len < 0) return len; memset(name, 0, sizeof(name)); - copy_from_user(name, &user_buffer[i], len); + if (copy_from_user(name, &user_buffer[i], len)) + return -EFAULT; i += len; max = count -i; @@ -1085,18 +1092,20 @@ if (len < 0) return len; memset(info->outdev, 0, sizeof(info->outdev)); - copy_from_user(info->outdev, &user_buffer[i], len); + if (copy_from_user(info->outdev, &user_buffer[i], len)) + return -EFAULT; i += len; sprintf(result, "OK: odev=%s", info->outdev); return count; } if (!strcmp(name, "flag")) { char f[32]; - memset(f, 0, 32); len = strn_len(&user_buffer[i], sizeof(f) - 1); if (len < 0) return len; - copy_from_user(f, &user_buffer[i], len); + memset(f, 0, 32); + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; i += len; if (strcmp(f, "IPSRC_RND") == 0) { info->flags |= F_IPSRC_RND; @@ -1148,7 +1157,8 @@ if (len < 0) return len; memset(info->dst_min, 0, sizeof(info->dst_min)); - copy_from_user(info->dst_min, &user_buffer[i], len); + if (copy_from_user(info->dst_min, &user_buffer[i], len)) + return -EFAULT; if(debug) printk("pg: dst_min set to: %s\n", info->dst_min); i += len; @@ -1160,7 +1170,8 @@ if (len < 0) return len; memset(info->dst_max, 0, sizeof(info->dst_max)); - copy_from_user(info->dst_max, &user_buffer[i], len); + if (copy_from_user(info->dst_max, &user_buffer[i], len)) + return -EFAULT; if(debug) printk("pg: dst_max set to: %s\n", info->dst_max); i += len; @@ -1172,7 +1183,8 @@ if (len < 0) return len; memset(info->src_min, 0, sizeof(info->src_min)); - copy_from_user(info->src_min, &user_buffer[i], len); + if (copy_from_user(info->src_min, &user_buffer[i], len)) + return -EFAULT; if(debug) printk("pg: src_min set to: %s\n", info->src_min); i += len; @@ -1184,7 +1196,8 @@ if (len < 0) return len; memset(info->src_max, 0, sizeof(info->src_max)); - copy_from_user(info->src_max, &user_buffer[i], len); + if (copy_from_user(info->src_max, &user_buffer[i], len)) + return -EFAULT; if(debug) printk("pg: src_max set to: %s\n", info->src_max); i += len; @@ -1199,7 +1212,8 @@ if (len < 0) return len; memset(valstr, 0, sizeof(valstr)); - copy_from_user(valstr, &user_buffer[i], len); + if (copy_from_user(valstr, &user_buffer[i], len)) + return -EFAULT; i += len; for(*m = 0;*v && m < info->dst_mac + 6; v++) { @@ -1231,7 +1245,8 @@ if (len < 0) return len; memset(valstr, 0, sizeof(valstr)); - copy_from_user(valstr, &user_buffer[i], len); + if (copy_from_user(valstr, &user_buffer[i], len)) + return -EFAULT; i += len; for(*m = 0;*v && m < info->src_mac + 6; v++) { diff -urN linux-2.4.22-bk1/net/core/sysctl_net_core.c linux-2.4.22-bk2/net/core/sysctl_net_core.c --- linux-2.4.22-bk1/net/core/sysctl_net_core.c 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.22-bk2/net/core/sysctl_net_core.c 2003-08-26 15:54:22.000000000 -0700 @@ -55,7 +55,7 @@ &netdev_max_backlog, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_CORE_NO_CONG_THRESH, "no_cong_thresh", - &no_cong, sizeof(int), 0644, NULL, + &no_cong_thresh, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_CORE_NO_CONG, "no_cong", &no_cong, sizeof(int), 0644, NULL, diff -urN linux-2.4.22-bk1/net/ipv4/Config.in linux-2.4.22-bk2/net/ipv4/Config.in --- linux-2.4.22-bk1/net/ipv4/Config.in 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/ipv4/Config.in 2003-08-26 15:54:22.000000000 -0700 @@ -43,3 +43,6 @@ if [ "$CONFIG_NETFILTER" != "n" ]; then source net/ipv4/netfilter/Config.in fi +if [ "$CONFIG_NETFILTER" != "n" ]; then + source net/ipv4/ipvs/Config.in +fi diff -urN linux-2.4.22-bk1/net/ipv4/arp.c linux-2.4.22-bk2/net/ipv4/arp.c --- linux-2.4.22-bk1/net/ipv4/arp.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/ipv4/arp.c 2003-08-26 15:54:22.000000000 -0700 @@ -1212,6 +1212,26 @@ } #endif +static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + switch (event) { + case NETDEV_CHANGEADDR: + neigh_changeaddr(&arp_tbl, dev); + rt_cache_flush(0); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +struct notifier_block arp_netdev_notifier = { + .notifier_call = arp_netdev_event, +}; + /* Note, that it is not on notifier chain. It is necessary, that this routine was called after route cache will be flushed. @@ -1243,6 +1263,7 @@ #ifdef CONFIG_SYSCTL neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, NET_IPV4_NEIGH, "ipv4"); #endif + register_netdevice_notifier(&arp_netdev_notifier); } diff -urN linux-2.4.22-bk1/net/ipv4/devinet.c linux-2.4.22-bk2/net/ipv4/devinet.c --- linux-2.4.22-bk1/net/ipv4/devinet.c 2003-06-13 07:51:39.000000000 -0700 +++ linux-2.4.22-bk2/net/ipv4/devinet.c 2003-08-26 15:54:22.000000000 -0700 @@ -883,6 +883,7 @@ unsigned char *b = skb->tail; nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; ifm = NLMSG_DATA(nlh); ifm->ifa_family = AF_INET; ifm->ifa_prefixlen = ifa->ifa_prefixlen; diff -urN linux-2.4.22-bk1/net/ipv4/ipconfig.c linux-2.4.22-bk2/net/ipv4/ipconfig.c --- linux-2.4.22-bk1/net/ipv4/ipconfig.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/ipv4/ipconfig.c 2003-08-26 15:54:22.000000000 -0700 @@ -124,14 +124,14 @@ int ic_host_name_set __initdata = 0; /* Host name set by us? */ -u32 ic_myaddr __initdata = INADDR_NONE; /* My IP address */ -u32 ic_netmask __initdata = INADDR_NONE; /* Netmask for local subnet */ -u32 ic_gateway __initdata = INADDR_NONE; /* Gateway IP address */ +u32 ic_myaddr = INADDR_NONE; /* My IP address */ +u32 ic_netmask = INADDR_NONE; /* Netmask for local subnet */ +u32 ic_gateway = INADDR_NONE; /* Gateway IP address */ -u32 ic_servaddr __initdata = INADDR_NONE; /* Boot server IP address */ +u32 ic_servaddr = INADDR_NONE; /* Boot server IP address */ -u32 root_server_addr __initdata = INADDR_NONE; /* Address of NFS server */ -u8 root_server_path[256] __initdata = { 0, }; /* Path to mount as root */ +u32 root_server_addr = INADDR_NONE; /* Address of NFS server */ +u8 root_server_path[256] = { 0, }; /* Path to mount as root */ /* Persistent data: */ diff -urN linux-2.4.22-bk1/net/ipv4/ipmr.c linux-2.4.22-bk2/net/ipv4/ipmr.c --- linux-2.4.22-bk1/net/ipv4/ipmr.c 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipmr.c 2003-08-26 15:54:22.000000000 -0700 @@ -1096,6 +1096,7 @@ skb->h.ipiph = skb->nh.iph; skb->nh.iph = iph; + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); #ifdef CONFIG_NETFILTER nf_conntrack_put(skb->nfct); skb->nfct = NULL; @@ -1104,8 +1105,12 @@ static inline int ipmr_forward_finish(struct sk_buff *skb) { + struct ip_options *opt = &(IPCB(skb)->opt); struct dst_entry *dst = skb->dst; + if (unlikely(opt->optlen)) + ip_forward_options(skb); + if (skb->len <= dst->pmtu) return dst->output(skb); else diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/Config.in linux-2.4.22-bk2/net/ipv4/ipvs/Config.in --- linux-2.4.22-bk1/net/ipv4/ipvs/Config.in 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/Config.in 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,26 @@ +# +# IP VS configuration +# +mainmenu_option next_comment +comment ' IP: Virtual Server Configuration' + +tristate 'virtual server support (EXPERIMENTAL)' CONFIG_IP_VS +if [ "$CONFIG_IP_VS" != "n" ]; then + bool ' IP virtual server debugging' CONFIG_IP_VS_DEBUG + int ' IPVS connection table size (the Nth power of 2)' CONFIG_IP_VS_TAB_BITS 12 + comment 'IPVS scheduler' + dep_tristate ' round-robin scheduling' CONFIG_IP_VS_RR $CONFIG_IP_VS + dep_tristate ' weighted round-robin scheduling' CONFIG_IP_VS_WRR $CONFIG_IP_VS + dep_tristate ' least-connection scheduling scheduling' CONFIG_IP_VS_LC $CONFIG_IP_VS + dep_tristate ' weighted least-connection scheduling' CONFIG_IP_VS_WLC $CONFIG_IP_VS + dep_tristate ' locality-based least-connection scheduling' CONFIG_IP_VS_LBLC $CONFIG_IP_VS + dep_tristate ' locality-based least-connection with replication scheduling' CONFIG_IP_VS_LBLCR $CONFIG_IP_VS + dep_tristate ' destination hashing scheduling' CONFIG_IP_VS_DH $CONFIG_IP_VS + dep_tristate ' source hashing scheduling' CONFIG_IP_VS_SH $CONFIG_IP_VS + dep_tristate ' shortest expected delay scheduling' CONFIG_IP_VS_SED $CONFIG_IP_VS + dep_tristate ' never queue scheduling' CONFIG_IP_VS_NQ $CONFIG_IP_VS + comment 'IPVS application helper' + dep_tristate ' FTP protocol helper' CONFIG_IP_VS_FTP $CONFIG_IP_VS +fi + +endmenu diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/Makefile linux-2.4.22-bk2/net/ipv4/ipvs/Makefile --- linux-2.4.22-bk1/net/ipv4/ipvs/Makefile 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/Makefile 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,43 @@ +# +# Makefile for the IPVS modules on top of IPv4. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definition is now in the main makefile... + +O_TARGET := ipvs.o + +export-objs := ip_vs_core.o ip_vs_app.o + +ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ + ip_vs_app.o ip_vs_sync.o ip_vs_est.o + +ifeq ($(CONFIG_IP_VS),y) + obj-y := $(ip_vs-objs) +else + ifeq ($(CONFIG_IP_VS),m) + obj-m := ip_vs.o + endif +endif + +# IPVS schedulers +obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o +obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o +obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o +obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o +obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o +obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o +obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o +obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o +obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o + +# IPVS application helpers +obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o + +include $(TOPDIR)/Rules.make + +ip_vs.o: $(ip_vs-objs) + $(LD) $(LD_RFLAG) -r -o $@ $(ip_vs-objs) diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_app.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_app.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_app.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_app.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,508 @@ +/* + * IPVS Application module + * + * Version: $Id: ip_vs_app.c,v 1.14 2001/11/23 14:34:10 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference + * is that ip_vs_app module handles the reverse direction (incoming requests + * and outgoing responses). The ip_vs_app modules are only used for VS/NAT. + * + * IP_MASQ_APP application masquerading module + * + * Author: Juan Jose Ciarlante, + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define IP_VS_APP_TAB_SIZE 16 /* must be power of 2 */ + +#define IP_VS_APP_HASH(proto, port) ((port^proto) & (IP_VS_APP_TAB_SIZE-1)) +#define IP_VS_APP_TYPE(proto, port) (proto<<16 | port) +#define IP_VS_APP_PORT(type) (type & 0xffff) +#define IP_VS_APP_PROTO(type) ((type>>16) & 0x00ff) + + +EXPORT_SYMBOL(register_ip_vs_app); +EXPORT_SYMBOL(unregister_ip_vs_app); + + +/* + * will hold ipvs app. hashed list heads + */ +static struct list_head ip_vs_app_base[IP_VS_APP_TAB_SIZE]; + +/* lock for ip_vs_app table */ +static rwlock_t __ip_vs_app_lock = RW_LOCK_UNLOCKED; + + +/* + * ip_vs_app registration routine + * port: host byte order. + */ +int register_ip_vs_app(struct ip_vs_app *vapp, + unsigned short proto, __u16 port) +{ + unsigned hash; + + if (!vapp) { + IP_VS_ERR("register_ip_vs_app(): NULL arg\n"); + return -EINVAL; + } + + MOD_INC_USE_COUNT; + + vapp->type = IP_VS_APP_TYPE(proto, port); + hash = IP_VS_APP_HASH(proto, port); + + write_lock_bh(&__ip_vs_app_lock); + list_add(&vapp->n_list, &ip_vs_app_base[hash]); + write_unlock_bh(&__ip_vs_app_lock); + + return 0; +} + + +/* + * ip_vs_app unregistration routine. + */ +int unregister_ip_vs_app(struct ip_vs_app *vapp) +{ + if (!vapp) { + IP_VS_ERR("unregister_ip_vs_app(): NULL arg\n"); + return -EINVAL; + } + + write_lock_bh(&__ip_vs_app_lock); + list_del(&vapp->n_list); + write_unlock_bh(&__ip_vs_app_lock); + + MOD_DEC_USE_COUNT; + + return 0; +} + + +/* + * get ip_vs_app object by its proto and port (net byte order). + */ +static struct ip_vs_app * ip_vs_app_get(unsigned short proto, __u16 port) +{ + struct list_head *e; + struct ip_vs_app *vapp; + unsigned hash; + unsigned type; + + port = ntohs(port); + type = IP_VS_APP_TYPE(proto, port); + hash = IP_VS_APP_HASH(proto, port); + + read_lock_bh(&__ip_vs_app_lock); + + list_for_each(e, &ip_vs_app_base[hash]) { + vapp = list_entry(e, struct ip_vs_app, n_list); + + /* + * Test and MOD_INC_USE_COUNT atomically + */ + if (vapp->module && !try_inc_mod_count(vapp->module)) { + /* + * This application module is just deleted + */ + continue; + } + if (type == vapp->type) { + read_unlock_bh(&__ip_vs_app_lock); + return vapp; + } + + if (vapp->module) + __MOD_DEC_USE_COUNT(vapp->module); + } + + read_unlock_bh(&__ip_vs_app_lock); + return NULL; +} + + +/* + * Bind ip_vs_conn to its ip_vs_app based on proto and dport, + * and call the ip_vs_app constructor. + */ +struct ip_vs_app * ip_vs_bind_app(struct ip_vs_conn *cp) +{ + struct ip_vs_app *vapp; + + /* no need to bind app if its forwarding method is not NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return NULL; + + if (cp->protocol != IPPROTO_TCP && cp->protocol != IPPROTO_UDP) + return NULL; + + /* + * don't allow binding if already bound + */ + if (cp->app != NULL) { + IP_VS_ERR("ip_vs_bind_app(): " + "called for already bound object.\n"); + return cp->app; + } + + vapp = ip_vs_app_get(cp->protocol, cp->vport); + + if (vapp != NULL) { + cp->app = vapp; + + if (vapp->init_conn) + vapp->init_conn(vapp, cp); + } + return vapp; +} + + +/* + * Unbind cp from type object and call cp destructor (does not kfree()). + */ +int ip_vs_unbind_app(struct ip_vs_conn *cp) +{ + struct ip_vs_app *vapp = cp->app; + + if (cp->protocol != IPPROTO_TCP && cp->protocol != IPPROTO_UDP) + return 0; + + if (vapp != NULL) { + if (vapp->done_conn) + vapp->done_conn(vapp, cp); + cp->app = NULL; + if (vapp->module) + __MOD_DEC_USE_COUNT(vapp->module); + } + return (vapp != NULL); +} + + +/* + * Fixes th->seq based on ip_vs_seq info. + */ +static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 seq = ntohl(th->seq); + + /* + * Adjust seq with delta-offset for all packets after + * the most recent resized pkt seq and with previous_delta offset + * for all packets before most recent resized pkt seq. + */ + if (vseq->delta || vseq->previous_delta) { + if(after(seq, vseq->init_seq)) { + th->seq = htonl(seq + vseq->delta); + IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", + vseq->delta); + } else { + th->seq = htonl(seq + vseq->previous_delta); + IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " + "(%d) to seq\n", vseq->previous_delta); + } + } +} + + +/* + * Fixes th->ack_seq based on ip_vs_seq info. + */ +static inline void +vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 ack_seq = ntohl(th->ack_seq); + + /* + * Adjust ack_seq with delta-offset for + * the packets AFTER most recent resized pkt has caused a shift + * for packets before most recent resized pkt, use previous_delta + */ + if (vseq->delta || vseq->previous_delta) { + /* since ack_seq is the number of octet that is expected + to receive next, so compare it with init_seq+delta */ + if(after(ack_seq, vseq->init_seq+vseq->delta)) { + th->ack_seq = htonl(ack_seq - vseq->delta); + IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " + "(%d) from ack_seq\n", vseq->delta); + + } else { + th->ack_seq = htonl(ack_seq - vseq->previous_delta); + IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " + "previous_delta (%d) from ack_seq\n", + vseq->previous_delta); + } + } +} + + +/* + * Updates ip_vs_seq if pkt has been resized + * Assumes already checked proto==IPPROTO_TCP and diff!=0. + */ +static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, + unsigned flag, __u32 seq, int diff) +{ + /* spinlock is to keep updating cp->flags atomic */ + spin_lock(&cp->lock); + if ( !(cp->flags & flag) || after(seq, vseq->init_seq)) { + vseq->previous_delta = vseq->delta; + vseq->delta += diff; + vseq->init_seq = seq; + cp->flags |= flag; + } + spin_unlock(&cp->lock); +} + + +/* + * Output pkt hook. Will call bound ip_vs_app specific function + * called by ip_vs_out(), assumes previously checked cp!=NULL + * returns (new - old) skb->len diff. + */ +int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *vapp; + int diff; + struct iphdr *iph; + struct tcphdr *th; + __u32 seq; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((vapp = cp->app) == NULL) + return 0; + + iph = skb->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->protocol == IPPROTO_TCP) { + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_seq(&cp->out_seq, th); + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_ack_seq(&cp->in_seq, th); + } + + /* + * Call private output hook function + */ + if (vapp->pkt_out == NULL) + return 0; + + diff = vapp->pkt_out(vapp, cp, skb); + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0 && cp->protocol == IPPROTO_TCP) + vs_seq_update(cp, &cp->out_seq, + IP_VS_CONN_F_OUT_SEQ, seq, diff); + + return diff; +} + + +/* + * Input pkt hook. Will call bound ip_vs_app specific function + * called by ip_fw_demasquerade(), assumes previously checked cp!=NULL. + * returns (new - old) skb->len diff. + */ +int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *vapp; + int diff; + struct iphdr *iph; + struct tcphdr *th; + __u32 seq; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((vapp = cp->app) == NULL) + return 0; + + iph = skb->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->protocol == IPPROTO_TCP) { + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_seq(&cp->in_seq, th); + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_ack_seq(&cp->out_seq, th); + } + + /* + * Call private input hook function + */ + if (vapp->pkt_in == NULL) + return 0; + + diff = vapp->pkt_in(vapp, cp, skb); + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0 && cp->protocol == IPPROTO_TCP) + vs_seq_update(cp, &cp->in_seq, + IP_VS_CONN_F_IN_SEQ, seq, diff); + + return diff; +} + + +/* + * /proc/net/ip_vs_app entry function + */ +static int ip_vs_app_getinfo(char *buffer, char **start, off_t offset, + int length) +{ + off_t pos=0; + int len=0; + char temp[64]; + int idx; + struct ip_vs_app *vapp; + struct list_head *e; + + pos = 64; + if (pos > offset) { + len += sprintf(buffer+len, "%-63s\n", + "prot port usecnt name"); + } + + read_lock_bh(&__ip_vs_app_lock); + for (idx=0 ; idx < IP_VS_APP_TAB_SIZE; idx++) { + list_for_each (e, &ip_vs_app_base[idx]) { + vapp = list_entry(e, struct ip_vs_app, n_list); + + pos += 64; + if (pos <= offset) + continue; + sprintf(temp, "%-3s %-7u %-6d %-17s", + ip_vs_proto_name(IP_VS_APP_PROTO(vapp->type)), + IP_VS_APP_PORT(vapp->type), + vapp->module?GET_USE_COUNT(vapp->module):0, + vapp->name); + len += sprintf(buffer+len, "%-63s\n", temp); + if (pos >= offset+length) + goto done; + } + } + done: + read_unlock_bh(&__ip_vs_app_lock); + + *start = buffer+len-(pos-offset); /* Start of wanted data */ + len = pos-offset; + if (len > length) + len = length; + if (len < 0) + len = 0; + return len; +} + + +/* + * Replace a segment of data with a new segment + */ +int ip_vs_skb_replace(struct sk_buff *skb, int pri, + char *o_buf, int o_len, char *n_buf, int n_len) +{ + struct iphdr *iph; + int diff; + int o_offset; + int o_left; + + EnterFunction(9); + + diff = n_len - o_len; + o_offset = o_buf - (char *)skb->data; + /* The length of left data after o_buf+o_len in the skb data */ + o_left = skb->len - (o_offset + o_len); + + if (diff <= 0) { + memmove(o_buf + n_len, o_buf + o_len, o_left); + memcpy(o_buf, n_buf, n_len); + skb_trim(skb, skb->len + diff); + } else if (diff <= skb_tailroom(skb)) { + skb_put(skb, diff); + memmove(o_buf + n_len, o_buf + o_len, o_left); + memcpy(o_buf, n_buf, n_len); + } else { + if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) + return -ENOMEM; + skb_put(skb, diff); + memmove(skb->data + o_offset + n_len, + skb->data + o_offset + o_len, o_left); + memcpy(skb->data + o_offset, n_buf, n_len); + } + + /* must update the iph total length here */ + iph = skb->nh.iph; + iph->tot_len = htons(skb->len); + + LeaveFunction(9); + return 0; +} + + +int ip_vs_app_init(void) +{ + int idx; + + for (idx=0 ; idx < IP_VS_APP_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_app_base[idx]); + } + + /* we will replace it with proc_net_ipvs_create() soon */ + proc_net_create("ip_vs_app", 0, ip_vs_app_getinfo); + return 0; +} + +void ip_vs_app_cleanup(void) +{ + proc_net_remove("ip_vs_app"); +} diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_conn.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_conn.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_conn.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_conn.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,1563 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_conn.c,v 1.28.2.5 2003/08/09 13:27:08 wensong Exp $ + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. Many code here is taken from IP MASQ code of kernel 2.2. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for tcphdr */ +#include +#include /* for proc_net_* */ +#include /* for local_bh_* */ +#include +#include /* for csum_tcpudp_magic */ +#include +#include /* for icmp_send */ +#include /* for ip_route_output */ +#include +#include +#include +#include + +#include + + +/* + * Connection hash table: for input and output packets lookups of IPVS + */ +static struct list_head *ip_vs_conn_tab; + +/* SLAB cache for IPVS connections */ +static kmem_cache_t *ip_vs_conn_cachep; + +/* counter for current IPVS connections */ +static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); + +/* counter for no-client-port connections */ +static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); + +/* random value for IPVS connection hash */ +static unsigned int ip_vs_conn_rnd; + +/* + * Fine locking granularity for big connection hash table + */ +#define CT_LOCKARRAY_BITS 4 +#define CT_LOCKARRAY_SIZE (1<flags & IP_VS_CONN_F_HASHED) { + IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* Hash by protocol, client address and port */ + hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); + + ct_write_lock(hash); + + list_add(&cp->c_list, &ip_vs_conn_tab[hash]); + cp->flags |= IP_VS_CONN_F_HASHED; + atomic_inc(&cp->refcnt); + + ct_write_unlock(hash); + + return 1; +} + + +/* + * UNhashes ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static int ip_vs_conn_unhash(struct ip_vs_conn *cp) +{ + unsigned hash; + + if (!(cp->flags & IP_VS_CONN_F_HASHED)) { + IP_VS_ERR("ip_vs_conn_unhash(): request for unhash flagged, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* unhash it and decrease its reference counter */ + hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); + ct_write_lock(hash); + + list_del(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + atomic_dec(&cp->refcnt); + + ct_write_unlock(hash); + + return 1; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from OUTside-to-INside. + * s_addr, s_port: pkt source address (foreign host) + * d_addr, d_port: pkt dest address (load balancer) + */ +static inline struct ip_vs_conn *__ip_vs_conn_in_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp; + struct list_head *l,*e; + + hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); + l = &ip_vs_conn_tab[hash]; + + ct_read_lock(hash); + + for (e=l->next; e!=l; e=e->next) { + cp = list_entry(e, struct ip_vs_conn, c_list); + if (s_addr==cp->caddr && s_port==cp->cport && + d_port==cp->vport && d_addr==cp->vaddr && + protocol==cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + ct_read_unlock(hash); + return cp; + } + } + + ct_read_unlock(hash); + + return NULL; +} + +struct ip_vs_conn *ip_vs_conn_in_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + struct ip_vs_conn *cp; + + cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); + if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) + cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); + + IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + ip_vs_proto_name(protocol), + NIPQUAD(s_addr), ntohs(s_port), + NIPQUAD(d_addr), ntohs(d_port), + cp?"hit":"not hit"); + + return cp; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from inside-to-OUTside. + * s_addr, s_port: pkt source address (inside host) + * d_addr, d_port: pkt dest address (foreign host) + */ +struct ip_vs_conn *ip_vs_conn_out_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp, *ret=NULL; + struct list_head *l,*e; + + /* + * Check for "full" addressed entries + */ + hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); + l = &ip_vs_conn_tab[hash]; + + ct_read_lock(hash); + + for (e=l->next; e!=l; e=e->next) { + cp = list_entry(e, struct ip_vs_conn, c_list); + if (d_addr == cp->caddr && d_port == cp->cport && + s_port == cp->dport && s_addr == cp->daddr && + protocol == cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + ret = cp; + break; + } + } + + ct_read_unlock(hash); + + IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + ip_vs_proto_name(protocol), + NIPQUAD(s_addr), ntohs(s_port), + NIPQUAD(d_addr), ntohs(d_port), + ret?"hit":"not hit"); + + return ret; +} + + +/* + * Put back the conn and restart its timer with its timeout + */ +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + /* reset it expire in its timeout */ + mod_timer(&cp->timer, jiffies+cp->timeout); + + __ip_vs_conn_put(cp); +} + + +/* + * Timeout table[state] + */ +struct ip_vs_timeout_table vs_timeout_table = { + ATOMIC_INIT(0), /* refcnt */ + 0, /* scale */ + { + [IP_VS_S_NONE] = 30*60*HZ, + [IP_VS_S_ESTABLISHED] = 15*60*HZ, + [IP_VS_S_SYN_SENT] = 2*60*HZ, + [IP_VS_S_SYN_RECV] = 1*60*HZ, + [IP_VS_S_FIN_WAIT] = 2*60*HZ, + [IP_VS_S_TIME_WAIT] = 2*60*HZ, + [IP_VS_S_CLOSE] = 10*HZ, + [IP_VS_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_S_LAST_ACK] = 30*HZ, + [IP_VS_S_LISTEN] = 2*60*HZ, + [IP_VS_S_SYNACK] = 120*HZ, + [IP_VS_S_UDP] = 5*60*HZ, + [IP_VS_S_ICMP] = 1*60*HZ, + [IP_VS_S_LAST] = 2*HZ, + }, /* timeout */ +}; + + +struct ip_vs_timeout_table vs_timeout_table_dos = { + ATOMIC_INIT(0), /* refcnt */ + 0, /* scale */ + { + [IP_VS_S_NONE] = 15*60*HZ, + [IP_VS_S_ESTABLISHED] = 8*60*HZ, + [IP_VS_S_SYN_SENT] = 60*HZ, + [IP_VS_S_SYN_RECV] = 10*HZ, + [IP_VS_S_FIN_WAIT] = 60*HZ, + [IP_VS_S_TIME_WAIT] = 60*HZ, + [IP_VS_S_CLOSE] = 10*HZ, + [IP_VS_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_S_LAST_ACK] = 30*HZ, + [IP_VS_S_LISTEN] = 2*60*HZ, + [IP_VS_S_SYNACK] = 100*HZ, + [IP_VS_S_UDP] = 3*60*HZ, + [IP_VS_S_ICMP] = 1*60*HZ, + [IP_VS_S_LAST] = 2*HZ, + }, /* timeout */ +}; + + +/* + * Timeout table to use for the VS entries + * If NULL we use the default table (vs_timeout_table). + * Under flood attack we switch to vs_timeout_table_dos + */ + +static struct ip_vs_timeout_table *ip_vs_timeout_table = &vs_timeout_table; + +static const char * state_name_table[IP_VS_S_LAST+1] = { + [IP_VS_S_NONE] = "NONE", + [IP_VS_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_S_SYN_SENT] = "SYN_SENT", + [IP_VS_S_SYN_RECV] = "SYN_RECV", + [IP_VS_S_FIN_WAIT] = "FIN_WAIT", + [IP_VS_S_TIME_WAIT] = "TIME_WAIT", + [IP_VS_S_CLOSE] = "CLOSE", + [IP_VS_S_CLOSE_WAIT] = "CLOSE_WAIT", + [IP_VS_S_LAST_ACK] = "LAST_ACK", + [IP_VS_S_LISTEN] = "LISTEN", + [IP_VS_S_SYNACK] = "SYNACK", + [IP_VS_S_UDP] = "UDP", + [IP_VS_S_ICMP] = "ICMP", + [IP_VS_S_LAST] = "BUG!", +}; + +#define sNO IP_VS_S_NONE +#define sES IP_VS_S_ESTABLISHED +#define sSS IP_VS_S_SYN_SENT +#define sSR IP_VS_S_SYN_RECV +#define sFW IP_VS_S_FIN_WAIT +#define sTW IP_VS_S_TIME_WAIT +#define sCL IP_VS_S_CLOSE +#define sCW IP_VS_S_CLOSE_WAIT +#define sLA IP_VS_S_LAST_ACK +#define sLI IP_VS_S_LISTEN +#define sSA IP_VS_S_SYNACK + +struct vs_tcp_states_t { + int next_state[IP_VS_S_LAST]; /* should be _LAST_TCP */ +}; + +const char * ip_vs_state_name(int state) +{ + if (state >= IP_VS_S_LAST) + return "ERR!"; + return state_name_table[state] ? state_name_table[state] : "?"; +} + +static struct vs_tcp_states_t vs_tcp_states [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct vs_tcp_states_t vs_tcp_states_dos [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, +/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct vs_tcp_states_t *ip_vs_state_table = vs_tcp_states; + +void ip_vs_secure_tcp_set(int on) +{ + if (on) { + ip_vs_state_table = vs_tcp_states_dos; + ip_vs_timeout_table = &vs_timeout_table_dos; + } else { + ip_vs_state_table = vs_tcp_states; + ip_vs_timeout_table = &vs_timeout_table; + } +} + + +static inline int vs_tcp_state_idx(struct tcphdr *th, int state_off) +{ + /* + * [0-3]: input states, [4-7]: output, [8-11] input only states. + */ + if (th->rst) + return state_off+3; + if (th->syn) + return state_off+0; + if (th->fin) + return state_off+1; + if (th->ack) + return state_off+2; + return -1; +} + + +static inline int vs_set_state_timeout(struct ip_vs_conn *cp, int state) +{ + struct ip_vs_timeout_table *vstim = cp->timeout_table; + + /* + * Use default timeout table if no specific for this entry + */ + if (!vstim) + vstim = &vs_timeout_table; + + cp->timeout = vstim->timeout[cp->state=state]; + + if (vstim->scale) { + int scale = vstim->scale; + + if (scale<0) + cp->timeout >>= -scale; + else if (scale > 0) + cp->timeout <<= scale; + } + + return state; +} + + +static inline int +vs_tcp_state(struct ip_vs_conn *cp, int state_off, struct tcphdr *th) +{ + int state_idx; + int new_state = IP_VS_S_CLOSE; + + /* + * Update state offset to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (state_off == VS_STATE_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + state_off = VS_STATE_INPUT_ONLY; + } + + if ((state_idx = vs_tcp_state_idx(th, state_off)) < 0) { + IP_VS_DBG(8, "vs_tcp_state_idx(%d)=%d!!!\n", + state_off, state_idx); + goto tcp_state_out; + } + + new_state = ip_vs_state_table[state_idx].next_state[cp->state]; + + tcp_state_out: + if (new_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" + "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n", + ip_vs_proto_name(cp->protocol), + (state_off==VS_STATE_OUTPUT)?"output ":"input ", + th->syn? 'S' : '.', + th->fin? 'F' : '.', + th->ack? 'A' : '.', + th->rst? 'R' : '.', + NIPQUAD(cp->daddr), ntohs(cp->dport), + NIPQUAD(cp->caddr), ntohs(cp->cport), + ip_vs_state_name(cp->state), + ip_vs_state_name(new_state), + atomic_read(&cp->refcnt)); + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state != IP_VS_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state == IP_VS_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + } + + return vs_set_state_timeout(cp, new_state); +} + + +/* + * Handle state transitions + */ +int ip_vs_set_state(struct ip_vs_conn *cp, + int state_off, struct iphdr *iph, void *tp) +{ + int ret; + + spin_lock(&cp->lock); + switch (iph->protocol) { + case IPPROTO_TCP: + ret = vs_tcp_state(cp, state_off, tp); + break; + case IPPROTO_UDP: + ret = vs_set_state_timeout(cp, IP_VS_S_UDP); + break; + case IPPROTO_ICMP: + ret = vs_set_state_timeout(cp, IP_VS_S_ICMP); + break; + default: + ret = -1; + } + spin_unlock(&cp->lock); + + return ret; +} + + +/* + * Set LISTEN timeout. (ip_vs_conn_put will setup timer) + */ +int ip_vs_conn_listen(struct ip_vs_conn *cp) +{ + vs_set_state_timeout(cp, IP_VS_S_LISTEN); + return cp->timeout; +} + + +/* + * Bypass transmitter + * Let packets bypass the destination when the destination is not + * available, it may be only used in transparent cache cluster. + */ +static int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = skb->nh.iph; + u8 tos = iph->tos; + int mtu; + + EnterFunction(10); + + if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(tos), 0)) { + IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " + "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); + goto tx_error_icmp; + } + + /* MTU checking */ + mtu = rt->u.dst.pmtu; + if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); + goto tx_error; + } + + /* update checksum because skb might be defragmented */ + ip_send_check(iph); + + if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) { + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) { + ip_rt_put(rt); + IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n"); + goto tx_error; + } + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 1 << NF_IP_LOCAL_OUT; +#endif /* CONFIG_NETFILTER_DEBUG */ + skb->nfcache |= NFC_IPVS_PROPERTY; + ip_send(skb); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + return NF_STOLEN; +} + + +/* + * NULL transmitter (do nothing except return NF_ACCEPT) + */ +static int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp) +{ + return NF_ACCEPT; +} + + +/* + * NAT transmitter (only for outside-to-inside nat forwarding) + */ +static int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph; + union ip_vs_tphdr h; + int ihl; + unsigned short size; + int mtu; + + EnterFunction(10); + + /* + * If it has ip_vs_app helper, the helper may change the payload, + * so it needs full checksum checking and checksum calculation. + * If not, only the header (such as IP address and port number) + * will be changed, so it is fast to do incremental checksum update, + * and let the destination host do final checksum checking. + */ + + if (cp->app && skb_is_nonlinear(skb) + && skb_linearize(skb, GFP_ATOMIC) != 0) + return NF_DROP; + + iph = skb->nh.iph; + ihl = iph->ihl << 2; + h.raw = (char*) iph + ihl; + size = ntohs(iph->tot_len) - ihl; + + /* do TCP/UDP checksum checking if it has application helper */ + if (cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) { + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = csum_partial(h.raw, size, 0); + + case CHECKSUM_HW: + if (csum_tcpudp_magic(iph->saddr, iph->daddr, size, + iph->protocol, skb->csum)) { + IP_VS_DBG_RL("Incoming failed %s checksum " + "from %d.%d.%d.%d (size=%d)!\n", + ip_vs_proto_name(iph->protocol), + NIPQUAD(iph->saddr), + size); + goto tx_error; + } + break; + default: + /* CHECKSUM_UNNECESSARY */ + break; + } + } + + /* + * Check if it is no_cport connection ... + */ + if (cp->flags & IP_VS_CONN_F_NO_CPORT) { + atomic_dec(&ip_vs_conn_no_cport_cnt); + ip_vs_conn_unhash(cp); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + cp->cport = h.portp[0]; + /* hash on new dport */ + ip_vs_conn_hash(cp); + + IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp->dport)); + } + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = rt->u.dst.pmtu; + if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_nat_xmit(): frag needed\n"); + goto tx_error; + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* copy-on-write the packet before mangling it */ + if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, &iph, &h.raw)) + return NF_DROP; + + /* mangle the packet */ + iph->daddr = cp->daddr; + h.portp[1] = cp->dport; + + /* + * Attempt ip_vs_app call. + * will fix ip_vs_conn and iph ack_seq stuff + */ + if (ip_vs_app_pkt_in(cp, skb) != 0) { + /* skb data has probably changed, update pointers */ + iph = skb->nh.iph; + h.raw = (char*) iph + ihl; + size = skb->len - ihl; + } + + /* + * Adjust TCP/UDP checksums + */ + if (!cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + ip_vs_fast_check_update(&h, cp->vaddr, cp->daddr, + cp->vport, cp->dport, iph->protocol); + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + switch (iph->protocol) { + case IPPROTO_TCP: + h.th->check = 0; + h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + csum_partial(h.raw, size, 0)); + break; + case IPPROTO_UDP: + h.uh->check = 0; + h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + csum_partial(h.raw, size, 0)); + if (h.uh->check == 0) + h.uh->check = 0xFFFF; + break; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + ip_send_check(iph); + + IP_VS_DBG(10, "NAT to %u.%u.%u.%u:%d\n", + NIPQUAD(iph->daddr), ntohs(h.portp[1])); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 1 << NF_IP_LOCAL_OUT; +#endif /* CONFIG_NETFILTER_DEBUG */ + skb->nfcache |= NFC_IPVS_PROPERTY; + ip_send(skb); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + return NF_STOLEN; +} + + +/* + * IP Tunneling transmitter + * + * This function encapsulates the packet in a new IP packet, its + * destination will be set to cp->daddr. Most code of this function + * is taken from ipip.c. + * + * It is used in VS/TUN cluster. The load balancer selects a real + * server from a cluster based on a scheduling algorithm, + * encapsulates the request packet and forwards it to the selected + * server. For example, all real servers are configured with + * "ifconfig tunl0 up". When the server receives + * the encapsulated packet, it will decapsulate the packet, processe + * the request and return the response packets directly to the client + * without passing the load balancer. This can greatly increase the + * scalability of virtual server. + */ +static int ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp) +{ + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *old_iph = skb->nh.iph; + u8 tos = old_iph->tos; + u16 df = old_iph->frag_off; + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + int mtu; + + EnterFunction(10); + + if (skb->protocol != __constant_htons(ETH_P_IP)) { + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " + "ETH_P_IP: %d, skb protocol: %d\n", + __constant_htons(ETH_P_IP), skb->protocol); + goto tx_error; + } + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) + goto tx_error_icmp; + + tdev = rt->u.dst.dev; + + mtu = rt->u.dst.pmtu - sizeof(struct iphdr); + if (mtu < 68) { + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); + goto tx_error; + } + if (skb->dst && mtu < skb->dst->pmtu) + skb->dst->pmtu = mtu; + + df |= (old_iph->frag_off&__constant_htons(IP_DF)); + + if ((old_iph->frag_off&__constant_htons(IP_DF)) + && mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); + goto tx_error; + } + + /* update checksum because skb might be defragmented */ + ip_send_check(old_iph); + + skb->h.raw = skb->nh.raw; + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr)); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + kfree_skb(skb); + IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); + return -EINVAL; + } + kfree_skb(skb); + skb = new_skb; + old_iph = skb->nh.iph; + } + + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + iph->ttl = old_iph->ttl; + iph->tot_len = htons(skb->len); + ip_select_ident(iph, &rt->u.dst, NULL); + ip_send_check(iph); + + skb->ip_summed = CHECKSUM_NONE; +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 1 << NF_IP_LOCAL_OUT; +#endif /* CONFIG_NETFILTER_DEBUG */ + skb->nfcache |= NFC_IPVS_PROPERTY; + ip_send(skb); + + LeaveFunction(10); + + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + return NF_STOLEN; +} + + +/* + * Direct Routing transmitter + */ +static int ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = skb->nh.iph; + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = rt->u.dst.pmtu; + if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); + goto tx_error; + } + + /* update checksum because skb might be defragmented */ + ip_send_check(iph); + + if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) { + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) { + ip_rt_put(rt); + IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n"); + goto tx_error; + } + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 1 << NF_IP_LOCAL_OUT; +#endif /* CONFIG_NETFILTER_DEBUG */ + skb->nfcache |= NFC_IPVS_PROPERTY; + ip_send(skb); + +#if 0000 + NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + do_ip_send); +#endif + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + return NF_STOLEN; +} + + +/* + * Bind a connection entry with the corresponding packet_xmit. + * Called by ip_vs_conn_new. + */ +static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit; + break; + } +} + + +/* + * Bind a connection entry with a virtual service destination + * Called just after a new connection entry is created. + */ +static inline void +ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) +{ + /* if dest is NULL, then return directly */ + if (!dest) + return; + + /* Increase the refcnt counter of the dest */ + atomic_inc(&dest->refcnt); + + /* Bind with the destination and its corresponding transmitter */ + cp->flags |= atomic_read(&dest->conn_flags); + cp->dest = dest; + + IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%s flg:%X cnt:%d destcnt:%d\n", + ip_vs_proto_name(cp->protocol), + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state), + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); +} + + +/* + * Unbind a connection entry with its VS destination + * Called by the ip_vs_conn_expire function. + */ +static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest = cp->dest; + + /* if dest is NULL, then return directly */ + if (!dest) + return; + + IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d " + "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d fwd:%c " + "s:%s flg:%X cnt:%d destcnt:%d", + ip_vs_proto_name(cp->protocol), + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state), + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* + * Decrease the inactconns or activeconns counter + * if it is not a connection template ((cp->cport!=0) + * || (cp->flags & IP_VS_CONN_F_NO_CPORT)). + */ + if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) { + if (cp->flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->inactconns); + } else { + atomic_dec(&dest->activeconns); + } + } + + /* + * Simply decrease the refcnt of the dest, because the + * dest will be either in service's destination list + * or in the trash. + */ + atomic_dec(&dest->refcnt); +} + + +/* + * Checking if the destination of a connection template is available. + * If available, return 1, otherwise invalidate this connection + * template and return 0. + */ +int ip_vs_check_template(struct ip_vs_conn *ct) +{ + struct ip_vs_dest *dest = ct->dest; + + /* + * Checking the dest server status. + */ + if ((dest == NULL) || + !(dest->flags & IP_VS_DEST_F_AVAILABLE)) { + IP_VS_DBG(9, "check_template: dest not available for " + "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "-> d:%u.%u.%u.%u:%d\n", + ip_vs_proto_name(ct->protocol), + NIPQUAD(ct->caddr), ntohs(ct->cport), + NIPQUAD(ct->vaddr), ntohs(ct->vport), + NIPQUAD(ct->daddr), ntohs(ct->dport)); + + /* + * Invalidate the connection template + */ + ip_vs_conn_unhash(ct); + ct->dport = 65535; + ct->vport = 65535; + ct->cport = 0; + ip_vs_conn_hash(ct); + + /* + * Simply decrease the refcnt of the template, + * don't restart its timer. + */ + atomic_dec(&ct->refcnt); + return 0; + } + return 1; +} + + +static inline void +ip_vs_timeout_attach(struct ip_vs_conn *cp, struct ip_vs_timeout_table *vstim) +{ + atomic_inc(&vstim->refcnt); + cp->timeout_table = vstim; +} + +static inline void ip_vs_timeout_detach(struct ip_vs_conn *cp) +{ + struct ip_vs_timeout_table *vstim = cp->timeout_table; + + if (!vstim) + return; + cp->timeout_table = NULL; + atomic_dec(&vstim->refcnt); +} + + +static void ip_vs_conn_expire(unsigned long data) +{ + struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + + if (cp->timeout_table) + cp->timeout = cp->timeout_table->timeout[IP_VS_S_TIME_WAIT]; + else + cp->timeout = vs_timeout_table.timeout[IP_VS_S_TIME_WAIT]; + + /* + * hey, I'm using it + */ + atomic_inc(&cp->refcnt); + + /* + * do I control anybody? + */ + if (atomic_read(&cp->n_control)) + goto expire_later; + + /* + * unhash it if it is hashed in the conn table + */ + ip_vs_conn_unhash(cp); + + /* + * refcnt==1 implies I'm the only one referrer + */ + if (likely(atomic_read(&cp->refcnt) == 1)) { + /* make sure that there is no timer on it now */ + if (timer_pending(&cp->timer)) + del_timer(&cp->timer); + + /* does anybody control me? */ + if (cp->control) + ip_vs_control_del(cp); + + ip_vs_unbind_dest(cp); + ip_vs_unbind_app(cp); + ip_vs_timeout_detach(cp); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) + atomic_dec(&ip_vs_conn_no_cport_cnt); + atomic_dec(&ip_vs_conn_count); + + kmem_cache_free(ip_vs_conn_cachep, cp); + return; + } + + /* hash it back to the table */ + ip_vs_conn_hash(cp); + + expire_later: + IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n", + atomic_read(&cp->refcnt)-1, + atomic_read(&cp->n_control)); + + ip_vs_conn_put(cp); +} + + +void ip_vs_conn_expire_now(struct ip_vs_conn *cp) +{ + cp->timeout = 0; + mod_timer(&cp->timer, jiffies); + __ip_vs_conn_put(cp); +} + +/* + * Create a new connection entry and hash it into the ip_vs_conn_tab. + */ +struct ip_vs_conn * +ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport, + __u32 daddr, __u16 dport, unsigned flags, + struct ip_vs_dest *dest) +{ + struct ip_vs_conn *cp; + + cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); + if (cp == NULL) { + IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); + return NULL; + } + + memset(cp, 0, sizeof(*cp)); + INIT_LIST_HEAD(&cp->c_list); + init_timer(&cp->timer); + cp->timer.data = (unsigned long)cp; + cp->timer.function = ip_vs_conn_expire; + ip_vs_timeout_attach(cp, ip_vs_timeout_table); + cp->protocol = proto; + cp->caddr = caddr; + cp->cport = cport; + cp->vaddr = vaddr; + cp->vport = vport; + cp->daddr = daddr; + cp->dport = dport; + cp->flags = flags; + cp->app_data = NULL; + cp->control = NULL; + cp->lock = SPIN_LOCK_UNLOCKED; + + atomic_set(&cp->n_control, 0); + atomic_set(&cp->in_pkts, 0); + + atomic_inc(&ip_vs_conn_count); + if (flags & IP_VS_CONN_F_NO_CPORT) + atomic_inc(&ip_vs_conn_no_cport_cnt); + + /* Bind its application helper (only for VS/NAT) if any */ + ip_vs_bind_app(cp); + + /* Bind the connection with a destination server */ + ip_vs_bind_dest(cp, dest); + + /* Set its state and timeout */ + vs_set_state_timeout(cp, IP_VS_S_NONE); + + /* Bind its packet transmitter */ + ip_vs_bind_xmit(cp); + + /* + * Set the entry is referenced by the current thread before hashing + * it in the table, so that other thread run ip_vs_random_dropentry + * but cannot drop this entry. + */ + atomic_set(&cp->refcnt, 1); + + /* Hash it in the ip_vs_conn_tab finally */ + ip_vs_conn_hash(cp); + + return cp; +} + + +/* + * /proc/net/ip_vs_conn entries + */ +static int +ip_vs_conn_getinfo(char *buffer, char **start, off_t offset, int length) +{ + off_t pos=0; + int idx, len=0; + char temp[70]; + struct ip_vs_conn *cp; + struct list_head *l, *e; + + pos = 128; + if (pos > offset) { + len += sprintf(buffer+len, "%-127s\n", + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires"); + } + + for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + /* + * Lock is actually only need in next loop + * we are called from uspace: must stop bh. + */ + ct_read_lock_bh(idx); + + l = &ip_vs_conn_tab[idx]; + for (e=l->next; e!=l; e=e->next) { + cp = list_entry(e, struct ip_vs_conn, c_list); + pos += 128; + if (pos <= offset) + continue; + sprintf(temp, + "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr), ntohs(cp->cport), + ntohl(cp->vaddr), ntohs(cp->vport), + ntohl(cp->daddr), ntohs(cp->dport), + ip_vs_state_name(cp->state), + (cp->timer.expires-jiffies)/HZ); + len += sprintf(buffer+len, "%-127s\n", temp); + if (pos >= offset+length) { + ct_read_unlock_bh(idx); + goto done; + } + } + ct_read_unlock_bh(idx); + } + + done: + *start = buffer+len-(pos-offset); /* Start of wanted data */ + len = pos-offset; + if (len > length) + len = length; + if (len < 0) + len = 0; + return len; +} + + +/* + * Randomly drop connection entries before running out of memory + */ +static inline int todrop_entry(struct ip_vs_conn *cp) +{ + /* + * The drop rate array needs tuning for real environments. + * Called from timer bh only => no locking + */ + static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static char todrop_counter[9] = {0}; + int i; + + /* if the conn entry hasn't lasted for 60 seconds, don't drop it. + This will leave enough time for normal connection to get + through. */ + if (cp->timeout+jiffies-cp->timer.expires < 60*HZ) + return 0; + + /* Don't drop the entry if its number of incoming packets is not + located in [0, 8] */ + i = atomic_read(&cp->in_pkts); + if (i > 8 || i < 0) return 0; + + if (!todrop_rate[i]) return 0; + if (--todrop_counter[i] > 0) return 0; + + todrop_counter[i] = todrop_rate[i]; + return 1; +} + + +void ip_vs_random_dropentry(void) +{ + int idx; + struct ip_vs_conn *cp; + struct list_head *l,*e; + struct ip_vs_conn *ct; + + /* + * Randomly scan 1/32 of the whole table every second + */ + for (idx=0; idx<(IP_VS_CONN_TAB_SIZE>>5); idx++) { + unsigned hash = net_random()&IP_VS_CONN_TAB_MASK; + + /* + * Lock is actually needed in this loop. + */ + ct_write_lock(hash); + + l = &ip_vs_conn_tab[hash]; + for (e=l->next; e!=l; e=e->next) { + cp = list_entry(e, struct ip_vs_conn, c_list); + if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT)) + /* connection template */ + continue; + switch(cp->state) { + case IP_VS_S_SYN_RECV: + case IP_VS_S_SYNACK: + break; + + case IP_VS_S_ESTABLISHED: + case IP_VS_S_UDP: + if (todrop_entry(cp)) + break; + continue; + + default: + continue; + } + + /* + * Drop the entry, and drop its ct if not referenced + */ + atomic_inc(&cp->refcnt); + ct_write_unlock(hash); + + if ((ct = cp->control)) + atomic_inc(&ct->refcnt); + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (ct) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(ct); + } + ct_write_lock(hash); + } + ct_write_unlock(hash); + } +} + + +/* + * Flush all the connection entries in the ip_vs_conn_tab + */ +static void ip_vs_conn_flush(void) +{ + int idx; + struct ip_vs_conn *cp; + struct list_head *l,*e; + struct ip_vs_conn *ct; + + flush_again: + for (idx=0; idxnext; e!=l; e=e->next) { + cp = list_entry(e, struct ip_vs_conn, c_list); + atomic_inc(&cp->refcnt); + ct_write_unlock(idx); + + if ((ct = cp->control)) + atomic_inc(&ct->refcnt); + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (ct) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(ct); + } + ct_write_lock(idx); + } + ct_write_unlock_bh(idx); + } + + /* the counter may be not NULL, because maybe some conn entries + are run by slow timer handler or unhashed but still referred */ + if (atomic_read(&ip_vs_conn_count) != 0) { + schedule(); + goto flush_again; + } +} + + +int ip_vs_conn_init(void) +{ + int idx; + + /* + * Allocate the connection hash table and initialize its list heads + */ + ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); + if (!ip_vs_conn_tab) + return -ENOMEM; + + IP_VS_INFO("Connection hash table configured " + "(size=%d, memory=%ldKbytes)\n", + IP_VS_CONN_TAB_SIZE, + (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); + IP_VS_DBG(0, "Each connection entry needs %d bytes at least\n", + sizeof(struct ip_vs_conn)); + + for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); + } + + for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { + __ip_vs_conntbl_lock_array[idx].l = RW_LOCK_UNLOCKED; + } + + /* Allocate ip_vs_conn slab cache */ + ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", + sizeof(struct ip_vs_conn), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!ip_vs_conn_cachep) { + vfree(ip_vs_conn_tab); + return -ENOMEM; + } + + proc_net_create("ip_vs_conn", 0, ip_vs_conn_getinfo); + + /* calculate the random value for connection hash */ + get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); + + return 0; +} + +void ip_vs_conn_cleanup(void) +{ + /* flush all the connection entries first */ + ip_vs_conn_flush(); + + /* Release the empty cache */ + kmem_cache_destroy(ip_vs_conn_cachep); + proc_net_remove("ip_vs_conn"); + vfree(ip_vs_conn_tab); +} diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_core.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_core.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_core.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_core.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,1284 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_core.c,v 1.31.2.5 2003/07/29 14:37:12 wensong Exp $ + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include /* for icmp_send */ +#include + +#include +#include + +#include + + +EXPORT_SYMBOL(register_ip_vs_scheduler); +EXPORT_SYMBOL(unregister_ip_vs_scheduler); +EXPORT_SYMBOL(ip_vs_skb_replace); +EXPORT_SYMBOL(ip_vs_proto_name); +EXPORT_SYMBOL(ip_vs_conn_new); +EXPORT_SYMBOL(ip_vs_conn_in_get); +EXPORT_SYMBOL(ip_vs_conn_out_get); +EXPORT_SYMBOL(ip_vs_conn_listen); +EXPORT_SYMBOL(ip_vs_conn_put); +#ifdef CONFIG_IP_VS_DEBUG +EXPORT_SYMBOL(ip_vs_get_debug_level); +#endif +EXPORT_SYMBOL(check_for_ip_vs_out); + + +/* ID used in ICMP lookups */ +#define icmp_id(icmph) ((icmph->un).echo.id) + +const char *ip_vs_proto_name(unsigned proto) +{ + static char buf[20]; + + switch (proto) { + case IPPROTO_IP: + return "IP"; + case IPPROTO_UDP: + return "UDP"; + case IPPROTO_TCP: + return "TCP"; + case IPPROTO_ICMP: + return "ICMP"; + default: + sprintf(buf, "IP_%d", proto); + return buf; + } +} + + +static inline void +ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.inpkts++; + dest->stats.inbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.inpkts++; + dest->svc->stats.inbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.inpkts++; + ip_vs_stats.inbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +static inline void +ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.outpkts++; + dest->stats.outbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.outpkts++; + dest->svc->stats.outbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.outpkts++; + ip_vs_stats.outbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +static inline void +ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) +{ + spin_lock(&cp->dest->stats.lock); + cp->dest->stats.conns++; + spin_unlock(&cp->dest->stats.lock); + + spin_lock(&svc->stats.lock); + svc->stats.conns++; + spin_unlock(&svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.conns++; + spin_unlock(&ip_vs_stats.lock); +} + +/* + * IPVS persistent scheduling function + * It creates a connection entry according to its template if exists, + * or selects a server and creates a connection entry plus a template. + * Locking: we are svc user (svc->refcnt), so we hold all dests too + */ +static struct ip_vs_conn * +ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_conn *cp = NULL; + struct ip_vs_dest *dest; + const __u16 *portp; + struct ip_vs_conn *ct; + __u16 dport; /* destination port to forward */ + __u32 snet; /* source network of the client, after masking */ + + portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + + /* Mask saddr with the netmask to adjust template granularity */ + snet = iph->saddr & svc->netmask; + + IP_VS_DBG(6, "P-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " + "mnet %u.%u.%u.%u\n", + NIPQUAD(iph->saddr), ntohs(portp[0]), + NIPQUAD(iph->daddr), ntohs(portp[1]), + NIPQUAD(snet)); + + /* + * As far as we know, FTP is a very complicated network protocol, and + * it uses control connection and data connections. For active FTP, + * FTP server initialize data connection to the client, its source port + * is often 20. For passive FTP, FTP server tells the clients the port + * that it passively listens to, and the client issues the data + * connection. In the tunneling or direct routing mode, the load + * balancer is on the client-to-server half of connection, the port + * number is unknown to the load balancer. So, a conn template like + * is created for persistent FTP + * service, and a template like + * is created for other persistent services. + */ + if (portp[1] == svc->port) { + /* Check if a template already exists */ + if (svc->port != FTPPORT) + ct = ip_vs_conn_in_get(iph->protocol, snet, 0, + iph->daddr, portp[1]); + else + ct = ip_vs_conn_in_get(iph->protocol, snet, 0, + iph->daddr, 0); + + if (!ct || !ip_vs_check_template(ct)) { + /* + * No template found or the dest of the connection + * template is not available. + */ + dest = svc->scheduler->schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "P-schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a template like for non-ftp service, + * and + * for ftp service. + */ + if (svc->port != FTPPORT) + ct = ip_vs_conn_new(iph->protocol, + snet, 0, + iph->daddr, portp[1], + dest->addr, dest->port, + 0, + dest); + else + ct = ip_vs_conn_new(iph->protocol, + snet, 0, + iph->daddr, 0, + dest->addr, 0, + 0, + dest); + if (ct == NULL) + return NULL; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + } + dport = dest->port; + } else { + /* + * Note: persistent fwmark-based services and persistent + * port zero service are handled here. + * fwmark template: + * port zero template: + */ + if (svc->fwmark) + ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0, + htonl(svc->fwmark), 0); + else + ct = ip_vs_conn_in_get(iph->protocol, snet, 0, + iph->daddr, 0); + + if (!ct || !ip_vs_check_template(ct)) { + /* + * If it is not persistent port zero, return NULL, + * otherwise create a connection template. + */ + if (svc->port) + return NULL; + + dest = svc->scheduler->schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "P-schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a template according to the service + */ + if (svc->fwmark) + ct = ip_vs_conn_new(IPPROTO_IP, + snet, 0, + htonl(svc->fwmark), 0, + dest->addr, 0, + 0, + dest); + else + ct = ip_vs_conn_new(iph->protocol, + snet, 0, + iph->daddr, 0, + dest->addr, 0, + 0, + dest); + if (ct == NULL) + return NULL; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + } + dport = portp[1]; + } + + /* + * Create a new connection according to the template + */ + cp = ip_vs_conn_new(iph->protocol, + iph->saddr, portp[0], + iph->daddr, portp[1], + dest->addr, dport, + 0, + dest); + if (cp == NULL) { + ip_vs_conn_put(ct); + return NULL; + } + + /* + * Increase the inactive connection counter + * because it is in Syn-Received + * state (inactive) when the connection is created. + */ + atomic_inc(&dest->inactconns); + + /* + * Add its control + */ + ip_vs_control_add(cp, ct); + + ip_vs_conn_put(ct); + return cp; +} + + +/* + * IPVS main scheduling function + * It selects a server according to the virtual service, and + * creates a connection entry. + */ +static struct ip_vs_conn * +ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_conn *cp = NULL; + struct ip_vs_dest *dest; + const __u16 *portp; + + /* + * Persistent service + */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + return ip_vs_sched_persist(svc, iph); + + /* + * Non-persistent service + */ + portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + if (!svc->fwmark && portp[1] != svc->port) { + if (!svc->port) + IP_VS_ERR("Schedule: port zero only supported " + "in persistent services, " + "check your ipvs configuration\n"); + return NULL; + } + + dest = svc->scheduler->schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "Schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a connection entry. + */ + cp = ip_vs_conn_new(iph->protocol, + iph->saddr, portp[0], + iph->daddr, portp[1], + dest->addr, dest->port?dest->port:portp[1], + 0, + dest); + if (cp == NULL) + return NULL; + + /* + * Increase the inactive connection counter because it is in + * Syn-Received state (inactive) when the connection is created. + */ + atomic_inc(&dest->inactconns); + + IP_VS_DBG(6, "Schedule fwd:%c s:%s c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " + "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n", + ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state), + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + + return cp; +} + + +/* + * Pass or drop the packet. + * Called by ip_vs_in, when the virtual service is available but + * no destination is available for a new connection. + */ +static int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); + + /* if it is fwmark-based service, the cache_bypass sysctl is up + and the destination is RTN_UNICAST (and not local), then create + a cache_bypass connection entry */ + if (sysctl_ip_vs_cache_bypass && svc->fwmark + && (inet_addr_type(iph->daddr) == RTN_UNICAST)) { + int ret; + struct ip_vs_conn *cp; + + ip_vs_service_put(svc); + + /* create a new connection entry */ + IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); + cp = ip_vs_conn_new(iph->protocol, + iph->saddr, portp[0], + iph->daddr, portp[1], + 0, 0, + IP_VS_CONN_F_BYPASS, + NULL); + if (cp == NULL) { + kfree_skb(skb); + return NF_STOLEN; + } + + /* statistics */ + ip_vs_in_stats(cp, skb); + + /* set state */ + ip_vs_set_state(cp, VS_STATE_INPUT, iph, portp); + + /* transmit the first SYN packet */ + ret = cp->packet_xmit(skb, cp); + + atomic_inc(&cp->in_pkts); + ip_vs_conn_put(cp); + return ret; + } + + /* + * When the virtual ftp service is presented, packets destined + * for other services on the VIP may get here (except services + * listed in the ipvs table), pass the packets, because it is + * not ipvs job to decide to drop the packets. + */ + if ((svc->port == FTPPORT) && (portp[1] != FTPPORT)) { + ip_vs_service_put(svc); + return NF_ACCEPT; + } + + ip_vs_service_put(svc); + + /* + * Notify the client that the destination is unreachable, and + * release the socket buffer. + * Since it is in IP layer, the TCP socket is not actually + * created, the TCP RST packet cannot be sent, instead that + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ + */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + kfree_skb(skb); + return NF_STOLEN; +} + + +/* + * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING + * chain, and is used for VS/NAT. + * It detects packets for VS/NAT connections and sends the packets + * immediately. This can avoid that iptable_nat mangles the packets + * for VS/NAT. + */ +static unsigned int ip_vs_post_routing(unsigned int hooknum, + struct sk_buff **skb_p, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *skb_p; + + if (!(skb->nfcache & NFC_IPVS_PROPERTY)) + return NF_ACCEPT; + + /* The packet was sent from IPVS, exit this chain */ + (*okfn)(skb); + + return NF_STOLEN; +} + + +/* + * Handle ICMP messages in the inside-to-outside direction (outgoing). + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded. + * (Only used in VS/NAT) + */ +static int ip_vs_out_icmp(struct sk_buff **skb_p) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph; + struct icmphdr *icmph; + struct iphdr *ciph; /* The ip header contained within the ICMP */ + __u16 *pptr; /* port numbers from TCP/UDP contained header */ + unsigned short ihl; + unsigned short len; + unsigned short clen, csize; + struct ip_vs_conn *cp; + + /* reassemble IP fragments, but will it happen in ICMP packets?? */ + if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { + skb = ip_defrag(skb); + if (!skb) + return NF_STOLEN; + *skb_p = skb; + } + + if (skb_is_nonlinear(skb)) { + if (skb_linearize(skb, GFP_ATOMIC) != 0) + return NF_DROP; + ip_send_check(skb->nh.iph); + } + + iph = skb->nh.iph; + ihl = iph->ihl << 2; + icmph = (struct icmphdr *)((char *)iph + ihl); + len = ntohs(iph->tot_len) - ihl; + if (len < sizeof(struct icmphdr)) + return NF_DROP; + + IP_VS_DBG(12, "outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", + icmph->type, ntohs(icmp_id(icmph)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((icmph->type != ICMP_DEST_UNREACH) && + (icmph->type != ICMP_SOURCE_QUENCH) && + (icmph->type != ICMP_TIME_EXCEEDED)) + return NF_ACCEPT; + + /* Now find the contained IP header */ + clen = len - sizeof(struct icmphdr); + if (clen < sizeof(struct iphdr)) + return NF_DROP; + ciph = (struct iphdr *) (icmph + 1); + csize = ciph->ihl << 2; + if (clen < csize) + return NF_DROP; + + /* We are only interested ICMPs generated from TCP or UDP packets */ + if (ciph->protocol != IPPROTO_UDP && ciph->protocol != IPPROTO_TCP) + return NF_ACCEPT; + + /* Skip non-first embedded TCP/UDP fragments */ + if (ciph->frag_off & __constant_htons(IP_OFFSET)) + return NF_ACCEPT; + + /* We need at least TCP/UDP ports here */ + if (clen < csize + sizeof(struct udphdr)) + return NF_DROP; + + /* + * Find the ports involved - this packet was + * incoming so the ports are right way round + * (but reversed relative to outer IP header!) + */ + pptr = (__u16 *)&(((char *)ciph)[csize]); + + /* Ensure the checksum is correct */ + if (ip_compute_csum((unsigned char *) icmph, len)) { + /* Failed checksum! */ + IP_VS_DBG(1, "forward ICMP: failed checksum from %d.%d.%d.%d!\n", + NIPQUAD(iph->saddr)); + return NF_DROP; + } + + IP_VS_DBG(11, "Handling outgoing ICMP for " + "%u.%u.%u.%u:%d -> %u.%u.%u.%u:%d\n", + NIPQUAD(ciph->saddr), ntohs(pptr[0]), + NIPQUAD(ciph->daddr), ntohs(pptr[1])); + + /* ciph content is actually */ + cp = ip_vs_conn_out_get(ciph->protocol, ciph->daddr, pptr[1], + ciph->saddr, pptr[0]); + if (!cp) + return NF_ACCEPT; + + if (IP_VS_FWD_METHOD(cp) != 0) { + IP_VS_ERR("shouldn't reach here, because the box is on the" + "half connection in the tun/dr module.\n"); + } + + /* Now we do real damage to this packet...! */ + /* First change the source IP address, and recalc checksum */ + iph->saddr = cp->vaddr; + ip_send_check(iph); + + /* Now change the *dest* address in the contained IP */ + ciph->daddr = cp->vaddr; + ip_send_check(ciph); + + /* the TCP/UDP dest port - cannot redo check */ + pptr[1] = cp->vport; + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_compute_csum((unsigned char *) icmph, len); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + ip_vs_conn_put(cp); + + IP_VS_DBG(11, "Forwarding correct outgoing ICMP to " + "%u.%u.%u.%u:%d -> %u.%u.%u.%u:%d\n", + NIPQUAD(ciph->saddr), ntohs(pptr[0]), + NIPQUAD(ciph->daddr), ntohs(pptr[1])); + + skb->nfcache |= NFC_IPVS_PROPERTY; + + return NF_ACCEPT; +} + + +/* + * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT. + * Check if outgoing packet belongs to the established ip_vs_conn, + * rewrite addresses of the packet and send it on its way... + */ +static unsigned int ip_vs_out(unsigned int hooknum, + struct sk_buff **skb_p, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph; + union ip_vs_tphdr h; + struct ip_vs_conn *cp; + int size; + int ihl; + + EnterFunction(11); + + if (skb->nfcache & NFC_IPVS_PROPERTY) + return NF_ACCEPT; + + iph = skb->nh.iph; + if (iph->protocol == IPPROTO_ICMP) + return ip_vs_out_icmp(skb_p); + + /* let it go if other IP protocols */ + if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_UDP) + return NF_ACCEPT; + + /* reassemble IP fragments */ + if (iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { + skb = ip_defrag(skb); + if (!skb) + return NF_STOLEN; + iph = skb->nh.iph; + *skb_p = skb; + } + + /* make sure that protocol header available in skb data area, + note that skb data area may be reallocated. */ + ihl = iph->ihl << 2; + if (ip_vs_header_check(skb, iph->protocol, ihl) == -1) + return NF_DROP; + + iph = skb->nh.iph; + h.raw = (char*) iph + ihl; + + /* + * Check if the packet belongs to an old entry + */ + cp = ip_vs_conn_out_get(iph->protocol, iph->saddr, h.portp[0], + iph->daddr, h.portp[1]); + if (!cp) { + if (sysctl_ip_vs_nat_icmp_send && + ip_vs_lookup_real_service(iph->protocol, + iph->saddr, h.portp[0])) { + /* + * Notify the real server: there is no existing + * entry if it is not RST packet or not TCP packet. + */ + if (!h.th->rst || iph->protocol != IPPROTO_TCP) { + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + kfree_skb(skb); + return NF_STOLEN; + } + } + IP_VS_DBG(12, "packet for %s %d.%d.%d.%d:%d " + "continue traversal as normal.\n", + ip_vs_proto_name(iph->protocol), + NIPQUAD(iph->daddr), + ntohs(h.portp[1])); + if (skb_is_nonlinear(skb)) + ip_send_check(iph); + return NF_ACCEPT; + } + + /* + * If it has ip_vs_app helper, the helper may change the payload, + * so it needs full checksum checking and checksum calculation. + * If not, only the header (addr/port) is changed, so it is fast + * to do incremental checksum update, and let the destination host + * do final checksum checking. + */ + + if (cp->app && skb_is_nonlinear(skb)) { + if (skb_linearize(skb, GFP_ATOMIC) != 0) { + ip_vs_conn_put(cp); + return NF_DROP; + } + iph = skb->nh.iph; + h.raw = (char*) iph + ihl; + } + + size = skb->len - ihl; + IP_VS_DBG(11, "O-pkt: %s size=%d\n", + ip_vs_proto_name(iph->protocol), size); + + /* do TCP/UDP checksum checking if it has application helper */ + if (cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) { + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = csum_partial(h.raw, size, 0); + case CHECKSUM_HW: + if (csum_tcpudp_magic(iph->saddr, iph->daddr, size, + iph->protocol, skb->csum)) { + ip_vs_conn_put(cp); + IP_VS_DBG_RL("Outgoing failed %s checksum " + "from %d.%d.%d.%d (size=%d)!\n", + ip_vs_proto_name(iph->protocol), + NIPQUAD(iph->saddr), + size); + return NF_DROP; + } + break; + default: + /* CHECKSUM_UNNECESSARY */ + break; + } + } + + IP_VS_DBG(11, "Outgoing %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d\n", + ip_vs_proto_name(iph->protocol), + NIPQUAD(iph->saddr), ntohs(h.portp[0]), + NIPQUAD(iph->daddr), ntohs(h.portp[1])); + + /* mangle the packet */ + iph->saddr = cp->vaddr; + h.portp[0] = cp->vport; + + /* + * Call application helper if needed + */ + if (ip_vs_app_pkt_out(cp, skb) != 0) { + /* skb data has probably changed, update pointers */ + iph = skb->nh.iph; + h.raw = (char*)iph + ihl; + size = skb->len - ihl; + } + + /* + * Adjust TCP/UDP checksums + */ + if (!cp->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + ip_vs_fast_check_update(&h, cp->daddr, cp->vaddr, + cp->dport, cp->vport, iph->protocol); + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + switch (iph->protocol) { + case IPPROTO_TCP: + h.th->check = 0; + skb->csum = csum_partial(h.raw, size, 0); + h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + skb->csum); + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n", + ip_vs_proto_name(iph->protocol), h.th->check, + (char*)&(h.th->check) - (char*)h.raw); + break; + case IPPROTO_UDP: + h.uh->check = 0; + skb->csum = csum_partial(h.raw, size, 0); + h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + skb->csum); + if (h.uh->check == 0) + h.uh->check = 0xFFFF; + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n", + ip_vs_proto_name(iph->protocol), h.uh->check, + (char*)&(h.uh->check) - (char*)h.raw); + break; + } + } + ip_send_check(iph); + + ip_vs_out_stats(cp, skb); + ip_vs_set_state(cp, VS_STATE_OUTPUT, iph, h.portp); + ip_vs_conn_put(cp); + + skb->nfcache |= NFC_IPVS_PROPERTY; + + LeaveFunction(11); + return NF_ACCEPT; +} + + +/* + * Check if the packet is for VS/NAT connections, then send it + * immediately. + * Called by ip_fw_compact to detect packets for VS/NAT before + * they are changed by ipchains masquerading code. + */ +unsigned int check_for_ip_vs_out(struct sk_buff **skb_p, + int (*okfn)(struct sk_buff *)) +{ + unsigned int ret; + + ret = ip_vs_out(NF_IP_FORWARD, skb_p, NULL, NULL, NULL); + if (ret != NF_ACCEPT) { + return ret; + } else { + /* send the packet immediately if it is already mangled + by ip_vs_out */ + if ((*skb_p)->nfcache & NFC_IPVS_PROPERTY) { + (*okfn)(*skb_p); + return NF_STOLEN; + } + } + return NF_ACCEPT; +} + + +/* + * Handle ICMP messages in the outside-to-inside direction (incoming) + * and sometimes in outgoing direction from ip_vs_forward_icmp. + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int ip_vs_in_icmp(struct sk_buff **skb_p) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph; + struct icmphdr *icmph; + struct iphdr *ciph; /* The ip header contained within the ICMP */ + __u16 *pptr; /* port numbers from TCP/UDP contained header */ + unsigned short len; + unsigned short clen, csize; + struct ip_vs_conn *cp; + struct rtable *rt; /* Route to the other host */ + int mtu; + + if (skb_is_nonlinear(skb)) { + if (skb_linearize(skb, GFP_ATOMIC) != 0) + return NF_DROP; + } + + iph = skb->nh.iph; + ip_send_check(iph); + icmph = (struct icmphdr *)((char *)iph + (iph->ihl << 2)); + len = ntohs(iph->tot_len) - (iph->ihl<<2); + if (len < sizeof(struct icmphdr)) + return NF_DROP; + + IP_VS_DBG(12, "icmp in (%d,%d) %u.%u.%u.%u -> %u.%u.%u.%u\n", + icmph->type, ntohs(icmp_id(icmph)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + if ((icmph->type != ICMP_DEST_UNREACH) && + (icmph->type != ICMP_SOURCE_QUENCH) && + (icmph->type != ICMP_TIME_EXCEEDED)) + return NF_ACCEPT; + + /* + * If we get here we have an ICMP error of one of the above 3 types + * Now find the contained IP header + */ + clen = len - sizeof(struct icmphdr); + if (clen < sizeof(struct iphdr)) + return NF_DROP; + ciph = (struct iphdr *) (icmph + 1); + csize = ciph->ihl << 2; + if (clen < csize) + return NF_DROP; + + /* We are only interested ICMPs generated from TCP or UDP packets */ + if (ciph->protocol != IPPROTO_UDP && ciph->protocol != IPPROTO_TCP) + return NF_ACCEPT; + + /* Skip non-first embedded TCP/UDP fragments */ + if (ciph->frag_off & __constant_htons(IP_OFFSET)) + return NF_ACCEPT; + + /* We need at least TCP/UDP ports here */ + if (clen < csize + sizeof(struct udphdr)) + return NF_DROP; + + /* Ensure the checksum is correct */ + if (ip_compute_csum((unsigned char *) icmph, len)) { + /* Failed checksum! */ + IP_VS_ERR_RL("incoming ICMP: failed checksum from " + "%d.%d.%d.%d!\n", NIPQUAD(iph->saddr)); + return NF_DROP; + } + + pptr = (__u16 *)&(((char *)ciph)[csize]); + + IP_VS_DBG(11, "Handling incoming ICMP for " + "%u.%u.%u.%u:%d -> %u.%u.%u.%u:%d\n", + NIPQUAD(ciph->saddr), ntohs(pptr[0]), + NIPQUAD(ciph->daddr), ntohs(pptr[1])); + + /* This is pretty much what ip_vs_conn_in_get() does, + except parameters are in the reverse order */ + cp = ip_vs_conn_in_get(ciph->protocol, + ciph->daddr, pptr[1], + ciph->saddr, pptr[0]); + if (cp == NULL) + return NF_ACCEPT; + + ip_vs_in_stats(cp, skb); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + int ret; + if (cp->packet_xmit) + ret = cp->packet_xmit(skb, cp); + else + ret = NF_ACCEPT; + atomic_inc(&cp->in_pkts); + ip_vs_conn_put(cp); + return ret; + } + + /* + * mangle and send the packet here + */ + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = rt->u.dst.pmtu; + if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); + goto tx_error; + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* copy-on-write the packet before mangling it */ + if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, + &iph, (unsigned char**)&icmph)) { + ip_vs_conn_put(cp); + return NF_DROP; + } + ciph = (struct iphdr *) (icmph + 1); + pptr = (__u16 *)&(((char *)ciph)[csize]); + + /* The ICMP packet for VS/NAT must be written to correct addresses + before being forwarded to the right server */ + + /* First change the dest IP address, and recalc checksum */ + iph->daddr = cp->daddr; + ip_send_check(iph); + + /* Now change the *source* address in the contained IP */ + ciph->saddr = cp->daddr; + ip_send_check(ciph); + + /* the TCP/UDP source port - cannot redo check */ + pptr[0] = cp->dport; + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_compute_csum((unsigned char *) icmph, len); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + IP_VS_DBG(11, "Forwarding incoming ICMP to " + "%u.%u.%u.%u:%d -> %u.%u.%u.%u:%d\n", + NIPQUAD(ciph->saddr), ntohs(pptr[0]), + NIPQUAD(ciph->daddr), ntohs(pptr[1])); + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 1 << NF_IP_LOCAL_OUT; +#endif /* CONFIG_NETFILTER_DEBUG */ + ip_send(skb); + ip_vs_conn_put(cp); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + dev_kfree_skb(skb); + ip_vs_conn_put(cp); + return NF_STOLEN; +} + + +/* + * Check if it's for virtual services, look it up, + * and send it on its way... + */ +static unsigned int ip_vs_in(unsigned int hooknum, + struct sk_buff **skb_p, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph = skb->nh.iph; + union ip_vs_tphdr h; + struct ip_vs_conn *cp; + struct ip_vs_service *svc; + int ihl; + int ret; + + /* + * Big tappo: only PACKET_HOST (nor loopback neither mcasts) + * ... don't know why 1st test DOES NOT include 2nd (?) + */ + if (skb->pkt_type != PACKET_HOST || skb->dev == &loopback_dev) { + IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", + skb->pkt_type, + iph->protocol, + NIPQUAD(iph->daddr)); + return NF_ACCEPT; + } + + if (iph->protocol == IPPROTO_ICMP) + return ip_vs_in_icmp(skb_p); + + /* let it go if other IP protocols */ + if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_UDP) + return NF_ACCEPT; + + /* make sure that protocol header available in skb data area, + note that skb data area may be reallocated. */ + ihl = iph->ihl << 2; + if (ip_vs_header_check(skb, iph->protocol, ihl) == -1) + return NF_DROP; + iph = skb->nh.iph; + h.raw = (char*) iph + ihl; + + /* + * Check if the packet belongs to an existing connection entry + */ + cp = ip_vs_conn_in_get(iph->protocol, iph->saddr, h.portp[0], + iph->daddr, h.portp[1]); + + if (!cp && + (h.th->syn || (iph->protocol!=IPPROTO_TCP)) && + (svc = ip_vs_service_get(skb->nfmark, iph->protocol, + iph->daddr, h.portp[1]))) { + if (ip_vs_todrop()) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + return NF_DROP; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + cp = ip_vs_schedule(svc, iph); + if (!cp) + return ip_vs_leave(svc, skb); + ip_vs_conn_stats(cp, svc); + ip_vs_service_put(svc); + } + + if (!cp) { + /* sorry, all this trouble for a no-hit :) */ + IP_VS_DBG(12, "packet for %s %d.%d.%d.%d:%d continue " + "traversal as normal.\n", + ip_vs_proto_name(iph->protocol), + NIPQUAD(iph->daddr), + ntohs(h.portp[1])); + return NF_ACCEPT; + } + + IP_VS_DBG(11, "Incoming %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d\n", + ip_vs_proto_name(iph->protocol), + NIPQUAD(iph->saddr), ntohs(h.portp[0]), + NIPQUAD(iph->daddr), ntohs(h.portp[1])); + + /* Check the server status */ + if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + /* the destination server is not availabe */ + + if (sysctl_ip_vs_expire_nodest_conn) { + /* try to expire the connection immediately */ + ip_vs_conn_expire_now(cp); + } else { + /* don't restart its timer, and silently + drop the packet. */ + __ip_vs_conn_put(cp); + } + return NF_DROP; + } + + ip_vs_in_stats(cp, skb); + ip_vs_set_state(cp, VS_STATE_INPUT, iph, h.portp); + if (cp->packet_xmit) + ret = cp->packet_xmit(skb, cp); + else { + IP_VS_DBG_RL("warning: packet_xmit is null"); + ret = NF_ACCEPT; + } + + /* increase its packet counter and check if it is needed + to be synchronized */ + atomic_inc(&cp->in_pkts); + if (ip_vs_sync_state == IP_VS_STATE_MASTER && + (cp->protocol != IPPROTO_TCP || + cp->state == IP_VS_S_ESTABLISHED) && + (atomic_read(&cp->in_pkts) % 50 == sysctl_ip_vs_sync_threshold)) + ip_vs_sync_conn(cp); + + ip_vs_conn_put(cp); + return ret; +} + + +/* + * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP + * packets destined for 0.0.0.0/0. + * When fwmark-based virtual service is used, such as transparent + * cache cluster, TCP packets can be marked and routed to ip_vs_in, + * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and + * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain + * and send them to ip_vs_in_icmp. + */ +static unsigned int ip_vs_forward_icmp(unsigned int hooknum, + struct sk_buff **skb_p, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *skb_p; + struct iphdr *iph = skb->nh.iph; + + if (iph->protocol != IPPROTO_ICMP) + return NF_ACCEPT; + + if (iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { + skb = ip_defrag(skb); + if (!skb) + return NF_STOLEN; + *skb_p = skb; + } + + return ip_vs_in_icmp(skb_p); +} + + +/* After packet filtering, forward packet through VS/DR, VS/TUN, + or VS/NAT(change destination), so that filtering rules can be + applied to IPVS. */ +static struct nf_hook_ops ip_vs_in_ops = { + { NULL, NULL }, + ip_vs_in, PF_INET, NF_IP_LOCAL_IN, 100 +}; + +/* After packet filtering, change source only for VS/NAT */ +static struct nf_hook_ops ip_vs_out_ops = { + { NULL, NULL }, + ip_vs_out, PF_INET, NF_IP_FORWARD, 100 +}; + +/* After packet filtering (but before ip_vs_out_icmp), catch icmp + destined for 0.0.0.0/0, which is for incoming IPVS connections */ +static struct nf_hook_ops ip_vs_forward_icmp_ops = { + { NULL, NULL }, + ip_vs_forward_icmp, PF_INET, NF_IP_FORWARD, 99 +}; + +/* Before the netfilter connection tracking, exit from POST_ROUTING */ +static struct nf_hook_ops ip_vs_post_routing_ops = { + { NULL, NULL }, + ip_vs_post_routing, PF_INET, NF_IP_POST_ROUTING, NF_IP_PRI_NAT_SRC-1 +}; + + +/* + * Initialize IP Virtual Server + */ +static int __init ip_vs_init(void) +{ + int ret; + + ret = ip_vs_control_init(); + if (ret < 0) { + IP_VS_ERR("can't setup control.\n"); + goto cleanup_nothing; + } + + ret = ip_vs_conn_init(); + if (ret < 0) { + IP_VS_ERR("can't setup connection table.\n"); + goto cleanup_control; + } + + ret = ip_vs_app_init(); + if (ret < 0) { + IP_VS_ERR("can't setup application helper.\n"); + goto cleanup_conn; + } + + ret = nf_register_hook(&ip_vs_in_ops); + if (ret < 0) { + IP_VS_ERR("can't register in hook.\n"); + goto cleanup_app; + } + ret = nf_register_hook(&ip_vs_out_ops); + if (ret < 0) { + IP_VS_ERR("can't register out hook.\n"); + goto cleanup_inops; + } + ret = nf_register_hook(&ip_vs_post_routing_ops); + if (ret < 0) { + IP_VS_ERR("can't register post_routing hook.\n"); + goto cleanup_outops; + } + ret = nf_register_hook(&ip_vs_forward_icmp_ops); + if (ret < 0) { + IP_VS_ERR("can't register forward_icmp hook.\n"); + goto cleanup_postroutingops; + } + + IP_VS_INFO("ipvs loaded.\n"); + return ret; + + cleanup_postroutingops: + nf_unregister_hook(&ip_vs_post_routing_ops); + cleanup_outops: + nf_unregister_hook(&ip_vs_out_ops); + cleanup_inops: + nf_unregister_hook(&ip_vs_in_ops); + cleanup_app: + ip_vs_app_cleanup(); + cleanup_conn: + ip_vs_conn_cleanup(); + cleanup_control: + ip_vs_control_cleanup(); + cleanup_nothing: + return ret; +} + +static void __exit ip_vs_cleanup(void) +{ + nf_unregister_hook(&ip_vs_forward_icmp_ops); + nf_unregister_hook(&ip_vs_post_routing_ops); + nf_unregister_hook(&ip_vs_out_ops); + nf_unregister_hook(&ip_vs_in_ops); + ip_vs_app_cleanup(); + ip_vs_conn_cleanup(); + ip_vs_control_cleanup(); + IP_VS_INFO("ipvs unloaded.\n"); +} + +module_init(ip_vs_init); +module_exit(ip_vs_cleanup); +MODULE_LICENSE("GPL"); diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_ctl.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_ctl.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_ctl.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_ctl.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,2154 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_ctl.c,v 1.30.2.3 2003/07/29 14:37:12 wensong Exp $ + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +#include + +/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ +static DECLARE_MUTEX(__ip_vs_mutex); + +/* lock for service table */ +rwlock_t __ip_vs_svc_lock = RW_LOCK_UNLOCKED; + +/* lock for table with the real services */ +static rwlock_t __ip_vs_rs_lock = RW_LOCK_UNLOCKED; + +/* lock for state and timeout tables */ +static rwlock_t __ip_vs_securetcp_lock = RW_LOCK_UNLOCKED; + +/* lock for drop entry handling */ +static spinlock_t __ip_vs_dropentry_lock = SPIN_LOCK_UNLOCKED; + +/* lock for drop packet handling */ +static spinlock_t __ip_vs_droppacket_lock = SPIN_LOCK_UNLOCKED; + +/* 1/rate drop and drop-entry variables */ +int ip_vs_drop_rate = 0; +int ip_vs_drop_counter = 0; +atomic_t ip_vs_dropentry = ATOMIC_INIT(0); + +/* number of virtual services */ +static int ip_vs_num_services = 0; + +/* sysctl variables */ +static int sysctl_ip_vs_drop_entry = 0; +static int sysctl_ip_vs_drop_packet = 0; +static int sysctl_ip_vs_secure_tcp = 0; +static int sysctl_ip_vs_amemthresh = 2048; +static int sysctl_ip_vs_am_droprate = 10; +int sysctl_ip_vs_cache_bypass = 0; +int sysctl_ip_vs_expire_nodest_conn = 0; +int sysctl_ip_vs_sync_threshold = 3; +int sysctl_ip_vs_nat_icmp_send = 0; + +#ifdef CONFIG_IP_VS_DEBUG +static int sysctl_ip_vs_debug_level = 0; + +int ip_vs_get_debug_level(void) +{ + return sysctl_ip_vs_debug_level; +} +#endif + +/* + * update_defense_level is called from timer bh and from sysctl. + */ +static void update_defense_level(void) +{ + struct sysinfo i; + int availmem; + int nomem; + + /* we only count free and buffered memory (in pages) */ + si_meminfo(&i); + availmem = i.freeram + i.bufferram; + + nomem = (availmem < sysctl_ip_vs_amemthresh); + + /* drop_entry */ + spin_lock(&__ip_vs_dropentry_lock); + switch (sysctl_ip_vs_drop_entry) { + case 0: + atomic_set(&ip_vs_dropentry, 0); + break; + case 1: + if (nomem) { + atomic_set(&ip_vs_dropentry, 1); + sysctl_ip_vs_drop_entry = 2; + } else { + atomic_set(&ip_vs_dropentry, 0); + } + break; + case 2: + if (nomem) { + atomic_set(&ip_vs_dropentry, 1); + } else { + atomic_set(&ip_vs_dropentry, 0); + sysctl_ip_vs_drop_entry = 1; + }; + break; + case 3: + atomic_set(&ip_vs_dropentry, 1); + break; + } + spin_unlock(&__ip_vs_dropentry_lock); + + /* drop_packet */ + spin_lock(&__ip_vs_droppacket_lock); + switch (sysctl_ip_vs_drop_packet) { + case 0: + ip_vs_drop_rate = 0; + break; + case 1: + if (nomem) { + ip_vs_drop_rate = ip_vs_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh - availmem); + sysctl_ip_vs_drop_packet = 2; + } else { + ip_vs_drop_rate = 0; + } + break; + case 2: + if (nomem) { + ip_vs_drop_rate = ip_vs_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh - availmem); + } else { + ip_vs_drop_rate = 0; + sysctl_ip_vs_drop_packet = 1; + } + break; + case 3: + ip_vs_drop_rate = sysctl_ip_vs_am_droprate; + break; + } + spin_unlock(&__ip_vs_droppacket_lock); + + /* secure_tcp */ + write_lock(&__ip_vs_securetcp_lock); + switch (sysctl_ip_vs_secure_tcp) { + case 0: + ip_vs_secure_tcp_set(0); + break; + case 1: + if (nomem) { + ip_vs_secure_tcp_set(1); + sysctl_ip_vs_secure_tcp = 2; + } else { + ip_vs_secure_tcp_set(0); + } + break; + case 2: + if (nomem) { + ip_vs_secure_tcp_set(1); + } else { + ip_vs_secure_tcp_set(0); + sysctl_ip_vs_secure_tcp = 1; + } + break; + case 3: + ip_vs_secure_tcp_set(1); + break; + } + write_unlock(&__ip_vs_securetcp_lock); +} + + +/* + * Timer for checking the defense + */ +static struct timer_list defense_timer; +#define DEFENSE_TIMER_PERIOD 1*HZ + +static void defense_timer_handler(unsigned long data) +{ + update_defense_level(); + if (atomic_read(&ip_vs_dropentry)) + ip_vs_random_dropentry(); + + mod_timer(&defense_timer, jiffies + DEFENSE_TIMER_PERIOD); +} + + +/* + * Hash table: for virtual service lookups + */ +#define IP_VS_SVC_TAB_BITS 8 +#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) + +/* the service table hashed by */ +static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +/* the service table hashed by fwmark */ +static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; + +/* + * Hash table: for real service lookups + */ +#define IP_VS_RTAB_BITS 4 +#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) +#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) + +static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; + +/* + * Trash for destinations + */ +static LIST_HEAD(ip_vs_dest_trash); + +/* + * FTP & NULL virtual service counters + */ +static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); +static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); + + +/* + * Returns hash value for virtual service + */ +static __inline__ unsigned +ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port) +{ + register unsigned porth = ntohs(port); + + return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth) + & IP_VS_SVC_TAB_MASK; +} + +/* + * Returns hash value of fwmark for virtual service lookup + */ +static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) +{ + return fwmark & IP_VS_SVC_TAB_MASK; +} + +/* + * Hashes ip_vs_service in the ip_vs_svc_table by + * or in the ip_vs_svc_fwm_table by fwmark. + * Should be called with locked tables. + * Returns bool success. + */ +static int ip_vs_svc_hash(struct ip_vs_service *svc) +{ + unsigned hash; + + if (svc->flags & IP_VS_SVC_F_HASHED) { + IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* + * Hash it by in ip_vs_svc_table + */ + hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port); + list_add(&svc->s_list, &ip_vs_svc_table[hash]); + } else { + /* + * Hash it by fwmark in ip_vs_svc_fwm_table + */ + hash = ip_vs_svc_fwm_hashkey(svc->fwmark); + list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + } + + svc->flags |= IP_VS_SVC_F_HASHED; + /* increase its refcnt because it is referenced by the svc table */ + atomic_inc(&svc->refcnt); + return 1; +} + + +/* + * Unhashes ip_vs_service from ip_vs_svc_table/ip_vs_svc_fwm_table. + * Should be called with locked tables. + * Returns bool success. + */ +static int ip_vs_svc_unhash(struct ip_vs_service *svc) +{ + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { + IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* + * Remove it from the ip_vs_svc_table table. + */ + list_del(&svc->s_list); + } else { + /* + * Remove it from the ip_vs_svc_fwm_table table. + */ + list_del(&svc->f_list); + } + + svc->flags &= ~IP_VS_SVC_F_HASHED; + atomic_dec(&svc->refcnt); + return 1; +} + + +/* + * Get service by {proto,addr,port} in the service table. + */ +static __inline__ struct ip_vs_service * +__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport) +{ + unsigned hash; + struct ip_vs_service *svc; + struct list_head *l,*e; + + /* + * Check for "full" addressed entries + */ + hash = ip_vs_svc_hashkey(protocol, vaddr, vport); + + l = &ip_vs_svc_table[hash]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, s_list); + if ((svc->addr == vaddr) + && (svc->port == vport) + && (svc->protocol == protocol)) { + /* HIT */ + atomic_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + + +/* + * Get service by {fwmark} in the service table. + */ +static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark) +{ + unsigned hash; + struct ip_vs_service *svc; + struct list_head *l,*e; + + /* + * Check for "full" addressed entries + */ + hash = ip_vs_svc_fwm_hashkey(fwmark); + + l = &ip_vs_svc_fwm_table[hash]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, f_list); + if (svc->fwmark == fwmark) { + /* HIT */ + atomic_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + +struct ip_vs_service * +ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport) +{ + struct ip_vs_service *svc; + + read_lock(&__ip_vs_svc_lock); + + /* + * Check the table hashed by fwmark first + */ + if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark))) + goto out; + + /* + * Check the table hashed by + * for "full" addressed entries + */ + svc = __ip_vs_service_get(protocol, vaddr, vport); + + if (svc == NULL + && protocol == IPPROTO_TCP + && atomic_read(&ip_vs_ftpsvc_counter) + && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { + /* + * Check if ftp service entry exists, the packet + * might belong to FTP data connections. + */ + svc = __ip_vs_service_get(protocol, vaddr, FTPPORT); + } + + if (svc == NULL + && atomic_read(&ip_vs_nullsvc_counter)) { + /* + * Check if the catch-all port (port zero) exists + */ + svc = __ip_vs_service_get(protocol, vaddr, 0); + } + + out: + read_unlock(&__ip_vs_svc_lock); + + IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", + fwmark, ip_vs_proto_name(protocol), + NIPQUAD(vaddr), ntohs(vport), + svc?"hit":"not hit"); + + return svc; +} + + +static inline void +__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + atomic_inc(&svc->refcnt); + dest->svc = svc; +} + +static inline void +__ip_vs_unbind_svc(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = dest->svc; + + dest->svc = NULL; + if (atomic_dec_and_test(&svc->refcnt)) + kfree(svc); +} + +/* + * Returns hash value for real service + */ +static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port) +{ + register unsigned porth = ntohs(port); + + return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) + & IP_VS_RTAB_MASK; +} + +/* + * Hashes ip_vs_dest in ip_vs_rtable by proto,addr,port. + * should be called with locked tables. + * returns bool success. + */ +static int ip_vs_rs_hash(struct ip_vs_dest *dest) +{ + unsigned hash; + + if (!list_empty(&dest->d_list)) { + return 0; + } + + /* + * Hash by proto,addr,port, + * which are the parameters of the real service. + */ + hash = ip_vs_rs_hashkey(dest->addr, dest->port); + list_add(&dest->d_list, &ip_vs_rtable[hash]); + + return 1; +} + +/* + * UNhashes ip_vs_dest from ip_vs_rtable. + * should be called with locked tables. + * returns bool success. + */ +static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +{ + /* + * Remove it from the ip_vs_rtable table. + */ + if (!list_empty(&dest->d_list)) { + list_del(&dest->d_list); + INIT_LIST_HEAD(&dest->d_list); + } + + return 1; +} + +/* + * Lookup real service by {proto,addr,port} in the real service table. + */ +struct ip_vs_dest * +ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport) +{ + unsigned hash; + struct ip_vs_dest *dest; + struct list_head *l,*e; + + /* + * Check for "full" addressed entries + * Return the first found entry + */ + hash = ip_vs_rs_hashkey(daddr, dport); + + l = &ip_vs_rtable[hash]; + + read_lock(&__ip_vs_rs_lock); + for (e=l->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, d_list); + if ((dest->addr == daddr) + && (dest->port == dport) + && ((dest->protocol == protocol) || + dest->vfwmark)) { + /* HIT */ + read_unlock(&__ip_vs_rs_lock); + return dest; + } + } + read_unlock(&__ip_vs_rs_lock); + + return NULL; +} + +/* + * Lookup destination by {addr,port} in the given service + */ +static struct ip_vs_dest * +ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) +{ + struct ip_vs_dest *dest; + struct list_head *l, *e; + + /* + * Find the destination for the given service + */ + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + if ((dest->addr == daddr) && (dest->port == dport)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + + +/* + * Lookup dest by {svc,addr,port} in the destination trash. + * The destination trash is used to hold the destinations that are removed + * from the service table but are still referenced by some conn entries. + * The reason to add the destination trash is when the dest is temporary + * down (either by administrator or by monitor program), the dest can be + * picked back from the trash, the remaining connections to the dest can + * continue, and the counting information of the dest is also useful for + * scheduling. + */ +static struct ip_vs_dest * +ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) +{ + struct ip_vs_dest *dest; + struct list_head *l, *e; + + /* + * Find the destination in trash + */ + l = &ip_vs_dest_trash; + + for (e=l->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " + "refcnt=%d\n", + dest->vfwmark, + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (dest->addr == daddr && + dest->port == dport && + dest->vfwmark == svc->fwmark && + dest->protocol == svc->protocol && + (svc->fwmark || + (dest->vaddr == svc->addr && + dest->vport == svc->port))) { + /* HIT */ + return dest; + } + + /* + * Try to purge the destination from trash if not referenced + */ + if (atomic_read(&dest->refcnt) == 1) { + IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u " + "from trash\n", + dest->vfwmark, + NIPQUAD(dest->addr), ntohs(dest->port)); + e = e->prev; + list_del(&dest->n_list); + __ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } + } + + return NULL; +} + + +/* + * Clean up all the destinations in the trash + * Called by the ip_vs_control_cleanup() + * + * When the ip_vs_control_clearup is activated by ipvs module exit, + * the service tables must have been flushed and all the connections + * are expired, and the refcnt of each destination in the trash must + * be 1, so we simply release them here. + */ +static void ip_vs_trash_cleanup(void) +{ + struct ip_vs_dest *dest; + struct list_head *l; + + l = &ip_vs_dest_trash; + + while (l->next != l) { + dest = list_entry(l->next, struct ip_vs_dest, n_list); + list_del(&dest->n_list); + __ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } +} + + +/* + * Update a destination in the given service + */ +static void __ip_vs_update_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct ip_vs_rule_user *ur) +{ + int conn_flags; + + /* + * Set the weight and the flags + */ + atomic_set(&dest->weight, ur->weight); + + conn_flags = ur->conn_flags | IP_VS_CONN_F_INACTIVE; + + /* + * Check if local node and update the flags + */ + if (inet_addr_type(ur->daddr) == RTN_LOCAL) { + conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) + | IP_VS_CONN_F_LOCALNODE; + } + + /* + * Set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading + */ + if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { + conn_flags |= IP_VS_CONN_F_NOOUTPUT; + } else { + /* + * Put the real service in ip_vs_rtable if not present. + * For now only for NAT! + */ + write_lock_bh(&__ip_vs_rs_lock); + ip_vs_rs_hash(dest); + write_unlock_bh(&__ip_vs_rs_lock); + } + atomic_set(&dest->conn_flags, conn_flags); + + /* bind the service */ + if (!dest->svc) { + __ip_vs_bind_svc(dest, svc); + } else { + if (dest->svc != svc) { + __ip_vs_unbind_svc(dest); + __ip_vs_bind_svc(dest, svc); + } + } + + /* set the dest status flags */ + dest->flags |= IP_VS_DEST_F_AVAILABLE; +} + + +/* + * Create a destination for the given service + */ +static int +ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_rule_user *ur, + struct ip_vs_dest **destp) +{ + struct ip_vs_dest *dest; + unsigned atype; + + EnterFunction(2); + + atype = inet_addr_type(ur->daddr); + if (atype != RTN_LOCAL && atype != RTN_UNICAST) + return -EINVAL; + + *destp = dest = (struct ip_vs_dest*) + kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); + if (dest == NULL) { + IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); + return -ENOMEM; + } + memset(dest, 0, sizeof(struct ip_vs_dest)); + + dest->protocol = svc->protocol; + dest->vaddr = svc->addr; + dest->vport = svc->port; + dest->vfwmark = svc->fwmark; + dest->addr = ur->daddr; + dest->port = ur->dport; + + atomic_set(&dest->activeconns, 0); + atomic_set(&dest->inactconns, 0); + atomic_set(&dest->refcnt, 0); + + INIT_LIST_HEAD(&dest->d_list); + dest->dst_lock = SPIN_LOCK_UNLOCKED; + dest->stats.lock = SPIN_LOCK_UNLOCKED; + __ip_vs_update_dest(svc, dest, ur); + ip_vs_new_estimator(&dest->stats); + + LeaveFunction(2); + return 0; +} + + +/* + * Add a destination into an existing service + */ +static int ip_vs_add_dest(struct ip_vs_service *svc, + struct ip_vs_rule_user *ur) +{ + struct ip_vs_dest *dest; + __u32 daddr = ur->daddr; + __u16 dport = ur->dport; + int ret; + + EnterFunction(2); + + if (ur->weight < 0) { + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); + return -ERANGE; + } + + /* + * Check if the dest already exists in the list + */ + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest != NULL) { + IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); + return -EEXIST; + } + + /* + * Check if the dest already exists in the trash and + * is from the same service + */ + dest = ip_vs_trash_get_dest(svc, daddr, dport); + if (dest != NULL) { + IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " + "refcnt=%d, service %u/%u.%u.%u.%u:%u\n", + NIPQUAD(daddr), ntohs(dport), + atomic_read(&dest->refcnt), + dest->vfwmark, + NIPQUAD(dest->vaddr), + ntohs(dest->vport)); + __ip_vs_update_dest(svc, dest, ur); + + /* + * Get the destination from the trash + */ + list_del(&dest->n_list); + + ip_vs_new_estimator(&dest->stats); + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + while (atomic_read(&svc->usecnt) > 1) {}; + + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + + /* call the update_service function of its scheduler */ + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + return 0; + } + + /* + * Allocate and initialize the dest structure + */ + ret = ip_vs_new_dest(svc, ur, &dest); + if (ret) { + return ret; + } + + /* + * Add the dest entry into the list + */ + atomic_inc(&dest->refcnt); + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + while (atomic_read(&svc->usecnt) > 1) {}; + + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + + /* call the update_service function of its scheduler */ + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + LeaveFunction(2); + + return 0; +} + + +/* + * Edit a destination in the given service + */ +static int ip_vs_edit_dest(struct ip_vs_service *svc, + struct ip_vs_rule_user *ur) +{ + struct ip_vs_dest *dest; + __u32 daddr = ur->daddr; + __u16 dport = ur->dport; + + EnterFunction(2); + + if (ur->weight < 0) { + IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); + return -ERANGE; + } + + /* + * Lookup the destination list + */ + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); + return -ENOENT; + } + + __ip_vs_update_dest(svc, dest, ur); + + /* call the update_service, because server weight may be changed */ + svc->scheduler->update_service(svc); + + LeaveFunction(2); + + return 0; +} + + +/* + * Delete a destination (must be already unlinked from the service) + */ +static void __ip_vs_del_dest(struct ip_vs_dest *dest) +{ + ip_vs_kill_estimator(&dest->stats); + + /* + * Remove it from the d-linked list with the real services. + */ + write_lock_bh(&__ip_vs_rs_lock); + ip_vs_rs_unhash(dest); + write_unlock_bh(&__ip_vs_rs_lock); + + /* + * Decrease the refcnt of the dest, and free the dest + * if nobody refers to it (refcnt=0). Otherwise, throw + * the destination into the trash. + */ + if (atomic_dec_and_test(&dest->refcnt)) { + __ip_vs_dst_reset(dest); + /* simply decrease svc->refcnt here, let the caller check + and release the service if nobody refers to it. + Only user context can release destination and service, + and only one user context can update virtual service at a + time, so the operation here is OK */ + atomic_dec(&dest->svc->refcnt); + kfree(dest); + } else { + IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + list_add(&dest->n_list, &ip_vs_dest_trash); + atomic_inc(&dest->refcnt); + } +} + + +/* + * Unlink a destination from the given service + */ +static void __ip_vs_unlink_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + int svcupd) +{ + dest->flags &= ~IP_VS_DEST_F_AVAILABLE; + + /* + * Remove it from the d-linked destination list. + */ + list_del(&dest->n_list); + svc->num_dests--; + if (svcupd) { + /* + * Call the update_service function of its scheduler + */ + svc->scheduler->update_service(svc); + } +} + + +/* + * Delete a destination server in the given service + */ +static int ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_rule_user *ur) +{ + struct ip_vs_dest *dest; + __u32 daddr = ur->daddr; + __u16 dport = ur->dport; + + EnterFunction(2); + + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); + return -ENOENT; + } + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + while (atomic_read(&svc->usecnt) > 1) {}; + + /* + * Unlink dest from the service + */ + __ip_vs_unlink_dest(svc, dest, 1); + + write_unlock_bh(&__ip_vs_svc_lock); + + /* + * Delete the destination + */ + __ip_vs_del_dest(dest); + + LeaveFunction(2); + + return 0; +} + + +/* + * Add a service into the service hash table + */ +static int +ip_vs_add_service(struct ip_vs_rule_user *ur, struct ip_vs_service **svc_p) +{ + int ret = 0; + struct ip_vs_scheduler *sched; + struct ip_vs_service *svc = NULL; + + MOD_INC_USE_COUNT; + + /* + * Lookup the scheduler, by 'ur->sched_name' + */ + sched = ip_vs_scheduler_get(ur->sched_name); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s.o not found\n", + ur->sched_name); + ret = -ENOENT; + goto out_mod_dec; + } + + svc = (struct ip_vs_service*) + kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); + if (svc == NULL) { + IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); + ret = -ENOMEM; + goto out_err; + } + memset(svc, 0, sizeof(struct ip_vs_service)); + + svc->protocol = ur->protocol; + svc->addr = ur->vaddr; + svc->port = ur->vport; + svc->fwmark = ur->vfwmark; + svc->flags = ur->vs_flags; + svc->timeout = ur->timeout * HZ; + svc->netmask = ur->netmask; + + INIT_LIST_HEAD(&svc->destinations); + svc->sched_lock = RW_LOCK_UNLOCKED; + svc->stats.lock = SPIN_LOCK_UNLOCKED; + + /* + * Bind the scheduler + */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) { + goto out_err; + } + + /* + * Update the virtual service counters + */ + if (svc->port == FTPPORT) + atomic_inc(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_inc(&ip_vs_nullsvc_counter); + + /* + * I'm the first user of the service + */ + atomic_set(&svc->usecnt, 1); + atomic_set(&svc->refcnt, 0); + + ip_vs_new_estimator(&svc->stats); + ip_vs_num_services++; + + /* + * Hash the service into the service table + */ + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_hash(svc); + write_unlock_bh(&__ip_vs_svc_lock); + + *svc_p = svc; + return 0; + + out_err: + if (svc) + kfree(svc); + ip_vs_scheduler_put(sched); + out_mod_dec: + MOD_DEC_USE_COUNT; + return ret; +} + + +/* + * Edit a service and bind it with a new scheduler + */ +static int ip_vs_edit_service(struct ip_vs_service *svc, + struct ip_vs_rule_user *ur) +{ + struct ip_vs_scheduler *sched, *old_sched; + int ret = 0; + + /* + * Lookup the scheduler, by 'ur->sched_name' + */ + sched = ip_vs_scheduler_get(ur->sched_name); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s.o not found\n", + ur->sched_name); + return -ENOENT; + } + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + while (atomic_read(&svc->usecnt) > 1) {}; + + /* + * Set the flags and timeout value + */ + svc->flags = ur->vs_flags | IP_VS_SVC_F_HASHED; + svc->timeout = ur->timeout * HZ; + svc->netmask = ur->netmask; + + old_sched = svc->scheduler; + if (sched != old_sched) { + /* + * Unbind the old scheduler + */ + if ((ret = ip_vs_unbind_scheduler(svc))) { + old_sched = sched; + goto out; + } + + /* + * Bind the new scheduler + */ + if ((ret = ip_vs_bind_scheduler(svc, sched))) { + /* + * If ip_vs_bind_scheduler fails, restore the old + * scheduler. + * The main reason of failure is out of memory. + * + * The question is if the old scheduler can be + * restored all the time. TODO: if it cannot be + * restored some time, we must delete the service, + * otherwise the system may crash. + */ + ip_vs_bind_scheduler(svc, old_sched); + old_sched = sched; + } + } + + out: + write_unlock_bh(&__ip_vs_svc_lock); + + if (old_sched) + ip_vs_scheduler_put(old_sched); + + return ret; +} + + +/* + * Delete a service from the service list + * The service must be unlinked, unlocked and not referenced! + */ +static void __ip_vs_del_service(struct ip_vs_service *svc) +{ + struct list_head *l; + struct ip_vs_dest *dest; + struct ip_vs_scheduler *old_sched; + + ip_vs_num_services--; + ip_vs_kill_estimator(&svc->stats); + + /* + * Unbind scheduler + */ + old_sched = svc->scheduler; + ip_vs_unbind_scheduler(svc); + if (old_sched && old_sched->module) + __MOD_DEC_USE_COUNT(old_sched->module); + + /* + * Unlink the whole destination list + */ + l = &svc->destinations; + while (l->next != l) { + dest = list_entry(l->next, struct ip_vs_dest, n_list); + __ip_vs_unlink_dest(svc, dest, 0); + __ip_vs_del_dest(dest); + } + + /* + * Update the virtual service counters + */ + if (svc->port == FTPPORT) + atomic_dec(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_dec(&ip_vs_nullsvc_counter); + + /* + * Free the service if nobody refers to it + */ + if (atomic_read(&svc->refcnt) == 0) + kfree(svc); + MOD_DEC_USE_COUNT; +} + +/* + * Delete a service from the service list + */ +static int ip_vs_del_service(struct ip_vs_service *svc) +{ + if (svc == NULL) + return -EEXIST; + + /* + * Unhash it from the service table + */ + write_lock_bh(&__ip_vs_svc_lock); + + ip_vs_svc_unhash(svc); + + /* + * Wait until all the svc users go away. + */ + while (atomic_read(&svc->usecnt) > 1) {}; + + __ip_vs_del_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + return 0; +} + + +/* + * Flush all the virtual services + */ +static int ip_vs_flush(void) +{ + int idx; + struct ip_vs_service *svc; + struct list_head *l; + + /* + * Flush the service table hashed by + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_table[idx]; + while (l->next != l) { + svc = list_entry(l->next,struct ip_vs_service,s_list); + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_unhash(svc); + /* + * Wait until all the svc users go away. + */ + while (atomic_read(&svc->usecnt) > 0) {}; + __ip_vs_del_service(svc); + write_unlock_bh(&__ip_vs_svc_lock); + } + } + + /* + * Flush the service table hashed by fwmark + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_fwm_table[idx]; + while (l->next != l) { + svc = list_entry(l->next,struct ip_vs_service,f_list); + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_unhash(svc); + /* + * Wait until all the svc users go away. + */ + while (atomic_read(&svc->usecnt) > 0) {}; + __ip_vs_del_service(svc); + write_unlock_bh(&__ip_vs_svc_lock); + } + } + + return 0; +} + + +/* + * Zero counters in a service or all services + */ +static inline void +__ip_vs_zero_stats(struct ip_vs_stats *stats) +{ + spin_lock_bh(&stats->lock); + memset(stats, 0, (char *)&stats->lock - (char *)stats); + spin_unlock_bh(&stats->lock); + ip_vs_zero_estimator(stats); +} + +static int ip_vs_zero_service(struct ip_vs_service *svc) +{ + struct list_head *l; + struct ip_vs_dest *dest; + + write_lock_bh(&__ip_vs_svc_lock); + list_for_each (l, &svc->destinations) { + dest = list_entry(l, struct ip_vs_dest, n_list); + __ip_vs_zero_stats(&dest->stats); + } + __ip_vs_zero_stats(&svc->stats); + write_unlock_bh(&__ip_vs_svc_lock); + return 0; +} + +static int ip_vs_zero_all(void) +{ + int idx; + struct list_head *l; + struct ip_vs_service *svc; + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each (l, &ip_vs_svc_table[idx]) { + svc = list_entry(l, struct ip_vs_service, s_list); + ip_vs_zero_service(svc); + } + } + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each (l, &ip_vs_svc_fwm_table[idx]) { + svc = list_entry(l, struct ip_vs_service, f_list); + ip_vs_zero_service(svc); + } + } + + __ip_vs_zero_stats(&ip_vs_stats); + return 0; +} + + +static int ip_vs_sysctl_defense_mode(ctl_table *ctl, int write, + struct file * filp, void *buffer, size_t *lenp) +{ + int *valp = ctl->data; + int val = *valp; + int ret; + + ret = proc_dointvec(ctl, write, filp, buffer, lenp); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 3)) { + /* Restore the correct value */ + *valp = val; + } else { + local_bh_disable(); + update_defense_level(); + local_bh_enable(); + } + } + return ret; +} + + +/* + * IPVS sysctl table + */ +struct ip_vs_sysctl_table { + struct ctl_table_header *sysctl_header; + ctl_table vs_vars[NET_IPV4_VS_LAST]; + ctl_table vs_dir[2]; + ctl_table ipv4_dir[2]; + ctl_table root_dir[2]; +}; + + +static struct ip_vs_sysctl_table ipv4_vs_table = { + NULL, + {{NET_IPV4_VS_AMEMTHRESH, "amemthresh", + &sysctl_ip_vs_amemthresh, sizeof(int), 0644, NULL, + &proc_dointvec}, +#ifdef CONFIG_IP_VS_DEBUG + {NET_IPV4_VS_DEBUG_LEVEL, "debug_level", + &sysctl_ip_vs_debug_level, sizeof(int), 0644, NULL, + &proc_dointvec}, +#endif + {NET_IPV4_VS_AMDROPRATE, "am_droprate", + &sysctl_ip_vs_am_droprate, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_VS_DROP_ENTRY, "drop_entry", + &sysctl_ip_vs_drop_entry, sizeof(int), 0644, NULL, + &ip_vs_sysctl_defense_mode}, + {NET_IPV4_VS_DROP_PACKET, "drop_packet", + &sysctl_ip_vs_drop_packet, sizeof(int), 0644, NULL, + &ip_vs_sysctl_defense_mode}, + {NET_IPV4_VS_SECURE_TCP, "secure_tcp", + &sysctl_ip_vs_secure_tcp, sizeof(int), 0644, NULL, + &ip_vs_sysctl_defense_mode}, + {NET_IPV4_VS_TO_ES, "timeout_established", + &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_SS, "timeout_synsent", + &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_SR, "timeout_synrecv", + &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_FW, "timeout_finwait", + &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_TW, "timeout_timewait", + &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_CL, "timeout_close", + &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_CW, "timeout_closewait", + &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_LA, "timeout_lastack", + &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_LI, "timeout_listen", + &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_SA, "timeout_synack", + &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_UDP, "timeout_udp", + &vs_timeout_table_dos.timeout[IP_VS_S_UDP], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_TO_ICMP, "timeout_icmp", + &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {NET_IPV4_VS_CACHE_BYPASS, "cache_bypass", + &sysctl_ip_vs_cache_bypass, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_VS_EXPIRE_NODEST_CONN, "expire_nodest_conn", + &sysctl_ip_vs_expire_nodest_conn, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_VS_SYNC_THRESHOLD, "sync_threshold", + &sysctl_ip_vs_sync_threshold, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_VS_NAT_ICMP_SEND, "nat_icmp_send", + &sysctl_ip_vs_nat_icmp_send, sizeof(int), 0644, NULL, + &proc_dointvec}, + {0}}, + {{NET_IPV4_VS, "vs", NULL, 0, 0555, ipv4_vs_table.vs_vars}, + {0}}, + {{NET_IPV4, "ipv4", NULL, 0, 0555, ipv4_vs_table.vs_dir}, + {0}}, + {{CTL_NET, "net", NULL, 0, 0555, ipv4_vs_table.ipv4_dir}, + {0}} +}; + + +/* + * Write the contents of the VS rule table to a PROCfs file. + * (It is kept just for backward compatibility) + */ +static inline char *ip_vs_fwd_name(unsigned flags) +{ + char *fwd; + + switch (flags & IP_VS_CONN_F_FWD_MASK) { + case IP_VS_CONN_F_LOCALNODE: + fwd = "Local"; + break; + case IP_VS_CONN_F_TUNNEL: + fwd = "Tunnel"; + break; + case IP_VS_CONN_F_DROUTE: + fwd = "Route"; + break; + default: + fwd = "Masq"; + } + return fwd; +} + +static int ip_vs_get_info(char *buf, char **start, off_t offset, int length) +{ + int len=0; + off_t pos=0; + char temp[64], temp2[32]; + int idx; + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + struct list_head *l, *e, *p, *q; + + /* + * Note: since the length of the buffer is usually the multiple + * of 512, it is good to use fixed record of the divisor of 512, + * so that records won't be truncated at buffer boundary. + */ + pos = 192; + if (pos > offset) { + sprintf(temp, + "IP Virtual Server version %d.%d.%d (size=%d)", + NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + len += sprintf(buf+len, "%-63s\n", temp); + len += sprintf(buf+len, "%-63s\n", + "Prot LocalAddress:Port Scheduler Flags"); + len += sprintf(buf+len, "%-63s\n", + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn"); + } + + read_lock_bh(&__ip_vs_svc_lock); + + /* print the service table hashed by */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_table[idx]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, s_list); + pos += 64; + if (pos > offset) { + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + sprintf(temp2, "persistent %d %08X", + svc->timeout, + ntohl(svc->netmask)); + else + temp2[0] = '\0'; + + sprintf(temp, "%s %08X:%04X %s %s", + ip_vs_proto_name(svc->protocol), + ntohl(svc->addr), + ntohs(svc->port), + svc->scheduler->name, temp2); + len += sprintf(buf+len, "%-63s\n", temp); + if (len >= length) + goto done; + } + + p = &svc->destinations; + for (q=p->next; q!=p; q=q->next) { + dest = list_entry(q, struct ip_vs_dest, n_list); + pos += 64; + if (pos <= offset) + continue; + sprintf(temp, + " -> %08X:%04X %-7s %-6d %-10d %-10d", + ntohl(dest->addr), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + len += sprintf(buf+len, "%-63s\n", temp); + if (len >= length) + goto done; + } + } + } + + /* print the service table hashed by fwmark */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + l = &ip_vs_svc_fwm_table[idx]; + for (e=l->next; e!=l; e=e->next) { + svc = list_entry(e, struct ip_vs_service, f_list); + pos += 64; + if (pos > offset) { + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + sprintf(temp2, "persistent %d %08X", + svc->timeout, + ntohl(svc->netmask)); + else + temp2[0] = '\0'; + + sprintf(temp, "FWM %08X %s %s", + svc->fwmark, + svc->scheduler->name, temp2); + len += sprintf(buf+len, "%-63s\n", temp); + if (len >= length) + goto done; + } + + p = &svc->destinations; + for (q=p->next; q!=p; q=q->next) { + dest = list_entry(q, struct ip_vs_dest, n_list); + pos += 64; + if (pos <= offset) + continue; + sprintf(temp, + " -> %08X:%04X %-7s %-6d %-10d %-10d", + ntohl(dest->addr), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + len += sprintf(buf+len, "%-63s\n", temp); + if (len >= length) + goto done; + } + } + } + + done: + read_unlock_bh(&__ip_vs_svc_lock); + + *start = buf+len-(pos-offset); /* Start of wanted data */ + len = pos-offset; + if (len > length) + len = length; + if (len < 0) + len = 0; + return len; +} + + +struct ip_vs_stats ip_vs_stats; + +static int +ip_vs_stats_get_info(char *buf, char **start, off_t offset, int length) +{ + int len=0; + off_t pos=0; + char temp[64]; + + pos += 320; + if (pos > offset) { + len += sprintf(buf+len, "%-63s\n%-63s\n", +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + " Total Incoming Outgoing Incoming Outgoing", + " Conns Packets Packets Bytes Bytes"); + + spin_lock_bh(&ip_vs_stats.lock); + sprintf(temp, "%8X %8X %8X %8X%08X %8X%08X", + ip_vs_stats.conns, + ip_vs_stats.inpkts, + ip_vs_stats.outpkts, + (__u32)(ip_vs_stats.inbytes>>32), + (__u32)ip_vs_stats.inbytes, + (__u32)(ip_vs_stats.outbytes>>32), + (__u32)ip_vs_stats.outbytes); + len += sprintf(buf+len, "%-62s\n\n", temp); + + len += sprintf(buf+len, "%-63s\n", +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s"); + sprintf(temp, "%8X %8X %8X %16X %16X", + ip_vs_stats.cps, + ip_vs_stats.inpps, + ip_vs_stats.outpps, + ip_vs_stats.inbps, + ip_vs_stats.outbps); + len += sprintf(buf+len, "%-63s\n", temp); + + spin_unlock_bh(&ip_vs_stats.lock); + } + + *start = buf+len-(pos-offset); /* Start of wanted data */ + len = pos-offset; + if (len > length) + len = length; + if (len < 0) + len = 0; + return len; +} + + +/* + * Set timeout values for tcp tcpfin udp in the vs_timeout_table. + */ +static int ip_vs_set_timeouts(struct ip_vs_rule_user *u) +{ + IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", + u->tcp_timeout, + u->tcp_fin_timeout, + u->udp_timeout); + + if (u->tcp_timeout) { + vs_timeout_table.timeout[IP_VS_S_ESTABLISHED] + = u->tcp_timeout * HZ; + } + + if (u->tcp_fin_timeout) { + vs_timeout_table.timeout[IP_VS_S_FIN_WAIT] + = u->tcp_fin_timeout * HZ; + } + + if (u->udp_timeout) { + vs_timeout_table.timeout[IP_VS_S_UDP] + = u->udp_timeout * HZ; + } + return 0; +} + + +static int +do_ip_vs_set_ctl(struct sock *sk, int cmd, void *user, unsigned int len) +{ + int ret; + struct ip_vs_rule_user *urule; + struct ip_vs_service *svc = NULL; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* + * Check the size of mm, no overflow... + * len > 128000 is a sanity check. + */ + if (len < sizeof(struct ip_vs_rule_user)) { + IP_VS_ERR("set_ctl: len %u < %u\n", + len, sizeof(struct ip_vs_rule_user)); + return -EINVAL; + } else if (len > 128000) { + IP_VS_ERR("set_ctl: len %u > 128000\n", len); + return -EINVAL; + } else if ((urule = kmalloc(len, GFP_KERNEL)) == NULL) { + IP_VS_ERR("set_ctl: no mem for len %u\n", len); + return -ENOMEM; + } else if (copy_from_user(urule, user, len) != 0) { + ret = -EFAULT; + goto out_free; + } + + MOD_INC_USE_COUNT; + if (down_interruptible(&__ip_vs_mutex)) { + ret = -ERESTARTSYS; + goto out_dec; + } + + if (cmd == IP_VS_SO_SET_FLUSH) { + /* Flush the virtual service */ + ret = ip_vs_flush(); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_TIMEOUTS) { + /* Set timeout values for (tcp tcpfin udp) */ + ret = ip_vs_set_timeouts(urule); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { + ret = start_sync_thread(urule->state, urule->mcast_ifn); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { + ret = stop_sync_thread(); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_ZERO) { + /* if no service address is set, zero counters in all */ + if (!urule->vfwmark && !urule->vaddr && !urule->vport) { + ret = ip_vs_zero_all(); + goto out_unlock; + } + } + + /* + * Check for valid protocol: TCP or UDP. Even for fwmark!=0 + */ + if (urule->protocol!=IPPROTO_TCP && urule->protocol!=IPPROTO_UDP) { + IP_VS_INFO("vs_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s", + ntohs(urule->protocol), NIPQUAD(urule->vaddr), + ntohs(urule->vport), urule->sched_name); + ret = -EFAULT; + goto out_unlock; + } + + /* + * Lookup the exact service by or fwmark + */ + if (urule->vfwmark == 0) + svc = __ip_vs_service_get(urule->protocol, + urule->vaddr, urule->vport); + else + svc = __ip_vs_svc_fwm_get(urule->vfwmark); + + if (cmd != IP_VS_SO_SET_ADD + && (svc == NULL || svc->protocol != urule->protocol)) { + ret = -ESRCH; + goto out_unlock; + } + + switch (cmd) { + case IP_VS_SO_SET_ADD: + if (svc != NULL) + ret = -EEXIST; + else + ret = ip_vs_add_service(urule, &svc); + break; + case IP_VS_SO_SET_EDIT: + ret = ip_vs_edit_service(svc, urule); + break; + case IP_VS_SO_SET_DEL: + ret = ip_vs_del_service(svc); + if (!ret) + goto out_unlock; + break; + case IP_VS_SO_SET_ADDDEST: + ret = ip_vs_add_dest(svc, urule); + break; + case IP_VS_SO_SET_EDITDEST: + ret = ip_vs_edit_dest(svc, urule); + break; + case IP_VS_SO_SET_DELDEST: + ret = ip_vs_del_dest(svc, urule); + break; + case IP_VS_SO_SET_ZERO: + ret = ip_vs_zero_service(svc); + break; + default: + ret = -EINVAL; + } + + if (svc) + ip_vs_service_put(svc); + + out_unlock: + up(&__ip_vs_mutex); + out_dec: + MOD_DEC_USE_COUNT; + out_free: + kfree(urule); + return ret; +} + + +static inline void +__ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +{ + spin_lock_bh(&src->lock); + memcpy(dst, src, (char*)&src->lock - (char*)src); + spin_unlock_bh(&src->lock); +} + +static inline int +__ip_vs_get_service_entries(const struct ip_vs_get_services *get, + struct ip_vs_get_services *uptr) +{ + int idx, count=0; + struct ip_vs_service *svc; + struct list_head *l; + struct ip_vs_service_user entry; + int ret = 0; + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each (l, &ip_vs_svc_table[idx]) { + if (count >= get->num_services) + goto out; + svc = list_entry(l, struct ip_vs_service, s_list); + entry.protocol = svc->protocol; + entry.addr = svc->addr; + entry.port = svc->port; + entry.fwmark = svc->fwmark; + strcpy(entry.sched_name, svc->scheduler->name); + entry.flags = svc->flags; + entry.timeout = svc->timeout / HZ; + entry.netmask = svc->netmask; + entry.num_dests = svc->num_dests; + __ip_vs_copy_stats(&entry.stats, &svc->stats); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each (l, &ip_vs_svc_fwm_table[idx]) { + if (count >= get->num_services) + goto out; + svc = list_entry(l, struct ip_vs_service, f_list); + entry.protocol = svc->protocol; + entry.addr = svc->addr; + entry.port = svc->port; + entry.fwmark = svc->fwmark; + strcpy(entry.sched_name, svc->scheduler->name); + entry.flags = svc->flags; + entry.timeout = svc->timeout / HZ; + entry.netmask = svc->netmask; + entry.num_dests = svc->num_dests; + __ip_vs_copy_stats(&entry.stats, &svc->stats); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + out: + return ret; +} + +static inline int +__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, + struct ip_vs_get_dests *uptr) +{ + struct ip_vs_service *svc; + int ret = 0; + + if (get->fwmark) + svc = __ip_vs_svc_fwm_get(get->fwmark); + else + svc = __ip_vs_service_get(get->protocol, + get->addr, get->port); + if (svc) { + int count = 0; + struct ip_vs_dest *dest; + struct list_head *l, *e; + struct ip_vs_dest_user entry; + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + if (count >= get->num_dests) + break; + dest = list_entry(e, struct ip_vs_dest, n_list); + entry.addr = dest->addr; + entry.port = dest->port; + entry.flags = atomic_read(&dest->conn_flags); + entry.weight = atomic_read(&dest->weight); + entry.activeconns = atomic_read(&dest->activeconns); + entry.inactconns = atomic_read(&dest->inactconns); + __ip_vs_copy_stats(&entry.stats, &dest->stats); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + break; + } + count++; + } + ip_vs_service_put(svc); + } else + ret = -ESRCH; + return ret; +} + +static inline void +__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) +{ + u->tcp_timeout = vs_timeout_table.timeout[IP_VS_S_ESTABLISHED] / HZ; + u->tcp_fin_timeout = vs_timeout_table.timeout[IP_VS_S_FIN_WAIT] / HZ; + u->udp_timeout = vs_timeout_table.timeout[IP_VS_S_UDP] / HZ; +} + +static int +do_ip_vs_get_ctl(struct sock *sk, int cmd, void *user, int *len) +{ + int ret = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (down_interruptible(&__ip_vs_mutex)) + return -ERESTARTSYS; + + switch (cmd) { + case IP_VS_SO_GET_VERSION: + { + char buf[64]; + + sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", + NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + if (*len < strlen(buf)+1) { + ret = -EINVAL; + goto out; + } + if (copy_to_user(user, buf, strlen(buf)+1) != 0) { + ret = -EFAULT; + goto out; + } + *len = strlen(buf)+1; + } + break; + + case IP_VS_SO_GET_INFO: + { + struct ip_vs_getinfo info; + info.version = IP_VS_VERSION_CODE; + info.size = IP_VS_CONN_TAB_SIZE; + info.num_services = ip_vs_num_services; + if (copy_to_user(user, &info, sizeof(info)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_SERVICES: + { + struct ip_vs_get_services get; + + if (*len < sizeof(get)) { + IP_VS_ERR("length: %u < %u\n", *len, sizeof(get)); + ret = -EINVAL; + goto out; + } + if (copy_from_user(&get, user, sizeof(get))) { + ret = -EFAULT; + goto out; + } + if (*len != (sizeof(get)+sizeof(struct ip_vs_service_user)*get.num_services)) { + IP_VS_ERR("length: %u != %u\n", *len, + sizeof(get)+sizeof(struct ip_vs_service_user)*get.num_services); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_service_entries(&get, user); + } + break; + + case IP_VS_SO_GET_SERVICE: + { + struct ip_vs_service_user get; + struct ip_vs_service *svc; + + if (*len != sizeof(get)) { + IP_VS_ERR("length: %u != %u\n", *len, sizeof(get)); + ret = -EINVAL; + goto out; + } + if (copy_from_user(&get, user, sizeof(get))) { + ret = -EFAULT; + goto out; + } + + if (get.fwmark) + svc = __ip_vs_svc_fwm_get(get.fwmark); + else + svc = __ip_vs_service_get(get.protocol, + get.addr, get.port); + if (svc) { + strcpy(get.sched_name, svc->scheduler->name); + get.flags = svc->flags; + get.timeout = svc->timeout / HZ; + get.netmask = svc->netmask; + get.num_dests = svc->num_dests; + __ip_vs_copy_stats(&get.stats, &svc->stats); + if (copy_to_user(user, &get, *len) != 0) + ret = -EFAULT; + ip_vs_service_put(svc); + } else + ret = -ESRCH; + } + break; + + case IP_VS_SO_GET_DESTS: + { + struct ip_vs_get_dests get; + + if (*len < sizeof(get)) { + IP_VS_ERR("length: %u < %u\n", *len, sizeof(get)); + ret = -EINVAL; + goto out; + } + if (copy_from_user(&get, user, sizeof(get))) { + ret = -EFAULT; + goto out; + } + if (*len != (sizeof(get) + + sizeof(struct ip_vs_dest_user)*get.num_dests)) { + IP_VS_ERR("length: %u != %u\n", *len, + sizeof(get)+sizeof(struct ip_vs_dest_user)*get.num_dests); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_dest_entries(&get, user); + } + break; + + case IP_VS_SO_GET_TIMEOUTS: + { + struct ip_vs_timeout_user u; + + if (*len < sizeof(u)) { + IP_VS_ERR("length: %u < %u\n", *len, sizeof(u)); + ret = -EINVAL; + goto out; + } + __ip_vs_get_timeouts(&u); + if (copy_to_user(user, &u, sizeof(u)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_DAEMON: + { + struct ip_vs_daemon_user u; + + if (*len < sizeof(u)) { + IP_VS_ERR("length: %u < %u\n", *len, sizeof(u)); + ret = -EINVAL; + goto out; + } + u.state = ip_vs_sync_state; + strcpy(u.mcast_ifn, ip_vs_mcast_ifn); + if (copy_to_user(user, &u, sizeof(u)) != 0) + ret = -EFAULT; + } + break; + + default: + ret = -EINVAL; + } + + out: + up(&__ip_vs_mutex); + return ret; +} + + +static struct nf_sockopt_ops ip_vs_sockopts = { + { NULL, NULL }, PF_INET, + IP_VS_BASE_CTL, IP_VS_SO_SET_MAX+1, do_ip_vs_set_ctl, + IP_VS_BASE_CTL, IP_VS_SO_GET_MAX+1, do_ip_vs_get_ctl +}; + + +int ip_vs_control_init(void) +{ + int ret; + int idx; + + EnterFunction(2); + + ret = nf_register_sockopt(&ip_vs_sockopts); + if (ret) { + IP_VS_ERR("cannot register sockopt.\n"); + return ret; + } + + proc_net_create("ip_vs", 0, ip_vs_get_info); + proc_net_create("ip_vs_stats", 0, ip_vs_stats_get_info); + + ipv4_vs_table.sysctl_header = + register_sysctl_table(ipv4_vs_table.root_dir, 0); + /* + * Initilize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable, + * ip_vs_schedulers. + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_svc_table[idx]); + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_rtable[idx]); + } + + memset(&ip_vs_stats, 0, sizeof(ip_vs_stats)); + ip_vs_stats.lock = SPIN_LOCK_UNLOCKED; + ip_vs_new_estimator(&ip_vs_stats); + + /* Hook the defense timer */ + init_timer(&defense_timer); + defense_timer.function = defense_timer_handler; + defense_timer.expires = jiffies + DEFENSE_TIMER_PERIOD; + add_timer(&defense_timer); + + LeaveFunction(2); + return 0; +} + +void ip_vs_control_cleanup(void) +{ + EnterFunction(2); + ip_vs_trash_cleanup(); + del_timer_sync(&defense_timer); + ip_vs_kill_estimator(&ip_vs_stats); + unregister_sysctl_table(ipv4_vs_table.sysctl_header); + proc_net_remove("ip_vs_stats"); + proc_net_remove("ip_vs"); + nf_unregister_sockopt(&ip_vs_sockopts); + LeaveFunction(2); +} diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_dh.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_dh.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_dh.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_dh.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,265 @@ +/* + * IPVS: Destination Hashing scheduling module + * + * Version: $Id: ip_vs_dh.c,v 1.4 2001/10/19 15:05:17 wensong Exp $ + * + * Authors: Wensong Zhang + * + * Inspired by the consistent hashing scheduler patch from + * Thomas Proell + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The dh algorithm is to select server by the hash key of destination IP + * address. The pseudo code is as follows: + * + * n <- servernode[dest_ip]; + * if (n is dead) OR + * (n is overloaded, such as n.conns>2*n.weight) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet destination IP address to the current server + * array. If the dh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#include +#include +#include +#include +#include +#include + +#include + + +/* + * IPVS DH bucket + */ +struct ip_vs_dh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS DH entry hash table + */ +#ifndef CONFIG_IP_VS_DH_TAB_BITS +#define CONFIG_IP_VS_DH_TAB_BITS 8 +#endif +#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS +#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) +#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS DH entry + */ +static inline unsigned ip_vs_dh_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __u32 addr) +{ + return (tbl[ip_vs_dh_hashkey(addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_dh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; idest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +{ + int i; + struct ip_vs_dh_bucket *b; + + b = tbl; + for (i=0; idest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_dh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl; + + /* allocate the DH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "DH hash table (memory=%dbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "DH hash table (memory=%dbytes) released\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +/* + * If the number of active connections is twice larger than its weight, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)*2) { + return 1; + } + return 0; +} + + +/* + * Destination hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_dh_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_dh_bucket *tbl; + + IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_dh_bucket *)svc->sched_data; + dest = ip_vs_dh_get(tbl, iph->daddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->daddr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS DH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_dh_scheduler = +{ + {0}, /* n_list */ + "dh", /* name */ + ATOMIC_INIT(0), /* refcnt */ + THIS_MODULE, /* this module */ + ip_vs_dh_init_svc, /* service initializer */ + ip_vs_dh_done_svc, /* service done */ + ip_vs_dh_update_svc, /* service updater */ + ip_vs_dh_schedule, /* select a server from the destination list */ +}; + + +static int __init ip_vs_dh_init(void) +{ + INIT_LIST_HEAD(&ip_vs_dh_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +static void __exit ip_vs_dh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +module_init(ip_vs_dh_init); +module_exit(ip_vs_dh_cleanup); +MODULE_LICENSE("GPL"); diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_est.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_est.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_est.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_est.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,200 @@ +/* + * ip_vs_est.c Simple rate estimator for IPVS + * + * Version: $Id: ip_vs_est.c,v 1.3.2.1 2003/07/29 14:37:13 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ +#include +#include + +#include + +/* + This code is to estimate rate in a shorter interval (such as 8 + seconds) for virtual services and real servers. For measure rate in a + long interval, it is easy to implement a user level daemon which + periodically reads those statistical counters and measure rate. + + Currently, the measurement is activated by slow timer handler. Hope + this measurement will not introduce too much load. + + We measure rate during the last 8 seconds every 2 seconds: + + avgrate = avgrate*(1-W) + rate*W + + where W = 2^(-2) + + NOTES. + + * The stored value for average bps is scaled by 2^5, so that maximal + rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. + + * A lot code is taken from net/sched/estimator.c + */ + + +struct ip_vs_estimator +{ + struct ip_vs_estimator *next; + struct ip_vs_stats *stats; + + u32 last_conns; + u32 last_inpkts; + u32 last_outpkts; + u64 last_inbytes; + u64 last_outbytes; + + u32 cps; + u32 inpps; + u32 outpps; + u32 inbps; + u32 outbps; +}; + + +static struct ip_vs_estimator *est_list = NULL; +static rwlock_t est_lock = RW_LOCK_UNLOCKED; +static struct timer_list est_timer; + +static void estimation_timer(unsigned long arg) +{ + struct ip_vs_estimator *e; + struct ip_vs_stats *s; + u32 n_conns; + u32 n_inpkts, n_outpkts; + u64 n_inbytes, n_outbytes; + u32 rate; + + read_lock(&est_lock); + for (e = est_list; e; e = e->next) { + s = e->stats; + + spin_lock(&s->lock); + n_conns = s->conns; + n_inpkts = s->inpkts; + n_outpkts = s->outpkts; + n_inbytes = s->inbytes; + n_outbytes = s->outbytes; + + /* scaled by 2^10, but divided 2 seconds */ + rate = (n_conns - e->last_conns)<<9; + e->last_conns = n_conns; + e->cps += ((long)rate - (long)e->cps)>>2; + s->cps = (e->cps+0x1FF)>>10; + + rate = (n_inpkts - e->last_inpkts)<<9; + e->last_inpkts = n_inpkts; + e->inpps += ((long)rate - (long)e->inpps)>>2; + s->inpps = (e->inpps+0x1FF)>>10; + + rate = (n_outpkts - e->last_outpkts)<<9; + e->last_outpkts = n_outpkts; + e->outpps += ((long)rate - (long)e->outpps)>>2; + s->outpps = (e->outpps+0x1FF)>>10; + + rate = (n_inbytes - e->last_inbytes)<<4; + e->last_inbytes = n_inbytes; + e->inbps += ((long)rate - (long)e->inbps)>>2; + s->inbps = (e->inbps+0xF)>>5; + + rate = (n_outbytes - e->last_outbytes)<<4; + e->last_outbytes = n_outbytes; + e->outbps += ((long)rate - (long)e->outbps)>>2; + s->outbps = (e->outbps+0xF)>>5; + spin_unlock(&s->lock); + } + read_unlock(&est_lock); + mod_timer(&est_timer, jiffies + 2*HZ); +} + +int ip_vs_new_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est; + + est = kmalloc(sizeof(*est), GFP_KERNEL); + if (est == NULL) + return -ENOMEM; + + memset(est, 0, sizeof(*est)); + est->stats = stats; + est->last_conns = stats->conns; + est->cps = stats->cps<<10; + + est->last_inpkts = stats->inpkts; + est->inpps = stats->inpps<<10; + + est->last_outpkts = stats->outpkts; + est->outpps = stats->outpps<<10; + + est->last_inbytes = stats->inbytes; + est->inbps = stats->inbps<<5; + + est->last_outbytes = stats->outbytes; + est->outbps = stats->outbps<<5; + + est->next = est_list; + if (est->next == NULL) { + init_timer(&est_timer); + est_timer.expires = jiffies + 2*HZ; + est_timer.function = estimation_timer; + add_timer(&est_timer); + } + write_lock_bh(&est_lock); + est_list = est; + write_unlock_bh(&est_lock); + return 0; +} + +void ip_vs_kill_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est, **pest; + int killed = 0; + + write_lock_bh(&est_lock); + pest = &est_list; + while ((est=*pest) != NULL) { + if (est->stats != stats) { + pest = &est->next; + continue; + } + *pest = est->next; + kfree(est); + killed++; + } + if (killed && est_list == NULL) + del_timer_sync(&est_timer); + write_unlock_bh(&est_lock); +} + +void ip_vs_zero_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *e; + + write_lock_bh(&est_lock); + for (e = est_list; e; e = e->next) { + if (e->stats != stats) + continue; + + /* set counters zero */ + e->last_conns = 0; + e->last_inpkts = 0; + e->last_outpkts = 0; + e->last_inbytes = 0; + e->last_outbytes = 0; + e->cps = 0; + e->inpps = 0; + e->outpps = 0; + e->inbps = 0; + e->outbps = 0; + } + write_unlock_bh(&est_lock); +} diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_ftp.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_ftp.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_ftp.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_ftp.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,407 @@ +/* + * IP_VS ftp application module + * + * Version: $Id: ip_vs_ftp.c,v 1.12 2002/08/10 04:32:35 wensong Exp $ + * + * Authors: Wensong Zhang + * + * Changes: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference + * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. + * + * IP_MASQ_FTP ftp masquerading module + * + * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 + * + * Author: Wouter Gadeyne + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +#define SERVER_STRING "227 Entering Passive Mode (" +#define CLIENT_STRING "PORT " + + +/* + * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper + * First port is set to the default port. + */ +static int ports[IP_VS_APP_MAX_PORTS] = {21, 0}; +struct ip_vs_app *incarnations[IP_VS_APP_MAX_PORTS]; + +/* + * Debug level + */ +#ifdef CONFIG_IP_VS_DEBUG +static int debug=0; +MODULE_PARM(debug, "i"); +#endif + +MODULE_PARM(ports, "1-" __MODULE_STRING(IP_VS_APP_MAX_PORTS) "i"); + +/* Dummy variable */ +static int ip_vs_ftp_pasv; + + +static int +ip_vs_ftp_init_conn(struct ip_vs_app *vapp, struct ip_vs_conn *cp) +{ + return 0; +} + + +static int +ip_vs_ftp_done_conn(struct ip_vs_app *vapp, struct ip_vs_conn *cp) +{ + return 0; +} + + +/* + * Get from the string "xxx.xxx.xxx.xxx,ppp,ppp", started + * with the "pattern" and terminated with the "term" character. + * is in network order. + */ +static int ip_vs_ftp_get_addrport(char *data, char *data_limit, + const char *pattern, size_t plen, char term, + __u32 *addr, __u16 *port, + char **start, char **end) +{ + unsigned char p1,p2,p3,p4,p5,p6; + + while (data < data_limit) { + if (strnicmp(data, pattern, plen) != 0) { + data++; + continue; + } + *start = data+plen; + p1 = simple_strtoul(data+plen, &data, 10); + if (*data != ',') + continue; + p2 = simple_strtoul(data+1, &data, 10); + if (*data != ',') + continue; + p3 = simple_strtoul(data+1, &data, 10); + if (*data != ',') + continue; + p4 = simple_strtoul(data+1, &data, 10); + if (*data != ',') + continue; + p5 = simple_strtoul(data+1, &data, 10); + if (*data != ',') + continue; + p6 = simple_strtoul(data+1, &data, 10); + if (*data != term) + continue; + + *end = data; + *addr = (p4<<24) | (p3<<16) | (p2<<8) | p1; + *port = (p6<<8) | p5; + return 1; + } + return 0; +} + + +/* + * Look at outgoing ftp packets to catch the response to a PASV command + * from the server (inside-to-outside). + * When we see one, we build a connection entry with the client address, + * client port 0 (unknown at the moment), the server address and the + * server port. Mark the current connection entry as a control channel + * of the new entry. All this work is just to make the data connection + * can be scheduled to the right server later. + * + * The outgoing packet should be something like + * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". + * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + */ +static int ip_vs_ftp_out(struct ip_vs_app *vapp, + struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + char *start, *end; + __u32 from; + __u16 port; + struct ip_vs_conn *n_cp; + char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ + unsigned buf_len; + int diff; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_S_ESTABLISHED) + return 0; + + if (cp->app_data == &ip_vs_ftp_pasv) { + iph = skb->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)th + (th->doff << 2); + data_limit = skb->tail; + + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING, + sizeof(SERVER_STRING)-1, ')', + &from, &port, + &start, &end) == 0) + return 0; + + IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> " + "%u.%u.%u.%u:%d detected\n", + NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0); + + /* + * Now update or create an connection entry for it + */ + n_cp = ip_vs_conn_out_get(iph->protocol, from, port, + cp->caddr, 0); + if (!n_cp) { + n_cp = ip_vs_conn_new(IPPROTO_TCP, + cp->caddr, 0, + cp->vaddr, port, + from, port, + IP_VS_CONN_F_NO_CPORT, + cp->dest); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + + /* increase dest's inactive connection counter */ + if (cp->dest) + atomic_inc(&cp->dest->inactconns); + } + + /* + * Replace the old passive address with the new one + */ + from = n_cp->vaddr; + port = n_cp->vport; + sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from), + port&255, port>>8&255); + buf_len = strlen(buf); + + /* + * Calculate required delta-offset to keep TCP happy + */ + diff = buf_len - (end-start); + + if (diff == 0) { + /* simply replace it with new passive address */ + memcpy(start, buf, buf_len); + } else { + /* fixme: return value isn't checked here */ + ip_vs_skb_replace(skb, GFP_ATOMIC, start, + end-start, buf, buf_len); + } + + cp->app_data = NULL; + ip_vs_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + return diff; + } + return 0; +} + + +/* + * Look at incoming ftp packets to catch the PASV/PORT command + * (outside-to-inside). + * + * The incoming packet having the PORT command should be something like + * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". + * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. + * In this case, we create a connection entry using the client address and + * port, so that the active ftp data connection from the server can reach + * the client. + */ +static int ip_vs_ftp_in(struct ip_vs_app *vapp, + struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_start, *data_limit; + char *start, *end; + __u32 to; + __u16 port; + struct ip_vs_conn *n_cp; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_S_ESTABLISHED) + return 0; + + /* + * Detecting whether it is passive + */ + iph = skb->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* Since there may be OPTIONS in the TCP packet and the HLEN is + the length of the header in 32-bit multiples, it is accurate + to calculate data address by th+HLEN*4 */ + data = data_start = (char *)th + (th->doff << 2); + data_limit = skb->tail; + + while (data < data_limit) { + if (strnicmp(data, "PASV\r\n", 6) == 0) { + IP_VS_DBG(1-debug, "got PASV at %d of %d\n", + data - data_start, + data_limit - data_start); + cp->app_data = &ip_vs_ftp_pasv; + return 0; + } + data++; + } + + /* + * To support virtual FTP server, the scenerio is as follows: + * FTP client ----> Load Balancer ----> FTP server + * First detect the port number in the application data, + * then create a new connection entry for the coming data + * connection. + */ + data = data_start; + data_limit = skb->h.raw + skb->len - 18; + + if (ip_vs_ftp_get_addrport(data, data_limit, + CLIENT_STRING, sizeof(CLIENT_STRING)-1, + '\r', &to, &port, + &start, &end) == 0) + return 0; + + IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n", + NIPQUAD(to), ntohs(port)); + + /* + * Now update or create a connection entry for it + */ + IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", + ip_vs_proto_name(iph->protocol), + NIPQUAD(to), ntohs(port), NIPQUAD(iph->daddr), 0); + + n_cp = ip_vs_conn_in_get(iph->protocol, + to, port, + iph->daddr, htons(ntohs(cp->vport)-1)); + if (!n_cp) { + n_cp = ip_vs_conn_new(IPPROTO_TCP, + to, port, + cp->vaddr, htons(ntohs(cp->vport)-1), + cp->daddr, htons(ntohs(cp->dport)-1), + 0, + cp->dest); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + + /* increase dest's inactive connection counter */ + if (cp->dest) + atomic_inc(&cp->dest->inactconns); + } + + /* + * Move tunnel to listen state + */ + ip_vs_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + + /* no diff required for incoming packets */ + return 0; +} + + +static struct ip_vs_app ip_vs_ftp = { + {0}, /* n_list */ + "ftp", /* name */ + 0, /* type */ + THIS_MODULE, /* this module */ + ip_vs_ftp_init_conn, /* ip_vs_init_conn */ + ip_vs_ftp_done_conn, /* ip_vs_done_conn */ + ip_vs_ftp_out, /* pkt_out */ + ip_vs_ftp_in, /* pkt_in */ +}; + + +/* + * ip_vs_ftp initialization + */ +static int __init ip_vs_ftp_init(void) +{ + int i, j; + + for (i=0; i + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Martin Hamilton : fixed the terrible locking bugs + * *lock(tbl->lock) ==> *lock(&tbl->lock) + * Wensong Zhang : fixed the uninitilized tbl->lock bug + * Wensong Zhang : added doing full expiration check to + * collect stale entries of 24+ hours when + * no partial expire check in a half hour + * Julian Anastasov : replaced del_timer call with del_timer_sync + * to avoid the possible race between timer + * handler and del_timer thread in SMP + * + */ + +/* + * The lblc algorithm is as follows (pseudo code): + * + * if cachenode[dest_ip] is null then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * else + * n <- cachenode[dest_ip]; + * if (n is dead) OR + * (n.conns>n.weight AND + * there is a node m with m.conns +#include +#include +#include +#include +#include + +/* for systcl */ +#include +#include + +#include + + +/* + * It is for garbage collection of stale IPVS lblc entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblc entry hash table + */ +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS +#define CONFIG_IP_VS_LBLC_TAB_BITS 10 +#endif +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) + + +/* + * IPVS lblc entry represents an association between destination + * IP address and its destination server + */ +struct ip_vs_lblc_entry { + struct list_head list; + __u32 addr; /* destination IP address */ + struct ip_vs_dest *dest; /* real server (cache) */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblc hash table + */ +struct ip_vs_lblc_table { + rwlock_t lock; /* lock for this table */ + struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLC sysctl table + */ +struct ip_vs_lblc_sysctl_table { + struct ctl_table_header *sysctl_header; + ctl_table vs_vars[2]; + ctl_table vs_dir[2]; + ctl_table ipv4_dir[2]; + ctl_table root_dir[2]; +}; + + +static struct ip_vs_lblc_sysctl_table lblc_sysctl_table = { + NULL, + {{NET_IPV4_VS_LBLC_EXPIRE, "lblc_expiration", + &sysctl_ip_vs_lblc_expiration, + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {0}}, + {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblc_sysctl_table.vs_vars}, + {0}}, + {{NET_IPV4, "ipv4", NULL, 0, 0555, lblc_sysctl_table.vs_dir}, + {0}}, + {{CTL_NET, "net", NULL, 0, 0555, lblc_sysctl_table.ipv4_dir}, + {0}} +}; + + +/* + * new/free a ip_vs_lblc_entry, which is a mapping of a destionation + * IP address to a server. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest) +{ + struct ip_vs_lblc_entry *en; + + en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); + if (en == NULL) { + IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); + return NULL; + } + + INIT_LIST_HEAD(&en->list); + en->addr = daddr; + + atomic_inc(&dest->refcnt); + en->dest = dest; + + return en; +} + + +static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) +{ + list_del(&en->list); + /* + * We don't kfree dest because it is refered either by its service + * or the trash dest list. + */ + atomic_dec(&en->dest->refcnt); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLC entry + */ +static inline unsigned ip_vs_lblc_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblc_table. + * returns bool success. + */ +static int +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) +{ + unsigned hash; + + if (!list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Hash by destination IP address + */ + hash = ip_vs_lblc_hashkey(en->addr); + + write_lock(&tbl->lock); + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); + write_unlock(&tbl->lock); + + return 1; +} + + +#if 0000 +/* + * Unhash ip_vs_lblc_entry from ip_vs_lblc_table. + * returns bool success. + */ +static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl, + struct ip_vs_lblc_entry *en) +{ + if (list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Remove it from the table + */ + write_lock(&tbl->lock); + list_del(&en->list); + INIT_LIST_HEAD(&en->list); + write_unlock(&tbl->lock); + + return 1; +} +#endif + + +/* + * Get ip_vs_lblc_entry associated with supplied parameters. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr) +{ + unsigned hash; + struct ip_vs_lblc_entry *en; + struct list_head *l,*e; + + hash = ip_vs_lblc_hashkey(addr); + l = &tbl->bucket[hash]; + + read_lock(&tbl->lock); + + for (e=l->next; e!=l; e=e->next) { + en = list_entry(e, struct ip_vs_lblc_entry, list); + if (en->addr == addr) { + /* HIT */ + read_unlock(&tbl->lock); + return en; + } + } + + read_unlock(&tbl->lock); + + return NULL; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +{ + int i; + struct list_head *l; + struct ip_vs_lblc_entry *en; + + for (i=0; ilock); + for (l=&tbl->bucket[i]; l->next!=l; ) { + en = list_entry(l->next, + struct ip_vs_lblc_entry, list); + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } +} + + +static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) +{ + unsigned long now = jiffies; + int i, j; + struct list_head *l, *e; + struct ip_vs_lblc_entry *en; + + for (i=0, j=tbl->rover; ibucket[j]; + write_lock(&tbl->lock); + while (e->next != l) { + en = list_entry(e->next, + struct ip_vs_lblc_entry, list); + if ((now - en->lastuse) < + sysctl_ip_vs_lblc_expiration) { + e = e->next; + continue; + } + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblc table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblc_check_expire(unsigned long data) +{ + struct ip_vs_lblc_table *tbl; + unsigned long now = jiffies; + int goal; + int i, j; + struct list_head *l, *e; + struct ip_vs_lblc_entry *en; + + tbl = (struct ip_vs_lblc_table *)data; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblc_full_check(tbl); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; ibucket[j]; + write_lock(&tbl->lock); + while (e->next != l) { + en = list_entry(e->next, + struct ip_vs_lblc_entry, list); + if ((now - en->lastuse) < ENTRY_TIMEOUT) { + e = e->next; + continue; + } + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&tbl->lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblc_table *tbl; + + /* + * Allocate the ip_vs_lblc_table for this service + */ + tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLC hash table (memory=%dbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_lblc_table)); + + /* + * Initialize the hash buckets + */ + for (i=0; ibucket[i]); + } + tbl->lock = RW_LOCK_UNLOCKED; + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + init_timer(&tbl->periodic_timer); + tbl->periodic_timer.data = (unsigned long)tbl; + tbl->periodic_timer.function = ip_vs_lblc_check_expire; + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; + add_timer(&tbl->periodic_timer); + + return 0; +} + + +static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblc_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "LBLC hash table (memory=%dbytes) released\n", + sizeof(struct ip_vs_lblc_table)); + + return 0; +} + + +static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We think the overhead of processing active connections is fifty + * times higher than that of inactive connections in average. (This + * fifty times might not be accurate, we will change it later.) We + * use the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + least = list_entry(e, struct ip_vs_dest, n_list); + if (atomic_read(&least->weight) > 0) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + register struct list_head *l, *e; + struct ip_vs_dest *d; + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + d = list_entry(e, struct ip_vs_dest, n_list); + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_lblc_table *tbl; + struct ip_vs_lblc_entry *en; + + IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_lblc_table *)svc->sched_data; + en = ip_vs_lblc_get(tbl, iph->daddr); + if (en == NULL) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + en = ip_vs_lblc_new(iph->daddr, dest); + if (en == NULL) { + return NULL; + } + ip_vs_lblc_hash(tbl, en); + } else { + dest = en->dest; + if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest, svc)) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + atomic_dec(&en->dest->refcnt); + atomic_inc(&dest->refcnt); + en->dest = dest; + } + } + en->lastuse = jiffies; + + IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(en->addr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLC Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = +{ + {0}, /* n_list */ + "lblc", /* name */ + ATOMIC_INIT(0), /* refcnt */ + THIS_MODULE, /* this module */ + ip_vs_lblc_init_svc, /* service initializer */ + ip_vs_lblc_done_svc, /* service done */ + ip_vs_lblc_update_svc, /* service updater */ + ip_vs_lblc_schedule, /* select a server from the destination list */ +}; + + +static int __init ip_vs_lblc_init(void) +{ + INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); + lblc_sysctl_table.sysctl_header = + register_sysctl_table(lblc_sysctl_table.root_dir, 0); + return register_ip_vs_scheduler(&ip_vs_lblc_scheduler); +} + + +static void __exit ip_vs_lblc_cleanup(void) +{ + unregister_sysctl_table(lblc_sysctl_table.sysctl_header); + unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); +} + + +module_init(ip_vs_lblc_init); +module_exit(ip_vs_lblc_cleanup); +MODULE_LICENSE("GPL"); diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_lblcr.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_lblcr.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_lblcr.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_lblcr.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,884 @@ +/* + * IPVS: Locality-Based Least-Connection with Replication scheduler + * + * Version: $Id: ip_vs_lblcr.c,v 1.10 2002/03/25 12:44:35 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Julian Anastasov : Added the missing (dest->weight>0) + * condition in the ip_vs_dest_set_max. + * + */ + +/* + * The lblc/r algorithm is as follows (pseudo code): + * + * if serverSet[dest_ip] is null then + * n, serverSet[dest_ip] <- {weighted least-conn node}; + * else + * n <- {least-conn (alive) node in serverSet[dest_ip]}; + * if (n is null) OR + * (n.conns>n.weight AND + * there is a node m with m.conns 1 AND + * now - serverSet[dest_ip].lastMod > T then + * m <- {most conn node in serverSet[dest_ip]}; + * remove m from serverSet[dest_ip]; + * if serverSet[dest_ip] changed then + * serverSet[dest_ip].lastMod <- now; + * + * return n; + * + */ + +#include +#include +#include +#include +#include +#include + +/* for systcl */ +#include +#include +/* for proc_net_create/proc_net_remove */ +#include + +#include + + +/* + * It is for garbage collection of stale IPVS lblcr entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblcr entry hash table + */ +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 +#endif +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) + + +/* + * IPVS destination set structure and operations + */ +struct ip_vs_dest_list { + struct ip_vs_dest_list *next; /* list link */ + struct ip_vs_dest *dest; /* destination server */ +}; + +struct ip_vs_dest_set { + atomic_t size; /* set size */ + unsigned long lastmod; /* last modified time */ + struct ip_vs_dest_list *list; /* destination list */ + rwlock_t lock; /* lock for this list */ +}; + + +static struct ip_vs_dest_list * +ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e; + + for (e=set->list; e!=NULL; e=e->next) { + if (e->dest == dest) + /* already existed */ + return NULL; + } + + e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC); + if (e == NULL) { + IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); + return NULL; + } + + atomic_inc(&dest->refcnt); + e->dest = dest; + + /* link it to the list */ + write_lock(&set->lock); + e->next = set->list; + set->list = e; + atomic_inc(&set->size); + write_unlock(&set->lock); + + set->lastmod = jiffies; + return e; +} + +static void +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e, **ep; + + write_lock(&set->lock); + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + if (e->dest == dest) { + /* HIT */ + *ep = e->next; + atomic_dec(&set->size); + set->lastmod = jiffies; + atomic_dec(&e->dest->refcnt); + kfree(e); + break; + } + ep = &e->next; + } + write_unlock(&set->lock); +} + +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) +{ + struct ip_vs_dest_list *e, **ep; + + write_lock(&set->lock); + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + *ep = e->next; + /* + * We don't kfree dest because it is refered either + * by its service or by the trash dest list. + */ + atomic_dec(&e->dest->refcnt); + kfree(e); + } + write_unlock(&set->lock); +} + +/* get weighted least-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + if (set == NULL) + return NULL; + + read_lock(&set->lock); + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + least = e->dest; + if ((atomic_read(&least->weight) > 0) + && (least->flags & IP_VS_DEST_F_AVAILABLE)) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + read_unlock(&set->lock); + return NULL; + + /* find the destination with the weighted least load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if ((loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + least = dest; + loh = doh; + } + } + read_unlock(&set->lock); + + IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + return least; +} + + +/* get weighted most-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *most; + int moh, doh; + + if (set == NULL) + return NULL; + + read_lock(&set->lock); + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + most = e->dest; + if (atomic_read(&most->weight) > 0) { + moh = atomic_read(&most->activeconns) * 50 + + atomic_read(&most->inactconns); + goto nextstage; + } + } + read_unlock(&set->lock); + return NULL; + + /* find the destination with the weighted most load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ + if ((moh * atomic_read(&dest->weight) < + doh * atomic_read(&most->weight)) + && (atomic_read(&dest->weight) > 0)) { + most = dest; + moh = doh; + } + } + read_unlock(&set->lock); + + IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(most->addr), ntohs(most->port), + atomic_read(&most->activeconns), + atomic_read(&most->refcnt), + atomic_read(&most->weight), moh); + return most; +} + + +/* + * IPVS lblcr entry represents an association between destination + * IP address and its destination server set + */ +struct ip_vs_lblcr_entry { + struct list_head list; + __u32 addr; /* destination IP address */ + struct ip_vs_dest_set set; /* destination server set */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblcr hash table + */ +struct ip_vs_lblcr_table { + rwlock_t lock; /* lock for this table */ + struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLCR sysctl table + */ +struct ip_vs_lblcr_sysctl_table { + struct ctl_table_header *sysctl_header; + ctl_table vs_vars[2]; + ctl_table vs_dir[2]; + ctl_table ipv4_dir[2]; + ctl_table root_dir[2]; +}; + + +static struct ip_vs_lblcr_sysctl_table lblcr_sysctl_table = { + NULL, + {{NET_IPV4_VS_LBLCR_EXPIRE, "lblcr_expiration", + &sysctl_ip_vs_lblcr_expiration, + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, + {0}}, + {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblcr_sysctl_table.vs_vars}, + {0}}, + {{NET_IPV4, "ipv4", NULL, 0, 0555, lblcr_sysctl_table.vs_dir}, + {0}}, + {{CTL_NET, "net", NULL, 0, 0555, lblcr_sysctl_table.ipv4_dir}, + {0}} +}; + + +/* + * new/free a ip_vs_lblcr_entry, which is a mapping of a destination + * IP address to a server. + */ +static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr) +{ + struct ip_vs_lblcr_entry *en; + + en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC); + if (en == NULL) { + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); + return NULL; + } + + INIT_LIST_HEAD(&en->list); + en->addr = daddr; + + /* initilize its dest set */ + atomic_set(&(en->set.size), 0); + en->set.list = NULL; + en->set.lock = RW_LOCK_UNLOCKED; + + return en; +} + + +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) +{ + list_del(&en->list); + ip_vs_dest_set_eraseall(&en->set); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLCR entry + */ +static inline unsigned ip_vs_lblcr_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblcr_table. + * returns bool success. + */ +static int +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) +{ + unsigned hash; + + if (!list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Hash by destination IP address + */ + hash = ip_vs_lblcr_hashkey(en->addr); + + write_lock(&tbl->lock); + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); + write_unlock(&tbl->lock); + + return 1; +} + + +#if 0000 +/* + * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table. + * returns bool success. + */ +static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl, + struct ip_vs_lblcr_entry *en) +{ + if (list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Remove it from the table + */ + write_lock(&tbl->lock); + list_del(&en->list); + INIT_LIST_HEAD(&en->list); + write_unlock(&tbl->lock); + + return 1; +} +#endif + + +/* + * Get ip_vs_lblcr_entry associated with supplied parameters. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr) +{ + unsigned hash; + struct ip_vs_lblcr_entry *en; + struct list_head *l,*e; + + hash = ip_vs_lblcr_hashkey(addr); + l = &tbl->bucket[hash]; + + read_lock(&tbl->lock); + + for (e=l->next; e!=l; e=e->next) { + en = list_entry(e, struct ip_vs_lblcr_entry, list); + if (en->addr == addr) { + /* HIT */ + read_unlock(&tbl->lock); + return en; + } + } + + read_unlock(&tbl->lock); + + return NULL; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +{ + int i; + struct list_head *l; + struct ip_vs_lblcr_entry *en; + + for (i=0; ilock); + for (l=&tbl->bucket[i]; l->next!=l; ) { + en = list_entry(l->next, + struct ip_vs_lblcr_entry, list); + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } +} + + +static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) +{ + unsigned long now = jiffies; + int i, j; + struct list_head *l, *e; + struct ip_vs_lblcr_entry *en; + + for (i=0, j=tbl->rover; ibucket[j]; + write_lock(&tbl->lock); + while (e->next != l) { + en = list_entry(e->next, + struct ip_vs_lblcr_entry, list); + if ((now - en->lastuse) < + sysctl_ip_vs_lblcr_expiration) { + e = e->next; + continue; + } + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblcr table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblcr_check_expire(unsigned long data) +{ + struct ip_vs_lblcr_table *tbl; + unsigned long now = jiffies; + int goal; + int i, j; + struct list_head *l, *e; + struct ip_vs_lblcr_entry *en; + + tbl = (struct ip_vs_lblcr_table *)data; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblcr_full_check(tbl); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; ibucket[j]; + write_lock(&tbl->lock); + while (e->next != l) { + en = list_entry(e->next, + struct ip_vs_lblcr_entry, list); + if ((now - en->lastuse) < ENTRY_TIMEOUT) { + e = e->next; + continue; + } + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&tbl->lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +#ifdef CONFIG_IP_VS_LBLCR_DEBUG +static struct ip_vs_lblcr_table *lblcr_table_list; + +/* + * /proc/net/ip_vs_lblcr to display the mappings of + * destination IP address <==> its serverSet + */ +static int +ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length) +{ + off_t pos=0, begin; + int len=0, size; + struct ip_vs_lblcr_table *tbl; + unsigned long now = jiffies; + int i; + struct list_head *l, *e; + struct ip_vs_lblcr_entry *en; + + tbl = lblcr_table_list; + + size = sprintf(buffer, "LastTime Dest IP address Server set\n"); + pos += size; + len += size; + + for (i=0; ibucket[i]; + read_lock_bh(&tbl->lock); + for (e=l->next; e!=l; e=e->next) { + char tbuf[16]; + struct ip_vs_dest_list *d; + + en = list_entry(e, struct ip_vs_lblcr_entry, list); + sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr)); + size = sprintf(buffer+len, "%8lu %-16s ", + now-en->lastuse, tbuf); + + read_lock(&en->set.lock); + for (d=en->set.list; d!=NULL; d=d->next) { + size += sprintf(buffer+len+size, + "%u.%u.%u.%u ", + NIPQUAD(d->dest->addr)); + } + read_unlock(&en->set.lock); + size += sprintf(buffer+len+size, "\n"); + len += size; + pos += size; + if (pos <= offset) + len=0; + if (pos >= offset+length) { + read_unlock_bh(&tbl->lock); + goto done; + } + } + read_unlock_bh(&tbl->lock); + } + + done: + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + if(len>length) + len = length; + return len; +} +#endif + + +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblcr_table *tbl; + + /* + * Allocate the ip_vs_lblcr_table for this service + */ + tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLCR hash table (memory=%dbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_lblcr_table)); + + /* + * Initialize the hash buckets + */ + for (i=0; ibucket[i]); + } + tbl->lock = RW_LOCK_UNLOCKED; + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + init_timer(&tbl->periodic_timer); + tbl->periodic_timer.data = (unsigned long)tbl; + tbl->periodic_timer.function = ip_vs_lblcr_check_expire; + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; + add_timer(&tbl->periodic_timer); + +#ifdef CONFIG_IP_VS_LBLCR_DEBUG + lblcr_table_list = tbl; +#endif + return 0; +} + + +static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblcr_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "LBLCR hash table (memory=%dbytes) released\n", + sizeof(struct ip_vs_lblcr_table)); + + return 0; +} + + +static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We think the overhead of processing active connections is fifty + * times higher than that of inactive connections in average. (This + * fifty times might not be accurate, we will change it later.) We + * use the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + least = list_entry(e, struct ip_vs_dest, n_list); + if (atomic_read(&least->weight) > 0) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + register struct list_head *l, *e; + struct ip_vs_dest *d; + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + d = list_entry(e, struct ip_vs_dest, n_list); + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_lblcr_table *tbl; + struct ip_vs_lblcr_entry *en; + + IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_lblcr_table *)svc->sched_data; + en = ip_vs_lblcr_get(tbl, iph->daddr); + if (en == NULL) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + en = ip_vs_lblcr_new(iph->daddr); + if (en == NULL) { + return NULL; + } + ip_vs_dest_set_insert(&en->set, dest); + ip_vs_lblcr_hash(tbl, en); + } else { + dest = ip_vs_dest_set_min(&en->set); + if (!dest || is_overloaded(dest, svc)) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + ip_vs_dest_set_insert(&en->set, dest); + } + if (atomic_read(&en->set.size) > 1 && + jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) { + struct ip_vs_dest *m; + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + } + } + en->lastuse = jiffies; + + IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(en->addr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLCR Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblcr_scheduler = +{ + {0}, /* n_list */ + "lblcr", /* name */ + ATOMIC_INIT(0), /* refcnt */ + THIS_MODULE, /* this module */ + ip_vs_lblcr_init_svc, /* service initializer */ + ip_vs_lblcr_done_svc, /* service done */ + ip_vs_lblcr_update_svc, /* service updater */ + ip_vs_lblcr_schedule, /* select a server from the destination list */ +}; + + +static int __init ip_vs_lblcr_init(void) +{ + INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); + lblcr_sysctl_table.sysctl_header = + register_sysctl_table(lblcr_sysctl_table.root_dir, 0); +#ifdef CONFIG_IP_VS_LBLCR_DEBUG + proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo); +#endif + return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); +} + + +static void __exit ip_vs_lblcr_cleanup(void) +{ +#ifdef CONFIG_IP_VS_LBLCR_DEBUG + proc_net_remove("ip_vs_lblcr"); +#endif + unregister_sysctl_table(lblcr_sysctl_table.sysctl_header); + unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); +} + + +module_init(ip_vs_lblcr_init); +module_exit(ip_vs_lblcr_cleanup); +MODULE_LICENSE("GPL"); diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_lc.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_lc.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_lc.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_lc.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,142 @@ +/* + * IPVS: Least-Connection Scheduling module + * + * Version: $Id: ip_vs_lc.c,v 1.8.2.1 2003/04/11 14:02:35 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : added the ip_vs_lc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include +#include +#include +#include +#include + +#include + + +static int ip_vs_lc_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int ip_vs_lc_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int ip_vs_lc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + + +/* + * Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); + + /* + * Simply select the server with the least number of + * (activeconns<<5) + inactconns + * Except whose weight is equal to zero. + * If the weight is equal to zero, it means that the server is + * quiesced, the existing connections to the server still get + * served, but no new connection is assigned to the server. + */ + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + least = list_entry (e, struct ip_vs_dest, n_list); + if (atomic_read(&least->weight) > 0) { + loh = ip_vs_lc_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + if (atomic_read(&dest->weight) == 0) + continue; + doh = ip_vs_lc_dest_overhead(dest); + if (doh < loh) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->inactconns)); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_lc_scheduler = { + {0}, /* n_list */ + "lc", /* name */ + ATOMIC_INIT(0), /* refcnt */ + THIS_MODULE, /* this module */ + ip_vs_lc_init_svc, /* service initializer */ + ip_vs_lc_done_svc, /* service done */ + ip_vs_lc_update_svc, /* service updater */ + ip_vs_lc_schedule, /* select a server from the destination list */ +}; + + +static int __init ip_vs_lc_init(void) +{ + INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; +} + +static void __exit ip_vs_lc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); +} + +module_init(ip_vs_lc_init); +module_exit(ip_vs_lc_cleanup); +MODULE_LICENSE("GPL"); diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_nq.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_nq.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_nq.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_nq.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,177 @@ +/* + * IPVS: Never Queue scheduling module + * + * Version: $Id: ip_vs_nq.c,v 1.1.2.1 2003/05/20 17:05:02 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The NQ algorithm adopts a two-speed model. When there is an idle server + * available, the job will be sent to the idle server, instead of waiting + * for a fast one. When there is no idle server available, the job will be + * sent to the server that minimize its expected delay (The Shortest + * Expected Delay scheduling algorithm). + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri for talking NQ to me. + * + * The difference between NQ and SED is that NQ can improve overall + * system utilization. + * + */ + +#include +#include +#include +#include +#include +#include + +#include + + +static int +ip_vs_nq_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_nq_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_nq_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_nq_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + least = list_entry(e, struct ip_vs_dest, n_list); + if (atomic_read(&least->weight) > 0) { + loh = ip_vs_nq_dest_overhead(least); + + /* return the server directly if it is idle */ + if (atomic_read(&least->activeconns) == 0) + goto out; + + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + doh = ip_vs_nq_dest_overhead(dest); + + /* return the server directly if it is idle */ + if (atomic_read(&dest->activeconns) == 0) { + least = dest; + loh = doh; + goto out; + } + + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + out: + IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_nq_scheduler = +{ + .name = "nq", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_nq_init_svc, + .done_service = ip_vs_nq_done_svc, + .update_service = ip_vs_nq_update_svc, + .schedule = ip_vs_nq_schedule, +}; + + +static int __init ip_vs_nq_init(void) +{ + INIT_LIST_HEAD(&ip_vs_nq_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +static void __exit ip_vs_nq_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +module_init(ip_vs_nq_init); +module_exit(ip_vs_nq_cleanup); +MODULE_LICENSE("GPL"); diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_rr.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_rr.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_rr.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_rr.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,120 @@ +/* + * IPVS: Round-Robin Scheduling module + * + * Version: $Id: ip_vs_rr.c,v 1.8 2001/10/19 15:05:17 wensong Exp $ + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes/Changes: + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest + * Julian Anastasov : fixed the NULL pointer access bug in debugging + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_rr_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include +#include +#include +#include +#include + +#include + + +static int ip_vs_rr_init_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +static int ip_vs_rr_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +/* + * Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_rr_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *p, *q; + struct ip_vs_dest *dest; + + IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); + + write_lock(&svc->sched_lock); + p = (struct list_head *)svc->sched_data; + p = p->next; + q = p; + do { + if (q == &svc->destinations) { + q = q->next; + continue; + } + dest = list_entry(q, struct ip_vs_dest, n_list); + if (atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + q = q->next; + } while (q != p); + write_unlock(&svc->sched_lock); + return NULL; + + out: + svc->sched_data = q; + write_unlock(&svc->sched_lock); + IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_rr_scheduler = { + {0}, /* n_list */ + "rr", /* name */ + ATOMIC_INIT(0), /* refcnt */ + THIS_MODULE, /* this module */ + ip_vs_rr_init_svc, /* service initializer */ + ip_vs_rr_done_svc, /* service done */ + ip_vs_rr_update_svc, /* service updater */ + ip_vs_rr_schedule, /* select a server from the destination list */ +}; + +static int __init ip_vs_rr_init(void) +{ + INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +static void __exit ip_vs_rr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +module_init(ip_vs_rr_init); +module_exit(ip_vs_rr_cleanup); +MODULE_LICENSE("GPL"); diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_sched.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_sched.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_sched.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_sched.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,260 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_sched.c,v 1.11 2001/11/04 08:58:43 wensong Exp $ + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include /* for local_bh_* */ +#include +#include + +#include + +/* + * IPVS scheduler list + */ +static LIST_HEAD(ip_vs_schedulers); + +/* lock for service table */ +static rwlock_t __ip_vs_sched_lock = RW_LOCK_UNLOCKED; + + +/* + * Bind a service with a scheduler + */ +int ip_vs_bind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *scheduler) +{ + int ret; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + if (scheduler == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); + return -EINVAL; + } + + svc->scheduler = scheduler; + + if (scheduler->init_service) { + ret = scheduler->init_service(svc); + if (ret) { + IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); + return ret; + } + } + + return 0; +} + + +/* + * Unbind a service with its scheduler + */ +int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +{ + struct ip_vs_scheduler *sched; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + + sched = svc->scheduler; + if (sched == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); + return -EINVAL; + } + + if (sched->done_service) { + if (sched->done_service(svc) != 0) { + IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); + return -EINVAL; + } + } + + svc->scheduler = NULL; + return 0; +} + + +/* + * Get scheduler in the scheduler list by name + */ +static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + struct list_head *l, *e; + + IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", + sched_name); + + l = &ip_vs_schedulers; + + read_lock_bh(&__ip_vs_sched_lock); + + for (e=l->next; e!=l; e=e->next) { + sched = list_entry(e, struct ip_vs_scheduler, n_list); + + /* + * Test and MOD_INC_USE_COUNT atomically + */ + if (sched->module && !try_inc_mod_count(sched->module)) { + /* + * This scheduler is just deleted + */ + continue; + } + if (strcmp(sched_name, sched->name)==0) { + /* HIT */ + read_unlock_bh(&__ip_vs_sched_lock); + return sched; + } + if (sched->module) + __MOD_DEC_USE_COUNT(sched->module); + } + + read_unlock_bh(&__ip_vs_sched_lock); + return NULL; +} + + +/* + * Lookup scheduler and try to load it if it doesn't exist + */ +struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + /* + * Search for the scheduler by sched_name + */ + sched = ip_vs_sched_getbyname(sched_name); + + /* + * If scheduler not found, load the module and search again + */ + if (sched == NULL) { + char module_name[IP_VS_SCHEDNAME_MAXLEN+8]; + sprintf(module_name,"ip_vs_%s", sched_name); + request_module(module_name); + sched = ip_vs_sched_getbyname(sched_name); + } + + return sched; +} + +void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) +{ + if (scheduler->module) + __MOD_DEC_USE_COUNT(scheduler->module); +} + + +/* + * Register a scheduler in the scheduler list + */ +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + struct ip_vs_scheduler *sched; + + if (!scheduler) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + if (!scheduler->name) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); + return -EINVAL; + } + + MOD_INC_USE_COUNT; + + /* + * Make sure that the scheduler with this name doesn't exist + * in the scheduler list. + */ + sched = ip_vs_sched_getbyname(scheduler->name); + if (sched) { + ip_vs_scheduler_put(sched); + MOD_DEC_USE_COUNT; + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already existed in the system\n", scheduler->name); + return -EINVAL; + } + + write_lock_bh(&__ip_vs_sched_lock); + + if (scheduler->n_list.next != &scheduler->n_list) { + write_unlock_bh(&__ip_vs_sched_lock); + MOD_DEC_USE_COUNT; + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already linked\n", scheduler->name); + return -EINVAL; + } + + /* + * Add it into the d-linked scheduler list + */ + list_add(&scheduler->n_list, &ip_vs_schedulers); + write_unlock_bh(&__ip_vs_sched_lock); + + IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); + + return 0; +} + + +/* + * Unregister a scheduler from the scheduler list + */ +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + if (!scheduler) { + IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + write_lock_bh(&__ip_vs_sched_lock); + if (scheduler->n_list.next == &scheduler->n_list) { + write_unlock_bh(&__ip_vs_sched_lock); + IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " + "is not in the list. failed\n", scheduler->name); + return -EINVAL; + } + + /* + * Remove it from the d-linked scheduler list + */ + list_del(&scheduler->n_list); + write_unlock_bh(&__ip_vs_sched_lock); + + MOD_DEC_USE_COUNT; + + IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); + + return 0; +} diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_sed.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_sed.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_sed.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_sed.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,167 @@ +/* + * IPVS: Shortest Expected Delay scheduling module + * + * Version: $Id: ip_vs_sed.c,v 1.1.2.1 2003/05/20 17:05:02 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The SED algorithm attempts to minimize each job's expected delay until + * completion. The expected delay that the job will experience is + * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of + * jobs on the the ith server and Ui is the fixed service rate (weight) of + * the ith server. The SED algorithm adopts a greedy policy that each does + * what is in its own best interest, i.e. to join the queue which would + * minimize its expected delay of completion. + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri for talking SED to me. + * + * The difference between SED and WLC is that SED includes the incoming + * job in the cost function (the increment of 1). SED may outperform + * WLC, while scheduling big jobs under larger heterogeneous systems + * (the server weight varies a lot). + * + */ + +#include +#include +#include +#include +#include +#include + +#include + + +static int +ip_vs_sed_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_sed_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_sed_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_sed_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + least = list_entry(e, struct ip_vs_dest, n_list); + if (atomic_read(&least->weight) > 0) { + loh = ip_vs_sed_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + doh = ip_vs_sed_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_sed_scheduler = +{ + .name = "sed", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_sed_init_svc, + .done_service = ip_vs_sed_done_svc, + .update_service = ip_vs_sed_update_svc, + .schedule = ip_vs_sed_schedule, +}; + + +static int __init ip_vs_sed_init(void) +{ + INIT_LIST_HEAD(&ip_vs_sed_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +static void __exit ip_vs_sed_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +module_init(ip_vs_sed_init); +module_exit(ip_vs_sed_cleanup); +MODULE_LICENSE("GPL"); diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_sh.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_sh.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_sh.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_sh.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,262 @@ +/* + * IPVS: Source Hashing scheduling module + * + * Version: $Id: ip_vs_sh.c,v 1.4 2001/10/19 15:05:17 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The sh algorithm is to select server by the hash key of source IP + * address. The pseudo code is as follows: + * + * n <- servernode[src_ip]; + * if (n is dead) OR + * (n is overloaded, such as n.conns>2*n.weight) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet source IP address to the current server + * array. If the sh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#include +#include +#include +#include +#include +#include + +#include + + +/* + * IPVS SH bucket + */ +struct ip_vs_sh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS SH entry hash table + */ +#ifndef CONFIG_IP_VS_SH_TAB_BITS +#define CONFIG_IP_VS_SH_TAB_BITS 8 +#endif +#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS +#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) +#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS SH entry + */ +static inline unsigned ip_vs_sh_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __u32 addr) +{ + return (tbl[ip_vs_sh_hashkey(addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_sh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; idest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +{ + int i; + struct ip_vs_sh_bucket *b; + + b = tbl; + for (i=0; idest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_sh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl; + + /* allocate the SH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "SH hash table (memory=%dbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "SH hash table (memory=%dbytes) released\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +/* + * If the number of active connections is twice larger than its weight, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)*2) { + return 1; + } + return 0; +} + + +/* + * Source Hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_sh_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_sh_bucket *tbl; + + IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_sh_bucket *)svc->sched_data; + dest = ip_vs_sh_get(tbl, iph->saddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->saddr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS SH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_sh_scheduler = +{ + {0}, /* n_list */ + "sh", /* name */ + ATOMIC_INIT(0), /* refcnt */ + THIS_MODULE, /* this module */ + ip_vs_sh_init_svc, /* service initializer */ + ip_vs_sh_done_svc, /* service done */ + ip_vs_sh_update_svc, /* service updater */ + ip_vs_sh_schedule, /* select a server from the destination list */ +}; + + +static int __init ip_vs_sh_init(void) +{ + INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +static void __exit ip_vs_sh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +module_init(ip_vs_sh_init); +module_exit(ip_vs_sh_cleanup); +MODULE_LICENSE("GPL"); diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_sync.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_sync.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_sync.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_sync.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,793 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_sync.c,v 1.8 2002/08/17 14:06:02 wensong Exp $ + * + * Authors: Wensong Zhang + * + * ip_vs_sync: sync connection info from master load balancer to backups + * through multicast + */ + +#define __KERNEL_SYSCALLS__ /* for waitpid */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include /* for ip_mc_join_group */ + +#include +#include +#include /* for get_fs and set_fs */ + +#include + +#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ +#define IP_VS_SYNC_PORT 8848 /* multicast port */ + + +/* + * IPVS sync connection entry + */ +struct ip_vs_sync_conn { + __u8 reserved; + + /* Protocol, addresses and port numbers */ + __u8 protocol; /* Which protocol (TCP/UDP) */ + __u16 cport; + __u16 vport; + __u16 dport; + __u32 caddr; /* client address */ + __u32 vaddr; /* virtual address */ + __u32 daddr; /* destination address */ + + /* Flags and state transition */ + __u16 flags; /* status flags */ + __u16 state; /* state info */ + + /* The sequence options start here */ +}; + +struct ip_vs_sync_conn_options { + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ +}; + +#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ) +#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) +#define FULL_CONN_SIZE \ +(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) + + +/* + The master mulitcasts messages to the backup load balancers in the + following format. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | Reserved | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | . | + | . | + | . | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (n) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ +#define SYNC_MESG_MAX_SIZE (24*50+4) +struct ip_vs_sync_mesg { + __u8 nr_conns; + __u8 reserved; + __u16 size; + + /* ip_vs_sync_conn entries start here */ +}; + + +struct ip_vs_sync_buff { + struct list_head list; + unsigned long firstuse; + + /* pointers for the message data */ + struct ip_vs_sync_mesg *mesg; + unsigned char *head; + unsigned char *end; +}; + + +/* the sync_buff list head and the lock */ +static LIST_HEAD(ip_vs_sync_queue); +static spinlock_t ip_vs_sync_lock = SPIN_LOCK_UNLOCKED; + +/* current sync_buff for accepting new conn entries */ +static struct ip_vs_sync_buff *curr_sb = NULL; +static spinlock_t curr_sb_lock = SPIN_LOCK_UNLOCKED; + +static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) +{ + spin_lock(&ip_vs_sync_lock); + list_add_tail(&sb->list, &ip_vs_sync_queue); + spin_unlock(&ip_vs_sync_lock); +} + +static inline struct ip_vs_sync_buff * sb_dequeue(void) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ip_vs_sync_lock); + if (list_empty(&ip_vs_sync_queue)) { + sb = NULL; + } else { + sb = list_entry(ip_vs_sync_queue.next, + struct ip_vs_sync_buff, + list); + list_del(&sb->list); + } + spin_unlock_bh(&ip_vs_sync_lock); + + return sb; +} + +static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) +{ + struct ip_vs_sync_buff *sb; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + if (!(sb->mesg=kmalloc(SYNC_MESG_MAX_SIZE, GFP_ATOMIC))) { + kfree(sb); + return NULL; + } + sb->mesg->nr_conns = 0; + sb->mesg->size = 4; + sb->head = (unsigned char *)sb->mesg + 4; + sb->end = (unsigned char *)sb->mesg + SYNC_MESG_MAX_SIZE; + sb->firstuse = jiffies; + return sb; +} + +static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) +{ + kfree(sb->mesg); + kfree(sb); +} + +/* + * Get the current sync buffer if it has been created for more + * than the specified time or the specified time is zero. + */ +static inline struct ip_vs_sync_buff * +get_curr_sync_buff(unsigned long time) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&curr_sb_lock); + if (curr_sb && + (jiffies - curr_sb->firstuse > time || time == 0)) { + sb = curr_sb; + curr_sb = NULL; + } else + sb = NULL; + spin_unlock_bh(&curr_sb_lock); + return sb; +} + + +/* + * Add an ip_vs_conn information into the current sync_buff. + * Called by ip_vs_in. + */ +void ip_vs_sync_conn(struct ip_vs_conn *cp) +{ + struct ip_vs_sync_mesg *m; + struct ip_vs_sync_conn *s; + int len; + + spin_lock(&curr_sb_lock); + if (!curr_sb) { + if (!(curr_sb=ip_vs_sync_buff_create())) { + spin_unlock(&curr_sb_lock); + IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); + return; + } + } + + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; + m = curr_sb->mesg; + s = (struct ip_vs_sync_conn *)curr_sb->head; + + /* copy members */ + s->protocol = cp->protocol; + s->cport = cp->cport; + s->vport = cp->vport; + s->dport = cp->dport; + s->caddr = cp->caddr; + s->vaddr = cp->vaddr; + s->daddr = cp->daddr; + s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); + s->state = htons(cp->state); + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + struct ip_vs_sync_conn_options *opt = + (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(opt, &cp->in_seq, sizeof(*opt)); + } + + m->nr_conns++; + m->size += len; + curr_sb->head += len; + + /* check if there is a space for next one */ + if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { + sb_queue_tail(curr_sb); + curr_sb = NULL; + } + spin_unlock(&curr_sb_lock); + + /* synchronize its controller if it has */ + if (cp->control) + ip_vs_sync_conn(cp->control); +} + + +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + */ +static void ip_vs_process_message(const char *buffer, const size_t buflen) +{ + struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; + struct ip_vs_sync_conn *s; + struct ip_vs_sync_conn_options *opt; + struct ip_vs_conn *cp; + char *p; + int i; + + if (buflen != m->size) { + IP_VS_ERR("bogus message\n"); + return; + } + + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); + for (i=0; inr_conns; i++) { + s = (struct ip_vs_sync_conn *)p; + cp = ip_vs_conn_in_get(s->protocol, + s->caddr, s->cport, + s->vaddr, s->vport); + if (!cp) { + cp = ip_vs_conn_new(s->protocol, + s->caddr, s->cport, + s->vaddr, s->vport, + s->daddr, s->dport, + ntohs(s->flags), NULL); + if (!cp) { + IP_VS_ERR("ip_vs_conn_new failed\n"); + return; + } + cp->state = ntohs(s->state); + } else if (!cp->dest) { + /* it is an entry created by the synchronization */ + cp->state = ntohs(s->state); + cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED; + } /* Note that we don't touch its state and flags + if it is a normal entry. */ + + if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) { + opt = (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(&cp->in_seq, opt, sizeof(*opt)); + p += FULL_CONN_SIZE; + } else + p += SIMPLE_CONN_SIZE; + + atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold); + cp->timeout = IP_VS_SYNC_CONN_TIMEOUT; + ip_vs_conn_put(cp); + + if (p > buffer+buflen) { + IP_VS_ERR("bogus message\n"); + return; + } + } +} + + +/* ipvs sync daemon state */ +volatile int ip_vs_sync_state = IP_VS_STATE_NONE; + +/* multicast interface name */ +char ip_vs_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + +/* multicast addr */ +static struct sockaddr_in mcast_addr; + + +/* + * Setup loopback of outgoing multicasts on a sending socket + */ +static void set_mcast_loop(struct sock *sk, u_char loop) +{ + /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ + lock_sock(sk); + sk->protinfo.af_inet.mc_loop = loop ? 1 : 0; + release_sock(sk); +} + +/* + * Specify TTL for outgoing multicasts on a sending socket + */ +static void set_mcast_ttl(struct sock *sk, u_char ttl) +{ + /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ + lock_sock(sk); + sk->protinfo.af_inet.mc_ttl = ttl; + release_sock(sk); +} + +/* + * Specifiy default interface for outgoing multicasts + */ +static int set_mcast_if(struct sock *sk, char *ifname) +{ + struct net_device *dev; + + if ((dev = __dev_get_by_name(ifname)) == NULL) + return -ENODEV; + + if (sk->bound_dev_if && dev->ifindex != sk->bound_dev_if) + return -EINVAL; + + lock_sock(sk); + sk->protinfo.af_inet.mc_index = dev->ifindex; + /* sk->protinfo.af_inet.mc_addr = 0; */ + release_sock(sk); + + return 0; +} + +/* + * Join a multicast group. + * the group is specified by a class D multicast address 224.0.0.0/8 + * in the in_addr structure passed in as a parameter. + */ +static int +join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) +{ + struct ip_mreqn mreq; + struct net_device *dev; + int ret; + + memset(&mreq, 0, sizeof(mreq)); + memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); + + if ((dev = __dev_get_by_name(ifname)) == NULL) + return -ENODEV; + if (sk->bound_dev_if && dev->ifindex != sk->bound_dev_if) + return -EINVAL; + + mreq.imr_ifindex = dev->ifindex; + + lock_sock(sk); + ret = ip_mc_join_group(sk, &mreq); + release_sock(sk); + + return ret; +} + + +static int bind_mcastif_addr(struct socket *sock, char *ifname) +{ + struct net_device *dev; + u32 addr; + struct sockaddr_in sin; + + if ((dev = __dev_get_by_name(ifname)) == NULL) + return -ENODEV; + + addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + if (!addr) + IP_VS_ERR("You probably need to specify IP address on " + "multicast interface.\n"); + + IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", + ifname, NIPQUAD(addr)); + + /* Now bind the socket with the address of multicast interface */ + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + sin.sin_port = 0; + + return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); +} + +/* + * Set up sending multicast socket over UDP + */ +static struct socket * make_send_sock(void) +{ + struct socket *sock; + + /* First create a socket */ + if (sock_create(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { + IP_VS_ERR("Error during creation of socket; terminating\n"); + return NULL; + } + + if (set_mcast_if(sock->sk, ip_vs_mcast_ifn) < 0) { + IP_VS_ERR("Error setting outbound mcast interface\n"); + goto error; + } + + set_mcast_loop(sock->sk, 0); + set_mcast_ttl(sock->sk, 1); + + if (bind_mcastif_addr(sock, ip_vs_mcast_ifn) < 0) { + IP_VS_ERR("Error binding address of the mcast interface\n"); + goto error; + } + + if (sock->ops->connect(sock, + (struct sockaddr*)&mcast_addr, + sizeof(struct sockaddr), 0) < 0) { + IP_VS_ERR("Error connecting to the multicast addr\n"); + goto error; + } + + return sock; + + error: + sock_release(sock); + return NULL; +} + + +/* + * Set up receiving multicast socket over UDP + */ +static struct socket * make_receive_sock(void) +{ + struct socket *sock; + + /* First create a socket */ + if (sock_create(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { + IP_VS_ERR("Error during creation of socket; terminating\n"); + return NULL; + } + + /* it is equivalent to the REUSEADDR option in user-space */ + sock->sk->reuse = 1; + + if (sock->ops->bind(sock, + (struct sockaddr*)&mcast_addr, + sizeof(struct sockaddr)) < 0) { + IP_VS_ERR("Error binding to the multicast addr\n"); + goto error; + } + + /* join the multicast group */ + if (join_mcast_group(sock->sk, + (struct in_addr*)&mcast_addr.sin_addr, + ip_vs_mcast_ifn) < 0) { + IP_VS_ERR("Error joining to the multicast group\n"); + goto error; + } + + return sock; + + error: + sock_release(sock); + return NULL; +} + + +static int +ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) +{ + struct msghdr msg; + mm_segment_t oldfs; + struct iovec iov; + int len; + + EnterFunction(7); + iov.iov_base = (void *)buffer; + iov.iov_len = length; + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL; + + oldfs = get_fs(); set_fs(KERNEL_DS); + len = sock_sendmsg(sock, &msg, (size_t)(length)); + set_fs(oldfs); + + LeaveFunction(7); + return len; +} + + +static int +ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) +{ + struct msghdr msg; + struct iovec iov; + int len; + mm_segment_t oldfs; + + EnterFunction(7); + + /* Receive a packet */ + iov.iov_base = buffer; + iov.iov_len = (size_t)buflen; + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + oldfs = get_fs(); set_fs(KERNEL_DS); + len = sock_recvmsg(sock, &msg, buflen, 0); + set_fs(oldfs); + + if (len < 0) + return -1; + + LeaveFunction(7); + return len; +} + + +static int errno; + +static DECLARE_WAIT_QUEUE_HEAD(sync_wait); +static pid_t sync_pid = 0; + +static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait); +static int stop_sync = 0; + +static void sync_master_loop(void) +{ + struct socket *sock; + struct ip_vs_sync_buff *sb; + struct ip_vs_sync_mesg *m; + + /* create the sending multicast socket */ + sock = make_send_sock(); + if (!sock) + return; + + for (;;) { + while ((sb=sb_dequeue())) { + m = sb->mesg; + if (ip_vs_send_async(sock, (char *)m, + m->size) != m->size) + IP_VS_ERR("ip_vs_send_async error\n"); + ip_vs_sync_buff_release(sb); + } + + /* check if entries stay in curr_sb for 2 seconds */ + if ((sb = get_curr_sync_buff(2*HZ))) { + m = sb->mesg; + if (ip_vs_send_async(sock, (char *)m, + m->size) != m->size) + IP_VS_ERR("ip_vs_send_async error\n"); + ip_vs_sync_buff_release(sb); + } + + if (stop_sync) + break; + + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + __set_current_state(TASK_RUNNING); + } + + /* clean up the sync_buff queue */ + while ((sb=sb_dequeue())) { + ip_vs_sync_buff_release(sb); + } + + /* clean up the current sync_buff */ + if ((sb = get_curr_sync_buff(0))) { + ip_vs_sync_buff_release(sb); + } + + /* release the sending multicast socket */ + sock_release(sock); +} + + +static void sync_backup_loop(void) +{ + struct socket *sock; + char *buf; + int len; + + if (!(buf=kmalloc(SYNC_MESG_MAX_SIZE, GFP_ATOMIC))) { + IP_VS_ERR("sync_backup_loop: kmalloc error\n"); + return; + } + + /* create the receiving multicast socket */ + sock = make_receive_sock(); + if (!sock) + goto out; + + for (;;) { + /* do you have data now? */ + while (!skb_queue_empty(&(sock->sk->receive_queue))) { + if ((len=ip_vs_receive(sock, buf, + SYNC_MESG_MAX_SIZE))<=0) { + IP_VS_ERR("receiving message error\n"); + break; + } + /* disable bottom half, because it accessed the data + shared by softirq while getting/creating conns */ + local_bh_disable(); + ip_vs_process_message(buf, len); + local_bh_enable(); + } + + if (stop_sync) + break; + + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + __set_current_state(TASK_RUNNING); + } + + /* release the sending multicast socket */ + sock_release(sock); + + out: + kfree(buf); +} + + +static int sync_thread(void *startup) +{ + DECLARE_WAITQUEUE(wait, current); + mm_segment_t oldmm; + int state; + + MOD_INC_USE_COUNT; + daemonize(); + + oldmm = get_fs(); + set_fs(KERNEL_DS); + + if (ip_vs_sync_state == IP_VS_STATE_MASTER) + sprintf(current->comm, "ipvs syncmaster"); + else if (ip_vs_sync_state == IP_VS_STATE_BACKUP) + sprintf(current->comm, "ipvs syncbackup"); + else IP_VS_BUG(); + + spin_lock_irq(¤t->sigmask_lock); + siginitsetinv(¤t->blocked, 0); + recalc_sigpending(current); + spin_unlock_irq(¤t->sigmask_lock); + + /* set up multicast address */ + mcast_addr.sin_family = AF_INET; + mcast_addr.sin_port = htons(IP_VS_SYNC_PORT); + mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP); + + add_wait_queue(&sync_wait, &wait); + + state = ip_vs_sync_state; + sync_pid = current->pid; + IP_VS_INFO("sync thread started.\n"); + complete((struct completion *)startup); + + /* processing master/backup loop here */ + if (state == IP_VS_STATE_MASTER) + sync_master_loop(); + else if (state == IP_VS_STATE_BACKUP) + sync_backup_loop(); + else IP_VS_BUG(); + + remove_wait_queue(&sync_wait, &wait); + + /* thread exits */ + sync_pid = 0; + IP_VS_INFO("sync thread stopped!\n"); + + set_fs(oldmm); + MOD_DEC_USE_COUNT; + + stop_sync = 0; + wake_up(&stop_sync_wait); + + return 0; +} + + +static int fork_sync_thread(void *startup) +{ + /* fork the sync thread here, then the parent process of the + sync thread is the init process after this thread exits. */ + if (kernel_thread(sync_thread, startup, 0) < 0) + IP_VS_BUG(); + return 0; +} + + +int start_sync_thread(int state, char *mcast_ifn) +{ + DECLARE_COMPLETION(startup); + pid_t pid; + int waitpid_result; + + if (sync_pid) + return -EEXIST; + + IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); + IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %d bytes\n", + sizeof(struct ip_vs_sync_conn)); + + ip_vs_sync_state = state; + strcpy(ip_vs_mcast_ifn, mcast_ifn); + + if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) + IP_VS_BUG(); + + if ((waitpid_result = waitpid(pid, NULL, __WCLONE)) != pid) { + IP_VS_ERR("%s: waitpid(%d,...) failed, errno %d\n", + __FUNCTION__, pid, -waitpid_result); + } + + wait_for_completion(&startup); + + return 0; +} + + +int stop_sync_thread(void) +{ + DECLARE_WAITQUEUE(wait, current); + + if (!sync_pid) + return -ESRCH; + + IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); + IP_VS_INFO("stopping sync thread %d ...\n", sync_pid); + + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&stop_sync_wait, &wait); + ip_vs_sync_state = IP_VS_STATE_NONE; + stop_sync = 1; + wake_up(&sync_wait); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&stop_sync_wait, &wait); + + /* Note: no need to reap the sync thread, because its parent + process is the init process */ + + if (stop_sync) + IP_VS_BUG(); + + return 0; +} diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_wlc.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_wlc.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_wlc.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_wlc.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,157 @@ +/* + * IPVS: Weighted Least-Connection Scheduling module + * + * Version: $Id: ip_vs_wlc.c,v 1.10.2.1 2003/04/11 14:02:35 wensong Exp $ + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest + * Wensong Zhang : changed to use the inactconns in scheduling + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wlc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include +#include +#include +#include +#include + +#include + + +static int +ip_vs_wlc_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_wlc_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_wlc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + least = list_entry(e, struct ip_vs_dest, n_list); + if (atomic_read(&least->weight) > 0) { + loh = ip_vs_wlc_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + + doh = ip_vs_wlc_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_wlc_scheduler = +{ + {0}, /* n_list */ + "wlc", /* name */ + ATOMIC_INIT(0), /* refcnt */ + THIS_MODULE, /* this module */ + ip_vs_wlc_init_svc, /* service initializer */ + ip_vs_wlc_done_svc, /* service done */ + ip_vs_wlc_update_svc, /* service updater */ + ip_vs_wlc_schedule, /* select a server from the destination list */ +}; + + +static int __init ip_vs_wlc_init(void) +{ + INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +static void __exit ip_vs_wlc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +module_init(ip_vs_wlc_init); +module_exit(ip_vs_wlc_cleanup); +MODULE_LICENSE("GPL"); diff -urN linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_wrr.c linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_wrr.c --- linux-2.4.22-bk1/net/ipv4/ipvs/ip_vs_wrr.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/ipvs/ip_vs_wrr.c 2003-08-26 15:54:22.000000000 -0700 @@ -0,0 +1,240 @@ +/* + * IPVS: Weighted Round-Robin Scheduling module + * + * Version: $Id: ip_vs_wrr.c,v 1.11 2002/03/25 12:44:35 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wrr_update_svc + * Julian Anastasov : fixed the bug of returning destination + * with weight 0 when all weights are zero + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +/* + * current destination pointer for weighted round-robin scheduling + */ +struct ip_vs_wrr_mark { + struct list_head *cl; /* current list head */ + int cw; /* current weight */ + int mw; /* maximum weight */ + int di; /* decreasing interval */ +}; + + +/* + * Get the gcd of server weights + */ +static int gcd(int a, int b) +{ + int c; + + while ((c = a % b)) { + a = b; + b = c; + } + return b; +} + +static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest; + int weight; + int g = 1; + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + weight = atomic_read(&dest->weight); + if (weight > 0) { + g = weight; + break; + } + } + if (e == l) + return g; + + for (e=e->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + weight = atomic_read(&dest->weight); + if (weight > 0) + g = gcd(weight, g); + } + + return g; +} + + +/* + * Get the maximum weight of the service destinations. + */ +static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) +{ + register struct list_head *l, *e; + struct ip_vs_dest *dest; + int weight = 0; + + l = &svc->destinations; + for (e=l->next; e!=l; e=e->next) { + dest = list_entry(e, struct ip_vs_dest, n_list); + if (atomic_read(&dest->weight) > weight) + weight = atomic_read(&dest->weight); + } + + return weight; +} + + +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark; + + /* + * Allocate the mark variable for WRR scheduling + */ + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); + if (mark == NULL) { + IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); + return -ENOMEM; + } + mark->cl = &svc->destinations; + mark->cw = 0; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + svc->sched_data = mark; + + return 0; +} + + +static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +{ + /* + * Release the mark variable + */ + kfree(svc->sched_data); + + return 0; +} + + +static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark = svc->sched_data; + + mark->cl = &svc->destinations; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + return 0; +} + + +/* + * Weighted Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_wrr_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_wrr_mark *mark = svc->sched_data; + + IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); + + /* + * This loop will always terminate, because 0cwsched_lock); + while (1) { + if (mark->cl == &svc->destinations) { + /* it is at the head of the destination list */ + + if (mark->cl == mark->cl->next) { + /* no dest entry */ + write_unlock(&svc->sched_lock); + return NULL; + } + + mark->cl = svc->destinations.next; + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* + * Still zero, which means no availabe servers. + */ + if (mark->cw == 0) { + mark->cl = &svc->destinations; + write_unlock(&svc->sched_lock); + IP_VS_INFO("ip_vs_wrr_schedule(): " + "no available servers\n"); + return NULL; + } + } + } + else mark->cl = mark->cl->next; + + if (mark->cl != &svc->destinations) { + /* not at the head of the list */ + dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + if (atomic_read(&dest->weight) >= mark->cw) { + write_unlock(&svc->sched_lock); + break; + } + } + } + + IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), + atomic_read(&dest->weight)); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_wrr_scheduler = { + {0}, /* n_list */ + "wrr", /* name */ + ATOMIC_INIT(0), /* refcnt */ + THIS_MODULE, /* this module */ + ip_vs_wrr_init_svc, /* service initializer */ + ip_vs_wrr_done_svc, /* service done */ + ip_vs_wrr_update_svc, /* service updater */ + ip_vs_wrr_schedule, /* select a server from the destination list */ +}; + +static int __init ip_vs_wrr_init(void) +{ + INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; +} + +static void __exit ip_vs_wrr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); +} + +module_init(ip_vs_wrr_init); +module_exit(ip_vs_wrr_cleanup); +MODULE_LICENSE("GPL"); diff -urN linux-2.4.22-bk1/net/ipv4/netfilter/ip_conntrack_core.c linux-2.4.22-bk2/net/ipv4/netfilter/ip_conntrack_core.c --- linux-2.4.22-bk1/net/ipv4/netfilter/ip_conntrack_core.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/ipv4/netfilter/ip_conntrack_core.c 2003-08-26 15:54:22.000000000 -0700 @@ -291,14 +291,15 @@ static void clean_from_lists(struct ip_conntrack *ct) { + unsigned int ho, hr; + DEBUGP("clean_from_lists(%p)\n", ct); MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); - LIST_DELETE(&ip_conntrack_hash - [hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)], - &ct->tuplehash[IP_CT_DIR_ORIGINAL]); - LIST_DELETE(&ip_conntrack_hash - [hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)], - &ct->tuplehash[IP_CT_DIR_REPLY]); + + ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); /* Destroy all un-established, pending expectations */ remove_expectations(ct, 1); @@ -370,9 +371,10 @@ const struct ip_conntrack *ignored_conntrack) { struct ip_conntrack_tuple_hash *h; + unsigned int hash = hash_conntrack(tuple); MUST_BE_READ_LOCKED(&ip_conntrack_lock); - h = LIST_FIND(&ip_conntrack_hash[hash_conntrack(tuple)], + h = LIST_FIND(&ip_conntrack_hash[hash], conntrack_tuple_cmp, struct ip_conntrack_tuple_hash *, tuple, ignored_conntrack); diff -urN linux-2.4.22-bk1/net/ipv4/netfilter/ip_fw_compat.c linux-2.4.22-bk2/net/ipv4/netfilter/ip_fw_compat.c --- linux-2.4.22-bk1/net/ipv4/netfilter/ip_fw_compat.c 2001-12-21 09:42:05.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/netfilter/ip_fw_compat.c 2003-08-26 15:54:22.000000000 -0700 @@ -47,6 +47,12 @@ extern int __init masq_init(void); extern void masq_cleanup(void); +#ifdef CONFIG_IP_VS +/* From ip_vs_core.c */ +extern unsigned int +check_for_ip_vs_out(struct sk_buff **skb_p, int (*okfn)(struct sk_buff *)); +#endif + /* They call these; we do what they want. */ int register_firewall(int pf, struct firewall_ops *fw) { @@ -172,8 +178,14 @@ return NF_ACCEPT; case FW_MASQUERADE: - if (hooknum == NF_IP_FORWARD) + if (hooknum == NF_IP_FORWARD) { +#ifdef CONFIG_IP_VS + /* check if it is for ip_vs */ + if (check_for_ip_vs_out(pskb, okfn) == NF_STOLEN) + return NF_STOLEN; +#endif return do_masquerade(pskb, out); + } else return NF_ACCEPT; case FW_REDIRECT: diff -urN linux-2.4.22-bk1/net/ipv4/netfilter/ip_nat_core.c linux-2.4.22-bk2/net/ipv4/netfilter/ip_nat_core.c --- linux-2.4.22-bk1/net/ipv4/netfilter/ip_nat_core.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/ipv4/netfilter/ip_nat_core.c 2003-08-26 15:54:22.000000000 -0700 @@ -67,6 +67,7 @@ static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn) { struct ip_nat_info *info = &conn->nat.info; + unsigned int hs, hp; if (!info->initialized) return; @@ -74,21 +75,18 @@ IP_NF_ASSERT(info->bysource.conntrack); IP_NF_ASSERT(info->byipsproto.conntrack); + hs = hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src, + conn->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum); + + hp = hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip, + conn->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip, + conn->tuplehash[IP_CT_DIR_REPLY] + .tuple.dst.protonum); + WRITE_LOCK(&ip_nat_lock); - LIST_DELETE(&bysource[hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.src, - conn->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.protonum)], - &info->bysource); - - LIST_DELETE(&byipsproto - [hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY] - .tuple.src.ip, - conn->tuplehash[IP_CT_DIR_REPLY] - .tuple.dst.ip, - conn->tuplehash[IP_CT_DIR_REPLY] - .tuple.dst.protonum)], - &info->byipsproto); + LIST_DELETE(&bysource[hs], &info->bysource); + LIST_DELETE(&byipsproto[hp], &info->byipsproto); WRITE_UNLOCK(&ip_nat_lock); } @@ -244,11 +242,12 @@ const struct ip_conntrack *conntrack) { unsigned int score = 0; + unsigned int h; MUST_BE_READ_LOCKED(&ip_nat_lock); - LIST_FIND(&byipsproto[hash_by_ipsproto(src, dst, protonum)], - fake_cmp, struct ip_nat_hash *, src, dst, protonum, &score, - conntrack); + h = hash_by_ipsproto(src, dst, protonum); + LIST_FIND(&byipsproto[h], fake_cmp, struct ip_nat_hash *, + src, dst, protonum, &score, conntrack); return score; } diff -urN linux-2.4.22-bk1/net/ipv4/netfilter/ip_nat_tftp.c linux-2.4.22-bk2/net/ipv4/netfilter/ip_nat_tftp.c --- linux-2.4.22-bk1/net/ipv4/netfilter/ip_nat_tftp.c 2003-06-13 07:51:39.000000000 -0700 +++ linux-2.4.22-bk2/net/ipv4/netfilter/ip_nat_tftp.c 2003-08-26 15:54:22.000000000 -0700 @@ -153,7 +153,7 @@ static int __init init(void) { - int i, ret; + int i, ret = 0; char *tmpname; if (!ports[0]) diff -urN linux-2.4.22-bk1/net/ipv4/netfilter/ipt_LOG.c linux-2.4.22-bk2/net/ipv4/netfilter/ipt_LOG.c --- linux-2.4.22-bk1/net/ipv4/netfilter/ipt_LOG.c 2002-02-25 11:38:14.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/netfilter/ipt_LOG.c 2003-08-26 15:54:22.000000000 -0700 @@ -3,15 +3,14 @@ */ #include #include -#include #include +#include #include #include #include -#include - -struct in_device; #include + +#include #include #if 0 @@ -20,10 +19,20 @@ #define DEBUGP(format, args...) #endif +/* FIXME: move to ip.h like in 2.5 */ +struct ahhdr { + __u8 nexthdr; + __u8 hdrlen; + __u16 reserved; + __u32 spi; + __u32 seq_no; +}; + struct esphdr { __u32 spi; -}; /* FIXME evil kludge */ - + __u32 seq_no; +}; + /* Use lock to serialize, so printks don't overlap */ static spinlock_t log_lock = SPIN_LOCK_UNLOCKED; @@ -58,7 +67,8 @@ printk("FRAG:%u ", ntohs(iph->frag_off) & IP_OFFSET); if ((info->logflags & IPT_LOG_IPOPT) - && iph->ihl * 4 != sizeof(struct iphdr)) { + && iph->ihl * 4 != sizeof(struct iphdr) + && iph->ihl * 4 >= datalen) { unsigned int i; /* Max length: 127 "OPT (" 15*4*2chars ") " */ @@ -230,13 +240,30 @@ break; } /* Max Length */ - case IPPROTO_AH: + case IPPROTO_AH: { + struct ahhdr *ah = protoh; + + /* Max length: 9 "PROTO=AH " */ + printk("PROTO=AH "); + + if (ntohs(iph->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (datalen < sizeof (*ah)) { + printk("INCOMPLETE [%u bytes] ", datalen); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + printk("SPI=0x%x ", ntohl(ah->spi) ); + break; + } case IPPROTO_ESP: { struct esphdr *esph = protoh; - int esp= (iph->protocol==IPPROTO_ESP); /* Max length: 10 "PROTO=ESP " */ - printk("PROTO=%s ",esp? "ESP" : "AH"); + printk("PROTO=ESP "); if (ntohs(iph->frag_off) & IP_OFFSET) break; diff -urN linux-2.4.22-bk1/net/ipv4/netfilter/ipt_MASQUERADE.c linux-2.4.22-bk2/net/ipv4/netfilter/ipt_MASQUERADE.c --- linux-2.4.22-bk1/net/ipv4/netfilter/ipt_MASQUERADE.c 2001-09-30 12:26:08.000000000 -0700 +++ linux-2.4.22-bk2/net/ipv4/netfilter/ipt_MASQUERADE.c 2003-08-26 15:54:22.000000000 -0700 @@ -87,13 +87,20 @@ key.dst = (*pskb)->nh.iph->daddr; key.src = 0; /* Unknown: that's what we're trying to establish */ key.tos = RT_TOS((*pskb)->nh.iph->tos)|RTO_CONN; - key.oif = out->ifindex; #ifdef CONFIG_IP_ROUTE_FWMARK key.fwmark = (*pskb)->nfmark; #endif if (ip_route_output_key(&rt, &key) != 0) { - /* Shouldn't happen */ - printk("MASQUERADE: No route: Rusty's brain broke!\n"); + /* Funky routing can do this. */ + if (net_ratelimit()) + printk("MASQUERADE:" + " No route: Rusty's brain broke!\n"); + return NF_DROP; + } + if (rt->u.dst.dev != out) { + if (net_ratelimit()) + printk("MASQUERADE:" + " Route sent us somewhere else.\n"); return NF_DROP; } diff -urN linux-2.4.22-bk1/net/ipv4/netfilter/ipt_ah.c linux-2.4.22-bk2/net/ipv4/netfilter/ipt_ah.c --- linux-2.4.22-bk1/net/ipv4/netfilter/ipt_ah.c 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/netfilter/ipt_ah.c 2003-08-26 15:54:22.000000000 -0700 @@ -15,7 +15,11 @@ #endif struct ahhdr { + __u8 nexthdr; + __u8 hdrlen; + __u16 reserved; __u32 spi; + __u32 seq_no; }; /* Returns 1 if the spi is matched by the range, 0 otherwise */ diff -urN linux-2.4.22-bk1/net/ipv4/netfilter/ipt_esp.c linux-2.4.22-bk2/net/ipv4/netfilter/ipt_esp.c --- linux-2.4.22-bk1/net/ipv4/netfilter/ipt_esp.c 2002-02-25 11:38:14.000000000 -0800 +++ linux-2.4.22-bk2/net/ipv4/netfilter/ipt_esp.c 2003-08-26 15:54:22.000000000 -0700 @@ -16,6 +16,7 @@ struct esphdr { __u32 spi; + __u32 seq_no; }; /* Returns 1 if the spi is matched by the range, 0 otherwise */ diff -urN linux-2.4.22-bk1/net/ipv4/route.c linux-2.4.22-bk2/net/ipv4/route.c --- linux-2.4.22-bk1/net/ipv4/route.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/ipv4/route.c 2003-08-26 15:54:22.000000000 -0700 @@ -375,7 +375,9 @@ */ static inline u32 rt_score(struct rtable *rt) { - u32 score = rt->u.dst.__use; + u32 score = jiffies - rt->u.dst.lastuse; + + score = ~score & ~(3<<30); if (rt_valuable(rt)) score |= (1<<31); @@ -703,8 +705,7 @@ * The second limit is less certain. At the moment it allows * only 2 entries per bucket. We will see. */ - if (chain_length > ip_rt_gc_elasticity || - (chain_length > 1 && !(min_score & (1<<31)))) { + if (chain_length > ip_rt_gc_elasticity) { *candp = cand->u.rt_next; rt_free(cand); } diff -urN linux-2.4.22-bk1/net/ipv6/addrconf.c linux-2.4.22-bk2/net/ipv6/addrconf.c --- linux-2.4.22-bk1/net/ipv6/addrconf.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/ipv6/addrconf.c 2003-08-26 15:54:22.000000000 -0700 @@ -103,7 +103,7 @@ static int addrconf_ifdown(struct net_device *dev, int how); -static void addrconf_dad_start(struct inet6_ifaddr *ifp); +static void addrconf_dad_start(struct inet6_ifaddr *ifp, int flags); static void addrconf_dad_timer(unsigned long data); static void addrconf_dad_completed(struct inet6_ifaddr *ifp); static void addrconf_rs_timer(unsigned long data); @@ -898,7 +898,7 @@ rtmsg.rtmsg_dst_len = 8; rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; rtmsg.rtmsg_ifindex = dev->ifindex; - rtmsg.rtmsg_flags = RTF_UP|RTF_ADDRCONF; + rtmsg.rtmsg_flags = RTF_UP; rtmsg.rtmsg_type = RTMSG_NEWROUTE; ip6_route_add(&rtmsg, NULL); } @@ -925,7 +925,7 @@ struct in6_addr addr; ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); - addrconf_prefix_route(&addr, 64, dev, 0, RTF_ADDRCONF); + addrconf_prefix_route(&addr, 64, dev, 0, 0); } static struct inet6_dev *addrconf_add_dev(struct net_device *dev) @@ -1017,7 +1017,7 @@ } } else if (pinfo->onlink && valid_lft) { addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, - dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES); + dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT); } if (rt) dst_release(&rt->u.dst); @@ -1063,7 +1063,7 @@ return; } - addrconf_dad_start(ifp); + addrconf_dad_start(ifp, RTF_ADDRCONF|RTF_PREFIX_RT); } if (ifp && valid_lft == 0) { @@ -1175,7 +1175,7 @@ ifp = ipv6_add_addr(idev, pfx, plen, scope, IFA_F_PERMANENT); if (!IS_ERR(ifp)) { - addrconf_dad_start(ifp); + addrconf_dad_start(ifp, 0); in6_ifa_put(ifp); return 0; } @@ -1350,7 +1350,7 @@ ifp = ipv6_add_addr(idev, addr, 64, IFA_LINK, IFA_F_PERMANENT); if (!IS_ERR(ifp)) { - addrconf_dad_start(ifp); + addrconf_dad_start(ifp, 0); in6_ifa_put(ifp); } } @@ -1588,8 +1588,7 @@ memset(&rtmsg, 0, sizeof(struct in6_rtmsg)); rtmsg.rtmsg_type = RTMSG_NEWROUTE; rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; - rtmsg.rtmsg_flags = (RTF_ALLONLINK | RTF_ADDRCONF | - RTF_DEFAULT | RTF_UP); + rtmsg.rtmsg_flags = (RTF_ALLONLINK | RTF_DEFAULT | RTF_UP); rtmsg.rtmsg_ifindex = ifp->idev->dev->ifindex; @@ -1603,7 +1602,7 @@ /* * Duplicate Address Detection */ -static void addrconf_dad_start(struct inet6_ifaddr *ifp) +static void addrconf_dad_start(struct inet6_ifaddr *ifp, int flags) { struct net_device *dev; unsigned long rand_num; @@ -1613,7 +1612,7 @@ addrconf_join_solict(dev, &ifp->addr); if (ifp->prefix_len != 128 && (ifp->flags&IFA_F_PERMANENT)) - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 0, RTF_ADDRCONF); + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 0, flags); net_srandom(ifp->addr.s6_addr32[3]); rand_num = net_random() % (ifp->idev->cnf.rtr_solicit_delay ? : 1); @@ -1895,6 +1894,7 @@ unsigned char *b = skb->tail; nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; ifm = NLMSG_DATA(nlh); ifm->ifa_family = AF_INET6; ifm->ifa_prefixlen = ifa->prefix_len; diff -urN linux-2.4.22-bk1/net/ipv6/ndisc.c linux-2.4.22-bk2/net/ipv6/ndisc.c --- linux-2.4.22-bk1/net/ipv6/ndisc.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/ipv6/ndisc.c 2003-08-26 15:54:22.000000000 -0700 @@ -1336,6 +1336,26 @@ return 0; } +static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + switch (event) { + case NETDEV_CHANGEADDR: + neigh_changeaddr(&nd_tbl, dev); + fib6_run_gc(0); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +struct notifier_block ndisc_netdev_notifier = { + .notifier_call = ndisc_netdev_event, +}; + int __init ndisc_init(struct net_proto_family *ops) { struct sock *sk; @@ -1377,6 +1397,7 @@ neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, NET_IPV6_NEIGH, "ipv6"); #endif + register_netdevice_notifier(&ndisc_netdev_notifier); return 0; } diff -urN linux-2.4.22-bk1/net/ipv6/raw.c linux-2.4.22-bk2/net/ipv6/raw.c --- linux-2.4.22-bk1/net/ipv6/raw.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/ipv6/raw.c 2003-08-26 15:54:22.000000000 -0700 @@ -771,6 +771,7 @@ val = -1; else val = opt->offset; + break; default: return -ENOPROTOOPT; diff -urN linux-2.4.22-bk1/net/ipv6/route.c linux-2.4.22-bk2/net/ipv6/route.c --- linux-2.4.22-bk1/net/ipv6/route.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/ipv6/route.c 2003-08-26 15:54:22.000000000 -0700 @@ -1580,13 +1580,19 @@ struct in6_addr *src, int iif, int type, u32 pid, u32 seq, - struct nlmsghdr *in_nlh) + struct nlmsghdr *in_nlh, int prefix) { struct rtmsg *rtm; struct nlmsghdr *nlh; unsigned char *b = skb->tail; struct rta_cacheinfo ci; + if (prefix) { /* user wants prefix routes only */ + if (!(rt->rt6i_flags & RTF_PREFIX_RT)) { + /* success since this is not a prefix route */ + return 1; + } + } if (!pid && in_nlh) { pid = in_nlh->nlmsg_pid; } @@ -1667,10 +1673,17 @@ static int rt6_dump_route(struct rt6_info *rt, void *p_arg) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; + struct rtmsg *rtm; + int prefix; + + rtm = NLMSG_DATA(arg->cb->nlh); + if (rtm) + prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0; + else prefix = 0; return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq, - NULL); + NULL, prefix); } static int fib6_dump_node(struct fib6_walker_t *w) @@ -1821,7 +1834,7 @@ fl.nl_u.ip6_u.saddr, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).pid, - nlh->nlmsg_seq, nlh); + nlh->nlmsg_seq, nlh, 0); if (err < 0) { err = -EMSGSIZE; goto out_free; @@ -1847,7 +1860,7 @@ netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS); return; } - if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0, nlh) < 0) { + if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0, nlh, 0) < 0) { kfree_skb(skb); netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL); return; diff -urN linux-2.4.22-bk1/net/netsyms.c linux-2.4.22-bk2/net/netsyms.c --- linux-2.4.22-bk1/net/netsyms.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/netsyms.c 2003-08-26 15:54:22.000000000 -0700 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -193,6 +194,7 @@ EXPORT_SYMBOL(neigh_parms_release); EXPORT_SYMBOL(neigh_rand_reach_time); EXPORT_SYMBOL(neigh_compat_output); +EXPORT_SYMBOL(neigh_changeaddr); /* dst_entry */ EXPORT_SYMBOL(dst_alloc); @@ -265,6 +267,7 @@ EXPORT_SYMBOL(in_aton); EXPORT_SYMBOL(ip_mc_inc_group); EXPORT_SYMBOL(ip_mc_dec_group); +EXPORT_SYMBOL(ip_mc_join_group); EXPORT_SYMBOL(ip_finish_output); EXPORT_SYMBOL(inet_stream_ops); EXPORT_SYMBOL(inet_dgram_ops); @@ -611,4 +614,10 @@ EXPORT_SYMBOL(wireless_send_event); #endif /* CONFIG_NET_RADIO || CONFIG_NET_PCMCIA_RADIO */ +/* ethtool.c */ +EXPORT_SYMBOL(ethtool_op_get_link); +EXPORT_SYMBOL(ethtool_op_get_tx_csum); +EXPORT_SYMBOL(ethtool_op_get_sg); +EXPORT_SYMBOL(ethtool_op_set_sg); + #endif /* CONFIG_NET */ diff -urN linux-2.4.22-bk1/net/sched/sch_htb.c linux-2.4.22-bk2/net/sched/sch_htb.c --- linux-2.4.22-bk1/net/sched/sch_htb.c 2003-08-25 04:44:44.000000000 -0700 +++ linux-2.4.22-bk2/net/sched/sch_htb.c 2003-08-26 15:54:22.000000000 -0700 @@ -19,9 +19,11 @@ * code review and helpful comments on shaping * Tomasz Wrona, * created test case so that I was able to fix nasty bug + * Wilfried Weissmann + * spotted bug in dequeue code and helped with fix * and many others. thanks. * - * $Id: sch_htb.c,v 1.20 2003/06/18 19:55:49 devik Exp devik $ + * $Id: sch_htb.c,v 1.24 2003/07/28 15:25:23 devik Exp devik $ */ #include #include @@ -73,7 +75,7 @@ #define HTB_HYSTERESIS 1/* whether to use mode hysteresis for speedup */ #define HTB_QLOCK(S) spin_lock_bh(&(S)->dev->queue_lock) #define HTB_QUNLOCK(S) spin_unlock_bh(&(S)->dev->queue_lock) -#define HTB_VER 0x3000c /* major must be matched with number suplied by TC as version */ +#define HTB_VER 0x3000d /* major must be matched with number suplied by TC as version */ #if HTB_VER >> 16 != TC_HTB_PROTOVER #error "Mismatched sch_htb.c and pkt_sch.h" @@ -98,7 +100,8 @@ from LSB */ #ifdef HTB_DEBUG -#define HTB_DBG(S,L,FMT,ARG...) if (((q->debug>>(2*S))&3) >= L) \ +#define HTB_DBG_COND(S,L) (((q->debug>>(2*S))&3) >= L) +#define HTB_DBG(S,L,FMT,ARG...) if (HTB_DBG_COND(S,L)) \ printk(KERN_DEBUG FMT,##ARG) #define HTB_CHCL(cl) BUG_TRAP((cl)->magic == HTB_CMAGIC) #define HTB_PASSQ q, @@ -114,6 +117,7 @@ rb_erase(N,R); \ (N)->rb_color = -1; } while (0) #else +#define HTB_DBG_COND(S,L) (0) #define HTB_DBG(S,L,FMT,ARG...) #define HTB_PASSQ #define HTB_ARGQ @@ -454,12 +458,14 @@ { rb_node_t *p; if ((*n)->rb_right) { + /* child at right. use it or its leftmost ancestor */ *n = (*n)->rb_right; while ((*n)->rb_left) *n = (*n)->rb_left; return; } while ((p = (*n)->rb_parent) != NULL) { + /* if we've arrived from left child then we have next node */ if (p->rb_left == *n) break; *n = p; } @@ -912,6 +918,7 @@ rb_node_t **pptr; } stk[TC_HTB_MAXDEPTH],*sp = stk; + BUG_TRAP(tree->rb_node); sp->root = tree->rb_node; sp->pptr = pptr; @@ -945,15 +952,36 @@ htb_dequeue_tree(struct htb_sched *q,int prio,int level) { struct sk_buff *skb = NULL; - //struct htb_sched *q = (struct htb_sched *)sch->data; struct htb_class *cl,*start; /* look initial class up in the row */ start = cl = htb_lookup_leaf (q->row[level]+prio,prio,q->ptr[level]+prio); do { - BUG_TRAP(cl && cl->un.leaf.q->q.qlen); if (!cl) return NULL; +next: + BUG_TRAP(cl); + if (!cl) return NULL; HTB_DBG(4,1,"htb_deq_tr prio=%d lev=%d cl=%X defic=%d\n", prio,level,cl->classid,cl->un.leaf.deficit[level]); + + /* class can be empty - it is unlikely but can be true if leaf + qdisc drops packets in enqueue routine or if someone used + graft operation on the leaf since last dequeue; + simply deactivate and skip such class */ + if (unlikely(cl->un.leaf.q->q.qlen == 0)) { + struct htb_class *next; + htb_deactivate(q,cl); + + /* row/level might become empty */ + if ((q->row_mask[level] & (1 << prio)) == 0) + return NULL; + + next = htb_lookup_leaf (q->row[level]+prio, + prio,q->ptr[level]+prio); + if (cl == start) /* fix start if we just deleted it */ + start = next; + cl = next; + goto next; + } if (likely((skb = cl->un.leaf.q->dequeue(cl->un.leaf.q)) != NULL)) break; @@ -1201,7 +1229,8 @@ gopt.direct_pkts = q->direct_pkts; #ifdef HTB_DEBUG - htb_debug_dump(q); + if (HTB_DBG_COND(0,2)) + htb_debug_dump(q); #endif gopt.version = HTB_VER; gopt.rate2quantum = q->rate2quantum; @@ -1282,6 +1311,9 @@ return -ENOBUFS; sch_tree_lock(sch); if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) { + if (cl->prio_activity) + htb_deactivate ((struct htb_sched*)sch->data,cl); + /* TODO: is it correct ? Why CBQ doesn't do it ? */ sch->q.qlen -= (*old)->q.qlen; qdisc_reset(*old);