1 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/Documentation/Configure.help linux-2.2.19-vs-1.0.8/Documentation/Configure.help
2 --- linux-2.2.19/Documentation/Configure.help Tue Mar 27 09:33:35 2001
3 +++ linux-2.2.19-vs-1.0.8/Documentation/Configure.help Tue Mar 27 09:32:02 2001
4 @@ -2807,6 +2807,118 @@
5 The module will be called ip_masq_markfw.o. If you want to compile
6 it as a module, say M here and read Documentation/modules.txt.
8 +IP: masquerading virtual server support
9 +CONFIG_IP_MASQUERADE_VS
10 + IP Virtual Server support will let you build a virtual server
11 + based on cluster of two or more real servers. This option must
12 + be enabled for at least one of the clustered computers that will
13 + take care of intercepting incomming connections to a single IP
14 + address and scheduling them to real servers.
16 + Three request dispatching techniques are implemented, they are
17 + virtual server via NAT, virtual server via tunneling and virtual
18 + server via direct routing. The round-robin scheduling, the weighted
19 + round-robin secheduling, the weighted least-connection scheduling,
20 + the locality-based least-connection scheduling, or the
21 + locality-based least-connection with replication scheduling
22 + algorithm can be used to choose which server the connection is
23 + directed to, thus load balancing can be achieved among the servers.
24 + For more information and its administration program, please visit
27 + http://www.linuxvirtualserver.org/
28 + If you want this, say Y.
30 +IP virtual server debugging
32 + Say Y here if you want to get additional messages useful in
33 + debugging the IP virtual server code. You can change the debug
34 + level in /proc/sys/net/ipv4/vs/debug_level
36 +IP masquerading VS table size (the Nth power of 2)
37 +CONFIG_IP_MASQUERADE_VS_TAB_BITS
38 + Using a big ipvs hash table for virtual server will greatly reduce
39 + conflicts in the ipvs hash table when there are hundreds of thousands
40 + of active connections.
42 + Note the table size must be power of 2. The table size will be the
43 + value of 2 to the your input number power. For example, the default
44 + number is 12, so the table size is 4096. Don't input the number too
45 + small, otherwise you will lose performance on it. You can adapt the
46 + table size yourself, according to your virtual server application. It
47 + is good to set the table size not far less than the number of
48 + connections per second multiplying average lasting time of connection
49 + in the table. For example, your virtual server gets 200 connections
50 + per second, the connection lasts for 200 seconds in average in the
51 + masquerading table, the table size should be not far less than
52 + 200x200, it is good to set the table size 32768 (2**15).
54 + Another note that each connection occupies 128 bytes effectively and
55 + each hash entry uses 8 bytes, so you can estimate how much memory is
56 + needed for your box.
58 +IPVS: round-robin scheduling
59 +CONFIG_IP_MASQUERADE_VS_RR
60 + The robin-robin scheduling algorithm simply directs network
61 + connections to different real servers in a round-robin manner.
62 + If you want to compile it in kernel, say Y. If you want to compile
63 + it as a module, say M here and read Documentation/modules.txt.
65 +IPVS: weighted round-robin scheduling
66 +CONFIG_IP_MASQUERADE_VS_WRR
67 + The weighted robin-robin scheduling algorithm directs network
68 + connections to different real servers based on server weights
69 + in a round-robin manner. Servers with higher weights receive
70 + new connections first than those with less weights, and servers
71 + with higher weights get more connections than those with less
72 + weights and servers with equal weights get equal connections.
73 + If you want to compile it in kernel, say Y. If you want to compile
74 + it as a module, say M here and read Documentation/modules.txt.
76 +IPVS: least-connection scheduling
77 +CONFIG_IP_MASQUERADE_VS_LC
78 + The least-connection scheduling algorithm directs network
79 + connections to the server with the least number of active
81 + If you want to compile it in kernel, say Y. If you want to compile
82 + it as a module, say M here and read Documentation/modules.txt.
84 +IPVS: weighted least-connection scheduling
85 +CONFIG_IP_MASQUERADE_VS_WLC
86 + The weighted least-connection scheduling algorithm directs network
87 + connections to the server with the least active connections
88 + normalized by the server weight.
89 + If you want to compile it in kernel, say Y. If you want to compile
90 + it as a module, say M here and read Documentation/modules.txt.
92 +IPVS: locality-based least-connection scheduling
93 +CONFIG_IP_MASQUERADE_VS_LBLC
94 + The locality-based least-connection scheduling algorithm is for
95 + destination IP load balancing. It is usually used in cache cluster.
96 + This algorithm usually directs packet destined for an IP address to
97 + its server if the server is alive and under load. If the server is
98 + overloaded (its active connection numbers is larger than its weight)
99 + and there is a server in its half load, then allocate the weighted
100 + least-connection server to this IP address.
101 + If you want to compile it in kernel, say Y. If you want to compile
102 + it as a module, say M here and read Documentation/modules.txt.
104 +IPVS: locality-based least-connection with replication scheduling
105 +CONFIG_IP_MASQUERADE_VS_LBLCR
106 + The locality-based least-connection with replication scheduling
107 + algorithm is also for destination IP load balancing. It is
108 + usually used in cache cluster. It differs from the LBLC scheduling
109 + as follows: the load balancer maintains mappings from a target
110 + to a set of server nodes that can serve the target. Requests for
111 + a target are assigned to the least-connection node in the target's
112 + server set. If all the node in the server set are over loaded,
113 + it picks up a least-connection node in the cluster and adds it
114 + in the sever set for the target. If the server set has not been
115 + modified for the specified time, the most loaded node is removed
116 + from the server set, in order to avoid high degree of replication.
117 + If you want to compile it in kernel, say Y. If you want to compile
118 + it as a module, say M here and read Documentation/modules.txt.
122 Sometimes it is useful to give several IP addresses to a single
123 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/linux/ip_masq.h linux-2.2.19-vs-1.0.8/include/linux/ip_masq.h
124 --- linux-2.2.19/include/linux/ip_masq.h Sat Oct 23 17:02:32 1999
125 +++ linux-2.2.19-vs-1.0.8/include/linux/ip_masq.h Tue Dec 12 19:17:27 2000
128 #define IP_MASQ_MFW_SCHED 0x01
131 + * Virtual server stuff
134 + /* virtual service options */
135 + u_int16_t protocol;
136 + u_int32_t vaddr; /* virtual address */
138 + u_int32_t vfwmark; /* firwall mark of virtual */
139 + unsigned vs_flags; /* virtual service flags */
140 + unsigned timeout; /* persistent timeout in ticks */
141 + u_int32_t netmask; /* persistent netmask */
143 + /* destination specific options */
144 + u_int32_t daddr; /* real destination address */
146 + unsigned masq_flags; /* destination flags */
147 + int weight; /* destination weight */
151 #define IP_FW_MASQCTL_MAX 256
152 #define IP_MASQ_TNAME_MAX 32
155 struct ip_autofw_user autofw_user;
156 struct ip_mfw_user mfw_user;
157 struct ip_masq_user user;
158 + struct ip_vs_user vs_user;
159 unsigned char m_raw[IP_FW_MASQCTL_MAX];
163 #define IP_MASQ_TARGET_CORE 1
164 #define IP_MASQ_TARGET_MOD 2 /* masq_mod is selected by "name" */
165 #define IP_MASQ_TARGET_USER 3
166 -#define IP_MASQ_TARGET_LAST 4
167 +#define IP_MASQ_TARGET_VS 4
168 +#define IP_MASQ_TARGET_LAST 5
171 #define IP_MASQ_CMD_NONE 0 /* just peek */
172 #define IP_MASQ_CMD_INSERT 1
174 #define IP_MASQ_CMD_LIST 7 /* actually fake: done via /proc */
175 #define IP_MASQ_CMD_ENABLE 8
176 #define IP_MASQ_CMD_DISABLE 9
177 +#define IP_MASQ_CMD_ADD_DEST 10 /* for adding dest in IPVS */
178 +#define IP_MASQ_CMD_DEL_DEST 11 /* for deleting dest in IPVS */
179 +#define IP_MASQ_CMD_SET_DEST 12 /* for setting dest in IPVS */
181 #endif /* _LINUX_IP_MASQ_H */
183 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/linux/sysctl.h linux-2.2.19-vs-1.0.8/include/linux/sysctl.h
184 --- linux-2.2.19/include/linux/sysctl.h Tue Mar 27 09:33:48 2001
185 +++ linux-2.2.19-vs-1.0.8/include/linux/sysctl.h Tue Mar 27 09:32:20 2001
189 NET_IPV4_FIB_HASH=19,
192 NET_IPV4_TCP_TIMESTAMPS=33,
193 NET_IPV4_TCP_WINDOW_SCALING=34,
195 NET_IPV4_CONF_LOG_MARTIANS=11,
196 NET_IPV4_CONF_HIDDEN=12,
197 NET_IPV4_CONF_ARPFILTER=13
200 +/* /proc/sys/net/ipv4/vs */
204 + NET_IPV4_VS_AMEMTHRESH=1,
205 + NET_IPV4_VS_AMDROPRATE=2,
206 + NET_IPV4_VS_DROP_ENTRY=3,
207 + NET_IPV4_VS_DROP_PACKET=4,
208 + NET_IPV4_VS_SECURE_TCP=5,
209 + NET_IPV4_VS_TO_ES=6,
210 + NET_IPV4_VS_TO_SS=7,
211 + NET_IPV4_VS_TO_SR=8,
212 + NET_IPV4_VS_TO_FW=9,
213 + NET_IPV4_VS_TO_TW=10,
214 + NET_IPV4_VS_TO_CL=11,
215 + NET_IPV4_VS_TO_CW=12,
216 + NET_IPV4_VS_TO_LA=13,
217 + NET_IPV4_VS_TO_LI=14,
218 + NET_IPV4_VS_TO_SA=15,
219 + NET_IPV4_VS_TO_UDP=16,
220 + NET_IPV4_VS_TO_ICMP=17,
221 + NET_IPV4_VS_DEBUG_LEVEL=18,
222 + NET_IPV4_VS_LBLC_EXPIRE=19,
223 + NET_IPV4_VS_LBLCR_EXPIRE=20,
226 /* /proc/sys/net/ipv6 */
227 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip.h linux-2.2.19-vs-1.0.8/include/net/ip.h
228 --- linux-2.2.19/include/net/ip.h Tue Mar 27 09:33:48 2001
229 +++ linux-2.2.19-vs-1.0.8/include/net/ip.h Tue Mar 27 17:48:23 2001
231 #define IPSKB_MASQUERADED 1
232 #define IPSKB_TRANSLATED 2
233 #define IPSKB_FORWARDED 4
234 +#ifdef CONFIG_IP_MASQUERADE_VS
235 +#define IPSKB_REDIRECTED 8
240 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip_masq.h linux-2.2.19-vs-1.0.8/include/net/ip_masq.h
241 --- linux-2.2.19/include/net/ip_masq.h Tue Mar 27 09:33:48 2001
242 +++ linux-2.2.19-vs-1.0.8/include/net/ip_masq.h Wed Apr 18 16:17:59 2001
244 #include <linux/ip.h>
245 #include <linux/skbuff.h>
246 #include <linux/list.h>
247 +#ifdef CONFIG_SYSCTL
248 +#include <linux/sysctl.h>
250 #endif /* __KERNEL__ */
252 +#ifdef CONFIG_IP_MASQUERADE_VS
257 * This define affects the number of ports that can be handled
258 * by each of the protocol helper modules.
260 #define IP_MASQ_MOD_CTL 0x00
261 #define IP_MASQ_USER_CTL 0x01
265 -#define IP_MASQ_TAB_SIZE 256
267 #define IP_MASQ_F_NO_DADDR 0x0001 /* no daddr yet */
268 #define IP_MASQ_F_NO_DPORT 0x0002 /* no dport set yet */
269 #define IP_MASQ_F_NO_SADDR 0x0004 /* no sport set yet */
271 #define IP_MASQ_F_USER 0x2000 /* from uspace */
272 #define IP_MASQ_F_SIMPLE_HASH 0x8000 /* prevent s+d and m+d hashing */
274 +#ifdef CONFIG_IP_MASQUERADE_VS
275 +#define IP_MASQ_F_VS 0x00010000 /* virtual server related */
276 +#define IP_MASQ_F_VS_NO_OUTPUT 0x00020000 /* output packets avoid masq */
277 +#define IP_MASQ_F_VS_INACTIVE 0x00040000 /* not established */
278 +#define IP_MASQ_F_VS_FWD_MASK 0x00700000 /* mask for the fdw method */
279 +#define IP_MASQ_F_VS_LOCALNODE 0x00100000 /* local node destination */
280 +#define IP_MASQ_F_VS_TUNNEL 0x00200000 /* packets will be tunneled */
281 +#define IP_MASQ_F_VS_DROUTE 0x00400000 /* direct routing */
282 + /* masquerading otherwise */
283 +#define IP_MASQ_VS_FWD(ms) (ms->flags & IP_MASQ_F_VS_FWD_MASK)
284 +#endif /* CONFIG_IP_MASQUERADE_VS */
288 +#define IP_MASQ_TAB_SIZE 256
291 * Delta seq. info structure
292 * Each MASQ struct has 2 (output AND input seq. changes).
294 struct ip_masq *control; /* Master control connection */
295 atomic_t n_control; /* Number of "controlled" masqs */
296 unsigned flags; /* status flags */
297 - unsigned timeout; /* timeout */
298 + unsigned long timeout; /* timeout */
299 unsigned state; /* state info */
300 struct ip_masq_timeout_table *timeout_table;
301 +#ifdef CONFIG_IP_MASQUERADE_VS
302 + struct ip_vs_dest *dest; /* real server */
303 + atomic_t in_pkts; /* incoming packet counter */
304 +#endif /* CONFIG_IP_MASQUERADE_VS */
309 extern struct list_head ip_masq_d_table[IP_MASQ_TAB_SIZE];
310 extern const char * ip_masq_state_name(int state);
311 extern struct ip_masq_hook *ip_masq_user_hook;
312 -extern u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope);
313 +extern int ip_masq_select_addr(struct sk_buff *skb,__u32 *maddr);
316 * IP_MASQ_APP: IP application masquerading definitions
318 static const char *strProt[] = {"UDP","TCP","ICMP"};
319 int msproto = masq_proto_num(proto);
321 +#ifdef CONFIG_IP_MASQUERADE_VS
322 + if (proto == IPPROTO_IP)
324 +#endif /* CONFIG_IP_MASQUERADE_VS */
325 if (msproto<0||msproto>2) {
326 sprintf(buf, "IP_%d", proto);
329 IP_MASQ_S_CLOSE_WAIT,
332 +#ifdef CONFIG_IP_MASQUERADE_VS
342 + ms->timeout_table = NULL;
343 atomic_dec(&mstim->refcnt);
346 +#ifdef CONFIG_IP_MASQUERADE_VS
348 +extern struct ip_masq_timeout_table masq_timeout_table_dos;
349 +extern void ip_masq_secure_tcp_set(int on);
352 + * This is a simple mechanism to ignore packets when
353 + * we are loaded. Just set ip_masq_drop_rate to 'n' and
354 + * we start to drop 1/n of the packets
357 +extern int ip_masq_drop_rate;
358 +extern int ip_masq_drop_counter;
360 +static __inline__ int ip_masq_todrop(void)
362 + if (!ip_masq_drop_rate) return 0;
363 + if (--ip_masq_drop_counter > 0) return 0;
364 + ip_masq_drop_counter = ip_masq_drop_rate;
368 +#endif /* CONFIG_IP_MASQUERADE_VS */
370 #endif /* __KERNEL__ */
372 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip_vs.h linux-2.2.19-vs-1.0.8/include/net/ip_vs.h
373 --- linux-2.2.19/include/net/ip_vs.h Thu Jan 1 08:00:00 1970
374 +++ linux-2.2.19-vs-1.0.8/include/net/ip_vs.h Tue Apr 24 18:07:00 2001
377 + * IP virtual server
378 + * data structure and functionality definitions
381 +#include <linux/config.h>
386 +#define IP_VS_VERSION_CODE 0x010008
387 +#define NVERSION(version) \
388 + (version >> 16) & 0xFF, \
389 + (version >> 8) & 0xFF, \
393 + * Virtual Service Flags
395 +#define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */
396 +#define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */
399 + * Destination Server Flags
401 +#define IP_VS_DEST_F_AVAILABLE 0x0001 /* Available tag */
404 + * The default IP_VS_TEMPLATE_TIMEOUT is a little larger than average
405 + * connection time plus MASQUERADE_EXPIRE_TCP_FIN(2*60*HZ). Because the
406 + * template won't be released until its controlled masq entries are
408 + * If IP_VS_TEMPLATE_TIMEOUT is too less, the template will soon expire
409 + * and will be put in expire again and again, which requires additional
410 + * overhead. If it is too large, the same will always visit the same
411 + * server, which will make dynamic load imbalance worse.
413 +#define IP_VS_TEMPLATE_TIMEOUT 6*60*HZ
417 +extern int ip_vs_forwarding_related_icmp(struct sk_buff *skb);
419 +#ifdef CONFIG_IP_VS_DEBUG
420 +extern int ip_vs_get_debug_level(void);
421 +#define IP_VS_DBG(level, msg...) \
423 + if (level <= ip_vs_get_debug_level()) \
424 + printk(KERN_DEBUG "IPVS: " ## msg); \
426 +#else /* NO DEBUGGING at ALL */
427 +#define IP_VS_DBG(level, msg...) do {} while (0)
430 +#define IP_VS_ERR(msg...) printk(KERN_ERR "IPVS: " ## msg )
431 +#define IP_VS_INFO(msg...) printk(KERN_INFO "IPVS: " ## msg )
432 +#define IP_VS_WARNING(msg...) \
433 + printk(KERN_WARNING "IPVS: " ## msg)
435 +#ifdef CONFIG_IP_VS_DEBUG
436 +#define EnterFunction(level) \
438 + if (level <= ip_vs_get_debug_level()) \
439 + printk(KERN_DEBUG "Enter: %s, %s line %i\n", \
440 + __FUNCTION__, __FILE__, __LINE__); \
442 +#define LeaveFunction(level) \
444 + if (level <= ip_vs_get_debug_level()) \
445 + printk(KERN_DEBUG "Leave: %s, %s line %i\n", \
446 + __FUNCTION__, __FILE__, __LINE__); \
449 +#define EnterFunction(level) do {} while (0)
450 +#define LeaveFunction(level) do {} while (0)
455 + * IPVS statistics object
459 + spinlock_t lock; /* spin lock */
460 + __u32 conns; /* connections scheduled */
461 + __u32 inpkts; /* incoming packets */
462 + __u32 outpkts; /* outgoing packets */
463 + __u64 inbytes; /* incoming bytes */
464 + __u64 outbytes; /* outgoing bytes */
469 + * The real server destination forwarding entry
470 + * with ip address, port
473 + struct list_head n_list; /* for the dests in the service */
474 + struct list_head d_list; /* for table with all the dests */
476 + __u32 addr; /* IP address of real server */
477 + __u16 port; /* port number of the service */
478 + unsigned flags; /* dest status flags */
479 + unsigned masq_flags; /* flags to copy to masq */
480 + atomic_t activeconns; /* active connections */
481 + atomic_t inactconns; /* inactive connections */
482 + atomic_t refcnt; /* reference counter */
483 + int weight; /* server weight */
484 + struct ip_vs_stats stats; /* statistics */
486 + /* for virtual service */
487 + struct ip_vs_service *svc; /* service that it belongs to */
488 + __u16 protocol; /* which protocol (TCP/UDP) */
489 + __u32 vaddr; /* IP address for virtual service */
490 + __u16 vport; /* port number for the service */
491 + __u32 vfwmark; /* firewall mark of the service */
496 + * The scheduler object
498 +struct ip_vs_scheduler {
499 + struct list_head n_list; /* d-linked list head */
500 + char *name; /* scheduler name */
501 + atomic_t refcnt; /* reference counter */
503 + /* scheduler initializing service */
504 + int (*init_service)(struct ip_vs_service *svc);
505 + /* scheduling service finish */
506 + int (*done_service)(struct ip_vs_service *svc);
507 + /* scheduler updating service */
508 + int (*update_service)(struct ip_vs_service *svc);
510 + /* selecting a server from the given service */
511 + struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc,
512 + struct iphdr *iph);
517 + * The information about the virtual service offered to the net
518 + * and the forwarding entries
520 +struct ip_vs_service {
521 + struct list_head s_list; /* for normal service table */
522 + struct list_head f_list; /* for fwmark-based service table */
523 + atomic_t refcnt; /* reference counter */
525 + __u16 protocol; /* which protocol (TCP/UDP) */
526 + __u32 addr; /* IP address for virtual service */
527 + __u16 port; /* port number for the service */
528 + __u32 fwmark; /* firewall mark of the service */
529 + unsigned flags; /* service status flags */
530 + unsigned timeout; /* persistent timeout in ticks */
531 + __u32 netmask; /* grouping granularity */
532 + struct list_head destinations; /* real server d-linked list */
533 + struct ip_vs_stats stats; /* statistics for the service */
535 + /* for scheduling */
536 + struct ip_vs_scheduler *scheduler; /* bound scheduler object */
537 + void *sched_data; /* scheduler application data */
542 + * IP Virtual Server masq entry hash table
544 +#define IP_VS_TAB_BITS CONFIG_IP_MASQUERADE_VS_TAB_BITS
545 +#define IP_VS_TAB_SIZE (1 << IP_VS_TAB_BITS)
546 +#define IP_VS_TAB_MASK (IP_VS_TAB_SIZE - 1)
547 +extern struct list_head *ip_vs_table;
550 + * Hash and unhash functions
552 +extern int ip_vs_hash(struct ip_masq *ms);
553 +extern int ip_vs_unhash(struct ip_masq *ms);
556 + * Registering/unregistering scheduler functions
558 +extern int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
559 +extern int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
562 + * Lookup functions for the hash table (caller must lock table)
564 +extern struct ip_masq * __ip_vs_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port);
565 +extern struct ip_masq * __ip_vs_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port);
568 + * Creating a masquerading entry for IPVS
570 +extern struct ip_masq * ip_masq_new_vs(int proto, __u32 maddr, __u16 mport, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned flags);
573 + * IPVS data and functions
575 +extern rwlock_t __ip_vs_lock;
577 +extern void ip_vs_set_state(struct ip_masq *ms, int new_state);
578 +extern void ip_vs_bind_masq(struct ip_masq *ms, struct ip_vs_dest *dest);
579 +extern void ip_vs_unbind_masq(struct ip_masq *ms);
581 +extern int ip_vs_ctl(int optname, struct ip_masq_ctl *mctl, int optlen);
582 +extern struct ip_vs_service *
583 +ip_vs_lookup_service(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport);
584 +extern struct ip_vs_service * ip_vs_lookup_svc_fwm(__u32 fwmark);
585 +extern struct ip_vs_dest *
586 +__ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport);
587 +extern struct ip_vs_dest *ip_vs_lookup_dest(struct ip_vs_service *svc,
588 + __u32 daddr, __u16 dport);
589 +extern struct ip_masq * ip_vs_schedule(struct ip_vs_service *svc,
590 + struct iphdr *iph);
591 +extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb);
592 +extern int ip_vs_tunnel_xmit(struct sk_buff *skb, __u32 daddr);
593 +extern int ip_vs_dr_xmit(struct sk_buff *skb, __u32 daddr);
598 +extern int ip_vs_init(void);
601 + * init function prototypes for scheduling modules
602 + * these function will be called when they are built in kernel
604 +extern int ip_vs_rr_init(void);
605 +extern int ip_vs_wrr_init(void);
606 +extern int ip_vs_lc_init(void);
607 +extern int ip_vs_wlc_init(void);
608 +extern int ip_vs_lblc_init(void);
609 +extern int ip_vs_lblcr_init(void);
613 + * Slow timer functions for IPVS
615 +extern void add_sltimer(struct timer_list * timer);
616 +extern int del_sltimer(struct timer_list * timer);
617 +extern void mod_sltimer(struct timer_list *timer, unsigned long expires);
621 + * IP Virtual Server statistics
623 +extern struct ip_vs_stats ip_vs_stats;
625 +extern __inline__ void
626 +ip_vs_in_stats(struct ip_masq *ms, struct sk_buff *skb)
628 + struct ip_vs_dest *dest = ms->dest;
629 + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
630 + spin_lock(&dest->stats.lock);
631 + dest->stats.inpkts++;
632 + dest->stats.inbytes += skb->len;
633 + spin_unlock(&dest->stats.lock);
635 + spin_lock(&dest->svc->stats.lock);
636 + dest->svc->stats.inpkts++;
637 + dest->svc->stats.inbytes += skb->len;
638 + spin_unlock(&dest->svc->stats.lock);
640 + spin_lock(&ip_vs_stats.lock);
641 + ip_vs_stats.inpkts++;
642 + ip_vs_stats.inbytes += skb->len;
643 + spin_unlock(&ip_vs_stats.lock);
648 +extern __inline__ void
649 +ip_vs_out_stats(struct ip_masq *ms, struct sk_buff *skb)
651 + struct ip_vs_dest *dest = ms->dest;
652 + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
653 + spin_lock(&dest->stats.lock);
654 + dest->stats.outpkts++;
655 + dest->stats.outbytes += skb->len;
656 + spin_unlock(&dest->stats.lock);
658 + spin_lock(&dest->svc->stats.lock);
659 + dest->svc->stats.outpkts++;
660 + dest->svc->stats.outbytes += skb->len;
661 + spin_unlock(&dest->svc->stats.lock);
663 + spin_lock(&ip_vs_stats.lock);
664 + ip_vs_stats.outpkts++;
665 + ip_vs_stats.outbytes += skb->len;
666 + spin_unlock(&ip_vs_stats.lock);
671 +extern __inline__ void
672 +ip_vs_conn_stats(struct ip_masq *ms, struct ip_vs_service *svc)
674 + spin_lock(&ms->dest->stats.lock);
675 + ms->dest->stats.conns++;
676 + spin_unlock(&ms->dest->stats.lock);
678 + spin_lock(&svc->stats.lock);
679 + svc->stats.conns++;
680 + spin_unlock(&svc->stats.lock);
682 + spin_lock(&ip_vs_stats.lock);
683 + ip_vs_stats.conns++;
684 + spin_unlock(&ip_vs_stats.lock);
689 + * ip_vs_fwd_tag returns the forwarding tag of the masq
691 +extern __inline__ char ip_vs_fwd_tag(struct ip_masq *ms)
695 + switch (IP_MASQ_VS_FWD(ms)) {
696 + case IP_MASQ_F_VS_LOCALNODE: fwd = 'L'; break;
697 + case IP_MASQ_F_VS_TUNNEL: fwd = 'T'; break;
698 + case IP_MASQ_F_VS_DROUTE: fwd = 'R'; break;
704 +extern __inline__ char * ip_vs_fwd_name(unsigned masq_flags)
708 + switch (masq_flags & IP_MASQ_F_VS_FWD_MASK) {
709 + case IP_MASQ_F_VS_LOCALNODE:
712 + case IP_MASQ_F_VS_TUNNEL:
715 + case IP_MASQ_F_VS_DROUTE:
726 + * ip_vs_forward forwards the packet through tunneling, direct
727 + * routing or local node (passing to the upper layer).
728 + * Return values mean:
729 + * 0 skb must be passed to the upper layer
730 + * -1 skb must be released
731 + * -2 skb has been released
733 +extern __inline__ int ip_vs_forward(struct sk_buff *skb, struct ip_masq *ms)
737 + atomic_inc(&ms->in_pkts);
739 + switch (IP_MASQ_VS_FWD(ms)) {
740 + case IP_MASQ_F_VS_TUNNEL:
741 + if (ip_vs_tunnel_xmit(skb, ms->saddr) == 0) {
742 + IP_VS_DBG(10, "tunneling failed.\n");
744 + IP_VS_DBG(10, "tunneling succeeded.\n");
749 + case IP_MASQ_F_VS_DROUTE:
750 + if (ip_vs_dr_xmit(skb, ms->saddr) == 0) {
751 + IP_VS_DBG(10, "direct routing failed.\n");
753 + IP_VS_DBG(10, "direct routing succeeded.\n");
758 + case IP_MASQ_F_VS_LOCALNODE:
765 +#endif /* __KERNEL__ */
767 +#endif /* _IP_VS_H */
768 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/Config.in linux-2.2.19-vs-1.0.8/net/ipv4/Config.in
769 --- linux-2.2.19/net/ipv4/Config.in Sat Dec 16 23:10:12 2000
770 +++ linux-2.2.19-vs-1.0.8/net/ipv4/Config.in Tue Dec 12 18:35:06 2000
772 tristate 'IP: ipportfw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPPORTFW
773 tristate 'IP: ip fwmark masq-forwarding support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_MFW
775 + bool 'IP: masquerading virtual server support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_VS
776 + if [ "$CONFIG_IP_MASQUERADE_VS" = "y" ]; then
777 + bool ' IP virtual server debugging' CONFIG_IP_VS_DEBUG
778 + int ' IP masquerading VS table size (the Nth power of 2)' CONFIG_IP_MASQUERADE_VS_TAB_BITS 12
779 + tristate ' IPVS: round-robin scheduling' CONFIG_IP_MASQUERADE_VS_RR
780 + tristate ' IPVS: weighted round-robin scheduling' CONFIG_IP_MASQUERADE_VS_WRR
781 + tristate ' IPVS: least-connection scheduling' CONFIG_IP_MASQUERADE_VS_LC
782 + tristate ' IPVS: weighted least-connection scheduling' CONFIG_IP_MASQUERADE_VS_WLC
783 + tristate ' IPVS: locality-based least-connection scheduling' CONFIG_IP_MASQUERADE_VS_LBLC
784 + tristate ' IPVS: locality-based least-connection with replication scheduling' CONFIG_IP_MASQUERADE_VS_LBLCR
789 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/Makefile linux-2.2.19-vs-1.0.8/net/ipv4/Makefile
790 --- linux-2.2.19/net/ipv4/Makefile Tue Jan 5 07:31:34 1999
791 +++ linux-2.2.19-vs-1.0.8/net/ipv4/Makefile Sat Dec 2 22:32:10 2000
796 +ifeq ($(CONFIG_IP_MASQUERADE_VS),y)
797 + IPV4X_OBJS += ip_vs.o
799 + ifeq ($(CONFIG_IP_MASQUERADE_VS_RR),y)
800 + IPV4_OBJS += ip_vs_rr.o
802 + ifeq ($(CONFIG_IP_MASQUERADE_VS_RR),m)
803 + M_OBJS += ip_vs_rr.o
807 + ifeq ($(CONFIG_IP_MASQUERADE_VS_WRR),y)
808 + IPV4_OBJS += ip_vs_wrr.o
810 + ifeq ($(CONFIG_IP_MASQUERADE_VS_WRR),m)
811 + M_OBJS += ip_vs_wrr.o
815 + ifeq ($(CONFIG_IP_MASQUERADE_VS_LC),y)
816 + IPV4_OBJS += ip_vs_lc.o
818 + ifeq ($(CONFIG_IP_MASQUERADE_VS_LC),m)
819 + M_OBJS += ip_vs_lc.o
823 + ifeq ($(CONFIG_IP_MASQUERADE_VS_WLC),y)
824 + IPV4_OBJS += ip_vs_wlc.o
826 + ifeq ($(CONFIG_IP_MASQUERADE_VS_WLC),m)
827 + M_OBJS += ip_vs_wlc.o
831 + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLC),y)
832 + IPV4_OBJS += ip_vs_lblc.o
834 + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLC),m)
835 + M_OBJS += ip_vs_lblc.o
839 + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLCR),y)
840 + IPV4_OBJS += ip_vs_lblcr.o
842 + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLCR),m)
843 + M_OBJS += ip_vs_lblcr.o
848 M_OBJS += ip_masq_user.o
849 M_OBJS += ip_masq_ftp.o ip_masq_irc.o ip_masq_raudio.o ip_masq_quake.o
850 M_OBJS += ip_masq_vdolive.o ip_masq_cuseeme.o
851 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_forward.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_forward.c
852 --- linux-2.2.19/net/ipv4/ip_forward.c Fri Jan 7 09:45:02 2000
853 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_forward.c Fri Feb 2 15:38:28 2001
855 #include <linux/ip_fw.h>
856 #ifdef CONFIG_IP_MASQUERADE
857 #include <net/ip_masq.h>
858 +#ifdef CONFIG_IP_MASQUERADE_VS
859 +#include <net/ip_vs.h>
862 #include <net/checksum.h>
863 #include <linux/route.h>
868 +#ifdef CONFIG_IP_MASQUERADE_VS
869 + if (iph->protocol == IPPROTO_ICMP &&
870 + !(IPCB(skb)->flags&IPSKB_MASQUERADED)) {
871 + /* Related ICMP packet for IPVS ? */
872 + fw_res = ip_vs_forwarding_related_icmp(skb);
873 + if (fw_res > 0) return ip_local_deliver(skb);
877 #ifdef CONFIG_IP_TRANSPARENT_PROXY
879 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_input.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_input.c
880 --- linux-2.2.19/net/ipv4/ip_input.c Tue Mar 27 09:33:49 2001
881 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_input.c Tue Mar 27 09:32:21 2001
887 +#ifdef CONFIG_IP_MASQUERADE_VS
888 + if((IPCB(skb)->flags&IPSKB_REDIRECTED)) {
889 + printk(KERN_DEBUG "ip_input(): ipvs recursion detected. Check ipvs configuration\n");
896 * Some masq modules can re-inject packets if
901 ret = ip_fw_demasquerade(&skb);
902 +#ifdef CONFIG_IP_MASQUERADE_VS
904 + /* skb has already been released */
911 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_masq.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_masq.c
912 --- linux-2.2.19/net/ipv4/ip_masq.c Tue Mar 27 09:33:49 2001
913 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_masq.c Wed Apr 18 19:58:48 2001
915 * Kai Bankett : do not toss other IP protos in proto_doff()
916 * Dan Kegel : pointed correct NAT behavior for UDP streams
917 * Julian Anastasov : use daddr and dport as hash keys
919 + * Wensong Zhang : Added virtual server support
920 + * Peter Kese : added masq TCP state handling for input-only
921 + * Julian Anastasov : step to mSR after SYN in INPUT_ONLY table
922 + * Julian Anastasov : fixed huge expire bug for IPVS after bad checksum
923 + * Wensong Zhang : added server status checking for IPVS
927 #include <linux/config.h>
929 #include <linux/ip_fw.h>
930 #include <linux/ip_masq.h>
932 +#ifdef CONFIG_IP_MASQUERADE_VS
933 +#include <net/ip_vs.h>
934 +#endif /* CONFIG_IP_MASQUERADE_VS */
936 int sysctl_ip_masq_debug = 0;
937 int sysctl_ip_masq_udp_dloose = 0;
941 struct ip_masq_hook *ip_masq_user_hook = NULL;
943 +#ifdef CONFIG_IP_MASQUERADE_VS
945 + * Use different state/timeout tables
947 +#ifndef IP_MASQ_MANY_STATE_TABLES
948 +#define IP_MASQ_MANY_STATE_TABLES
951 +int ip_masq_drop_rate = 0;
952 +int ip_masq_drop_counter = 0;
956 +#ifndef CONFIG_IP_MASQUERADE_VS
959 * Timeout table[state]
961 @@ -106,38 +130,104 @@
962 ATOMIC_INIT(0), /* refcnt */
965 - 30*60*HZ, /* IP_MASQ_S_NONE, */
966 - 15*60*HZ, /* IP_MASQ_S_ESTABLISHED, */
967 - 2*60*HZ, /* IP_MASQ_S_SYN_SENT, */
968 - 1*60*HZ, /* IP_MASQ_S_SYN_RECV, */
969 - 2*60*HZ, /* IP_MASQ_S_FIN_WAIT, */
970 - 2*60*HZ, /* IP_MASQ_S_TIME_WAIT, */
971 - 10*HZ, /* IP_MASQ_S_CLOSE, */
972 - 60*HZ, /* IP_MASQ_S_CLOSE_WAIT, */
973 - 30*HZ, /* IP_MASQ_S_LAST_ACK, */
974 - 2*60*HZ, /* IP_MASQ_S_LISTEN, */
975 - 5*60*HZ, /* IP_MASQ_S_UDP, */
976 - 1*60*HZ, /* IP_MASQ_S_ICMP, */
977 - 2*HZ,/* IP_MASQ_S_LAST */
978 + [IP_MASQ_S_NONE] = 30*60*HZ,
979 + [IP_MASQ_S_ESTABLISHED] = 15*60*HZ,
980 + [IP_MASQ_S_SYN_SENT] = 2*60*HZ,
981 + [IP_MASQ_S_SYN_RECV] = 1*60*HZ,
982 + [IP_MASQ_S_FIN_WAIT] = 2*60*HZ,
983 + [IP_MASQ_S_TIME_WAIT] = 2*60*HZ,
984 + [IP_MASQ_S_CLOSE] = 10*HZ,
985 + [IP_MASQ_S_CLOSE_WAIT] = 60*HZ,
986 + [IP_MASQ_S_LAST_ACK] = 30*HZ,
987 + [IP_MASQ_S_LISTEN] = 2*60*HZ,
988 + [IP_MASQ_S_UDP] = 5*60*HZ,
989 + [IP_MASQ_S_ICMP] = 1*60*HZ,
990 + [IP_MASQ_S_LAST] = 2*HZ,
994 +#else /* CONFIG_IP_MASQUERADE_VS */
997 + * Timeout table[state]
999 +/* static int masq_timeout_table[IP_MASQ_S_LAST+1] = { */
1000 +static struct ip_masq_timeout_table masq_timeout_table = {
1001 + ATOMIC_INIT(0), /* refcnt */
1004 + [IP_MASQ_S_NONE] = 30*60*HZ,
1005 + [IP_MASQ_S_ESTABLISHED] = 15*60*HZ,
1006 + [IP_MASQ_S_SYN_SENT] = 2*60*HZ,
1007 + [IP_MASQ_S_SYN_RECV] = 1*60*HZ,
1008 + [IP_MASQ_S_FIN_WAIT] = 2*60*HZ,
1009 + [IP_MASQ_S_TIME_WAIT] = 2*60*HZ,
1010 + [IP_MASQ_S_CLOSE] = 10*HZ,
1011 + [IP_MASQ_S_CLOSE_WAIT] = 60*HZ,
1012 + [IP_MASQ_S_LAST_ACK] = 30*HZ,
1013 + [IP_MASQ_S_LISTEN] = 2*60*HZ,
1014 + [IP_MASQ_S_SYNACK] = 120*HZ,
1015 + [IP_MASQ_S_UDP] = 5*60*HZ,
1016 + [IP_MASQ_S_ICMP] = 1*60*HZ,
1017 + [IP_MASQ_S_LAST] = 2*HZ,
1022 +struct ip_masq_timeout_table masq_timeout_table_dos = {
1023 + ATOMIC_INIT(0), /* refcnt */
1026 + [IP_MASQ_S_NONE] = 15*60*HZ,
1027 + [IP_MASQ_S_ESTABLISHED] = 8*60*HZ,
1028 + [IP_MASQ_S_SYN_SENT] = 60*HZ,
1029 + [IP_MASQ_S_SYN_RECV] = 10*HZ,
1030 + [IP_MASQ_S_FIN_WAIT] = 60*HZ,
1031 + [IP_MASQ_S_TIME_WAIT] = 60*HZ,
1032 + [IP_MASQ_S_CLOSE] = 10*HZ,
1033 + [IP_MASQ_S_CLOSE_WAIT] = 60*HZ,
1034 + [IP_MASQ_S_LAST_ACK] = 30*HZ,
1035 + [IP_MASQ_S_LISTEN] = 2*60*HZ,
1036 + [IP_MASQ_S_SYNACK] = 100*HZ,
1037 + [IP_MASQ_S_UDP] = 3*60*HZ,
1038 + [IP_MASQ_S_ICMP] = 1*60*HZ,
1039 + [IP_MASQ_S_LAST] = 2*HZ,
1044 + * Timeout table to use for the VS entries
1045 + * If NULL we use the default table (masq_timeout_table).
1046 + * Under flood attack we switch to masq_timeout_table_dos
1049 +struct ip_masq_timeout_table *ip_vs_timeout_table = &masq_timeout_table;
1051 +#endif /* CONFIG_IP_MASQUERADE_VS */
1053 +#ifdef CONFIG_IP_MASQUERADE_VS
1054 +#define MASQUERADE_EXPIRE_RETRY(ms) (ms->timeout_table? ms->timeout_table->timeout[IP_MASQ_S_TIME_WAIT] : masq_timeout_table.timeout[IP_MASQ_S_TIME_WAIT])
1056 #define MASQUERADE_EXPIRE_RETRY masq_timeout_table.timeout[IP_MASQ_S_TIME_WAIT]
1059 static const char * state_name_table[IP_MASQ_S_LAST+1] = {
1060 - "NONE", /* IP_MASQ_S_NONE, */
1061 - "ESTABLISHED", /* IP_MASQ_S_ESTABLISHED, */
1062 - "SYN_SENT", /* IP_MASQ_S_SYN_SENT, */
1063 - "SYN_RECV", /* IP_MASQ_S_SYN_RECV, */
1064 - "FIN_WAIT", /* IP_MASQ_S_FIN_WAIT, */
1065 - "TIME_WAIT", /* IP_MASQ_S_TIME_WAIT, */
1066 - "CLOSE", /* IP_MASQ_S_CLOSE, */
1067 - "CLOSE_WAIT", /* IP_MASQ_S_CLOSE_WAIT, */
1068 - "LAST_ACK", /* IP_MASQ_S_LAST_ACK, */
1069 - "LISTEN", /* IP_MASQ_S_LISTEN, */
1070 - "UDP", /* IP_MASQ_S_UDP, */
1071 - "ICMP", /* IP_MASQ_S_ICMP, */
1072 - "BUG!", /* IP_MASQ_S_LAST */
1073 + [IP_MASQ_S_NONE] = "NONE",
1074 + [IP_MASQ_S_ESTABLISHED] = "ESTABLISHED",
1075 + [IP_MASQ_S_SYN_SENT] = "SYN_SENT",
1076 + [IP_MASQ_S_SYN_RECV] = "SYN_RECV",
1077 + [IP_MASQ_S_FIN_WAIT] = "FIN_WAIT",
1078 + [IP_MASQ_S_TIME_WAIT] = "TIME_WAIT",
1079 + [IP_MASQ_S_CLOSE] = "CLOSE",
1080 + [IP_MASQ_S_CLOSE_WAIT] = "CLOSE_WAIT",
1081 + [IP_MASQ_S_LAST_ACK] = "LAST_ACK",
1082 + [IP_MASQ_S_LISTEN] = "LISTEN",
1083 +#ifdef CONFIG_IP_MASQUERADE_VS
1084 + [IP_MASQ_S_SYNACK] = "SYNACK",
1086 + [IP_MASQ_S_UDP] = "UDP",
1087 + [IP_MASQ_S_ICMP] = "ICMP",
1088 + [IP_MASQ_S_LAST] = "BUG!",
1091 #define mNO IP_MASQ_S_NONE
1093 #define mCW IP_MASQ_S_CLOSE_WAIT
1094 #define mLA IP_MASQ_S_LAST_ACK
1095 #define mLI IP_MASQ_S_LISTEN
1096 +#ifdef CONFIG_IP_MASQUERADE_VS
1097 +#define mSA IP_MASQ_S_SYNACK
1100 struct masq_tcp_states_t {
1101 int next_state[IP_MASQ_S_LAST]; /* should be _LAST_TCP */
1102 @@ -159,46 +252,111 @@
1104 if (state >= IP_MASQ_S_LAST)
1106 - return state_name_table[state];
1107 + return state_name_table[state] ? state_name_table[state] : "?";
1110 +#ifndef CONFIG_IP_MASQUERADE_VS
1112 struct masq_tcp_states_t masq_tcp_states [] = {
1114 /* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */
1115 /*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR }},
1116 /*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI }},
1117 -/*ack*/ {{mCL, mES, mSS, mSR, mFW, mTW, mCL, mCW, mCL, mLI }},
1118 +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI }},
1119 /*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI }},
1122 /* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */
1123 -/*syn*/ {{mSS, mES, mSS, mES, mSS, mSS, mSS, mSS, mSS, mLI }},
1124 +/*syn*/ {{mSS, mES, mSS, mSR, mSS, mSS, mSS, mSS, mSS, mLI }},
1125 /*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI }},
1126 -/*ack*/ {{mES, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mES }},
1127 +/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES }},
1128 /*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL }},
1131 -static __inline__ int masq_tcp_state_idx(struct tcphdr *th, int output)
1132 +#else /* CONFIG_IP_MASQUERADE_VS */
1134 +struct masq_tcp_states_t masq_tcp_states [] = {
1136 +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1137 +/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSR }},
1138 +/*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI, mTW }},
1139 +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }},
1140 +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mSR }},
1143 +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1144 +/*syn*/ {{mSS, mES, mSS, mSR, mSS, mSS, mSS, mSS, mSS, mLI, mSR }},
1145 +/*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI, mTW }},
1146 +/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES, mES }},
1147 +/*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL, mCL }},
1150 +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1151 +/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSR }},
1152 +/*fin*/ {{mCL, mFW, mSS, mTW, mFW, mTW, mCL, mCW, mLA, mLI, mTW }},
1153 +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }},
1154 +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }},
1157 +struct masq_tcp_states_t masq_tcp_states_dos [] = {
1159 +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1160 +/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSA }},
1161 +/*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI, mSA }},
1162 +/*ack*/ {{mCL, mES, mSS, mSR, mFW, mTW, mCL, mCW, mCL, mLI, mSA }},
1163 +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }},
1166 +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1167 +/*syn*/ {{mSS, mES, mSS, mSA, mSS, mSS, mSS, mSS, mSS, mLI, mSA }},
1168 +/*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI, mTW }},
1169 +/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES, mES }},
1170 +/*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL, mCL }},
1173 +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1174 +/*syn*/ {{mSA, mES, mES, mSR, mSA, mSA, mSA, mSA, mSA, mSA, mSA }},
1175 +/*fin*/ {{mCL, mFW, mSS, mTW, mFW, mTW, mCL, mCW, mLA, mLI, mTW }},
1176 +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }},
1177 +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }},
1180 +struct masq_tcp_states_t *ip_vs_state_table = masq_tcp_states;
1182 +void ip_masq_secure_tcp_set(int on)
1185 + ip_vs_state_table = masq_tcp_states_dos;
1186 + ip_vs_timeout_table = &masq_timeout_table_dos;
1188 + ip_vs_state_table = masq_tcp_states;
1189 + ip_vs_timeout_table = &masq_timeout_table;
1193 +#endif /* CONFIG_IP_MASQUERADE_VS */
1195 +#define MASQ_STATE_INPUT 0
1196 +#define MASQ_STATE_OUTPUT 4
1197 +#define MASQ_STATE_INPUT_ONLY 8
1199 +static __inline__ int masq_tcp_state_idx(struct tcphdr *th, int state_off)
1202 - * [0-3]: input states, [4-7]: output.
1203 + * [0-3]: input states, [4-7]: output, [8-11] input only states.
1210 + return state_off+3;
1213 + return state_off+0;
1216 + return state_off+1;
1219 + return state_off+2;
1225 static int masq_set_state_timeout(struct ip_masq *ms, int state)
1227 struct ip_masq_timeout_table *mstim = ms->timeout_table;
1228 @@ -221,18 +379,34 @@
1232 -static int masq_tcp_state(struct ip_masq *ms, int output, struct tcphdr *th)
1233 +static int masq_tcp_state(struct ip_masq *ms, int state_off, struct tcphdr *th)
1236 int new_state = IP_MASQ_S_CLOSE;
1238 - if ((state_idx = masq_tcp_state_idx(th, output)) < 0) {
1239 +#ifdef CONFIG_IP_MASQUERADE_VS
1241 + * Update state offset to INPUT_ONLY if necessary
1242 + * or delete NO_OUTPUT flag if output packet detected
1244 + if (ms->flags & IP_MASQ_F_VS_NO_OUTPUT) {
1245 + if (state_off == MASQ_STATE_OUTPUT)
1246 + ms->flags &= ~IP_MASQ_F_VS_NO_OUTPUT;
1247 + else state_off = MASQ_STATE_INPUT_ONLY;
1251 + if ((state_idx = masq_tcp_state_idx(th, state_off)) < 0) {
1252 IP_MASQ_DEBUG(1, "masq_state_idx(%d)=%d!!!\n",
1253 - output, state_idx);
1254 + state_off, state_idx);
1258 +#ifdef CONFIG_IP_MASQUERADE_VS
1259 + new_state = ip_vs_state_table[state_idx].next_state[ms->state];
1261 new_state = masq_tcp_states[state_idx].next_state[ms->state];
1265 if (new_state!=ms->state)
1266 @@ -247,6 +421,15 @@
1267 ntohl(ms->daddr), ntohs(ms->dport),
1268 ip_masq_state_name(ms->state),
1269 ip_masq_state_name(new_state));
1271 +#ifdef CONFIG_IP_MASQUERADE_VS
1273 + * Increase/Decrease the active connection counter and
1274 + * set ms->flags according to ms->state and new_state.
1276 + ip_vs_set_state(ms, new_state);
1277 +#endif /* CONFIG_IP_MASQUERADE_VS */
1279 return masq_set_state_timeout(ms, new_state);
1284 * Handle state transitions
1286 -static int masq_set_state(struct ip_masq *ms, int output, struct iphdr *iph, void *tp)
1287 +static int masq_set_state(struct ip_masq *ms, int state_off, struct iphdr *iph, void *tp)
1289 switch (iph->protocol) {
1293 return masq_set_state_timeout(ms, IP_MASQ_S_UDP);
1295 - return masq_tcp_state(ms, output, tp);
1296 + return masq_tcp_state(ms, state_off, tp);
1302 EXPORT_SYMBOL(ip_masq_get_debug_level);
1303 EXPORT_SYMBOL(ip_masq_new);
1304 +#ifdef CONFIG_IP_MASQUERADE_VS
1305 +EXPORT_SYMBOL(ip_masq_new_vs);
1306 +#endif /* CONFIG_IP_MASQUERADE_VS */
1307 EXPORT_SYMBOL(ip_masq_listen);
1308 EXPORT_SYMBOL(ip_masq_free_ports);
1309 EXPORT_SYMBOL(ip_masq_out_get);
1310 @@ -423,9 +609,17 @@
1313 ms->timer.expires = jiffies+tout;
1314 +#ifdef CONFIG_IP_MASQUERADE_VS
1315 + add_sltimer(&ms->timer);
1317 add_timer(&ms->timer);
1320 +#ifdef CONFIG_IP_MASQUERADE_VS
1321 + del_sltimer(&ms->timer);
1323 del_timer(&ms->timer);
1328 @@ -741,6 +935,10 @@
1331 read_lock(&__ip_masq_lock);
1332 +#ifdef CONFIG_IP_MASQUERADE_VS
1333 + ms = __ip_vs_out_get(protocol, s_addr, s_port, d_addr, d_port);
1335 +#endif /* CONFIG_IP_MASQUERADE_VS */
1336 ms = __ip_masq_out_get(protocol, s_addr, s_port, d_addr, d_port);
1337 read_unlock(&__ip_masq_lock);
1339 @@ -754,7 +952,11 @@
1342 read_lock(&__ip_masq_lock);
1343 - ms = __ip_masq_in_get(protocol, s_addr, s_port, d_addr, d_port);
1344 +#ifdef CONFIG_IP_MASQUERADE_VS
1345 + ms = __ip_vs_in_get(protocol, s_addr, s_port, d_addr, d_port);
1347 +#endif /* CONFIG_IP_MASQUERADE_VS */
1348 + ms = __ip_masq_in_get(protocol, s_addr, s_port, d_addr, d_port);
1349 read_unlock(&__ip_masq_lock);
1352 @@ -791,7 +993,11 @@
1353 static void masq_expire(unsigned long data)
1355 struct ip_masq *ms = (struct ip_masq *)data;
1356 +#ifdef CONFIG_IP_MASQUERADE_VS
1357 + ms->timeout = MASQUERADE_EXPIRE_RETRY(ms);
1359 ms->timeout = MASQUERADE_EXPIRE_RETRY;
1364 @@ -826,6 +1032,15 @@
1366 ip_masq_control_del(ms);
1368 +#ifdef CONFIG_IP_MASQUERADE_VS
1369 + if (ms->flags & IP_MASQ_F_VS) {
1370 + if (ip_vs_unhash(ms)) {
1371 + ip_vs_unbind_masq(ms);
1372 + ip_masq_unbind_app(ms);
1376 +#endif /* CONFIG_IP_MASQUERADE_VS */
1377 if (ip_masq_unhash(ms)) {
1378 if (ms->flags&IP_MASQ_F_MPORT) {
1379 atomic_dec(&mport_count);
1380 @@ -839,6 +1054,9 @@
1381 * refcnt==1 implies I'm the only one referrer
1383 if (atomic_read(&ms->refcnt) == 1) {
1384 +#ifdef IP_MASQ_MANY_STATE_TABLES
1385 + ip_masq_timeout_detach(ms);
1387 kfree_s(ms,sizeof(*ms));
1388 sysctl_ip_always_defrag--;
1390 @@ -1077,6 +1295,83 @@
1395 +#ifdef CONFIG_IP_MASQUERADE_VS
1397 + * Create a new masquerade entry for IPVS, all parameters {maddr,
1398 + * mport, saddr, sport, daddr, dport, mflags} are known. No need
1399 + * to allocate a free mport. And, hash it into the ip_vs_table.
1401 + * Be careful, it can be called from u-space
1404 +struct ip_masq * ip_masq_new_vs(int proto, __u32 maddr, __u16 mport, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned mflags)
1406 + struct ip_masq *ms;
1407 + static int n_fails = 0;
1410 + prio = (mflags&IP_MASQ_F_USER) ? GFP_KERNEL : GFP_ATOMIC;
1412 + ms = (struct ip_masq *) kmalloc(sizeof(struct ip_masq), prio);
1414 + if (++n_fails < 5)
1415 + IP_VS_ERR("ip_masq_new_vs(proto=%s): no memory available.\n",
1416 + masq_proto_name(proto));
1419 + MOD_INC_USE_COUNT;
1421 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,2,14)
1422 + sysctl_ip_always_defrag++;
1424 + memset(ms, 0, sizeof(*ms));
1425 + INIT_LIST_HEAD(&ms->s_list);
1426 + INIT_LIST_HEAD(&ms->m_list);
1427 + INIT_LIST_HEAD(&ms->d_list);
1428 + init_timer(&ms->timer);
1429 + ms->timer.data = (unsigned long)ms;
1430 + ms->timer.function = masq_expire;
1431 + ip_masq_timeout_attach(ms,ip_vs_timeout_table);
1432 + ms->protocol = proto;
1433 + ms->saddr = saddr;
1434 + ms->sport = sport;
1435 + ms->daddr = daddr;
1436 + ms->dport = dport;
1437 + ms->maddr = maddr;
1438 + ms->mport = mport;
1439 + ms->flags = mflags;
1440 + ms->app_data = NULL;
1441 + ms->control = NULL;
1443 + atomic_set(&ms->n_control,0);
1444 + atomic_set(&ms->refcnt,0);
1445 + atomic_set(&ms->in_pkts,0);
1447 + if (mflags & IP_MASQ_F_USER)
1448 + write_lock_bh(&__ip_masq_lock);
1450 + write_lock(&__ip_masq_lock);
1453 + * Hash it in the ip_vs_table
1457 + if (mflags & IP_MASQ_F_USER)
1458 + write_unlock_bh(&__ip_masq_lock);
1460 + write_unlock(&__ip_masq_lock);
1462 + ip_masq_bind_app(ms);
1464 + atomic_inc(&ms->refcnt);
1465 + masq_set_state_timeout(ms, IP_MASQ_S_NONE);
1468 +#endif /* CONFIG_IP_MASQUERADE_VS */
1472 * Get transport protocol data offset, check against size
1474 @@ -1153,25 +1448,20 @@
1478 +#ifndef CONFIG_IP_MASQUERADE_VS
1479 /* Lets determine our maddr now, shall we? */
1481 - struct rtable *rt;
1482 - struct rtable *skb_rt = (struct rtable*)skb->dst;
1483 - struct device *skb_dev = skb_rt->u.dst.dev;
1485 - if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos)|RTO_CONN, skb_dev?skb_dev->ifindex:0)) {
1486 - /* Fallback on old method */
1487 - /* This really shouldn't happen... */
1488 - maddr = inet_select_addr(skb_dev, skb_rt->rt_gateway, RT_SCOPE_UNIVERSE);
1490 - /* Route lookup succeeded */
1491 - maddr = rt->rt_src;
1494 + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) {
1499 switch (iph->protocol) {
1501 +#ifdef CONFIG_IP_MASQUERADE_VS
1502 + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) {
1506 return(ip_fw_masq_icmp(skb_p, maddr));
1508 if (h.uh->check == 0)
1509 @@ -1230,6 +1520,17 @@
1511 ms = ip_masq_out_get_iph(iph);
1513 +#ifdef CONFIG_IP_MASQUERADE_VS
1514 + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) {
1516 + * Drop this packet but don't
1517 + * start the timer from the beginning
1519 + __ip_masq_put(ms);
1520 + add_sltimer(&ms->timer);
1526 * If sysctl !=0 and no pkt has been received yet
1527 @@ -1280,6 +1581,33 @@
1528 ms->daddr = iph->daddr;
1531 +#ifdef CONFIG_IP_MASQUERADE_VS
1532 + struct ip_vs_dest *dest;
1535 + * Check if the packet is from our real service
1537 + read_lock(&__ip_vs_lock);
1538 + dest = __ip_vs_lookup_real_service(iph->protocol,
1539 + iph->saddr, h.portp[0]);
1540 + read_unlock(&__ip_vs_lock);
1543 + * Notify the real server: there is
1544 + * no existing entry if it is not RST packet
1545 + * or not TCP packet.
1547 + if (!h.th->rst || iph->protocol != IPPROTO_TCP)
1548 + icmp_send(skb, ICMP_DEST_UNREACH,
1549 + ICMP_PORT_UNREACH, 0);
1553 + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) {
1559 * Nope, not found, create a new entry for it
1561 @@ -1392,11 +1720,17 @@
1562 IP_MASQ_DEBUG(2, "O-routed from %08X:%04X with masq.addr %08X\n",
1563 ntohl(ms->maddr),ntohs(ms->mport),ntohl(maddr));
1565 - masq_set_state(ms, 1, iph, h.portp);
1566 +#ifdef CONFIG_IP_MASQUERADE_VS
1567 + /* do the IPVS statistics */
1568 + if (ms->flags & IP_MASQ_F_VS)
1569 + ip_vs_out_stats(ms, skb);
1572 + masq_set_state(ms, MASQ_STATE_OUTPUT, iph, h.portp);
1580 * Restore original addresses and ports in the original IP
1581 @@ -1438,6 +1772,12 @@
1582 ms = __ip_masq_out_get(iph->protocol,
1583 iph->daddr, portp[1],
1584 iph->saddr, portp[0]);
1585 +#ifdef CONFIG_IP_MASQUERADE_VS
1587 + ms = __ip_vs_out_get(iph->protocol,
1588 + iph->daddr, portp[1],
1589 + iph->saddr, portp[0]);
1590 +#endif /* CONFIG_IP_MASQUERADE_VS */
1591 read_unlock(&__ip_masq_lock);
1593 IP_MASQ_DEBUG(1, "Incoming frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n",
1594 @@ -1459,6 +1799,12 @@
1595 ms = __ip_masq_in_get(iph->protocol,
1596 iph->daddr, portp[1],
1597 iph->saddr, portp[0]);
1598 +#ifdef CONFIG_IP_MASQUERADE_VS
1600 + ms = __ip_vs_in_get(iph->protocol,
1601 + iph->daddr, portp[1],
1602 + iph->saddr, portp[0]);
1603 +#endif /* CONFIG_IP_MASQUERADE_VS */
1604 read_unlock(&__ip_masq_lock);
1606 IP_MASQ_DEBUG(1, "Outgoing frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n",
1607 @@ -1469,8 +1815,8 @@
1615 * Handle ICMP messages in forward direction.
1616 * Find any that might be relevant, check against existing connections,
1617 @@ -1556,7 +1902,7 @@
1618 ntohs(icmp_id(icmph)),
1621 - masq_set_state(ms, 1, iph, icmph);
1622 + masq_set_state(ms, MASQ_STATE_OUTPUT, iph, icmph);
1626 @@ -1684,11 +2030,28 @@
1630 +#ifdef CONFIG_IP_MASQUERADE_VS
1632 + ms = __ip_vs_out_get(ciph->protocol,
1633 + ciph->daddr, pptr[1],
1634 + ciph->saddr, pptr[0]);
1636 +#endif /* CONFIG_IP_MASQUERADE_VS */
1637 read_unlock(&__ip_masq_lock);
1642 +#ifdef CONFIG_IP_MASQUERADE_VS
1643 + if (IP_MASQ_VS_FWD(ms) != 0) {
1644 + IP_VS_INFO("shouldn't get here, because tun/dr is on the half connection\n");
1647 + /* do the IPVS statistics */
1648 + if (ms->flags & IP_MASQ_F_VS)
1649 + ip_vs_out_stats(ms, skb);
1650 +#endif /* CONFIG_IP_MASQUERADE_VS */
1652 /* Now we do real damage to this packet...! */
1653 /* First change the source IP address, and recalc checksum */
1654 iph->saddr = ms->maddr;
1655 @@ -1739,6 +2102,87 @@
1659 +#ifdef CONFIG_IP_MASQUERADE_VS
1662 + * Check whether this ICMP packet in the FORWARD path is for
1663 + * related IPVS connection and needs to be delivered locally
1666 +int ip_vs_forwarding_related_icmp(struct sk_buff *skb)
1668 + struct iphdr *iph = skb->nh.iph;
1669 + struct icmphdr *icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2));
1670 + unsigned short size = ntohs(iph->tot_len) - (iph->ihl * 4);
1671 + struct iphdr *ciph; /* The ip header contained within the ICMP */
1672 + __u16 *pptr; /* port numbers from TCP/UDP contained header */
1673 + struct ip_masq *ms;
1674 + union ip_masq_tphdr h;
1678 + * PACKET_HOST only, see ip_forward
1681 + h.raw = (char*) iph + iph->ihl * 4;
1683 + doff = proto_doff(iph->protocol, h.raw, size);
1685 + if (doff <= 0) return 0;
1687 + IP_VS_DBG(10, "icmp fwd/rev (%d,%d) %u.%u.%u.%u -> %u.%u.%u.%u\n",
1688 + icmph->type, ntohs(icmp_id(icmph)),
1689 + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
1691 + if ((icmph->type != ICMP_DEST_UNREACH) &&
1692 + (icmph->type != ICMP_SOURCE_QUENCH) &&
1693 + (icmph->type != ICMP_TIME_EXCEEDED))
1697 + * If we get here we have an ICMP error of one of the above 3 types
1698 + * Now find the contained IP header
1701 + ciph = (struct iphdr *) (icmph + 1);
1702 + size -= sizeof(struct icmphdr);
1703 + if (size < sizeof(struct iphdr)) return 0;
1705 + /* We are only interested ICMPs generated from TCP or UDP packets */
1706 + if (ciph->protocol == IPPROTO_TCP) {
1707 + if (size < sizeof(struct tcphdr)) return 0;
1710 + if (ciph->protocol == IPPROTO_UDP) {
1711 + if (size < sizeof(struct udphdr)) return 0;
1715 + /* We don't ensure for now the checksum is correct */
1717 + /* This is pretty much what __ip_masq_in_get_iph() does,
1718 + except params are wrong way round */
1719 + pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]);
1721 + read_lock(&__ip_masq_lock);
1722 + ms = __ip_vs_in_get(ciph->protocol,
1727 + read_unlock(&__ip_masq_lock);
1729 + if (!ms) return 0;
1730 + IP_VS_DBG(10, "Delivering locally ICMP for %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u to %u.%u.%u.%u\n",
1731 + NIPQUAD(ciph->daddr), ntohs(pptr[1]),
1732 + NIPQUAD(ciph->saddr), ntohs(pptr[0]),
1733 + NIPQUAD(ms->saddr));
1734 + __ip_masq_put(ms);
1738 +#endif /* CONFIG_IP_MASQUERADE_VS */
1741 * Handle ICMP messages in reverse (demasquerade) direction.
1742 * Find any that might be relevant, check against existing connections,
1743 @@ -1812,7 +2256,7 @@
1744 ntohs(icmp_id(icmph)),
1747 - masq_set_state(ms, 0, iph, icmph);
1748 + masq_set_state(ms, MASQ_STATE_INPUT, iph, icmph);
1752 @@ -1914,9 +2358,11 @@
1753 * *outgoing* so the ports are reversed (and addresses)
1755 pptr = (__u16 *)&(((char *)ciph)[csize]);
1756 +#ifndef CONFIG_IP_MASQUERADE_VS
1757 if (ntohs(pptr[0]) < PORT_MASQ_BEGIN ||
1758 ntohs(pptr[0]) > PORT_MASQ_END)
1762 /* Ensure the checksum is correct */
1763 if (ip_compute_csum((unsigned char *) icmph, len))
1764 @@ -1927,7 +2373,6 @@
1769 IP_MASQ_DEBUG(2, "Handling reverse ICMP for %08X:%04X -> %08X:%04X\n",
1770 ntohl(ciph->saddr), ntohs(pptr[0]),
1771 ntohl(ciph->daddr), ntohs(pptr[1]));
1772 @@ -1935,6 +2380,14 @@
1774 /* This is pretty much what __ip_masq_in_get_iph() does, except params are wrong way round */
1775 read_lock(&__ip_masq_lock);
1776 +#ifdef CONFIG_IP_MASQUERADE_VS
1777 + ms = __ip_vs_in_get(ciph->protocol,
1783 +#endif /* CONFIG_IP_MASQUERADE_VS */
1784 ms = __ip_masq_in_get(ciph->protocol,
1787 @@ -1945,10 +2398,23 @@
1791 +#ifdef CONFIG_IP_MASQUERADE_VS
1792 + /* do the IPVS statistics */
1793 + if (ms->flags & IP_MASQ_F_VS)
1794 + ip_vs_in_stats(ms, skb);
1796 + if (IP_MASQ_VS_FWD(ms) != 0) {
1797 + int ret = ip_vs_forward(skb, ms);
1798 + __ip_masq_put(ms);
1801 +#endif /* CONFIG_IP_MASQUERADE_VS */
1803 if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) {
1808 ciph = (struct iphdr *) (icmph + 1);
1809 pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]);
1811 @@ -1998,7 +2464,10 @@
1816 +#ifdef CONFIG_IP_MASQUERADE_VS
1817 + struct ip_vs_service *svc = NULL;
1821 * Big tappo: only PACKET_HOST (nor loopback neither mcasts)
1822 * ... don't know why 1st test DOES NOT include 2nd (?)
1823 @@ -2039,13 +2508,21 @@
1824 return(ip_fw_demasq_icmp(skb_p));
1829 * Make sure packet is in the masq range
1830 * ... or some mod-ule relaxes input range
1831 * ... or there is still some `special' mport opened
1833 +#ifdef CONFIG_IP_MASQUERADE_VS
1834 + svc = ip_vs_lookup_service(skb->fwmark,
1835 + iph->protocol, maddr, h.portp[1]);
1837 + (ntohs(h.portp[1]) < PORT_MASQ_BEGIN
1838 + || ntohs(h.portp[1]) > PORT_MASQ_END)
1840 if ((ntohs(h.portp[1]) < PORT_MASQ_BEGIN
1841 || ntohs(h.portp[1]) > PORT_MASQ_END)
1842 +#endif /* CONFIG_IP_MASQUERADE_VS */
1843 #ifdef CONFIG_IP_MASQUERADE_MOD
1844 && (ip_masq_mod_in_rule(skb, iph) != 1)
1846 @@ -2100,6 +2577,21 @@
1848 ms = ip_masq_in_get_iph(iph);
1850 +#ifdef CONFIG_IP_MASQUERADE_VS
1852 + * Checking the server status
1854 + if (ms && ms->dest && !(ms->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1856 + * If the dest is not avaiable, don't restart the timer
1857 + * of the packet, but silently drop it.
1859 + add_sltimer(&ms->timer);
1860 + __ip_masq_put(ms);
1866 * Give additional modules a chance to create an entry
1868 @@ -2116,6 +2608,27 @@
1869 ip_masq_mod_in_update(skb, iph, ms);
1872 +#ifdef CONFIG_IP_MASQUERADE_VS
1874 + (h.th->syn || (iph->protocol!=IPPROTO_TCP)) && svc) {
1875 + if (ip_masq_todrop()) {
1877 + * It seems that we are very loaded.
1878 + * We have to drop this packet :(
1883 + * Let the virtual server select a real server
1884 + * for the incomming connection, and create a
1885 + * masquerading entry.
1887 + ms = ip_vs_schedule(svc, iph);
1889 + return ip_vs_leave(svc, skb);
1890 + ip_vs_conn_stats(ms, svc);
1892 +#endif /* CONFIG_IP_MASQUERADE_VS */
1896 @@ -2168,13 +2681,43 @@
1901 +#ifdef CONFIG_IP_MASQUERADE_VS
1902 + /* do the IPVS statistics */
1903 + if (ms->flags & IP_MASQ_F_VS)
1904 + ip_vs_in_stats(ms, skb);
1906 + if (IP_MASQ_VS_FWD(ms) != 0) {
1910 + * Sorry for setting state of masq entry so early
1911 + * no matter whether the packet is forwarded
1912 + * successfully or not, because ip_vs_forward may
1913 + * have already released the skb. Although it
1914 + * brokes the original sematics, it won't lead to
1915 + * serious errors. We look forward to fixing it
1916 + * under the Rusty's netfilter framework both for
1917 + * correctness and modularization.
1919 + masq_set_state(ms, MASQ_STATE_INPUT, iph, h.portp);
1921 + ret = ip_vs_forward(skb, ms);
1926 + IP_VS_DBG(10, "masquerading packet...\n");
1927 +#endif /* CONFIG_IP_MASQUERADE_VS */
1929 if ((skb=masq_skb_cow(skb_p, &iph, &h.raw)) == NULL) {
1934 iph->daddr = ms->saddr;
1935 h.portp[1] = ms->sport;
1939 * Invalidate csum saving if tunnel has masq helper
1941 @@ -2231,15 +2774,28 @@
1942 h.uh->check = 0xFFFF;
1945 - ip_send_check(iph);
1946 + ip_send_check(iph);
1948 IP_MASQ_DEBUG(2, "I-routed to %08X:%04X\n",ntohl(iph->daddr),ntohs(h.portp[1]));
1950 - masq_set_state (ms, 0, iph, h.portp);
1951 + masq_set_state(ms, MASQ_STATE_INPUT, iph, h.portp);
1956 +#ifdef CONFIG_IP_MASQUERADE_VS
1959 + * Drop packet if it belongs to virtual service but no entry
1960 + * is found or created. Furthermore, send DEST_UNREACH icmp
1961 + * packet to clients if it is not RST or it is not TCP.
1963 + if (!h.th->rst || iph->protocol != IPPROTO_TCP) {
1964 + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
1970 /* sorry, all this trouble for a no-hit :) */
1972 @@ -2350,7 +2906,6 @@
1973 len += sprintf(buffer+len, "%-127s\n", temp);
1977 read_unlock_bh(&__ip_masq_lock);
1980 @@ -2358,9 +2913,52 @@
1981 read_unlock_bh(&__ip_masq_lock);
1986 +#ifdef CONFIG_IP_MASQUERADE_VS
1987 + for(idx = 0; idx < IP_VS_TAB_SIZE; idx++)
1990 + * Lock is actually only need in next loop
1991 + * we are called from uspace: must stop bh.
1993 + read_lock_bh(&__ip_masq_lock);
1995 + l = &ip_vs_table[idx];
1996 + for (e=l->next; e!=l; e=e->next) {
1997 + ms = list_entry(e, struct ip_masq, m_list);
1999 + if (pos <= offset) {
2005 + * We have locked the tables, no need to del/add timers
2009 + sprintf(temp,"%s %08X:%04X %08X:%04X %04X %08X %6d %6d %7lu",
2010 + masq_proto_name(ms->protocol),
2011 + ntohl(ms->saddr), ntohs(ms->sport),
2012 + ntohl(ms->daddr), ntohs(ms->dport),
2014 + ms->out_seq.init_seq,
2015 + ms->out_seq.delta,
2016 + ms->out_seq.previous_delta,
2017 + ms->timer.expires-jiffies);
2018 + len += sprintf(buffer+len, "%-127s\n", temp);
2020 + if(len >= length) {
2021 + read_unlock_bh(&__ip_masq_lock);
2025 + read_unlock_bh(&__ip_masq_lock);
2028 +#endif /* CONFIG_IP_MASQUERADE_VS */
2031 begin = len - (pos - offset);
2032 *start = buffer + begin;
2034 @@ -2386,17 +2984,29 @@
2035 len, sizeof(struct ip_fw_masq));
2037 masq = (struct ip_fw_masq *)m;
2038 - if (masq->tcp_timeout)
2039 + if (masq->tcp_timeout) {
2040 masq_timeout_table.timeout[IP_MASQ_S_ESTABLISHED]
2041 +#ifdef CONFIG_IP_MASQUERADE_VS
2042 + = masq_timeout_table_dos.timeout[IP_MASQ_S_ESTABLISHED]
2044 = masq->tcp_timeout;
2047 - if (masq->tcp_fin_timeout)
2048 + if (masq->tcp_fin_timeout) {
2049 masq_timeout_table.timeout[IP_MASQ_S_FIN_WAIT]
2050 +#ifdef CONFIG_IP_MASQUERADE_VS
2051 + = masq_timeout_table_dos.timeout[IP_MASQ_S_FIN_WAIT]
2053 = masq->tcp_fin_timeout;
2056 - if (masq->udp_timeout)
2057 + if (masq->udp_timeout) {
2058 masq_timeout_table.timeout[IP_MASQ_S_UDP]
2059 +#ifdef CONFIG_IP_MASQUERADE_VS
2060 + = masq_timeout_table_dos.timeout[IP_MASQ_S_UDP]
2062 = masq->udp_timeout;
2067 @@ -2468,6 +3078,11 @@
2068 ret = ip_masq_mod_ctl(optname, &masq_ctl, optlen);
2071 +#ifdef CONFIG_IP_MASQUERADE_VS
2072 + case IP_MASQ_TARGET_VS:
2073 + ret = ip_vs_ctl(optname, &masq_ctl, optlen);
2079 @@ -2529,12 +3144,25 @@
2082 #endif /* CONFIG_PROC_FS */
2085 - * Wrapper over inet_select_addr()
2086 + * Determine maddr from skb
2088 -u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope)
2089 +int ip_masq_select_addr(struct sk_buff *skb, __u32 *maddr)
2091 - return inet_select_addr(dev, dst, scope);
2092 + struct rtable *rt;
2093 + struct rtable *skb_rt = (struct rtable*)skb->dst;
2094 + struct device *skb_dev = skb_rt->u.dst.dev;
2095 + struct iphdr *iph = skb->nh.iph;
2097 + if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos)|RTO_CONN, skb_dev?skb_dev->ifindex:0)) {
2100 + /* Route lookup succeeded */
2101 + *maddr = rt->rt_src;
2108 @@ -2587,7 +3215,7 @@
2109 (char *) IPPROTO_ICMP,
2113 +#endif /* CONFIG_PROC_FS */
2114 #ifdef CONFIG_IP_MASQUERADE_IPAUTOFW
2117 @@ -2596,6 +3224,9 @@
2119 #ifdef CONFIG_IP_MASQUERADE_MFW
2122 +#ifdef CONFIG_IP_MASQUERADE_VS
2127 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs.c
2128 --- linux-2.2.19/net/ipv4/ip_vs.c Thu Jan 1 08:00:00 1970
2129 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs.c Mon May 14 22:04:50 2001
2132 + * IPVS An implementation of the IP virtual server support for the
2133 + * LINUX operating system. IPVS is now implemented as a part
2134 + * of IP masquerading code. IPVS can be used to build a
2135 + * high-performance and highly available server based on a
2136 + * cluster of servers.
2140 + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
2141 + * Peter Kese <peter.kese@ijs.si>
2143 + * This program is free software; you can redistribute it and/or
2144 + * modify it under the terms of the GNU General Public License
2145 + * as published by the Free Software Foundation; either version
2146 + * 2 of the License, or (at your option) any later version.
2149 + * Wensong Zhang : fixed the overflow bug in ip_vs_procinfo
2150 + * Wensong Zhang : added editing dest and service functions
2151 + * Wensong Zhang : changed the names of some functions
2152 + * Wensong Zhang : fixed the unlocking bug in ip_vs_del_dest
2153 + * Wensong Zhang : added a separate hash table for IPVS
2154 + * Wensong Zhang : added slow timer for IPVS masq entries
2155 + * Julian Anastasov : fixed the number of active connections
2156 + * Wensong Zhang : added persistent port
2157 + * Wensong Zhang : fixed the incorrect lookup in hash table
2158 + * Wensong Zhang : added server status checking
2159 + * Wensong Zhang : fixed the incorrect slow timer vector layout
2160 + * Wensong Zhang : fixed the sltimer added twice bug of mst
2161 + * Julian Anastasov : fixed the IP_MASQ_F_VS_INACTIVE cleared bug after editing dest
2162 + * Wensong Zhang : added the inactive connection counter
2163 + * Wensong Zhang : changed the body of ip_vs_schedule
2164 + * Julian Anastasov : fixed the unlocking bug in ip_vs_schedule
2165 + * Julian Anastasov : fixed the uncounting bug in creating masqs by template
2166 + * Wensong Zhang : changed some condition orders for a bit performance
2167 + * Julian Anastasov : don't touch counters in ip_vs_unbind_masq for templates
2168 + * Wensong Zhang : added the hash table for virtual services
2169 + * Wensong Zhang : changed destination lists to d-linked lists
2170 + * Wensong Zhang : changed the scheduler list to the d-linked list
2171 + * Wensong Zhang : added new persistent service handling
2172 + * Julian Anastasov : fixed the counting bug in ip_vs_unbind_masq again
2173 + * (don't touch counters for templates)
2174 + * Wensong Zhang : changed some IP_VS_ERR to IP_VS_DBG in the ip_vs_tunnel_xmit
2175 + * Wensong Zhang : added different timeout support for persistent svc
2176 + * Wensong Zhang : fixed the bug that persistent svc cannot be edited
2177 + * Julian Anastasov : removed extra read_unlock in __ip_vs_lookup_service
2178 + * Julian Anastasov : changed not to restart template timers if dest is unavailable
2179 + * Julian Anastasov : added the destination trash
2180 + * Wensong Zhang : added the update_service call in ip_vs_del_dest
2181 + * Wensong Zhang : added the ip_vs_leave function
2182 + * Lars Marowsky-Bree : added persistence granularity support
2183 + * Julian Anastasov : changed some comestics things for debugging
2184 + * Wensong Zhang : use vmalloc to allocate big ipvs hash table
2185 + * Wensong Zhang : changed the tunneling/direct routing methods a little
2186 + * Julian Anastasov : fixed the return bug of ip_vs_leave(-2 instead of -3)
2187 + * Roberto Nibali : fixed the undefined variable bug in the IP_VS_DBG of ip_vs_dr_xmit
2188 + * Julian Anastasov : changed ICMP_PROT_UNREACH to ICMP_PORT_UNREACH in ip_vs_leave
2189 + * Wensong Zhang : added port zero support for persistent services
2190 + * Wensong Zhang : fixed the bug that virtual ftp service blocks other services not listed in ipvs table
2191 + * Wensong Zhang : invalidate a persistent template when its dest is unavailable
2192 + * Julian Anastasov : changed two IP_VS_ERR calls to IP_VS_DBG
2193 + * Wensong Zhang : added random drop of syn entries
2194 + * Wensong Zhang : added random drop of UDP entris
2195 + * Julian Anastasov : added droprate defense against DoS attack
2196 + * Julian Anastasov : added secure_tcp defense against DoS attack
2197 + * Wensong Zhang : revisited dropentry defense against DoS attach
2198 + * Horms : added the fwmark service feature
2199 + * Wensong Zhang : changed to two service hash tables
2200 + * Julian Anastasov : corrected trash_dest lookup for both
2201 + * normal service and fwmark service
2205 +#include <linux/config.h>
2206 +#include <linux/module.h>
2207 +#include <linux/types.h>
2208 +#include <linux/kernel.h>
2209 +#include <linux/errno.h>
2210 +#include <linux/vmalloc.h>
2211 +#include <linux/swap.h>
2212 +#include <net/ip_masq.h>
2214 +#include <linux/sysctl.h>
2215 +#include <linux/ip_fw.h>
2216 +#include <linux/ip_masq.h>
2217 +#include <linux/proc_fs.h>
2219 +#include <linux/inetdevice.h>
2220 +#include <linux/ip.h>
2221 +#include <net/icmp.h>
2222 +#include <net/ip.h>
2223 +#include <net/route.h>
2224 +#include <net/ip_vs.h>
2227 +#include <linux/kmod.h>
2230 +EXPORT_SYMBOL(register_ip_vs_scheduler);
2231 +EXPORT_SYMBOL(unregister_ip_vs_scheduler);
2232 +EXPORT_SYMBOL(ip_vs_bind_masq);
2233 +EXPORT_SYMBOL(ip_vs_unbind_masq);
2234 +EXPORT_SYMBOL(ip_vs_lookup_dest);
2235 +#ifdef CONFIG_IP_VS_DEBUG
2236 +EXPORT_SYMBOL(ip_vs_get_debug_level);
2239 +int sysctl_ip_vs_drop_entry = 0;
2240 +int sysctl_ip_vs_drop_packet = 0;
2241 +int sysctl_ip_vs_secure_tcp = 0;
2242 +int sysctl_ip_vs_amemthresh = 1024;
2243 +int sysctl_ip_vs_am_droprate = 10;
2245 +#ifdef CONFIG_IP_VS_DEBUG
2246 +static int sysctl_ip_vs_debug_level = 0;
2248 +int ip_vs_get_debug_level(void)
2250 + return sysctl_ip_vs_debug_level;
2255 +int ip_vs_dropentry = 0;
2257 +static inline void update_defense_level(void)
2259 + int ip_vs_amem = nr_free_pages+page_cache_size+(buffermem>>PAGE_SHIFT);
2260 + int nomem = (ip_vs_amem < sysctl_ip_vs_amemthresh);
2263 + switch (sysctl_ip_vs_drop_entry) {
2265 + ip_vs_dropentry = 0;
2269 + ip_vs_dropentry = 1;
2270 + sysctl_ip_vs_drop_entry = 2;
2272 + ip_vs_dropentry = 0;
2277 + ip_vs_dropentry = 1;
2279 + ip_vs_dropentry = 0;
2280 + sysctl_ip_vs_drop_entry = 1;
2284 + ip_vs_dropentry = 1;
2289 + switch (sysctl_ip_vs_drop_packet) {
2291 + ip_masq_drop_rate = 0;
2295 + ip_masq_drop_rate = ip_masq_drop_counter
2296 + = sysctl_ip_vs_amemthresh /
2297 + (sysctl_ip_vs_amemthresh-ip_vs_amem);
2298 + sysctl_ip_vs_drop_packet = 2;
2300 + ip_masq_drop_rate = 0;
2305 + ip_masq_drop_rate = ip_masq_drop_counter
2306 + = sysctl_ip_vs_amemthresh /
2307 + (sysctl_ip_vs_amemthresh-ip_vs_amem);
2309 + ip_masq_drop_rate = 0;
2310 + sysctl_ip_vs_drop_packet = 1;
2314 + ip_masq_drop_rate = sysctl_ip_vs_am_droprate;
2319 + switch (sysctl_ip_vs_secure_tcp) {
2321 + ip_masq_secure_tcp_set(0);
2325 + ip_masq_secure_tcp_set(1);
2326 + sysctl_ip_vs_secure_tcp = 2;
2328 + ip_masq_secure_tcp_set(0);
2333 + ip_masq_secure_tcp_set(1);
2335 + ip_masq_secure_tcp_set(0);
2336 + sysctl_ip_vs_secure_tcp = 1;
2340 + ip_masq_secure_tcp_set(1);
2346 +static inline int todrop_entry(struct ip_masq *ms)
2349 + * The drop rate array needs tuning for real environments.
2351 + static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
2352 + static char todrop_counter[9] = {0};
2355 + if (ms->timeout+jiffies-ms->timer.expires < 60*HZ)
2358 + i = atomic_read(&ms->in_pkts);
2359 + if (i > 8) return 0;
2361 + if (!todrop_rate[i]) return 0;
2362 + if (--todrop_counter[i] > 0) return 0;
2364 + todrop_counter[i] = todrop_rate[i];
2368 +static inline void ip_vs_random_dropentry(void)
2371 + struct ip_masq *ms;
2372 + struct list_head *l,*e;
2373 + struct ip_masq *mst;
2374 + void (*fn)(unsigned long);
2377 + * Randomly scan 1/32 of the whole table every second
2379 + for (i=0; i < (IP_VS_TAB_SIZE>>5); i++) {
2381 + * Lock is actually needed in this loop.
2383 + write_lock(&__ip_masq_lock);
2385 + l = &ip_vs_table[net_random()&IP_VS_TAB_MASK];
2386 + for (e=l->next; e!=l; e=e->next) {
2387 + ms = list_entry(e, struct ip_masq, m_list);
2388 + if (ms->dport == 0)
2389 + /* masq template */
2391 + switch(ms->state) {
2392 + case IP_MASQ_S_SYN_RECV:
2393 + case IP_MASQ_S_SYNACK:
2396 + case IP_MASQ_S_ESTABLISHED:
2397 + case IP_MASQ_S_UDP:
2398 + if (todrop_entry(ms))
2407 + * Drop the entry, and drop its mst if not referenced
2409 + write_unlock(&__ip_masq_lock);
2410 + IP_VS_DBG(4, "Drop masq\n");
2411 + mst = ms->control;
2412 + fn = (ms->timer).function;
2413 + del_sltimer(&ms->timer);
2414 + fn((unsigned long)ms);
2415 + if (mst && !atomic_read(&mst->n_control)) {
2416 + IP_VS_DBG(4, "Drop masq template\n");
2417 + del_sltimer(&mst->timer);
2418 + fn((unsigned long)mst);
2420 + write_lock(&__ip_masq_lock);
2422 + write_unlock(&__ip_masq_lock);
2428 + * The following block implements slow timers for IPVS, most code is stolen
2429 + * from linux/kernel/sched.c
2430 + * Slow timer is used to avoid the overhead of cascading timers, when lots
2431 + * of masq entries (>50,000) are cluttered in the system.
2433 +#define SHIFT_BITS 6
2435 +#define TVR_BITS 10
2436 +#define TVN_SIZE (1 << TVN_BITS)
2437 +#define TVR_SIZE (1 << TVR_BITS)
2438 +#define TVN_MASK (TVN_SIZE - 1)
2439 +#define TVR_MASK (TVR_SIZE - 1)
2441 +struct sltimer_vec {
2443 + struct timer_list *vec[TVN_SIZE];
2446 +struct sltimer_vec_root {
2448 + struct timer_list *vec[TVR_SIZE];
2451 +static struct sltimer_vec sltv3 = { 0 };
2452 +static struct sltimer_vec sltv2 = { 0 };
2453 +static struct sltimer_vec_root sltv1 = { 0 };
2455 +static struct sltimer_vec * const sltvecs[] = {
2456 + (struct sltimer_vec *)&sltv1, &sltv2, &sltv3
2459 +#define NOOF_SLTVECS (sizeof(sltvecs) / sizeof(sltvecs[0]))
2461 +static unsigned long sltimer_jiffies = 0;
2463 +static inline void insert_sltimer(struct timer_list *timer,
2464 + struct timer_list **vec, int idx)
2466 + if ((timer->next = vec[idx]))
2467 + vec[idx]->prev = timer;
2469 + timer->prev = (struct timer_list *)&vec[idx];
2472 +static inline void internal_add_sltimer(struct timer_list *timer)
2475 + * must be cli-ed when calling this
2477 + unsigned long expires = timer->expires;
2478 + unsigned long idx = (expires - sltimer_jiffies) >> SHIFT_BITS;
2480 + if (idx < TVR_SIZE) {
2481 + int i = (expires >> SHIFT_BITS) & TVR_MASK;
2482 + insert_sltimer(timer, sltv1.vec, i);
2483 + } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
2484 + int i = (expires >> (SHIFT_BITS+TVR_BITS)) & TVN_MASK;
2485 + insert_sltimer(timer, sltv2.vec, i);
2486 + } else if ((signed long) idx < 0) {
2488 + * can happen if you add a timer with expires == jiffies,
2489 + * or you set a timer to go off in the past
2491 + insert_sltimer(timer, sltv1.vec, sltv1.index);
2492 + } else if (idx <= 0xffffffffUL) {
2493 + int i = (expires >> (SHIFT_BITS+TVR_BITS+TVN_BITS)) & TVN_MASK;
2494 + insert_sltimer(timer, sltv3.vec, i);
2496 + /* Can only get here on architectures with 64-bit jiffies */
2497 + timer->next = timer->prev = timer;
2501 +rwlock_t sltimerlist_lock = RW_LOCK_UNLOCKED;
2503 +void add_sltimer(struct timer_list *timer)
2505 + write_lock(&sltimerlist_lock);
2508 + internal_add_sltimer(timer);
2510 + write_unlock(&sltimerlist_lock);
2514 + printk("bug: kernel sltimer added twice at %p.\n",
2515 + __builtin_return_address(0));
2519 +static inline int detach_sltimer(struct timer_list *timer)
2521 + struct timer_list *prev = timer->prev;
2523 + struct timer_list *next = timer->next;
2524 + prev->next = next;
2526 + next->prev = prev;
2532 +void mod_sltimer(struct timer_list *timer, unsigned long expires)
2534 + write_lock(&sltimerlist_lock);
2535 + timer->expires = expires;
2536 + detach_sltimer(timer);
2537 + internal_add_sltimer(timer);
2538 + write_unlock(&sltimerlist_lock);
2541 +int del_sltimer(struct timer_list * timer)
2545 + write_lock(&sltimerlist_lock);
2546 + ret = detach_sltimer(timer);
2547 + timer->next = timer->prev = 0;
2548 + write_unlock(&sltimerlist_lock);
2553 +static inline void cascade_sltimers(struct sltimer_vec *tv)
2556 + * cascade all the timers from tv up one level
2558 + struct timer_list *timer;
2559 + timer = tv->vec[tv->index];
2561 + * We are removing _all_ timers from the list, so we don't have to
2562 + * detach them individually, just clear the list afterwards.
2565 + struct timer_list *tmp = timer;
2566 + timer = timer->next;
2567 + internal_add_sltimer(tmp);
2569 + tv->vec[tv->index] = NULL;
2570 + tv->index = (tv->index + 1) & TVN_MASK;
2573 +static inline void run_sltimer_list(void)
2575 + write_lock(&sltimerlist_lock);
2576 + while ((long)(jiffies - sltimer_jiffies) >= 0) {
2577 + struct timer_list *timer;
2578 + if (!sltv1.index) {
2581 + cascade_sltimers(sltvecs[n]);
2582 + } while (sltvecs[n]->index == 1 && ++n < NOOF_SLTVECS);
2584 + while ((timer = sltv1.vec[sltv1.index])) {
2585 + void (*fn)(unsigned long) = timer->function;
2586 + unsigned long data = timer->data;
2587 + detach_sltimer(timer);
2588 + timer->next = timer->prev = NULL;
2589 + write_unlock(&sltimerlist_lock);
2591 + write_lock(&sltimerlist_lock);
2593 + sltimer_jiffies += 1<<SHIFT_BITS;
2594 + sltv1.index = (sltv1.index + 1) & TVR_MASK;
2596 + write_unlock(&sltimerlist_lock);
2599 +static void sltimer_handler(unsigned long data);
2601 +struct timer_list slow_timer = {
2608 + * Slow timer handler is activated every second
2610 +#define SLTIMER_PERIOD 1*HZ
2612 +void sltimer_handler(unsigned long data)
2614 + run_sltimer_list();
2616 + update_defense_level();
2617 + if (ip_vs_dropentry)
2618 + ip_vs_random_dropentry();
2620 + mod_timer(&slow_timer, (jiffies + SLTIMER_PERIOD));
2625 + * The port number of FTP service (in network order).
2627 +#define FTPPORT __constant_htons(21)
2628 +#define FTPDATA __constant_htons(20)
2633 +rwlock_t __ip_vs_lock = RW_LOCK_UNLOCKED;
2636 + * Hash table: for input and output packets lookups of IPVS
2638 +#define IP_MASQ_NTABLES 3
2640 +struct list_head *ip_vs_table;
2643 + * Hash table: for virtual service lookups
2645 +#define IP_VS_SVC_TAB_BITS 8
2646 +#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
2647 +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
2649 +/* the service table hashed by <protocol, addr, port> */
2650 +struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
2651 +/* the service table hashed by fwmark */
2652 +struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
2655 + * Hash table: for real service lookups
2657 +#define IP_VS_RTAB_BITS 4
2658 +#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
2659 +#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
2661 +struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
2664 + * IPVS scheduler list
2666 +struct list_head ip_vs_schedulers;
2669 + * Trash for destinations
2671 +struct list_head ip_vs_dest_trash;
2674 + * FTP & NULL virtual service counters
2676 +atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
2677 +atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
2680 + * Register a scheduler in the scheduler list
2682 +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
2685 + IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
2689 + if (!scheduler->name) {
2690 + IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
2694 + if (scheduler->n_list.next != &scheduler->n_list) {
2695 + IP_VS_ERR("register_ip_vs_scheduler(): scheduler already linked\n");
2700 + * Add it into the d-linked scheduler list
2702 + list_add(&scheduler->n_list, &ip_vs_schedulers);
2709 + * Unregister a scheduler in the scheduler list
2711 +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
2714 + IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
2719 + * Only allow unregistration if it is not referenced
2721 + if (atomic_read(&scheduler->refcnt)) {
2722 + IP_VS_ERR("unregister_ip_vs_scheduler(): is in use by %d guys. failed\n",
2723 + atomic_read(&scheduler->refcnt));
2727 + if (scheduler->n_list.next == &scheduler->n_list) {
2728 + IP_VS_ERR("unregister_ip_vs_scheduler(): scheduler is not in the list. failed\n");
2733 + * Removed it from the d-linked scheduler list
2735 + list_del(&scheduler->n_list);
2742 + * Bind a service with a scheduler
2743 + * Must called with the __ip_vs_lock lock, and return bool.
2745 +int ip_vs_bind_scheduler(struct ip_vs_service *svc,
2746 + struct ip_vs_scheduler *scheduler)
2748 + if (svc == NULL) {
2749 + IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
2752 + if (scheduler == NULL) {
2753 + IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
2757 + svc->scheduler = scheduler;
2758 + atomic_inc(&scheduler->refcnt);
2760 + if(scheduler->init_service)
2761 + if(scheduler->init_service(svc) != 0) {
2762 + IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
2771 + * Unbind a service with its scheduler
2772 + * Must called with the __ip_vs_lock lock, and return bool.
2774 +int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
2776 + struct ip_vs_scheduler *sched;
2778 + if (svc == NULL) {
2779 + IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
2783 + sched = svc->scheduler;
2784 + if (sched == NULL) {
2785 + IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
2789 + if(sched->done_service)
2790 + if(sched->done_service(svc) != 0) {
2791 + IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
2795 + atomic_dec(&sched->refcnt);
2796 + svc->scheduler = NULL;
2803 + * Get scheduler in the scheduler list by name
2805 +struct ip_vs_scheduler * ip_vs_sched_getbyname(const char *sched_name)
2807 + struct ip_vs_scheduler *sched;
2808 + struct list_head *l, *e;
2810 + IP_VS_DBG(6, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
2813 + read_lock_bh(&__ip_vs_lock);
2815 + l = &ip_vs_schedulers;
2816 + for (e=l->next; e!=l; e=e->next) {
2817 + sched = list_entry(e, struct ip_vs_scheduler, n_list);
2818 + if (strcmp(sched_name, sched->name)==0) {
2820 + read_unlock_bh(&__ip_vs_lock);
2825 + read_unlock_bh(&__ip_vs_lock);
2831 + * Lookup scheduler and try to load it if it doesn't exist
2833 +struct ip_vs_scheduler * ip_vs_lookup_scheduler(const char *sched_name)
2835 + struct ip_vs_scheduler *sched;
2838 + * Search for the scheduler by sched_name
2840 + sched = ip_vs_sched_getbyname(sched_name);
2843 + * If scheduler not found, load the module and search again
2845 + if (sched == NULL) {
2846 + char module_name[IP_MASQ_TNAME_MAX+8];
2847 + sprintf(module_name,"ip_vs_%s",sched_name);
2849 + request_module(module_name);
2850 +#endif /* CONFIG_KMOD */
2851 + sched = ip_vs_sched_getbyname(sched_name);
2859 + * Returns hash value for IPVS masq entry
2862 +static __inline__ unsigned
2863 +ip_vs_hash_key(unsigned proto, __u32 addr, __u16 port)
2865 + unsigned addrh = ntohl(addr);
2867 + return (proto^addrh^(addrh>>IP_VS_TAB_BITS)^ntohs(port))
2873 + * Hashes ip_masq in ip_vs_table by proto,addr,port.
2874 + * should be called with locked tables.
2875 + * returns bool success.
2877 +int ip_vs_hash(struct ip_masq *ms)
2881 + if (ms->flags & IP_MASQ_F_HASHED) {
2882 + IP_VS_ERR("ip_vs_hash(): request for already hashed, "
2883 + "called from %p\n", __builtin_return_address(0));
2888 + * Note: because ip_masq_put sets masq expire only if its
2889 + * refcnt==IP_MASQ_NTABLES, otherwise the masq entry
2890 + * will never expire.
2892 + atomic_add(IP_MASQ_NTABLES, &ms->refcnt);
2895 + * Hash by proto,d{addr,port},
2896 + * which are client address and port in IPVS.
2898 + hash = ip_vs_hash_key(ms->protocol, ms->daddr, ms->dport);
2899 + list_add(&ms->m_list, &ip_vs_table[hash]);
2901 + ms->flags |= IP_MASQ_F_HASHED;
2907 + * Unhashes ip_masq from ip_vs_table.
2908 + * should be called with locked tables.
2909 + * returns bool success.
2911 +int ip_vs_unhash(struct ip_masq *ms)
2913 + if (!(ms->flags & IP_MASQ_F_HASHED)) {
2914 + IP_VS_ERR("ip_vs_unhash(): request for unhash flagged, "
2915 + "called from %p\n", __builtin_return_address(0));
2920 + * Remove it from the list and decrease its reference counter.
2922 + list_del(&ms->m_list);
2923 + atomic_sub(IP_MASQ_NTABLES, &ms->refcnt);
2925 + ms->flags &= ~IP_MASQ_F_HASHED;
2931 + * Gets ip_masq associated with supplied parameters in the ip_vs_table.
2932 + * Called for pkts coming from OUTside-to-INside.
2933 + * s_addr, s_port: pkt source address (foreign host)
2934 + * d_addr, d_port: pkt dest address (load balancer)
2935 + * Caller must lock tables
2937 +struct ip_masq * __ip_vs_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
2940 + struct ip_masq *ms;
2941 + struct list_head *l,*e;
2943 + hash = ip_vs_hash_key(protocol, s_addr, s_port);
2945 + l = &ip_vs_table[hash];
2946 + for (e=l->next; e!=l; e=e->next) {
2947 + ms = list_entry(e, struct ip_masq, m_list);
2948 + if (s_addr==ms->daddr && s_port==ms->dport &&
2949 + d_port==ms->mport && d_addr==ms->maddr &&
2950 + protocol==ms->protocol) {
2952 + atomic_inc(&ms->refcnt);
2959 + IP_VS_DBG(7, "look/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
2960 + masq_proto_name(protocol),
2961 + NIPQUAD(s_addr), ntohs(s_port),
2962 + NIPQUAD(d_addr), ntohs(d_port),
2963 + ms?"hit":"not hit");
2970 + * Gets ip_masq associated with supplied parameters in the ip_vs_table.
2971 + * Called for pkts coming from inside-to-OUTside.
2972 + * s_addr, s_port: pkt source address (inside host)
2973 + * d_addr, d_port: pkt dest address (foreign host)
2974 + * Caller must lock tables
2976 +struct ip_masq * __ip_vs_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
2979 + struct ip_masq *ms;
2980 + struct list_head *l,*e;
2983 + * Check for "full" addressed entries
2985 + hash = ip_vs_hash_key(protocol, d_addr, d_port);
2987 + l = &ip_vs_table[hash];
2988 + for (e=l->next; e!=l; e=e->next) {
2989 + ms = list_entry(e, struct ip_masq, m_list);
2990 + if (d_addr == ms->daddr && d_port == ms->dport &&
2991 + s_port == ms->sport && s_addr == ms->saddr &&
2992 + protocol == ms->protocol) {
2994 + atomic_inc(&ms->refcnt);
3001 + IP_VS_DBG(7, "look/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
3002 + masq_proto_name(protocol),
3003 + NIPQUAD(s_addr), ntohs(s_port),
3004 + NIPQUAD(d_addr), ntohs(d_port),
3005 + ms?"hit":"not hit");
3012 + * Called by ip_vs_sched_persist to look for masq template.
3014 +static __inline__ struct ip_masq *ip_vs_in_get
3015 +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
3017 + struct ip_masq *ms;
3019 + read_lock(&__ip_masq_lock);
3020 + ms = __ip_vs_in_get(protocol, s_addr, s_port, d_addr, d_port);
3021 + read_unlock(&__ip_masq_lock);
3028 + * Returns hash value for virtual service
3030 +static __inline__ unsigned
3031 +ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
3033 + register unsigned porth = ntohs(port);
3035 + return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
3036 + & IP_VS_SVC_TAB_MASK;
3040 + * Returns hash value of fwmark for virtual service lookup
3042 +static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
3044 + return fwmark & IP_VS_SVC_TAB_MASK;
3048 + * Hashes ip_vs_service in the ip_vs_svc_table by <proto,addr,port>
3049 + * or in the ip_vs_svc_fwm_table by fwmark.
3050 + * Should be called with locked tables.
3051 + * Returns bool success.
3053 +int ip_vs_svc_hash(struct ip_vs_service *svc)
3057 + if (svc->flags & IP_VS_SVC_F_HASHED) {
3058 + IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
3059 + "called from %p\n", __builtin_return_address(0));
3063 + if (svc->fwmark == 0) {
3065 + * Hash by <protocol,addr,port> in ip_vs_svc_table
3067 + hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
3068 + list_add(&svc->s_list, &ip_vs_svc_table[hash]);
3071 + * Hash by fwmark in ip_vs_svc_fwm_table
3073 + hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
3074 + list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
3077 + svc->flags |= IP_VS_SVC_F_HASHED;
3078 + atomic_inc(&svc->refcnt);
3084 + * Unhashes ip_vs_service from ip_vs_svc_table/ip_vs_svc_fwm_table.
3085 + * Should be called with locked tables.
3086 + * Returns bool success.
3088 +int ip_vs_svc_unhash(struct ip_vs_service *svc)
3090 + if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
3091 + IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
3092 + "called from %p\n", __builtin_return_address(0));
3096 + if (svc->fwmark == 0) {
3098 + * Remove it from the ip_vs_svc_table table.
3100 + list_del(&svc->s_list);
3103 + * Remove it from the ip_vs_svc_fwm_table table.
3105 + list_del(&svc->f_list);
3108 + svc->flags &= ~IP_VS_SVC_F_HASHED;
3109 + atomic_dec(&svc->refcnt);
3115 + * Lookup service by {proto,addr,port} in the service table.
3117 +static __inline__ struct ip_vs_service *
3118 +__ip_vs_lookup_service(__u16 protocol, __u32 vaddr, __u16 vport)
3121 + struct ip_vs_service *svc;
3122 + struct list_head *l,*e;
3125 + * Check for "full" addressed entries
3126 + * Note: as long as IP_VS_SVC_TAB_BITS is larger than zero,
3127 + * <TCP,addr,port> and <UDP,addr,port> have different hash
3128 + * keys, there is no need to do protcol checking.
3130 + hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
3132 + l = &ip_vs_svc_table[hash];
3133 + for (e=l->next; e!=l; e=e->next) {
3134 + svc = list_entry(e, struct ip_vs_service, s_list);
3135 + if ((svc->addr == vaddr)
3136 + && (svc->port == vport)) {
3147 + * Lookup service by fwmark in the service table.
3149 +static __inline__ struct ip_vs_service * __ip_vs_lookup_svc_fwm(__u32 fwmark)
3152 + struct ip_vs_service *svc;
3153 + struct list_head *l,*e;
3156 + * Check for fwmark-indexed entries
3158 + hash = ip_vs_svc_fwm_hashkey(fwmark);
3160 + l = &ip_vs_svc_fwm_table[hash];
3161 + for (e=l->next; e!=l; e=e->next) {
3162 + svc = list_entry(e, struct ip_vs_service, f_list);
3163 + if (svc->fwmark == fwmark) {
3172 +struct ip_vs_service *
3173 +ip_vs_lookup_service(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
3175 + struct ip_vs_service *svc;
3177 + read_lock(&__ip_vs_lock);
3181 + * Check the table hashed by fwmark first
3183 + svc = __ip_vs_lookup_svc_fwm(fwmark);
3189 + * Check the table hashed by <protocol,addr,port>
3190 + * first for "full" addressed entries
3192 + svc = __ip_vs_lookup_service(protocol, vaddr, vport);
3195 + && protocol == IPPROTO_TCP
3196 + && atomic_read(&ip_vs_ftpsvc_counter)
3197 + && (vport==FTPDATA || ntohs(vport)>=PROT_SOCK)){
3199 + * Check if ftp service entry exists, the packet
3200 + * might belong to FTP data connections.
3202 + svc = __ip_vs_lookup_service(protocol, vaddr, FTPPORT);
3206 + && atomic_read(&ip_vs_nullsvc_counter)) {
3208 + * Check if the catch-all port (port zero) exists
3210 + svc = __ip_vs_lookup_service(protocol, vaddr, 0);
3214 + read_unlock(&__ip_vs_lock);
3216 + IP_VS_DBG(5, "lookup_service fwm %d %s %u.%u.%u.%u:%d %s\n",
3218 + masq_proto_name(protocol),
3219 + NIPQUAD(vaddr), ntohs(vport),
3220 + svc?"hit":"not hit");
3227 + * Bind a destination with a service
3230 +__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
3232 + atomic_inc(&svc->refcnt);
3237 + * Unbind a destination with its service
3240 +__ip_vs_unbind_svc(struct ip_vs_dest *dest)
3242 + struct ip_vs_service *svc = dest->svc;
3245 + if (atomic_dec_and_test(&svc->refcnt)) {
3246 + IP_VS_DBG(2, "release svc %s %u.%u.%u.%u:%d\n",
3247 + masq_proto_name(svc->protocol),
3248 + NIPQUAD(svc->addr), ntohs(svc->port));
3249 + kfree_s(svc, sizeof(struct ip_vs_service));
3255 + * Returns hash value for real service
3257 +static __inline__ unsigned
3258 +ip_vs_rs_hashkey(__u32 addr, __u16 port)
3260 + register unsigned porth = ntohs(port);
3262 + return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) & IP_VS_RTAB_MASK;
3266 + * Hashes ip_vs_dest in ip_vs_rtable by proto,addr,port.
3267 + * should be called with locked tables.
3268 + * returns bool success.
3270 +int ip_vs_rs_hash(struct ip_vs_dest *dest)
3274 + if (!list_empty(&dest->d_list)) {
3279 + * Hash by proto,addr,port,
3280 + * which are the parameters of the real service.
3282 + hash = ip_vs_rs_hashkey(dest->addr, dest->port);
3283 + list_add(&dest->d_list, &ip_vs_rtable[hash]);
3289 + * UNhashes ip_vs_dest from ip_vs_rtable.
3290 + * should be called with locked tables.
3291 + * returns bool success.
3293 +int ip_vs_rs_unhash(struct ip_vs_dest *dest)
3296 + * Remove it from the ip_vs_rtable table.
3298 + if (!list_empty(&dest->d_list)) {
3299 + list_del(&dest->d_list);
3300 + INIT_LIST_HEAD(&dest->d_list);
3307 + * Lookup real service by {proto,addr,port} in the real service table.
3309 +struct ip_vs_dest * __ip_vs_lookup_real_service(__u16 protocol,
3310 + __u32 daddr, __u16 dport)
3313 + struct ip_vs_dest *dest;
3314 + struct list_head *l,*e;
3317 + * Check for "full" addressed entries
3318 + * Return the first found entry
3320 + hash = ip_vs_rs_hashkey(daddr, dport);
3322 + l = &ip_vs_rtable[hash];
3323 + for (e=l->next; e!=l; e=e->next) {
3324 + dest = list_entry(e, struct ip_vs_dest, d_list);
3325 + if ((dest->addr == daddr)
3326 + && (dest->port == dport)
3327 + && ((dest->protocol == protocol) || dest->vfwmark)) {
3337 + * Lookup destination by {addr,port} in the given service
3339 +struct ip_vs_dest * ip_vs_lookup_dest(struct ip_vs_service *svc,
3340 + __u32 daddr, __u16 dport)
3342 + struct ip_vs_dest *dest;
3343 + struct list_head *l, *e;
3345 + read_lock_bh(&__ip_vs_lock);
3348 + * Find the destination for the given service
3350 + l = &svc->destinations;
3351 + for (e=l->next; e!=l; e=e->next) {
3352 + dest = list_entry(e, struct ip_vs_dest, n_list);
3353 + if ((dest->addr == daddr) && (dest->port == dport)) {
3355 + read_unlock_bh(&__ip_vs_lock);
3360 + read_unlock_bh(&__ip_vs_lock);
3366 + * Lookup dest by {svc,addr,port} in the destination trash.
3367 + * Called by ip_vs_add_dest with the __ip_vs_lock.
3368 + * The destination trash is used to hold the destinations that are removed
3369 + * from the service table but are still referenced by some masq entries.
3370 + * The reason to add the destination trash is when the dest is temporary
3371 + * down (either by administrator or by monitor program), the dest can be
3372 + * picked back from the trash, the remaining connections to the dest can
3373 + * continue, and the counting information of the dest is also useful for
3376 +struct ip_vs_dest * __ip_vs_get_trash_dest(struct ip_vs_service *svc,
3377 + __u32 daddr, __u16 dport)
3379 + struct ip_vs_dest *dest;
3380 + struct list_head *l, *e;
3383 + * Find the destination in trash
3385 + l = &ip_vs_dest_trash;
3386 + for (e=l->next; e!=l; e=e->next) {
3387 + dest = list_entry(e, struct ip_vs_dest, n_list);
3388 + IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%d still in trash, "
3391 + NIPQUAD(dest->addr), ntohs(dest->port),
3392 + atomic_read(&dest->refcnt));
3393 + if (dest->addr == daddr &&
3394 + dest->port == dport &&
3395 + dest->vfwmark == svc->fwmark &&
3397 + (dest->protocol == svc->protocol &&
3398 + dest->vaddr == svc->addr &&
3399 + dest->vport == svc->port))) {
3405 + * Try to purge the destination from trash if not referenced
3407 + if (atomic_read(&dest->refcnt) == 1) {
3408 + IP_VS_DBG(3, "Remove destination %u/%u.%u.%u.%u:%d "
3411 + NIPQUAD(dest->addr), ntohs(dest->port));
3413 + list_del(&dest->n_list);
3414 + __ip_vs_unbind_svc(dest);
3415 + kfree_s(dest, sizeof(*dest));
3423 + * Update a destination in the given service
3425 +void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
3426 + struct ip_masq_ctl *mctl)
3428 + struct ip_vs_user *mm = &mctl->u.vs_user;
3431 + * Set the weight and the flags
3433 + dest->weight = mm->weight;
3434 + dest->masq_flags = mm->masq_flags;
3436 + dest->masq_flags |= IP_MASQ_F_VS;
3437 + dest->masq_flags |= IP_MASQ_F_VS_INACTIVE;
3440 + * Check if local node and update the flags
3442 + if (inet_addr_type(mm->daddr) == RTN_LOCAL) {
3443 + dest->masq_flags = (dest->masq_flags & ~IP_MASQ_F_VS_FWD_MASK)
3444 + | IP_MASQ_F_VS_LOCALNODE;
3448 + * Set the IP_MASQ_F_VS_NO_OUTPUT flag if not masquerading
3450 + if ((dest->masq_flags & IP_MASQ_F_VS_FWD_MASK) != 0) {
3451 + dest->masq_flags |= IP_MASQ_F_VS_NO_OUTPUT;
3454 + * Put the real service in ip_vs_rtable if not present.
3455 + * For now only for NAT!
3457 + ip_vs_rs_hash(dest);
3461 + /* bind the service */
3463 + __ip_vs_bind_svc(dest, svc);
3465 + if (dest->svc != svc) {
3466 + __ip_vs_unbind_svc(dest);
3467 + __ip_vs_bind_svc(dest, svc);
3472 + * Set the dest status flags
3474 + dest->flags |= IP_VS_DEST_F_AVAILABLE;
3479 + * Create a destination for the given service
3481 +struct ip_vs_dest *ip_vs_new_dest(struct ip_vs_service *svc,
3482 + struct ip_masq_ctl *mctl)
3484 + struct ip_vs_dest *dest;
3485 + struct ip_vs_user *mm = &mctl->u.vs_user;
3489 + dest = (struct ip_vs_dest*) kmalloc(sizeof(struct ip_vs_dest),
3491 + if (dest == NULL) {
3492 + IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
3495 + memset(dest, 0, sizeof(struct ip_vs_dest));
3497 + dest->protocol = svc->protocol;
3498 + dest->vaddr = svc->addr;
3499 + dest->vport = svc->port;
3500 + dest->vfwmark = svc->fwmark;
3501 + dest->addr = mm->daddr;
3502 + dest->port = mm->dport;
3504 + atomic_set(&dest->activeconns, 0);
3505 + atomic_set(&dest->inactconns, 0);
3506 + atomic_set(&dest->refcnt, 0);
3508 + INIT_LIST_HEAD(&dest->d_list);
3509 + dest->stats.lock = SPIN_LOCK_UNLOCKED;
3510 + __ip_vs_update_dest(svc, dest, mctl);
3519 + * Add a destination into an existing service
3521 +int ip_vs_add_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl)
3523 + struct ip_vs_dest *dest;
3524 + struct ip_vs_user *mm = &mctl->u.vs_user;
3525 + __u32 daddr = mm->daddr;
3526 + __u16 dport = mm->dport;
3530 + if (mm->weight < 0) {
3531 + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
3536 + * Check if the dest already exists in the list
3538 + dest = ip_vs_lookup_dest(svc, daddr, dport);
3539 + if (dest != NULL) {
3540 + IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
3544 + write_lock_bh(&__ip_vs_lock);
3547 + * Check if the dest already exists in the trash and
3548 + * is from the same service
3550 + dest = __ip_vs_get_trash_dest(svc, daddr, dport);
3551 + if (dest != NULL) {
3552 + IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%d from trash, "
3553 + "refcnt=%d, service %u.%u.%u.%u:%d\n",
3554 + NIPQUAD(daddr), ntohs(dport),
3555 + atomic_read(&dest->refcnt),
3556 + NIPQUAD(dest->vaddr),
3557 + ntohs(dest->vport));
3560 + * Get the destination from the trash
3562 + list_del(&dest->n_list);
3563 + list_add(&dest->n_list, &svc->destinations);
3565 + __ip_vs_update_dest(svc, dest, mctl);
3567 + write_unlock_bh(&__ip_vs_lock);
3572 + * Allocate and initialize the dest structure
3574 + dest = ip_vs_new_dest(svc, mctl);
3575 + if (dest == NULL) {
3576 + write_unlock_bh(&__ip_vs_lock);
3577 + IP_VS_ERR("ip_vs_add_dest(): out of memory\n");
3582 + * Add the dest entry into the list
3584 + list_add(&dest->n_list, &svc->destinations);
3585 + atomic_inc(&dest->refcnt);
3587 + write_unlock_bh(&__ip_vs_lock);
3595 + * Edit a destination in the given service
3597 +int ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl)
3599 + struct ip_vs_dest *dest;
3600 + struct ip_vs_user *mm = &mctl->u.vs_user;
3601 + __u32 daddr = mm->daddr;
3602 + __u16 dport = mm->dport;
3606 + if (mm->weight < 0) {
3607 + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
3612 + * Lookup the destination list
3614 + dest = ip_vs_lookup_dest(svc, daddr, dport);
3615 + if (dest == NULL) {
3616 + IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
3620 + write_lock_bh(&__ip_vs_lock);
3622 + __ip_vs_update_dest(svc, dest, mctl);
3624 + write_unlock_bh(&__ip_vs_lock);
3632 + * Delete a destination from the given service
3634 +void __ip_vs_del_dest(struct ip_vs_dest *dest)
3636 + dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
3639 + * Remove it from the d-linked destination list.
3641 + list_del(&dest->n_list);
3644 + * Remove it from the d-linked list with the real services.
3646 + ip_vs_rs_unhash(dest);
3649 + * Decrease the refcnt of the dest, and free the dest
3650 + * if nobody refers to it (refcnt=0). Otherwise, throw
3651 + * the destination into the trash.
3653 + if (atomic_dec_and_test(&dest->refcnt)) {
3654 + /* simply decrease svc->refcnt here, let the caller check
3655 + and release the service if nobody refers to it.
3656 + Only user context can release destination and service,
3657 + and only user context can update virtual service at a
3658 + time, so the operation here is OK */
3659 + atomic_dec(&dest->svc->refcnt);
3660 + kfree_s(dest, sizeof(*dest));
3662 + IP_VS_DBG(3, "Move dest %u.%u.%u.%u:%d into trash, "
3664 + NIPQUAD(dest->addr), ntohs(dest->port),
3665 + atomic_read(&dest->refcnt));
3666 + list_add(&dest->n_list, &ip_vs_dest_trash);
3667 + atomic_inc(&dest->refcnt);
3671 +int ip_vs_del_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl)
3673 + struct ip_vs_dest *dest;
3674 + struct ip_vs_user *mm = &mctl->u.vs_user;
3675 + __u32 daddr = mm->daddr;
3676 + __u16 dport = mm->dport;
3681 + * Lookup the destination list
3683 + dest = ip_vs_lookup_dest(svc, daddr, dport);
3684 + if (dest == NULL) {
3685 + IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
3689 + write_lock_bh(&__ip_vs_lock);
3692 + * Remove dest from the destination list
3694 + __ip_vs_del_dest(dest);
3697 + * Called the update_service function of its scheduler
3699 + svc->scheduler->update_service(svc);
3701 + write_unlock_bh(&__ip_vs_lock);
3710 + * Add a service into the service hash table
3712 +int ip_vs_add_service(struct ip_masq_ctl *mctl)
3714 + struct ip_vs_user *mm = &mctl->u.vs_user;
3715 + __u16 protocol = mm->protocol;
3716 + __u32 vaddr = mm->vaddr;
3717 + __u16 vport = mm->vport;
3718 + __u32 vfwmark = mm->vfwmark;
3721 + struct ip_vs_scheduler *sched;
3722 + struct ip_vs_service *svc;
3727 + * Lookup the scheduler, by 'mctl->m_tname'
3729 + sched = ip_vs_lookup_scheduler(mctl->m_tname);
3730 + if (sched == NULL) {
3731 + IP_VS_INFO("Scheduler module ip_vs_%s.o not found\n",
3736 + write_lock_bh(&__ip_vs_lock);
3739 + * Check if the service already exists
3742 + svc = __ip_vs_lookup_service(protocol, vaddr, vport);
3744 + svc = __ip_vs_lookup_svc_fwm(vfwmark);
3746 + if (svc != NULL) {
3747 + IP_VS_DBG(1, "ip_vs_add_service: service already exists.\n");
3752 + svc = (struct ip_vs_service*)
3753 + kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
3754 + if (svc == NULL) {
3755 + IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
3759 + memset(svc, 0, sizeof(struct ip_vs_service));
3761 + svc->protocol = protocol;
3762 + svc->addr = vaddr;
3763 + svc->port = vport;
3764 + svc->fwmark = vfwmark;
3765 + svc->flags = mm->vs_flags;
3766 + svc->timeout = mm->timeout;
3767 + svc->netmask = mm->netmask;
3769 + INIT_LIST_HEAD(&svc->destinations);
3770 + atomic_set(&svc->refcnt, 0);
3771 + svc->stats.lock = SPIN_LOCK_UNLOCKED;
3774 + * Bind the scheduler
3776 + ip_vs_bind_scheduler(svc, sched);
3779 + * Hash the service into the service table
3781 + ip_vs_svc_hash(svc);
3784 + * Update the virtual service counters
3786 + if (vport == FTPPORT)
3787 + atomic_inc(&ip_vs_ftpsvc_counter);
3788 + else if (vport == 0)
3789 + atomic_inc(&ip_vs_nullsvc_counter);
3792 + write_unlock_bh(&__ip_vs_lock);
3799 + * Edit a service and bind it with a new scheduler
3801 +int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_masq_ctl *mctl)
3803 + struct ip_vs_user *mm = &mctl->u.vs_user;
3804 + struct ip_vs_scheduler *sched;
3809 + * Lookup the scheduler, by 'mctl->m_tname'
3811 + sched = ip_vs_lookup_scheduler(mctl->m_tname);
3812 + if (sched == NULL) {
3813 + IP_VS_INFO("Scheduler module ip_vs_%s.o not found\n",
3818 + write_lock_bh(&__ip_vs_lock);
3821 + * Set the flags and timeout value
3823 + svc->flags = mm->vs_flags | IP_VS_SVC_F_HASHED;
3824 + svc->timeout = mm->timeout;
3825 + svc->netmask = mm->netmask;
3828 + * Unbind the old scheduler
3830 + ip_vs_unbind_scheduler(svc);
3833 + * Bind the new scheduler
3835 + ip_vs_bind_scheduler(svc, sched);
3837 + write_unlock_bh(&__ip_vs_lock);
3845 + * Delete a service from the service list
3847 +int __ip_vs_del_service(struct ip_vs_service *svc)
3849 + struct list_head *l;
3850 + struct ip_vs_dest *dest;
3853 + * Unbind scheduler
3855 + ip_vs_unbind_scheduler(svc);
3858 + * Unlink the whole destination list
3860 + l = &svc->destinations;
3861 + while (l->next != l) {
3862 + dest = list_entry(l->next, struct ip_vs_dest, n_list);
3863 + __ip_vs_del_dest(dest);
3867 + * Unhash it from the service table
3869 + if (ip_vs_svc_unhash(svc)) {
3871 + * Update the virtual service counters
3873 + if (svc->port == FTPPORT)
3874 + atomic_dec(&ip_vs_ftpsvc_counter);
3875 + else if (svc->port == 0)
3876 + atomic_dec(&ip_vs_nullsvc_counter);
3879 + * Free the service if nobody refers to it
3881 + if (atomic_read(&svc->refcnt) == 0) {
3882 + IP_VS_DBG(2, "release svc %s %u.%u.%u.%u:%d\n",
3883 + masq_proto_name(svc->protocol),
3884 + NIPQUAD(svc->addr), ntohs(svc->port));
3885 + kfree_s(svc, sizeof(struct ip_vs_service));
3889 + * Called the update_service function of its scheduler
3891 + svc->scheduler->update_service(svc);
3898 +int ip_vs_del_service(struct ip_vs_service *svc)
3905 + write_lock_bh(&__ip_vs_lock);
3907 + __ip_vs_del_service(svc);
3909 + write_unlock_bh(&__ip_vs_lock);
3916 + * Flush all the virtual services
3918 +int ip_vs_flush(void)
3921 + struct ip_vs_service *svc;
3922 + struct list_head *l;
3924 + write_lock_bh(&__ip_vs_lock);
3927 + * Flush the service table hashed by <protocol,addr,port>
3929 + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3930 + l = &ip_vs_svc_table[idx];
3931 + while (l->next != l) {
3932 + svc = list_entry(l->next,struct ip_vs_service,s_list);
3934 + if (__ip_vs_del_service(svc))
3940 + * Flush the service table hashed by fwmark
3942 + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3943 + l = &ip_vs_svc_fwm_table[idx];
3944 + while (l->next != l) {
3945 + svc = list_entry(l->next,struct ip_vs_service,f_list);
3947 + if (__ip_vs_del_service(svc))
3953 + write_unlock_bh(&__ip_vs_lock);
3959 + * Change the connection counter and the flags if the masq state changes
3960 + * Called by the masq_tcp_state function.
3962 +void ip_vs_set_state(struct ip_masq *ms, int new_state)
3964 + struct ip_vs_dest *dest = ms->dest;
3967 + (ms->flags & IP_MASQ_F_VS) && (new_state != ms->state)) {
3968 + if (!(ms->flags & IP_MASQ_F_VS_INACTIVE) &&
3969 + (new_state != IP_MASQ_S_ESTABLISHED)) {
3970 + atomic_dec(&dest->activeconns);
3971 + atomic_inc(&dest->inactconns);
3972 + ms->flags |= IP_MASQ_F_VS_INACTIVE;
3973 + } else if ((ms->flags & IP_MASQ_F_VS_INACTIVE) &&
3974 + (new_state == IP_MASQ_S_ESTABLISHED)) {
3975 + atomic_inc(&dest->activeconns);
3976 + atomic_dec(&dest->inactconns);
3977 + ms->flags &= ~IP_MASQ_F_VS_INACTIVE;
3980 + IP_VS_DBG(8, "Set-state masq fwd:%c s:%s c:%u.%u.%u.%u:%d "
3981 + "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d flg:%X cnt:%d\n",
3982 + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state),
3983 + NIPQUAD(ms->daddr), ntohs(ms->dport),
3984 + NIPQUAD(ms->maddr), ntohs(ms->mport),
3985 + NIPQUAD(ms->saddr), ntohs(ms->sport),
3986 + ms->flags, atomic_read(&ms->refcnt));
3992 + * Bind a masq entry with a virtual service destination
3993 + * Called when a new masq entry is created for VS.
3995 +void ip_vs_bind_masq(struct ip_masq *ms, struct ip_vs_dest *dest)
3997 + ms->flags |= dest->masq_flags;
4001 + * Increase the refcnt counter of the dest.
4003 + atomic_inc(&dest->refcnt);
4005 + IP_VS_DBG(9, "Bind-masq fwd:%c s:%s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
4006 + "d:%u.%u.%u.%u:%d flg:%X cnt:%d destcnt:%d\n",
4007 + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state),
4008 + NIPQUAD(ms->daddr), ntohs(ms->dport),
4009 + NIPQUAD(ms->maddr), ntohs(ms->mport),
4010 + NIPQUAD(ms->saddr), ntohs(ms->sport),
4011 + ms->flags, atomic_read(&ms->refcnt),
4012 + atomic_read(&dest->refcnt));
4017 + * Unbind a masq entry with its VS destination
4018 + * Called by the masq_expire function.
4020 +void ip_vs_unbind_masq(struct ip_masq *ms)
4022 + struct ip_vs_dest *dest = ms->dest;
4024 + IP_VS_DBG(9, "Unbind-masq fwd:%c s:%s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
4025 + "d:%u.%u.%u.%u:%d flg:%X cnt:%d destcnt:%d\n",
4026 + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state),
4027 + NIPQUAD(ms->daddr),ntohs(ms->dport),
4028 + NIPQUAD(ms->maddr),ntohs(ms->mport),
4029 + NIPQUAD(ms->saddr),ntohs(ms->sport),
4030 + ms->flags, atomic_read(&ms->refcnt),
4031 + atomic_read(&dest->refcnt));
4035 + * Decrease the inactconns or activeconns counter
4036 + * if it is not a masq template (ms->dport!=0).
4039 + if (ms->flags & IP_MASQ_F_VS_INACTIVE) {
4040 + atomic_dec(&dest->inactconns);
4042 + atomic_dec(&dest->activeconns);
4047 + * Simply decrease the refcnt of the dest, because the
4048 + * dest will be either in service's destination list
4049 + * or in the trash.
4051 + atomic_dec(&dest->refcnt);
4057 + * Checking if the destination of a masq template is available.
4058 + * If available, return 1, otherwise return 0 and invalidate this
4061 +int ip_vs_check_template(struct ip_masq *mst)
4063 + struct ip_vs_dest *dest = mst->dest;
4066 + * Checking the dest server status.
4068 + if ((dest == NULL) ||
4069 + !(dest->flags & IP_VS_DEST_F_AVAILABLE)) {
4070 + IP_VS_DBG(9, "check_template: dest not available for prot %s "
4071 + "src %u.%u.%u.%u:%d dest %u.%u.%u.%u:%d -> %X:%X\n",
4072 + masq_proto_name(mst->protocol),
4073 + NIPQUAD(mst->daddr), ntohs(mst->dport),
4074 + NIPQUAD(mst->maddr), ntohs(mst->mport),
4075 + (dest!=NULL)? ntohl(dest->addr):0,
4076 + (dest!=NULL)? ntohs(dest->port):0);
4079 + * Invalidate the masq template
4081 + ip_vs_unhash(mst);
4082 + mst->sport = 65535;
4083 + mst->mport = 65535;
4088 + * Simply decrease the refcnt of the template,
4089 + * don't restart its timer.
4091 + atomic_dec(&mst->refcnt);
4099 + * IPVS persistent scheduling function
4100 + * It creates a masq entry according to its template if exists, or selects
4101 + * a server and creates a masq entry plus a template.
4104 +ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
4106 + struct ip_masq *ms = NULL;
4107 + struct ip_vs_dest *dest;
4108 + const __u16 *portp;
4109 + struct ip_masq *mst;
4110 + __u16 dport; /* destination port to forward */
4111 + __u32 snet; /* source network of the client, after masking */
4113 + portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
4115 + /* Mask saddr with the netmask to adjust template granularity */
4116 + snet = iph->saddr & svc->netmask;
4118 + IP_VS_DBG(6, "P-schedule: src %u.%u.%u.%u:%d dest %u.%u.%u.%u:%d "
4119 + "snet %u.%u.%u.%u/%u.%u.%u.%u\n",
4120 + NIPQUAD(iph->saddr), ntohs(portp[0]),
4121 + NIPQUAD(iph->daddr), ntohs(portp[1]),
4122 + NIPQUAD(snet), NIPQUAD(svc->netmask));
4125 + * As far as we know, FTP is a very complicated network protocol, and
4126 + * it uses control connection and data connections. For active FTP,
4127 + * FTP server initilize data connection to the client, its source port
4128 + * is often 20. For passive FTP, FTP server tells the clients the port
4129 + * that it passively listens to, and the client issues the data
4130 + * connection. In the tunneling or direct routing mode, the load
4131 + * balancer is on the client-to-server half of connection, the port
4132 + * number is unknown to the load balancer. So, a template masq like
4133 + * <daddr, 0, maddr, 0, saddr, 0> is created for persistent FTP
4134 + * service, and a template like <daddr, 0, maddr, mport, saddr, sport>
4135 + * is created for other persistent services.
4137 + if (portp[1] == svc->port) {
4138 + /* Check if a template already exists */
4139 + if (svc->port != FTPPORT)
4140 + mst = ip_vs_in_get(iph->protocol, snet, 0,
4141 + iph->daddr, portp[1]);
4143 + mst = ip_vs_in_get(iph->protocol, snet, 0,
4146 + if (!mst || !ip_vs_check_template(mst)) {
4148 + * No template found or the dest of the masq
4149 + * template is not available.
4151 + read_lock(&__ip_vs_lock);
4153 + dest = svc->scheduler->schedule(svc, iph);
4154 + if (dest == NULL) {
4155 + IP_VS_DBG(1, "P-schedule: no dest found.\n");
4156 + read_unlock(&__ip_vs_lock);
4161 + * Create a template like <protocol,daddr,0,
4162 + * maddr,mport,saddr,sport> for non-ftp service,
4163 + * and <protocol,daddr,0,maddr,0,saddr,0>
4164 + * for ftp service.
4166 + if (svc->port != FTPPORT)
4167 + mst = ip_masq_new_vs(iph->protocol,
4168 + iph->daddr, portp[1],
4169 + dest->addr, dest->port,
4173 + mst = ip_masq_new_vs(iph->protocol,
4178 + if (mst == NULL) {
4179 + IP_VS_ERR("ip_masq_new_vs template failed\n");
4180 + read_unlock(&__ip_vs_lock);
4185 + * Bind the template with dest and set timeout.
4187 + ip_vs_bind_masq(mst, dest);
4188 + mst->timeout = svc->timeout;
4190 + read_unlock(&__ip_vs_lock);
4193 + * Template found and its destination is available.
4198 + * Delete its timer so that it can be put back.
4200 + del_sltimer(&mst->timer);
4202 + dport = dest->port;
4205 + * Note: persistent fwmark-based services and persistent
4206 + * port zero service are handled here.
4207 + * fwmark template: <IPPROTO_IP,daddr,0,fwmark,0,saddr,0>
4208 + * port zero template: <protocol,daddr,0,maddr,0,saddr,0>
4211 + mst = ip_vs_in_get(IPPROTO_IP, snet, 0,
4212 + htonl(svc->fwmark), 0);
4214 + mst = ip_vs_in_get(iph->protocol,
4215 + snet, 0, iph->daddr, 0);
4217 + if (!mst || !ip_vs_check_template(mst)) {
4219 + * If it is not persistent port zero, return NULL.
4224 + read_lock(&__ip_vs_lock);
4226 + dest = svc->scheduler->schedule(svc, iph);
4227 + if (dest == NULL) {
4228 + IP_VS_DBG(1, "P-schedule: no dest found.\n");
4229 + read_unlock(&__ip_vs_lock);
4234 + * Create a template according to the service
4237 + mst = ip_masq_new_vs(IPPROTO_IP,
4238 + htonl(svc->fwmark), 0,
4243 + mst = ip_masq_new_vs(iph->protocol,
4248 + if (mst == NULL) {
4249 + IP_VS_ERR("ip_masq_new_vs template failed\n");
4250 + read_unlock(&__ip_vs_lock);
4255 + * Bind the template with dest and set timeout.
4257 + ip_vs_bind_masq(mst, dest);
4258 + mst->timeout = svc->timeout;
4259 + read_unlock(&__ip_vs_lock);
4264 + * Delete its timer so that it can be put back.
4266 + del_sltimer(&mst->timer);
4272 + * Create a new masq according to the template
4274 + ms = ip_masq_new_vs(iph->protocol,
4275 + iph->daddr, portp[1],
4276 + dest->addr, dport,
4277 + iph->saddr, portp[0],
4280 + IP_VS_ERR("ip_masq_new_vs failed\n");
4286 + * Bind the masq entry with the vs dest.
4288 + ip_vs_bind_masq(ms, dest);
4291 + * Increase the inactive connection counter
4292 + * because it is in Syn-Received
4293 + * state (inactive) when the masq is created.
4295 + atomic_inc(&dest->inactconns);
4300 + ip_masq_control_add(ms, mst);
4308 + * IPVS main scheduling function
4309 + * It selects a server according to the virtual service, and
4310 + * creates a masq entry.
4312 +struct ip_masq *ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph)
4314 + struct ip_masq *ms = NULL;
4315 + struct ip_vs_dest *dest;
4316 + const __u16 *portp;
4319 + * Persistent service
4321 + if (svc->flags & IP_VS_SVC_F_PERSISTENT)
4322 + return ip_vs_sched_persist(svc, iph);
4325 + * Non-persistent service
4327 + portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
4328 + if (!svc->fwmark && portp[1] != svc->port) {
4330 + IP_VS_ERR("Schedule: port zero only supported in persistent services, check your ipvs configuration\n");
4334 + read_lock(&__ip_vs_lock);
4336 + dest = svc->scheduler->schedule(svc, iph);
4337 + if (dest == NULL) {
4338 + IP_VS_DBG(1, "Schedule: no dest found.\n");
4339 + read_unlock(&__ip_vs_lock);
4344 + * Create a masquerading entry.
4346 + ms = ip_masq_new_vs(iph->protocol,
4347 + iph->daddr, portp[1],
4348 + dest->addr, dest->port?dest->port:portp[1],
4349 + iph->saddr, portp[0],
4352 + IP_VS_ERR("Schedule: ip_masq_new_vs failed\n");
4353 + read_unlock(&__ip_vs_lock);
4358 + * Bind the masq entry with the vs dest.
4360 + ip_vs_bind_masq(ms, dest);
4363 + * Increase the inactive connection counter because it is in
4364 + * Syn-Received state (inactive) when the masq is created.
4366 + atomic_inc(&dest->inactconns);
4368 + IP_VS_DBG(9, "Schedule masq fwd:%c s:%s c:%u.%u.%u.%u:%d "
4369 + "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d flg:%X cnt:%d\n",
4370 + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state),
4371 + NIPQUAD(ms->daddr),ntohs(ms->dport),
4372 + NIPQUAD(ms->maddr),ntohs(ms->mport),
4373 + NIPQUAD(ms->saddr),ntohs(ms->sport),
4374 + ms->flags, atomic_read(&ms->refcnt));
4376 + read_unlock(&__ip_vs_lock);
4383 + * Pass or drop the packet.
4384 + * Called by ip_fw_demasquerade, when the virtual service is available but
4385 + * no destination is available for a new connection.
4387 +int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb)
4389 + struct iphdr *iph = skb->nh.iph;
4390 + __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
4393 + * When the virtual ftp service is presented, packets destined
4394 + * for other services on the VIP may get here (except services
4395 + * listed in the ipvs table), pass the packets, because it is
4396 + * not ipvs job to decide to drop the packets.
4398 + if ((svc->port == FTPPORT) && (portp[1] != FTPPORT))
4402 + * Notify the client that the destination is unreachable, and
4403 + * release the socket buffer.
4404 + * Since it is in IP layer, the TCP socket is not actually
4405 + * created, the TCP RST packet cannot be sent, instead that
4406 + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
4408 + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
4415 + * IPVS user control entry
4417 +int ip_vs_ctl(int optname, struct ip_masq_ctl *mctl, int optlen)
4419 + struct ip_vs_service *svc = NULL;
4420 + struct ip_vs_user *mm = &mctl->u.vs_user;
4421 + __u32 vaddr = mm->vaddr;
4422 + __u16 vport = mm->vport;
4423 + int proto_num = masq_proto_num(mm->protocol);
4426 + * Check the size of mctl, no overflow...
4428 + if (optlen != sizeof(*mctl))
4432 + * Flush all the virtual service...
4434 + if (mctl->m_cmd == IP_MASQ_CMD_FLUSH)
4435 + return ip_vs_flush();
4438 + * Check for valid protocol: TCP or UDP
4440 + if (mm->vfwmark == 0 && (proto_num < 0 || proto_num > 1)) {
4441 + IP_VS_INFO("vs_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s",
4442 + ntohs(mm->protocol),
4443 + NIPQUAD(vaddr), ntohs(vport), mctl->m_tname);
4448 + * Lookup the exact service by (protocol, vaddr, vport)
4450 + read_lock(&__ip_vs_lock);
4452 + if (mm->vfwmark == 0)
4453 + svc = __ip_vs_lookup_service(mm->protocol, vaddr, vport);
4455 + svc = __ip_vs_lookup_svc_fwm(mm->vfwmark);
4457 + read_unlock(&__ip_vs_lock);
4459 + switch (mctl->m_cmd) {
4460 + case IP_MASQ_CMD_ADD:
4464 + return ip_vs_add_service(mctl);
4466 + case IP_MASQ_CMD_SET:
4470 + return ip_vs_edit_service(svc, mctl);
4472 + case IP_MASQ_CMD_DEL:
4476 + return ip_vs_del_service(svc);
4478 + case IP_MASQ_CMD_ADD_DEST:
4482 + return ip_vs_add_dest(svc, mctl);
4484 + case IP_MASQ_CMD_SET_DEST:
4488 + return ip_vs_edit_dest(svc, mctl);
4490 + case IP_MASQ_CMD_DEL_DEST:
4494 + return ip_vs_del_dest(svc, mctl);
4500 +#ifdef CONFIG_SYSCTL
4502 +static int ip_vs_sysctl_defense_mode(ctl_table *ctl, int write,
4503 + struct file * filp,void *buffer, size_t *lenp)
4505 + int *valp = ctl->data;
4509 + ret = proc_dointvec(ctl, write, filp, buffer, lenp);
4510 + if (write && (*valp != val)) {
4511 + if ((*valp < 0) || (*valp > 3)) {
4512 + /* Restore the correct value */
4515 + update_defense_level();
4521 +ctl_table ipv4_vs_table[] = {
4522 +#ifdef CONFIG_IP_VS_DEBUG
4523 + {NET_IPV4_VS_DEBUG_LEVEL, "debug_level",
4524 + &sysctl_ip_vs_debug_level, sizeof(int), 0644, NULL,
4527 + {NET_IPV4_VS_AMEMTHRESH, "amemthresh",
4528 + &sysctl_ip_vs_amemthresh, sizeof(int), 0644, NULL,
4530 + {NET_IPV4_VS_AMDROPRATE, "am_droprate",
4531 + &sysctl_ip_vs_am_droprate, sizeof(int), 0644, NULL,
4533 + {NET_IPV4_VS_DROP_ENTRY, "drop_entry",
4534 + &sysctl_ip_vs_drop_entry, sizeof(int), 0644, NULL,
4535 + &ip_vs_sysctl_defense_mode},
4536 + {NET_IPV4_VS_DROP_PACKET, "drop_packet",
4537 + &sysctl_ip_vs_drop_packet, sizeof(int), 0644, NULL,
4538 + &ip_vs_sysctl_defense_mode},
4539 + {NET_IPV4_VS_SECURE_TCP, "secure_tcp",
4540 + &sysctl_ip_vs_secure_tcp, sizeof(int), 0644, NULL,
4541 + &ip_vs_sysctl_defense_mode},
4542 + {NET_IPV4_VS_TO_ES, "timeout_established",
4543 + &masq_timeout_table_dos.timeout[IP_MASQ_S_ESTABLISHED],
4544 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4545 + {NET_IPV4_VS_TO_SS, "timeout_synsent",
4546 + &masq_timeout_table_dos.timeout[IP_MASQ_S_SYN_SENT],
4547 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4548 + {NET_IPV4_VS_TO_SR, "timeout_synrecv",
4549 + &masq_timeout_table_dos.timeout[IP_MASQ_S_SYN_RECV],
4550 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4551 + {NET_IPV4_VS_TO_FW, "timeout_finwait",
4552 + &masq_timeout_table_dos.timeout[IP_MASQ_S_FIN_WAIT],
4553 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4554 + {NET_IPV4_VS_TO_TW, "timeout_timewait",
4555 + &masq_timeout_table_dos.timeout[IP_MASQ_S_TIME_WAIT],
4556 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4557 + {NET_IPV4_VS_TO_CL, "timeout_close",
4558 + &masq_timeout_table_dos.timeout[IP_MASQ_S_CLOSE],
4559 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4560 + {NET_IPV4_VS_TO_CW, "timeout_closewait",
4561 + &masq_timeout_table_dos.timeout[IP_MASQ_S_CLOSE_WAIT],
4562 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4563 + {NET_IPV4_VS_TO_LA, "timeout_lastack",
4564 + &masq_timeout_table_dos.timeout[IP_MASQ_S_LAST_ACK],
4565 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4566 + {NET_IPV4_VS_TO_LI, "timeout_listen",
4567 + &masq_timeout_table_dos.timeout[IP_MASQ_S_LISTEN],
4568 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4569 + {NET_IPV4_VS_TO_SA, "timeout_synack",
4570 + &masq_timeout_table_dos.timeout[IP_MASQ_S_SYNACK],
4571 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4572 + {NET_IPV4_VS_TO_UDP, "timeout_udp",
4573 + &masq_timeout_table_dos.timeout[IP_MASQ_S_UDP],
4574 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4575 + {NET_IPV4_VS_TO_ICMP, "timeout_icmp",
4576 + &masq_timeout_table_dos.timeout[IP_MASQ_S_ICMP],
4577 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4582 +#ifdef CONFIG_PROC_FS
4584 + * Write the contents of the VS rule table to a PROCfs file.
4586 +static int ip_vs_procinfo(char *buf, char **start, off_t offset,
4587 + int length, int *eof, void *data)
4591 + char temp[64], temp2[32];
4593 + struct ip_vs_service *svc;
4594 + struct ip_vs_dest *dest;
4595 + struct list_head *l, *e, *p, *q;
4598 + * Note: since the length of the buffer is usually the multiple
4599 + * of 512, it is good to use fixed record of the divisor of 512,
4600 + * so that records won't be truncated at buffer boundary.
4603 + if (pos > offset) {
4605 + "IP Virtual Server version %d.%d.%d (size=%d)",
4606 + NVERSION(IP_VS_VERSION_CODE), IP_VS_TAB_SIZE);
4607 + len += sprintf(buf+len, "%-63s\n", temp);
4608 + len += sprintf(buf+len, "%-63s\n",
4609 + "Prot LocalAddress:Port Scheduler Flags");
4610 + len += sprintf(buf+len, "%-63s\n",
4611 + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn");
4614 + read_lock_bh(&__ip_vs_lock);
4616 + /* print the service table hashed by <protocol,addr,port> */
4617 + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4618 + l = &ip_vs_svc_table[idx];
4619 + for (e=l->next; e!=l; e=e->next) {
4620 + svc = list_entry(e, struct ip_vs_service, s_list);
4622 + if (pos > offset) {
4623 + if (svc->flags & IP_VS_SVC_F_PERSISTENT)
4624 + sprintf(temp2, "persistent %d %08X",
4626 + ntohl(svc->netmask));
4630 + sprintf(temp, "%s %08X:%04X %s %s",
4631 + masq_proto_name(svc->protocol),
4634 + svc->scheduler->name, temp2);
4635 + len += sprintf(buf+len, "%-63s\n", temp);
4636 + if (len >= length)
4640 + p = &svc->destinations;
4641 + for (q=p->next; q!=p; q=q->next) {
4642 + dest = list_entry(q, struct ip_vs_dest, n_list);
4644 + if (pos <= offset)
4647 + " -> %08X:%04X %-7s %-6d %-10d %-10d",
4648 + ntohl(dest->addr),
4649 + ntohs(dest->port),
4650 + ip_vs_fwd_name(dest->masq_flags),
4652 + atomic_read(&dest->activeconns),
4653 + atomic_read(&dest->inactconns));
4654 + len += sprintf(buf+len, "%-63s\n", temp);
4655 + if (len >= length)
4661 + /* print the service table hashed by fwmark */
4662 + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4663 + l = &ip_vs_svc_fwm_table[idx];
4664 + for (e=l->next; e!=l; e=e->next) {
4665 + svc = list_entry(e, struct ip_vs_service, f_list);
4667 + if (pos > offset) {
4668 + if (svc->flags & IP_VS_SVC_F_PERSISTENT)
4669 + sprintf(temp2, "persistent %d %08X",
4671 + ntohl(svc->netmask));
4675 + sprintf(temp, "FWM %08X %s %s",
4677 + svc->scheduler->name, temp2);
4678 + len += sprintf(buf+len, "%-63s\n", temp);
4679 + if (len >= length)
4683 + p = &svc->destinations;
4684 + for (q=p->next; q!=p; q=q->next) {
4685 + dest = list_entry(q, struct ip_vs_dest, n_list);
4687 + if (pos <= offset)
4690 + " -> %08X:%04X %-7s %-6d %-10d %-10d",
4691 + ntohl(dest->addr),
4692 + ntohs(dest->port),
4693 + ip_vs_fwd_name(dest->masq_flags),
4695 + atomic_read(&dest->activeconns),
4696 + atomic_read(&dest->inactconns));
4697 + len += sprintf(buf+len, "%-63s\n", temp);
4698 + if (len >= length)
4705 + read_unlock_bh(&__ip_vs_lock);
4707 + *start = buf+len-(pos-offset); /* Start of wanted data */
4716 +struct proc_dir_entry ip_vs_proc_entry = {
4717 + 0, /* dynamic inode */
4718 + 2, "vs", /* namelen and name */
4719 + S_IFREG | S_IRUGO, /* mode */
4720 + 1, 0, 0, 0, /* nlinks, owner, group, size */
4721 + &proc_net_inode_operations, /* operations */
4722 + NULL, /* get_info */
4723 + NULL, /* fill_inode */
4724 + NULL, NULL, NULL, /* next, parent, subdir */
4726 + &ip_vs_procinfo, /* function to generate proc data */
4731 + * Write the IPVS statistic information to a PROCfs file.
4733 +struct ip_vs_stats ip_vs_stats = {SPIN_LOCK_UNLOCKED, 0, 0};
4736 +ip_vs_stats_get_info(char *buf, char **start, off_t offset,
4737 + int length, int *eof, void *data)
4743 + struct ip_vs_service *svc;
4744 + struct ip_vs_dest *dest;
4745 + struct list_head *l, *e, *p, *q;
4748 + if (pos > offset) {
4749 + len += sprintf(buf+len, "%-63s\n",
4750 +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
4751 + "TotalConns InPkts OutPkts InBytes OutBytes");
4752 + spin_lock(&ip_vs_stats.lock);
4753 + sprintf(temp, " %8X %8X %8X %8X%08X %8X%08X",
4754 + ip_vs_stats.conns,
4755 + ip_vs_stats.inpkts,
4756 + ip_vs_stats.outpkts,
4757 + (__u32)(ip_vs_stats.inbytes >> 32),
4758 + (__u32)ip_vs_stats.inbytes,
4759 + (__u32)(ip_vs_stats.outbytes >> 32),
4760 + (__u32)ip_vs_stats.outbytes);
4761 + spin_unlock(&ip_vs_stats.lock);
4762 + len += sprintf(buf+len, "%-63s\n", temp);
4765 + read_lock_bh(&__ip_vs_lock);
4767 + /* print the service statistics */
4769 + if (pos > offset) {
4770 + len += sprintf(buf+len, "%-127s\n",
4771 + "\nVirtual Service\n"
4772 + "Pro VirtService Conns InPkts OutPkts InBytes OutBytes");
4775 + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4776 + l = &ip_vs_svc_table[idx];
4777 + for (e=l->next; e!=l; e=e->next) {
4778 + svc = list_entry(e, struct ip_vs_service, s_list);
4780 + if (pos <= offset)
4782 + spin_lock(&svc->stats.lock);
4783 + sprintf(temp, "%3s %08X:%04X %8X %8X %8X %8X%08X %8X%08X",
4784 + masq_proto_name(svc->protocol),
4788 + svc->stats.inpkts,
4789 + svc->stats.outpkts,
4790 + (__u32)(svc->stats.inbytes >> 32),
4791 + (__u32)svc->stats.inbytes,
4792 + (__u32)(svc->stats.outbytes >> 32),
4793 + (__u32)svc->stats.outbytes);
4794 + spin_unlock(&svc->stats.lock);
4795 + len += sprintf(buf+len, "%-127s\n", temp);
4796 + if (pos >= offset+length)
4801 + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4802 + l = &ip_vs_svc_fwm_table[idx];
4803 + for (e=l->next; e!=l; e=e->next) {
4804 + svc = list_entry(e, struct ip_vs_service, f_list);
4806 + if (pos <= offset)
4808 + spin_lock(&svc->stats.lock);
4809 + sprintf(temp, "FWM %08X %8X %8X %8X %8X%08X %8X%08X",
4812 + svc->stats.inpkts,
4813 + svc->stats.outpkts,
4814 + (__u32)(svc->stats.inbytes >> 32),
4815 + (__u32)svc->stats.inbytes,
4816 + (__u32)(svc->stats.outbytes >> 32),
4817 + (__u32)svc->stats.outbytes);
4818 + spin_unlock(&svc->stats.lock);
4819 + len += sprintf(buf+len, "%-127s\n", temp);
4820 + if (pos >= offset+length)
4825 + /* print the real server statistics */
4827 + if (pos > offset) {
4828 + len += sprintf(buf+len, "%-127s\n",
4829 + "\nReal Service\n"
4830 + "Pro VirtService RealService Conns InPkts OutPkts InBytes OutBytes");
4833 + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4834 + l = &ip_vs_svc_table[idx];
4835 + for (e=l->next; e!=l; e=e->next) {
4836 + svc = list_entry(e, struct ip_vs_service, s_list);
4837 + p = &svc->destinations;
4838 + for (q=p->next; q!=p; q=q->next) {
4839 + dest = list_entry(q, struct ip_vs_dest, n_list);
4841 + if (pos <= offset)
4843 + spin_lock(&dest->stats.lock);
4845 + "%3s %08X:%04X %08X:%04X %8X %8X %8X %8X%08X %8X%08X",
4846 + masq_proto_name(svc->protocol),
4849 + ntohl(dest->addr),
4850 + ntohs(dest->port),
4851 + dest->stats.conns,
4852 + dest->stats.inpkts,
4853 + dest->stats.outpkts,
4854 + (__u32)(dest->stats.inbytes >> 32),
4855 + (__u32)dest->stats.inbytes,
4856 + (__u32)(dest->stats.outbytes >> 32),
4857 + (__u32)dest->stats.outbytes);
4858 + spin_unlock(&dest->stats.lock);
4859 + len += sprintf(buf+len, "%-127s\n", temp);
4860 + if (pos >= offset+length)
4866 + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4867 + l = &ip_vs_svc_fwm_table[idx];
4868 + for (e=l->next; e!=l; e=e->next) {
4869 + svc = list_entry(e, struct ip_vs_service, f_list);
4870 + p = &svc->destinations;
4871 + for (q=p->next; q!=p; q=q->next) {
4872 + dest = list_entry(q,struct ip_vs_dest,n_list);
4874 + if (pos <= offset)
4876 + spin_lock(&dest->stats.lock);
4878 + "FWM %08X %08X:%04X %8X %8X %8X %8X%08X %8X%08X",
4880 + ntohl(dest->addr),
4881 + ntohs(dest->port),
4882 + dest->stats.conns,
4883 + dest->stats.inpkts,
4884 + dest->stats.outpkts,
4885 + (__u32)(dest->stats.inbytes >> 32),
4886 + (__u32)dest->stats.inbytes,
4887 + (__u32)(dest->stats.outbytes >> 32),
4888 + (__u32)dest->stats.outbytes);
4889 + spin_unlock(&dest->stats.lock);
4890 + len += sprintf(buf+len, "%-127s\n", temp);
4891 + if (pos >= offset+length)
4897 + read_unlock_bh(&__ip_vs_lock);
4899 + *start = buf+len-(pos-offset); /* Start of wanted data */
4908 +struct proc_dir_entry ip_vs_stat_proc_entry = {
4909 + 0, /* dynamic inode */
4910 + 8, "vs_stats", /* namelen and name */
4911 + S_IFREG | S_IRUGO, /* mode */
4912 + 1, 0, 0, 0, /* nlinks, owner, group, size */
4913 + &proc_net_inode_operations, /* operations */
4914 + NULL, /* get_info */
4915 + NULL, /* fill_inode */
4916 + NULL, NULL, NULL, /* next, parent, subdir */
4918 + &ip_vs_stats_get_info, /* function to generate proc data */
4925 + * This function encapsulates the packet in a new IP header, its destination
4926 + * will be set to the daddr. Most code of this function is from ipip.c.
4928 + * It is called in the ip_vs_forward() function. The load balancer
4929 + * selects a real server from a cluster based on a scheduling algorithm,
4930 + * encapsulates the packet and forwards it to the selected server. All real
4931 + * servers are configured with "ifconfig tunl0 <Virtual IP Address> up".
4932 + * When the server receives the encapsulated packet, it decapsulates the
4933 + * packet, processes the request and return the reply packets directly to
4934 + * the client without passing the load balancer. This can greatly
4935 + * increase the scalability of virtual server.
4937 + * if succeeded, return 1; otherwise, return 0.
4940 +int ip_vs_tunnel_xmit(struct sk_buff *skb, __u32 daddr)
4942 + struct rtable *rt; /* Route to the other host */
4943 + struct device *tdev; /* Device to other host */
4944 + struct iphdr *old_iph = skb->nh.iph;
4945 + u8 tos = old_iph->tos;
4946 + u16 df = old_iph->frag_off;
4947 + struct iphdr *iph; /* Our new IP header */
4948 + int max_headroom; /* The extra header space needed */
4953 + if (skb->protocol != __constant_htons(ETH_P_IP)) {
4954 + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): protocol error, ETH_P_IP: %d, skb protocol: %d\n",
4955 + __constant_htons(ETH_P_IP),skb->protocol);
4959 + if (ip_route_output(&rt, dst, src, RT_TOS(tos), 0)) {
4960 + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): route error, dest: "
4961 + "%u.%u.%u.%u\n", NIPQUAD(dst));
4962 + goto tx_error_icmp;
4964 + tdev = rt->u.dst.dev;
4966 + mtu = rt->u.dst.pmtu - sizeof(struct iphdr);
4969 + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): mtu less than 68\n");
4972 + if (skb->dst && mtu < skb->dst->pmtu)
4973 + skb->dst->pmtu = mtu;
4975 + df |= (old_iph->frag_off&__constant_htons(IP_DF));
4977 + if ((old_iph->frag_off&__constant_htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
4978 + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
4980 + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): frag needed\n");
4984 + skb->h.raw = skb->nh.raw;
4987 + * Okay, now see if we can stuff it in the buffer as-is.
4989 + max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
4991 + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
4992 + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
4996 + IP_VS_ERR("ip_vs_tunnel_xmit(): no memory for new_skb\n");
5003 + skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
5004 + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
5005 + dst_release(skb->dst);
5006 + skb->dst = &rt->u.dst;
5009 + * Push down and install the IPIP header.
5012 + iph = skb->nh.iph;
5014 + iph->ihl = sizeof(struct iphdr)>>2;
5015 + iph->frag_off = df;
5016 + iph->protocol = IPPROTO_IPIP;
5018 + iph->daddr = rt->rt_dst;
5019 + iph->saddr = rt->rt_src;
5020 + iph->ttl = old_iph->ttl;
5021 + iph->tot_len = htons(skb->len);
5022 + iph->id = htons(ip_id_count++);
5023 + ip_send_check(iph);
5025 + IPCB(skb)->flags |= IPSKB_REDIRECTED;
5026 + IPCB(skb)->flags |= IPSKB_MASQUERADED;
5032 + dst_link_failure(skb);
5042 +int ip_vs_dr_xmit(struct sk_buff *skb, __u32 daddr)
5044 + struct rtable *rt; /* Route to the other host */
5045 + struct iphdr *iph = skb->nh.iph;
5046 + u8 tos = iph->tos;
5049 + if (ip_route_output(&rt, daddr, 0, RT_TOS(tos), 0)) {
5050 + IP_VS_DBG(0, "ip_vs_dr_xmit(): route error, dest: %u.%u.%u.%u\n",
5052 + goto tx_error_icmp;
5055 + /* MTU checking */
5056 + mtu = rt->u.dst.pmtu;
5057 + if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
5058 + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
5060 + IP_VS_DBG(0, "ip_vs_dr_xmit(): frag needed\n");
5064 + dst_release(skb->dst);
5065 + skb->dst = &rt->u.dst;
5067 + IPCB(skb)->flags |= IPSKB_REDIRECTED;
5068 + IPCB(skb)->flags |= IPSKB_MASQUERADED;
5074 + dst_link_failure(skb);
5082 + * Initialize IP virtual server
5084 +__initfunc(int ip_vs_init(void))
5089 + * Allocate the ip_vs_table and initialize its list head.
5090 + * Initilize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable,
5091 + * ip_vs_schedulers and ip_vs_dest_trash.
5093 + if (!(ip_vs_table =
5094 + vmalloc(IP_VS_TAB_SIZE*sizeof(struct list_head)))) {
5097 + for(idx = 0; idx < IP_VS_TAB_SIZE; idx++) {
5098 + INIT_LIST_HEAD(&ip_vs_table[idx]);
5100 + IP_VS_INFO("Connection hash table configured "
5101 + "(size=%d, memory=%ldKbytes)\n",
5103 + (long) (IP_VS_TAB_SIZE*sizeof(struct list_head))/1024);
5105 + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
5106 + INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
5107 + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
5109 + for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
5110 + INIT_LIST_HEAD(&ip_vs_rtable[idx]);
5112 + INIT_LIST_HEAD(&ip_vs_schedulers);
5113 + INIT_LIST_HEAD(&ip_vs_dest_trash);
5116 + * Hook the slow_timer handler in the system timer.
5118 + slow_timer.expires = jiffies+SLTIMER_PERIOD;
5119 + add_timer(&slow_timer);
5121 +#ifdef CONFIG_PROC_FS
5122 + ip_masq_proc_register(&ip_vs_proc_entry);
5123 + ip_masq_proc_register(&ip_vs_stat_proc_entry);
5126 +#ifdef CONFIG_IP_MASQUERADE_VS_RR
5129 +#ifdef CONFIG_IP_MASQUERADE_VS_WRR
5132 +#ifdef CONFIG_IP_MASQUERADE_VS_LC
5135 +#ifdef CONFIG_IP_MASQUERADE_VS_WLC
5138 +#ifdef CONFIG_IP_MASQUERADE_VS_LBLC
5139 + ip_vs_lblc_init();
5141 +#ifdef CONFIG_IP_MASQUERADE_VS_LBLCR
5142 + ip_vs_lblcr_init();
5146 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lblc.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblc.c
5147 --- linux-2.2.19/net/ipv4/ip_vs_lblc.c Thu Jan 1 08:00:00 1970
5148 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblc.c Fri Feb 2 18:49:08 2001
5151 + * IPVS: Locality-Based Least-Connection scheduling module
5155 + * Authors: Wensong Zhang <wensong@gnuchina.org>
5157 + * This program is free software; you can redistribute it and/or
5158 + * modify it under the terms of the GNU General Public License
5159 + * as published by the Free Software Foundation; either version
5160 + * 2 of the License, or (at your option) any later version.
5163 + * Martin Hamilton : fixed the terrible locking bugs
5164 + * *lock(tbl->lock) ==> *lock(&tbl->lock)
5165 + * Wensong Zhang : fixed the uninitilized tbl->lock bug
5166 + * Wensong Zhang : added doing full expiration check to
5167 + * collect stale entries of 24+ hours when
5168 + * no partial expire check in a half hour
5173 + * The lblc algorithm is as follows (pseudo code):
5175 + * if cachenode[dest_ip] is null then
5176 + * n, cachenode[dest_ip] <- {weighted least-conn node};
5178 + * n <- cachenode[dest_ip];
5179 + * if (n is dead) OR
5180 + * (n.conns>n.weight AND
5181 + * there is a node m with m.conns<m.weight/2) then
5182 + * n, cachenode[dest_ip] <- {weighted least-conn node};
5186 + * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
5187 + * me to write this module.
5190 +#include <linux/config.h>
5191 +#include <linux/module.h>
5193 +#include <linux/kmod.h>
5195 +#include <linux/types.h>
5196 +#include <linux/kernel.h>
5197 +#include <linux/errno.h>
5198 +#include <linux/vmalloc.h>
5199 +#include <net/ip_masq.h>
5200 +#ifdef CONFIG_IP_MASQUERADE_MOD
5201 +#include <net/ip_masq_mod.h>
5203 +#include <linux/sysctl.h>
5204 +#include <linux/proc_fs.h>
5205 +#include <linux/ip_fw.h>
5206 +#include <net/ip_vs.h>
5210 + * It is for garbage collection of stale IPVS lblc entries,
5211 + * when the table is full.
5213 +#define CHECK_EXPIRE_INTERVAL (60*HZ)
5214 +#define ENTRY_TIMEOUT (5*60*HZ)
5217 + * It is for full expiration check.
5218 + * When there is no partial expiration check (garbage collection)
5219 + * in a half hour, do a full expiration check to collect stale
5220 + * entries that haven't been touched for a day (by default).
5222 +#define COUNT_FOR_FULL_EXPIRATION 30
5223 +int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
5227 + * for IPVS lblc entry hash table
5229 +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
5230 +#define CONFIG_IP_VS_LBLC_TAB_BITS 10
5232 +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
5233 +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
5234 +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
5238 + * IPVS lblc entry represents an association between destination
5239 + * IP address and its destination server
5241 +struct ip_vs_lblc_entry {
5242 + struct list_head list;
5243 + __u32 addr; /* destination IP address */
5244 + struct ip_vs_dest *dest; /* real server (cache) */
5245 + unsigned long lastuse; /* last used time */
5250 + * IPVS lblc hash table
5252 +struct ip_vs_lblc_table {
5253 + rwlock_t lock; /* lock for this table */
5254 + struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
5255 + atomic_t entries; /* number of entries */
5256 + int max_size; /* maximum size of entries */
5257 + struct timer_list periodic_timer; /* collect stale entries */
5258 + int rover; /* rover for expire check */
5259 + int counter; /* counter for no expire */
5265 + * IPVS LBLC sysctl table
5267 +struct ip_vs_lblc_sysctl_table {
5268 + struct ctl_table_header *sysctl_header;
5269 + ctl_table vs_vars[2];
5270 + ctl_table vs_dir[2];
5271 + ctl_table ipv4_dir[2];
5272 + ctl_table root_dir[2];
5276 +static struct ip_vs_lblc_sysctl_table lblc_sysctl_table = {
5278 + {{NET_IPV4_VS_LBLC_EXPIRE, "lblc_expiration",
5279 + &sysctl_ip_vs_lblc_expiration,
5280 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
5282 + {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblc_sysctl_table.vs_vars},
5284 + {{NET_IPV4, "ipv4", NULL, 0, 0555, lblc_sysctl_table.vs_dir},
5286 + {{CTL_NET, "net", NULL, 0, 0555, lblc_sysctl_table.ipv4_dir},
5292 + * new/free a ip_vs_lblc_entry, which is a mapping of a destination
5293 + * IP address to a server.
5295 +static inline struct ip_vs_lblc_entry *
5296 +ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest)
5298 + struct ip_vs_lblc_entry *en;
5300 + en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
5302 + IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
5306 + INIT_LIST_HEAD(&en->list);
5309 + atomic_inc(&dest->refcnt);
5316 +static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
5318 + list_del(&en->list);
5319 + atomic_dec(&en->dest->refcnt);
5325 + * Returns hash value for IPVS LBLC entry
5327 +static inline unsigned ip_vs_lblc_hashkey(__u32 addr)
5329 + return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
5334 + * Hash an entry in the ip_vs_lblc_table.
5335 + * returns bool success.
5338 +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
5342 + if (!list_empty(&en->list)) {
5343 + IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
5344 + "called from %p\n", __builtin_return_address(0));
5349 + * Hash by destination IP address
5351 + hash = ip_vs_lblc_hashkey(en->addr);
5353 + write_lock(&tbl->lock);
5354 + list_add(&en->list, &tbl->bucket[hash]);
5355 + atomic_inc(&tbl->entries);
5356 + write_unlock(&tbl->lock);
5364 + * Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
5365 + * returns bool success.
5367 +static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
5368 + struct ip_vs_lblc_entry *en)
5370 + if (list_empty(&en->list)) {
5371 + IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
5372 + "called from %p\n", __builtin_return_address(0));
5377 + * Remove it from the table
5379 + write_lock(&tbl->lock);
5380 + list_del(&en->list);
5381 + INIT_LIST_HEAD(&en->list);
5382 + write_unlock(&tbl->lock);
5390 + * Get ip_vs_lblc_entry associated with supplied parameters.
5392 +static inline struct ip_vs_lblc_entry *
5393 +ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr)
5396 + struct ip_vs_lblc_entry *en;
5397 + struct list_head *l,*e;
5399 + hash = ip_vs_lblc_hashkey(addr);
5401 + read_lock(&tbl->lock);
5403 + l = &tbl->bucket[hash];
5404 + for (e=l->next; e!=l; e=e->next) {
5405 + en = list_entry(e, struct ip_vs_lblc_entry, list);
5406 + if (en->addr == addr) {
5408 + read_unlock(&tbl->lock);
5413 + read_unlock(&tbl->lock);
5420 + * Flush all the entries of the specified table.
5422 +static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
5425 + struct list_head *l;
5426 + struct ip_vs_lblc_entry *en;
5428 + for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
5429 + write_lock(&tbl->lock);
5430 + for (l=&tbl->bucket[i]; l->next!=l; ) {
5431 + en = list_entry(l->next,
5432 + struct ip_vs_lblc_entry, list);
5433 + ip_vs_lblc_free(en);
5434 + atomic_dec(&tbl->entries);
5436 + write_unlock(&tbl->lock);
5441 +static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
5443 + unsigned long now = jiffies;
5445 + struct list_head *l, *e;
5446 + struct ip_vs_lblc_entry *en;
5448 + for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
5449 + j = (j + 1) & IP_VS_LBLC_TAB_MASK;
5450 + e = l = &tbl->bucket[j];
5451 + write_lock(&tbl->lock);
5452 + while (e->next != l) {
5453 + en = list_entry(e->next,
5454 + struct ip_vs_lblc_entry, list);
5455 + if ((now - en->lastuse) <
5456 + sysctl_ip_vs_lblc_expiration) {
5460 + ip_vs_lblc_free(en);
5461 + atomic_dec(&tbl->entries);
5463 + write_unlock(&tbl->lock);
5470 + * Periodical timer handler for IPVS lblc table
5471 + * It is used to collect stale entries when the number of entries
5472 + * exceeds the maximum size of the table.
5474 + * Fixme: we probably need more complicated algorithm to collect
5475 + * entries that have not been used for a long time even
5476 + * if the number of entries doesn't exceed the maximum size
5478 + * The full expiration check is for this purpose now.
5480 +static void ip_vs_lblc_check_expire(unsigned long data)
5482 + struct ip_vs_lblc_table *tbl;
5483 + unsigned long now = jiffies;
5486 + struct list_head *l, *e;
5487 + struct ip_vs_lblc_entry *en;
5489 + tbl = (struct ip_vs_lblc_table *)data;
5491 + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
5492 + /* do full expiration check */
5493 + ip_vs_lblc_full_check(tbl);
5498 + if (atomic_read(&tbl->entries) < tbl->max_size) {
5503 + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
5504 + if (goal > tbl->max_size/2)
5505 + goal = tbl->max_size/2;
5507 + for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
5508 + j = (j + 1) & IP_VS_LBLC_TAB_MASK;
5509 + e = l = &tbl->bucket[j];
5510 + write_lock(&tbl->lock);
5511 + while (e->next != l) {
5512 + en = list_entry(e->next,
5513 + struct ip_vs_lblc_entry, list);
5514 + if ((now - en->lastuse) < ENTRY_TIMEOUT) {
5518 + ip_vs_lblc_free(en);
5519 + atomic_dec(&tbl->entries);
5522 + write_unlock(&tbl->lock);
5529 + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
5533 +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
5536 + struct ip_vs_lblc_table *tbl;
5539 + * Allocate the ip_vs_lblc_table for this service
5541 + tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
5542 + if (tbl == NULL) {
5543 + IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
5546 + svc->sched_data = tbl;
5547 + IP_VS_DBG(0, "LBLC hash table (memory=%dbytes) allocated for "
5548 + "current service\n",
5549 + sizeof(struct ip_vs_lblc_table));
5552 + * Initialize the hash buckets
5554 + for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
5555 + INIT_LIST_HEAD(&tbl->bucket[i]);
5557 + tbl->lock = RW_LOCK_UNLOCKED;
5558 + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
5563 + * Hook periodic timer for garbage collection
5565 + init_timer(&tbl->periodic_timer);
5566 + tbl->periodic_timer.data = (unsigned long)tbl;
5567 + tbl->periodic_timer.function = ip_vs_lblc_check_expire;
5568 + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
5569 + add_timer(&tbl->periodic_timer);
5571 + MOD_INC_USE_COUNT;
5576 +static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
5578 + struct ip_vs_lblc_table *tbl = svc->sched_data;
5580 + /* remove periodic timer */
5581 + del_timer(&tbl->periodic_timer);
5583 + /* got to clean up table entries here */
5584 + ip_vs_lblc_flush(tbl);
5586 + /* release the table itself */
5587 + kfree(svc->sched_data);
5588 + IP_VS_DBG(0, "LBLC hash table (memory=%dbytes) released\n",
5589 + sizeof(struct ip_vs_lblc_table));
5591 + MOD_DEC_USE_COUNT;
5596 +static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
5602 +static inline struct ip_vs_dest *
5603 +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
5605 + register struct list_head *l, *e;
5606 + struct ip_vs_dest *dest, *least;
5609 + l = &svc->destinations;
5614 + * We think the overhead of processing active connections is fifty
5615 + * times than that of inactive conncetions in average. (This fifty
5616 + * times might be not accurate, we will change it later.) We use
5617 + * the following formula to estimate the overhead:
5618 + * dest->activeconns*50 + dest->inactconns
5620 + * (dest overhead) / dest->weight
5622 + * Remember -- no floats in kernel mode!!!
5623 + * The comparison of h1*w2 > h2*w1 is equivalent to that of
5625 + * if every weight is larger than zero.
5627 + * The server with weight=0 is quiesced and will not receive any
5631 + for (e=l->next; e!=l; e=e->next) {
5632 + least = list_entry(e, struct ip_vs_dest, n_list);
5633 + if (least->weight > 0) {
5634 + loh = atomic_read(&least->activeconns) * 50
5635 + + atomic_read(&least->inactconns);
5642 + * Find the destination with the least load.
5645 + for (e=e->next; e!=l; e=e->next)
5647 + dest = list_entry(e, struct ip_vs_dest, n_list);
5648 + doh = atomic_read(&dest->activeconns) * 50
5649 + + atomic_read(&dest->inactconns);
5650 + if (loh * dest->weight > doh * least->weight)
5657 + IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
5658 + "activeconns %d refcnt %d weight %d overhead %d\n",
5659 + NIPQUAD(least->addr), ntohs(least->port),
5660 + atomic_read(&least->activeconns),
5661 + atomic_read(&least->refcnt), least->weight, loh);
5668 + * If this destination server is overloaded and there is a less loaded
5669 + * server, then return true.
5672 +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
5674 + if (atomic_read(&dest->activeconns) > dest->weight) {
5675 + register struct list_head *l, *e;
5676 + struct ip_vs_dest *d;
5678 + l = &svc->destinations;
5679 + for (e=l->next; e!=l; e=e->next) {
5680 + d = list_entry(e, struct ip_vs_dest, n_list);
5681 + if (atomic_read(&d->activeconns)*2 < d->weight) {
5691 + * Locality-Based (weighted) Least-Connection scheduling
5693 +static struct ip_vs_dest *
5694 +ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
5696 + struct ip_vs_dest *dest;
5697 + struct ip_vs_lblc_table *tbl;
5698 + struct ip_vs_lblc_entry *en;
5700 + IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
5702 + tbl = (struct ip_vs_lblc_table *)svc->sched_data;
5703 + en = ip_vs_lblc_get(tbl, iph->daddr);
5705 + dest = __ip_vs_wlc_schedule(svc, iph);
5706 + if (dest == NULL) {
5707 + IP_VS_DBG(1, "no destination available\n");
5710 + en = ip_vs_lblc_new(iph->daddr, dest);
5714 + ip_vs_lblc_hash(tbl, en);
5717 + if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
5718 + || dest->weight <= 0
5719 + || is_overloaded(dest, svc)) {
5720 + dest = __ip_vs_wlc_schedule(svc, iph);
5721 + if (dest == NULL) {
5722 + IP_VS_DBG(1, "no destination available\n");
5725 + atomic_dec(&en->dest->refcnt);
5726 + atomic_inc(&dest->refcnt);
5730 + en->lastuse = jiffies;
5732 + IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
5733 + "--> server %u.%u.%u.%u:%d\n",
5734 + NIPQUAD(en->addr),
5735 + NIPQUAD(dest->addr),
5736 + ntohs(dest->port));
5742 +static struct ip_vs_scheduler ip_vs_lblc_scheduler =
5745 + "lblc", /* name */
5746 + ATOMIC_INIT(0), /* refcnt */
5747 + ip_vs_lblc_init_svc, /* service initializer */
5748 + ip_vs_lblc_done_svc, /* service done */
5749 + ip_vs_lblc_update_svc, /* service updater */
5750 + ip_vs_lblc_schedule, /* select a server from the destination list */
5754 +__initfunc(int ip_vs_lblc_init(void))
5756 + IP_VS_INFO("Initializing LBLC scheduling\n");
5757 + INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
5758 + lblc_sysctl_table.sysctl_header =
5759 + register_sysctl_table(lblc_sysctl_table.root_dir, 0);
5760 + return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
5767 +int init_module(void)
5769 + INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
5771 + /* module initialization by 'request_module' */
5772 + if (register_ip_vs_scheduler(&ip_vs_lblc_scheduler) != 0)
5775 + lblc_sysctl_table.sysctl_header =
5776 + register_sysctl_table(lblc_sysctl_table.root_dir, 0);
5778 + IP_VS_INFO("LBLC scheduling module loaded.\n");
5783 +void cleanup_module(void)
5785 + /* module cleanup by 'release_module' */
5786 + if (unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler) != 0) {
5787 + IP_VS_INFO("cannot remove LBLC scheduling module\n");
5789 + IP_VS_INFO("LBLC scheduling module unloaded.\n");
5791 + unregister_sysctl_table(lblc_sysctl_table.sysctl_header);
5794 +#endif /* MODULE */
5795 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lblcr.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblcr.c
5796 --- linux-2.2.19/net/ipv4/ip_vs_lblcr.c Thu Jan 1 08:00:00 1970
5797 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblcr.c Tue Mar 27 17:37:00 2001
5800 + * IPVS: Locality-Based Least-Connection with Replication scheduler
5804 + * Authors: Wensong Zhang <wensong@gnuchina.org>
5806 + * This program is free software; you can redistribute it and/or
5807 + * modify it under the terms of the GNU General Public License
5808 + * as published by the Free Software Foundation; either version
5809 + * 2 of the License, or (at your option) any later version.
5812 + * Julian Anastasov : Added the missing (dest->weight>0)
5813 + * condition in the ip_vs_dest_set_max.
5818 + * The lblc/r algorithm is as follows (pseudo code):
5820 + * if serverSet[dest_ip] is null then
5821 + * n, serverSet[dest_ip] <- {weighted least-conn node};
5823 + * n <- {least-conn (alive) node in serverSet[dest_ip]};
5824 + * if (n is null) OR
5825 + * (n.conns>n.weight AND
5826 + * there is a node m with m.conns<m.weight/2) then
5827 + * n <- {weighted least-conn node};
5828 + * add n to serverSet[dest_ip];
5829 + * if |serverSet[dest_ip]| > 1 AND
5830 + * now - serverSet[dest_ip].lastMod > T then
5831 + * m <- {most conn node in serverSet[dest_ip]};
5832 + * remove m from serverSet[dest_ip];
5833 + * if serverSet[dest_ip] changed then
5834 + * serverSet[dest_ip].lastMod <- now;
5840 +#include <linux/config.h>
5841 +#include <linux/module.h>
5843 +#include <linux/kmod.h>
5845 +#include <linux/types.h>
5846 +#include <linux/kernel.h>
5847 +#include <linux/errno.h>
5848 +#include <linux/vmalloc.h>
5849 +#include <net/ip_masq.h>
5850 +#ifdef CONFIG_IP_MASQUERADE_MOD
5851 +#include <net/ip_masq_mod.h>
5853 +#include <linux/sysctl.h>
5854 +#include <linux/proc_fs.h>
5855 +#include <linux/ip_fw.h>
5856 +#include <net/ip_vs.h>
5860 + * It is for garbage collection of stale IPVS lblcr entries,
5861 + * when the table is full.
5863 +#define CHECK_EXPIRE_INTERVAL (60*HZ)
5864 +#define ENTRY_TIMEOUT (6*60*HZ)
5867 + * It is for full expiration check.
5868 + * When there is no partial expiration check (garbage collection)
5869 + * in a half hour, do a full expiration check to collect stale
5870 + * entries that haven't been touched for a day.
5872 +#define COUNT_FOR_FULL_EXPIRATION 30
5873 +int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
5877 + * for IPVS lblcr entry hash table
5879 +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
5880 +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
5882 +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
5883 +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
5884 +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
5888 + * IPVS destination set structure and operations
5890 +struct ip_vs_dest_list {
5891 + struct ip_vs_dest_list *next; /* list link */
5892 + struct ip_vs_dest *dest; /* destination server */
5895 +struct ip_vs_dest_set {
5896 + atomic_t size; /* set size */
5897 + unsigned long lastmod; /* last modified time */
5898 + struct ip_vs_dest_list *list; /* destination list */
5899 + rwlock_t lock; /* lock for this list */
5903 +static struct ip_vs_dest_list *
5904 +ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
5906 + struct ip_vs_dest_list *e;
5908 + for (e=set->list; e!=NULL; e=e->next) {
5909 + if (e->dest == dest)
5910 + /* already existed */
5914 + e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
5916 + IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
5920 + atomic_inc(&dest->refcnt);
5923 + /* link it to the list */
5924 + write_lock(&set->lock);
5925 + if (set->list != NULL) {
5926 + e->next = set->list->next;
5932 + write_unlock(&set->lock);
5934 + atomic_inc(&set->size);
5935 + set->lastmod = jiffies;
5940 +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
5942 + struct ip_vs_dest_list *e, **ep;
5944 + write_lock(&set->lock);
5945 + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
5946 + if (e->dest == dest) {
5949 + atomic_dec(&set->size);
5950 + set->lastmod = jiffies;
5951 + atomic_dec(&e->dest->refcnt);
5957 + write_unlock(&set->lock);
5960 +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
5962 + struct ip_vs_dest_list *e, **ep;
5964 + write_lock(&set->lock);
5965 + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
5968 + * We don't kfree dest because it is refered either
5969 + * by its service or by the trash dest list.
5971 + atomic_dec(&e->dest->refcnt);
5974 + write_unlock(&set->lock);
5977 +/* get weighted least-connection node in the destination set */
5978 +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
5980 + register struct ip_vs_dest_list *e;
5981 + struct ip_vs_dest *dest, *least;
5987 + read_lock(&set->lock);
5988 + /* select the first destination server, whose weight > 0 */
5989 + for (e=set->list; e!=NULL; e=e->next) {
5991 + if ((least->weight > 0)
5992 + && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
5993 + loh = atomic_read(&least->activeconns) * 50
5994 + + atomic_read(&least->inactconns);
5998 + read_unlock(&set->lock);
6001 + /* find the destination with the weighted least load */
6003 + for (e=e->next; e!=NULL; e=e->next) {
6005 + doh = atomic_read(&dest->activeconns) * 50
6006 + + atomic_read(&dest->inactconns);
6007 + if ((loh*dest->weight > doh*least->weight)
6008 + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
6013 + read_unlock(&set->lock);
6015 + IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
6016 + "activeconns %d refcnt %d weight %d overhead %d\n",
6017 + NIPQUAD(least->addr), ntohs(least->port),
6018 + atomic_read(&least->activeconns),
6019 + atomic_read(&least->refcnt), least->weight, loh);
6024 +/* get weighted most-connection node in the destination set */
6025 +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
6027 + register struct ip_vs_dest_list *e;
6028 + struct ip_vs_dest *dest, *most;
6034 + read_lock(&set->lock);
6035 + /* select the first destination server, whose weight > 0 */
6036 + for (e=set->list; e!=NULL; e=e->next) {
6038 + if (most->weight > 0) {
6039 + moh = atomic_read(&most->activeconns) * 50
6040 + + atomic_read(&most->inactconns);
6044 + read_unlock(&set->lock);
6047 + /* find the destination with the weighted most load */
6049 + for (e=e->next; e!=NULL; e=e->next) {
6051 + doh = atomic_read(&dest->activeconns) * 50
6052 + + atomic_read(&dest->inactconns);
6053 + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
6054 + if (moh*dest->weight < doh*most->weight
6055 + && dest->weight > 0) {
6060 + read_unlock(&set->lock);
6062 + IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
6063 + "activeconns %d refcnt %d weight %d overhead %d\n",
6064 + NIPQUAD(most->addr), ntohs(most->port),
6065 + atomic_read(&most->activeconns),
6066 + atomic_read(&most->refcnt), most->weight, moh);
6072 + * IPVS lblcr entry represents an association between destination
6073 + * IP address and its destination server set
6075 +struct ip_vs_lblcr_entry {
6076 + struct list_head list;
6077 + __u32 addr; /* destination IP address */
6078 + struct ip_vs_dest_set set; /* destination server set */
6079 + unsigned long lastuse; /* last used time */
6084 + * IPVS lblcr hash table
6086 +struct ip_vs_lblcr_table {
6087 + rwlock_t lock; /* lock for this table */
6088 + struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
6089 + atomic_t entries; /* number of entries */
6090 + int max_size; /* maximum size of entries */
6091 + struct timer_list periodic_timer; /* collect stale entries */
6092 + int rover; /* rover for expire check */
6093 + int counter; /* counter for no expire */
6098 + * IPVS LBLCR sysctl table
6100 +struct ip_vs_lblcr_sysctl_table {
6101 + struct ctl_table_header *sysctl_header;
6102 + ctl_table vs_vars[2];
6103 + ctl_table vs_dir[2];
6104 + ctl_table ipv4_dir[2];
6105 + ctl_table root_dir[2];
6109 +static struct ip_vs_lblcr_sysctl_table lblcr_sysctl_table = {
6111 + {{NET_IPV4_VS_LBLCR_EXPIRE, "lblcr_expiration",
6112 + &sysctl_ip_vs_lblcr_expiration,
6113 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
6115 + {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblcr_sysctl_table.vs_vars},
6117 + {{NET_IPV4, "ipv4", NULL, 0, 0555, lblcr_sysctl_table.vs_dir},
6119 + {{CTL_NET, "net", NULL, 0, 0555, lblcr_sysctl_table.ipv4_dir},
6125 + * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
6126 + * IP address to a server.
6128 +static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr)
6130 + struct ip_vs_lblcr_entry *en;
6132 + en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
6134 + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
6138 + INIT_LIST_HEAD(&en->list);
6141 + /* initilize its dest set */
6142 + atomic_set(&(en->set.size), 0);
6143 + en->set.list = NULL;
6144 + en->set.lock = RW_LOCK_UNLOCKED;
6150 +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
6152 + list_del(&en->list);
6153 + ip_vs_dest_set_eraseall(&en->set);
6159 + * Returns hash value for IPVS LBLCR entry
6161 +static inline unsigned ip_vs_lblcr_hashkey(__u32 addr)
6163 + return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
6168 + * Hash an entry in the ip_vs_lblcr_table.
6169 + * returns bool success.
6172 +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
6176 + if (!list_empty(&en->list)) {
6177 + IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
6178 + "called from %p\n", __builtin_return_address(0));
6183 + * Hash by destination IP address
6185 + hash = ip_vs_lblcr_hashkey(en->addr);
6187 + write_lock(&tbl->lock);
6188 + list_add(&en->list, &tbl->bucket[hash]);
6189 + atomic_inc(&tbl->entries);
6190 + write_unlock(&tbl->lock);
6198 + * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
6199 + * returns bool success.
6201 +static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
6202 + struct ip_vs_lblcr_entry *en)
6204 + if (list_empty(&en->list)) {
6205 + IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
6206 + "called from %p\n", __builtin_return_address(0));
6211 + * Remove it from the table
6213 + write_lock(&tbl->lock);
6214 + list_del(&en->list);
6215 + INIT_LIST_HEAD(&en->list);
6216 + write_unlock(&tbl->lock);
6224 + * Get ip_vs_lblcr_entry associated with supplied parameters.
6226 +static inline struct ip_vs_lblcr_entry *
6227 +ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr)
6230 + struct ip_vs_lblcr_entry *en;
6231 + struct list_head *l,*e;
6233 + hash = ip_vs_lblcr_hashkey(addr);
6234 + l = &tbl->bucket[hash];
6236 + read_lock(&tbl->lock);
6238 + for (e=l->next; e!=l; e=e->next) {
6239 + en = list_entry(e, struct ip_vs_lblcr_entry, list);
6240 + if (en->addr == addr) {
6242 + read_unlock(&tbl->lock);
6247 + read_unlock(&tbl->lock);
6254 + * Flush all the entries of the specified table.
6256 +static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
6259 + struct list_head *l;
6260 + struct ip_vs_lblcr_entry *en;
6262 + for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
6263 + write_lock(&tbl->lock);
6264 + for (l=&tbl->bucket[i]; l->next!=l; ) {
6265 + en = list_entry(l->next,
6266 + struct ip_vs_lblcr_entry, list);
6267 + ip_vs_lblcr_free(en);
6268 + atomic_dec(&tbl->entries);
6270 + write_unlock(&tbl->lock);
6275 +static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
6277 + unsigned long now = jiffies;
6279 + struct list_head *l, *e;
6280 + struct ip_vs_lblcr_entry *en;
6282 + for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
6283 + j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
6284 + e = l = &tbl->bucket[j];
6285 + write_lock(&tbl->lock);
6286 + while (e->next != l) {
6287 + en = list_entry(e->next,
6288 + struct ip_vs_lblcr_entry, list);
6289 + if ((now - en->lastuse) <
6290 + sysctl_ip_vs_lblcr_expiration) {
6294 + ip_vs_lblcr_free(en);
6295 + atomic_dec(&tbl->entries);
6297 + write_unlock(&tbl->lock);
6304 + * Periodical timer handler for IPVS lblcr table
6305 + * It is used to collect stale entries when the number of entries
6306 + * exceeds the maximum size of the table.
6308 + * Fixme: we probably need more complicated algorithm to collect
6309 + * entries that have not been used for a long time even
6310 + * if the number of entries doesn't exceed the maximum size
6312 + * The full expiration check is for this purpose now.
6314 +static void ip_vs_lblcr_check_expire(unsigned long data)
6316 + struct ip_vs_lblcr_table *tbl;
6317 + unsigned long now = jiffies;
6320 + struct list_head *l, *e;
6321 + struct ip_vs_lblcr_entry *en;
6323 + tbl = (struct ip_vs_lblcr_table *)data;
6325 + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
6326 + /* do full expiration check */
6327 + ip_vs_lblcr_full_check(tbl);
6332 + if (atomic_read(&tbl->entries) < tbl->max_size) {
6337 + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
6338 + if (goal > tbl->max_size/2)
6339 + goal = tbl->max_size/2;
6341 + for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
6342 + j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
6343 + e = l = &tbl->bucket[j];
6344 + write_lock(&tbl->lock);
6345 + while (e->next != l) {
6346 + en = list_entry(e->next,
6347 + struct ip_vs_lblcr_entry, list);
6348 + if ((now - en->lastuse) < ENTRY_TIMEOUT) {
6352 + ip_vs_lblcr_free(en);
6353 + atomic_dec(&tbl->entries);
6356 + write_unlock(&tbl->lock);
6363 + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
6367 +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
6370 + struct ip_vs_lblcr_table *tbl;
6373 + * Allocate the ip_vs_lblcr_table for this service
6375 + tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
6376 + if (tbl == NULL) {
6377 + IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
6380 + svc->sched_data = tbl;
6381 + IP_VS_DBG(0, "LBLCR hash table (memory=%dbytes) allocated for "
6382 + "current service\n",
6383 + sizeof(struct ip_vs_lblcr_table));
6386 + * Initialize the hash buckets
6388 + for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
6389 + INIT_LIST_HEAD(&tbl->bucket[i]);
6391 + tbl->lock = RW_LOCK_UNLOCKED;
6392 + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
6397 + * Hook periodic timer for garbage collection
6399 + init_timer(&tbl->periodic_timer);
6400 + tbl->periodic_timer.data = (unsigned long)tbl;
6401 + tbl->periodic_timer.function = ip_vs_lblcr_check_expire;
6402 + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
6403 + add_timer(&tbl->periodic_timer);
6405 + MOD_INC_USE_COUNT;
6410 +static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
6412 + struct ip_vs_lblcr_table *tbl = svc->sched_data;
6414 + /* remove periodic timer */
6415 + del_timer(&tbl->periodic_timer);
6417 + /* got to clean up table entries here */
6418 + ip_vs_lblcr_flush(tbl);
6420 + /* release the table itself */
6421 + kfree(svc->sched_data);
6422 + IP_VS_DBG(0, "LBLCR hash table (memory=%dbytes) released\n",
6423 + sizeof(struct ip_vs_lblcr_table));
6425 + MOD_DEC_USE_COUNT;
6430 +static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
6436 +static inline struct ip_vs_dest *
6437 +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
6439 + register struct list_head *l, *e;
6440 + struct ip_vs_dest *dest, *least;
6443 + l = &svc->destinations;
6448 + * We think the overhead of processing active connections is fifty
6449 + * times than that of inactive conncetions in average. (This fifty
6450 + * times might be not accurate, we will change it later.) We use
6451 + * the following formula to estimate the overhead:
6452 + * dest->activeconns*50 + dest->inactconns
6454 + * (dest overhead) / dest->weight
6456 + * Remember -- no floats in kernel mode!!!
6457 + * The comparison of h1*w2 > h2*w1 is equivalent to that of
6459 + * if every weight is larger than zero.
6461 + * The server with weight=0 is quiesced and will not receive any
6465 + for (e=l->next; e!=l; e=e->next) {
6466 + least = list_entry(e, struct ip_vs_dest, n_list);
6467 + if (least->weight > 0) {
6468 + loh = atomic_read(&least->activeconns) * 50
6469 + + atomic_read(&least->inactconns);
6476 + * Find the destination with the least load.
6479 + for (e=e->next; e!=l; e=e->next) {
6480 + dest = list_entry(e, struct ip_vs_dest, n_list);
6481 + doh = atomic_read(&dest->activeconns) * 50
6482 + + atomic_read(&dest->inactconns);
6483 + if (loh*dest->weight > doh*least->weight) {
6489 + IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
6490 + "activeconns %d refcnt %d weight %d overhead %d\n",
6491 + NIPQUAD(least->addr), ntohs(least->port),
6492 + atomic_read(&least->activeconns),
6493 + atomic_read(&least->refcnt), least->weight, loh);
6500 + * If this destination server is overloaded and there is a less loaded
6501 + * server, then return true.
6504 +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
6506 + if (atomic_read(&dest->activeconns) > dest->weight) {
6507 + register struct list_head *l, *e;
6508 + struct ip_vs_dest *d;
6510 + l = &svc->destinations;
6511 + for (e=l->next; e!=l; e=e->next) {
6512 + d = list_entry(e, struct ip_vs_dest, n_list);
6513 + if (atomic_read(&d->activeconns)*2 < d->weight) {
6523 + * Locality-Based (weighted) Least-Connection scheduling
6525 +static struct ip_vs_dest *
6526 +ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
6528 + struct ip_vs_dest *dest;
6529 + struct ip_vs_lblcr_table *tbl;
6530 + struct ip_vs_lblcr_entry *en;
6532 + IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
6534 + tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
6535 + en = ip_vs_lblcr_get(tbl, iph->daddr);
6537 + dest = __ip_vs_wlc_schedule(svc, iph);
6538 + if (dest == NULL) {
6539 + IP_VS_DBG(1, "no destination available\n");
6542 + en = ip_vs_lblcr_new(iph->daddr);
6546 + ip_vs_dest_set_insert(&en->set, dest);
6547 + ip_vs_lblcr_hash(tbl, en);
6549 + dest = ip_vs_dest_set_min(&en->set);
6550 + if (!dest || is_overloaded(dest, svc)) {
6551 + dest = __ip_vs_wlc_schedule(svc, iph);
6552 + if (dest == NULL) {
6553 + IP_VS_DBG(1, "no destination available\n");
6556 + ip_vs_dest_set_insert(&en->set, dest);
6558 + if (atomic_read(&en->set.size) > 1 &&
6559 + jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
6560 + struct ip_vs_dest *m;
6561 + m = ip_vs_dest_set_max(&en->set);
6562 + if (m) ip_vs_dest_set_erase(&en->set, m);
6565 + en->lastuse = jiffies;
6567 + IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
6568 + "--> server %u.%u.%u.%u:%d\n",
6569 + NIPQUAD(en->addr),
6570 + NIPQUAD(dest->addr),
6571 + ntohs(dest->port));
6578 + * IPVS LBLCR Scheduler structure
6580 +static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
6583 + "lblcr", /* name */
6584 + ATOMIC_INIT(0), /* refcnt */
6585 + ip_vs_lblcr_init_svc, /* service initializer */
6586 + ip_vs_lblcr_done_svc, /* service done */
6587 + ip_vs_lblcr_update_svc, /* service updater */
6588 + ip_vs_lblcr_schedule, /* select a server from the destination list */
6592 +__initfunc(int ip_vs_lblcr_init(void))
6594 + IP_VS_INFO("Initializing LBLCR scheduling\n");
6595 + INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
6596 + lblcr_sysctl_table.sysctl_header =
6597 + register_sysctl_table(lblcr_sysctl_table.root_dir, 0);
6598 + return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
6605 +int init_module(void)
6607 + INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
6609 + /* module initialization by 'request_module' */
6610 + if (register_ip_vs_scheduler(&ip_vs_lblcr_scheduler) != 0)
6613 + lblcr_sysctl_table.sysctl_header =
6614 + register_sysctl_table(lblcr_sysctl_table.root_dir, 0);
6616 + IP_VS_INFO("LBLCR scheduling module loaded.\n");
6621 +void cleanup_module(void)
6623 + /* module cleanup by 'release_module' */
6624 + if (unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler) != 0) {
6625 + IP_VS_INFO("cannot remove LBLCR scheduling module\n");
6627 + IP_VS_INFO("LBLCR scheduling module unloaded.\n");
6629 + unregister_sysctl_table(lblcr_sysctl_table.sysctl_header);
6632 +#endif /* MODULE */
6633 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lc.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lc.c
6634 --- linux-2.2.19/net/ipv4/ip_vs_lc.c Thu Jan 1 08:00:00 1970
6635 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lc.c Fri Nov 24 10:02:53 2000
6638 + * IPVS: Least-Connection Scheduling module
6642 + * Authors: Wensong Zhang <wensong@iinchina.net>
6644 + * This program is free software; you can redistribute it and/or
6645 + * modify it under the terms of the GNU General Public License
6646 + * as published by the Free Software Foundation; either version
6647 + * 2 of the License, or (at your option) any later version.
6650 + * Wensong Zhang : added the ip_vs_lc_update_svc
6651 + * Wensong Zhang : added any dest with weight=0 is quiesced
6655 +#include <linux/config.h>
6656 +#include <linux/module.h>
6658 +#include <linux/kmod.h>
6660 +#include <linux/types.h>
6661 +#include <linux/kernel.h>
6662 +#include <linux/errno.h>
6663 +#include <net/ip_masq.h>
6664 +#ifdef CONFIG_IP_MASQUERADE_MOD
6665 +#include <net/ip_masq_mod.h>
6667 +#include <linux/ip_fw.h>
6668 +#include <net/ip_vs.h>
6671 +static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
6673 + MOD_INC_USE_COUNT;
6678 +static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
6680 + MOD_DEC_USE_COUNT;
6685 +static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
6692 + * Least Connection scheduling
6694 +static struct ip_vs_dest* ip_vs_lc_schedule(struct ip_vs_service *svc,
6695 + struct iphdr *iph)
6697 + struct list_head *l, *e;
6698 + struct ip_vs_dest *dest, *least;
6701 + IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
6703 + l = &svc->destinations;
6708 + * Simply select the server with the least number of
6709 + * (activeconns<<5) + inactconns
6710 + * Except whose weight is equal to zero.
6711 + * If the weight is equal to zero, it means that the server is
6712 + * quiesced, the existing connections to the server still get
6713 + * served, but no new connection is assigned to the server.
6716 + for (e=l->next; e!=l; e=e->next) {
6717 + least = list_entry (e, struct ip_vs_dest, n_list);
6718 + if (least->weight > 0) {
6719 + lac = (atomic_read(&least->activeconns) << 5)
6720 + + atomic_read(&least->inactconns);
6727 + * Find the destination with the least load.
6730 + for (e=e->next; e!=l; e=e->next) {
6731 + dest = list_entry(e, struct ip_vs_dest, n_list);
6732 + if (dest->weight == 0)
6734 + dac = (atomic_read(&dest->activeconns) << 5)
6735 + + atomic_read(&dest->inactconns);
6742 + IP_VS_DBG(6, "LC: server %d.%d.%d.%d:%d activeconns %d inactconns %d\n",
6743 + NIPQUAD(least->addr), ntohs(least->port),
6744 + atomic_read(&least->activeconns),
6745 + atomic_read(&least->inactconns));
6751 +static struct ip_vs_scheduler ip_vs_lc_scheduler = {
6754 + ATOMIC_INIT(0), /* refcnt */
6755 + ip_vs_lc_init_svc, /* service initializer */
6756 + ip_vs_lc_done_svc, /* service done */
6757 + ip_vs_lc_update_svc, /* service updater */
6758 + ip_vs_lc_schedule, /* select a server from the destination list */
6762 +__initfunc(int ip_vs_lc_init(void))
6764 + IP_VS_INFO("Initializing LC scheduling\n");
6765 + INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
6766 + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
6773 +int init_module(void)
6775 + INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
6777 + /* module initialization by 'request_module' */
6778 + if(register_ip_vs_scheduler(&ip_vs_lc_scheduler) != 0)
6781 + IP_VS_INFO("LC scheduling module loaded.\n");
6786 +void cleanup_module(void)
6788 + /* module cleanup by 'release_module' */
6789 + if(unregister_ip_vs_scheduler(&ip_vs_lc_scheduler) != 0)
6790 + IP_VS_INFO("cannot remove LC scheduling module\n");
6792 + IP_VS_INFO("LC scheduling module unloaded.\n");
6795 +#endif /* MODULE */
6796 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_rr.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_rr.c
6797 --- linux-2.2.19/net/ipv4/ip_vs_rr.c Thu Jan 1 08:00:00 1970
6798 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_rr.c Fri Nov 24 10:04:12 2000
6801 + * IPVS: Round-Robin Scheduling module
6805 + * Authors: Wensong Zhang <wensong@iinchina.net>
6806 + * Peter Kese <peter.kese@ijs.si>
6808 + * This program is free software; you can redistribute it and/or
6809 + * modify it under the terms of the GNU General Public License
6810 + * as published by the Free Software Foundation; either version
6811 + * 2 of the License, or (at your option) any later version.
6814 + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest
6815 + * Julian Anastasov : fixed the NULL pointer access bug in debugging
6816 + * Wensong Zhang : changed some comestics things for debugging
6817 + * Wensong Zhang : changed for the d-linked destination list
6818 + * Wensong Zhang : added the ip_vs_rr_update_svc
6819 + * Wensong Zhang : added any dest with weight=0 is quiesced
6823 +#include <linux/config.h>
6824 +#include <linux/module.h>
6826 +#include <linux/kmod.h>
6828 +#include <linux/types.h>
6829 +#include <linux/kernel.h>
6830 +#include <linux/errno.h>
6831 +#include <net/ip_masq.h>
6832 +#ifdef CONFIG_IP_MASQUERADE_MOD
6833 +#include <net/ip_masq_mod.h>
6835 +#include <linux/ip_fw.h>
6836 +#include <net/ip_vs.h>
6839 +static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
6841 + svc->sched_data = &svc->destinations;
6842 + MOD_INC_USE_COUNT;
6847 +static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
6849 + MOD_DEC_USE_COUNT;
6854 +static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
6856 + svc->sched_data = &svc->destinations;
6862 + * Round-Robin Scheduling
6864 +static struct ip_vs_dest* ip_vs_rr_schedule(struct ip_vs_service *svc,
6865 + struct iphdr *iph)
6867 + register struct list_head *p, *q;
6868 + struct ip_vs_dest *dest;
6870 + IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
6872 + p = (struct list_head *)svc->sched_data;
6876 + if (q == &svc->destinations) {
6880 + dest = list_entry(q, struct ip_vs_dest, n_list);
6881 + if (dest->weight > 0)
6889 + svc->sched_data = q;
6890 + IP_VS_DBG(6, "RR: server %d.%d.%d.%d:%d "
6891 + "activeconns %d refcnt %d weight %d\n",
6892 + NIPQUAD(dest->addr), ntohs(dest->port),
6893 + atomic_read(&dest->activeconns),
6894 + atomic_read(&dest->refcnt), dest->weight);
6900 +static struct ip_vs_scheduler ip_vs_rr_scheduler = {
6903 + ATOMIC_INIT(0), /* refcnt */
6904 + ip_vs_rr_init_svc, /* service initializer */
6905 + ip_vs_rr_done_svc, /* service done */
6906 + ip_vs_rr_update_svc, /* service updater */
6907 + ip_vs_rr_schedule, /* select a server from the destination list */
6911 +__initfunc(int ip_vs_rr_init(void))
6913 + IP_VS_INFO("Initializing RR scheduling\n");
6914 + INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
6915 + return register_ip_vs_scheduler(&ip_vs_rr_scheduler) ;
6922 +int init_module(void)
6924 + INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
6926 + /* module initialization by 'request_module' */
6927 + if(register_ip_vs_scheduler(&ip_vs_rr_scheduler) != 0)
6930 + IP_VS_INFO("RR scheduling module loaded.\n");
6935 +void cleanup_module(void)
6937 + /* module cleanup by 'release_module' */
6938 + if(unregister_ip_vs_scheduler(&ip_vs_rr_scheduler) != 0)
6939 + IP_VS_INFO("cannot remove RR scheduling module\n");
6941 + IP_VS_INFO("RR scheduling module unloaded.\n");
6944 +#endif /* MODULE */
6945 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_wlc.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wlc.c
6946 --- linux-2.2.19/net/ipv4/ip_vs_wlc.c Thu Jan 1 08:00:00 1970
6947 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wlc.c Fri Nov 24 09:59:32 2000
6950 + * IPVS: Weighted Least-Connection Scheduling module
6954 + * Authors: Wensong Zhang <wensong@iinchina.net>
6955 + * Peter Kese <peter.kese@ijs.si>
6957 + * This program is free software; you can redistribute it and/or
6958 + * modify it under the terms of the GNU General Public License
6959 + * as published by the Free Software Foundation; either version
6960 + * 2 of the License, or (at your option) any later version.
6963 + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
6964 + * Wensong Zhang : changed to use the inactconns in scheduling
6965 + * Wensong Zhang : changed some comestics things for debugging
6966 + * Wensong Zhang : changed for the d-linked destination list
6967 + * Wensong Zhang : added the ip_vs_wlc_update_svc
6968 + * Wensong Zhang : added any dest with weight=0 is quiesced
6972 +#include <linux/config.h>
6973 +#include <linux/module.h>
6975 +#include <linux/kmod.h>
6977 +#include <linux/types.h>
6978 +#include <linux/kernel.h>
6979 +#include <linux/errno.h>
6980 +#include <net/ip_masq.h>
6981 +#ifdef CONFIG_IP_MASQUERADE_MOD
6982 +#include <net/ip_masq_mod.h>
6984 +#include <linux/ip_fw.h>
6985 +#include <net/ip_vs.h>
6989 +ip_vs_wlc_init_svc(struct ip_vs_service *svc)
6991 + MOD_INC_USE_COUNT;
6997 +ip_vs_wlc_done_svc(struct ip_vs_service *svc)
6999 + MOD_DEC_USE_COUNT;
7005 +ip_vs_wlc_update_svc(struct ip_vs_service *svc)
7012 + * Weighted Least Connection scheduling
7014 +static struct ip_vs_dest *
7015 +ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
7017 + register struct list_head *l, *e;
7018 + struct ip_vs_dest *dest, *least;
7021 + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
7023 + l = &svc->destinations;
7028 + * We think the overhead of processing active connections is fifty
7029 + * times than that of inactive conncetions in average. (This fifty
7030 + * times might be not accurate, we will change it later.) We use
7031 + * the following formula to estimate the overhead:
7032 + * dest->activeconns*50 + dest->inactconns
7034 + * (dest overhead) / dest->weight
7036 + * Remember -- no floats in kernel mode!!!
7037 + * The comparison of h1*w2 > h2*w1 is equivalent to that of
7039 + * if every weight is larger than zero.
7041 + * The server with weight=0 is quiesced and will not receive any
7045 + for (e=l->next; e!=l; e=e->next) {
7046 + least = list_entry(e, struct ip_vs_dest, n_list);
7047 + if (least->weight > 0) {
7048 + loh = atomic_read(&least->activeconns) * 50
7049 + + atomic_read(&least->inactconns);
7056 + * Find the destination with the least load.
7059 + for (e=e->next; e!=l; e=e->next) {
7060 + dest = list_entry(e, struct ip_vs_dest, n_list);
7061 + doh = atomic_read(&dest->activeconns) * 50
7062 + + atomic_read(&dest->inactconns);
7063 + if (loh * dest->weight > doh * least->weight) {
7069 + IP_VS_DBG(6, "WLC: server %d.%d.%d.%d:%d "
7070 + "activeconns %d refcnt %d weight %d overhead %d\n",
7071 + NIPQUAD(least->addr), ntohs(least->port),
7072 + atomic_read(&least->activeconns),
7073 + atomic_read(&least->refcnt), least->weight, loh);
7079 +static struct ip_vs_scheduler ip_vs_wlc_scheduler =
7083 + ATOMIC_INIT (0), /* refcnt */
7084 + ip_vs_wlc_init_svc, /* service initializer */
7085 + ip_vs_wlc_done_svc, /* service done */
7086 + ip_vs_wlc_update_svc, /* service updater */
7087 + ip_vs_wlc_schedule, /* select a server from the destination list */
7091 +__initfunc(int ip_vs_wlc_init (void))
7093 + IP_VS_INFO("Initializing WLC scheduling\n");
7094 + INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list);
7095 + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
7102 +int init_module(void)
7104 + INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list);
7106 + /* module initialization by 'request_module' */
7107 + if (register_ip_vs_scheduler(&ip_vs_wlc_scheduler) != 0)
7110 + IP_VS_INFO("WLC scheduling module loaded.\n");
7115 +void cleanup_module(void)
7117 + /* module cleanup by 'release_module' */
7118 + if (unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler) != 0)
7119 + IP_VS_INFO("cannot remove WLC scheduling module\n");
7121 + IP_VS_INFO("WLC scheduling module unloaded.\n");
7124 +#endif /* MODULE */
7125 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_wrr.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wrr.c
7126 --- linux-2.2.19/net/ipv4/ip_vs_wrr.c Thu Jan 1 08:00:00 1970
7127 +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wrr.c Fri Nov 24 09:57:23 2000
7130 + * IPVS: Weighted Round-Robin Scheduling module
7134 + * Authors: Wensong Zhang <wensong@iinchina.net>
7136 + * This program is free software; you can redistribute it and/or
7137 + * modify it under the terms of the GNU General Public License
7138 + * as published by the Free Software Foundation; either version
7139 + * 2 of the License, or (at your option) any later version.
7142 + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
7143 + * Wensong Zhang : changed some comestics things for debugging
7144 + * Wensong Zhang : changed for the d-linked destination list
7145 + * Wensong Zhang : added the ip_vs_wrr_update_svc
7146 + * Julian Anastasov : return -ENOMEM instead of ENOMEM in the
7147 + * ip_vs_wrr_init_svc
7148 + * Julian Anastasov : fixed the bug of returning destination
7149 + * with weight 0 when all weights are zero
7153 +#include <linux/config.h>
7154 +#include <linux/module.h>
7156 +#include <linux/kmod.h>
7158 +#include <linux/types.h>
7159 +#include <linux/kernel.h>
7160 +#include <linux/errno.h>
7161 +#include <net/ip_masq.h>
7162 +#ifdef CONFIG_IP_MASQUERADE_MOD
7163 +#include <net/ip_masq_mod.h>
7165 +#include <linux/ip_fw.h>
7166 +#include <net/ip_vs.h>
7169 + * current destination pointer for weighted round-robin scheduling
7171 +struct ip_vs_wrr_mark {
7172 + struct list_head *cl; /* current list head */
7173 + int cw; /* current weight */
7177 +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
7180 + * Allocate the mark variable for WRR scheduling
7182 + svc->sched_data = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
7184 + if (svc->sched_data == NULL) {
7185 + IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
7188 + memset(svc->sched_data, 0, sizeof(struct ip_vs_wrr_mark));
7190 + ((struct ip_vs_wrr_mark*)svc->sched_data)->cl = &svc->destinations;
7192 + MOD_INC_USE_COUNT;
7197 +static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
7200 + * Release the mark variable
7202 + kfree_s(svc->sched_data, sizeof(struct ip_vs_wrr_mark));
7204 + MOD_DEC_USE_COUNT;
7209 +static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
7211 + ((struct ip_vs_wrr_mark*)svc->sched_data)->cl = &svc->destinations;
7217 + * Get the maximum weight of the service destinations.
7219 +int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
7221 + register struct list_head *l, *e;
7222 + struct ip_vs_dest *dest;
7225 + l = &svc->destinations;
7226 + for (e=l->next; e!=l; e=e->next) {
7227 + dest = list_entry(e, struct ip_vs_dest, n_list);
7228 + if (dest->weight > weight)
7229 + weight = dest->weight;
7237 + * Weighted Round-Robin Scheduling
7239 +static struct ip_vs_dest* ip_vs_wrr_schedule(struct ip_vs_service *svc,
7240 + struct iphdr *iph)
7242 + struct ip_vs_dest *dest;
7243 + struct ip_vs_wrr_mark *mark = svc->sched_data;
7245 + IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
7248 + * This loop will always terminate, because 0<mark->cw<max_weight,
7249 + * and at least one server has its weight equal to max_weight.
7252 + if (mark->cl == &svc->destinations) {
7253 + /* it is at the head of the destination list */
7255 + if (mark->cl == mark->cl->next)
7256 + /* no dest entry */
7259 + mark->cl = svc->destinations.next;
7261 + if (mark->cw <= 0) {
7262 + mark->cw = ip_vs_wrr_max_weight(svc);
7264 + * Still zero, which means no availabe servers.
7266 + if (mark->cw == 0) {
7267 + mark->cl = &svc->destinations;
7268 + IP_VS_INFO("ip_vs_wrr_schedule(): "
7269 + "no available servers\n");
7274 + else mark->cl = mark->cl->next;
7276 + if (mark->cl != &svc->destinations) {
7277 + /* not at the head of the list */
7278 + dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
7279 + if (dest->weight >= mark->cw)
7284 + IP_VS_DBG(6, "WRR: server %d.%d.%d.%d:%d "
7285 + "activeconns %d refcnt %d weight %d\n",
7286 + NIPQUAD(dest->addr), ntohs(dest->port),
7287 + atomic_read(&dest->activeconns),
7288 + atomic_read(&dest->refcnt), dest->weight);
7294 +static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
7297 + ATOMIC_INIT(0), /* refcnt */
7298 + ip_vs_wrr_init_svc, /* service initializer */
7299 + ip_vs_wrr_done_svc, /* service done */
7300 + ip_vs_wrr_update_svc, /* service updater */
7301 + ip_vs_wrr_schedule, /* select a server from the destination list */
7305 +__initfunc(int ip_vs_wrr_init(void))
7307 + IP_VS_INFO("Initializing WRR scheduling\n");
7308 + INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
7309 + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
7315 +int init_module(void)
7317 + INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
7319 + /* module initialization by 'request_module' */
7320 + if(register_ip_vs_scheduler(&ip_vs_wrr_scheduler) != 0)
7323 + IP_VS_INFO("WRR scheduling module loaded.\n");
7328 +void cleanup_module(void)
7330 + /* module cleanup by 'release_module' */
7331 + if(unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler) != 0)
7332 + IP_VS_INFO("cannot remove WRR scheduling module\n");
7334 + IP_VS_INFO("WRR scheduling module unloaded.\n");
7337 +#endif /* MODULE */
7338 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/sysctl_net_ipv4.c linux-2.2.19-vs-1.0.8/net/ipv4/sysctl_net_ipv4.c
7339 --- linux-2.2.19/net/ipv4/sysctl_net_ipv4.c Tue Mar 27 09:33:49 2001
7340 +++ linux-2.2.19-vs-1.0.8/net/ipv4/sysctl_net_ipv4.c Tue Mar 27 09:32:21 2001
7342 struct ipv4_config ipv4_config;
7344 extern ctl_table ipv4_route_table[];
7345 +#ifdef CONFIG_IP_MASQUERADE_VS
7346 +extern ctl_table ipv4_vs_table[];
7349 #ifdef CONFIG_SYSCTL
7351 @@ -198,7 +201,10 @@
7352 {NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships",
7353 &sysctl_igmp_max_memberships, sizeof(int), 0644, NULL, &proc_dointvec},
7355 +#ifdef CONFIG_IP_MASQUERADE_VS
7356 + {NET_IPV4_VS, "vs", NULL, 0, 0555, ipv4_vs_table},
7362 #endif /* CONFIG_SYSCTL */