1 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/Documentation/Configure.help linux-2.2.19-vs-1.0.7/Documentation/Configure.help
2 --- linux-2.2.19/Documentation/Configure.help Tue Mar 27 09:33:35 2001
3 +++ linux-2.2.19-vs-1.0.7/Documentation/Configure.help Tue Mar 27 09:32:02 2001
4 @@ -2807,6 +2807,118 @@
5 The module will be called ip_masq_markfw.o. If you want to compile
6 it as a module, say M here and read Documentation/modules.txt.
8 +IP: masquerading virtual server support
9 +CONFIG_IP_MASQUERADE_VS
10 + IP Virtual Server support will let you build a virtual server
11 + based on cluster of two or more real servers. This option must
12 + be enabled for at least one of the clustered computers that will
13 + take care of intercepting incomming connections to a single IP
14 + address and scheduling them to real servers.
16 + Three request dispatching techniques are implemented, they are
17 + virtual server via NAT, virtual server via tunneling and virtual
18 + server via direct routing. The round-robin scheduling, the weighted
19 + round-robin secheduling, the weighted least-connection scheduling,
20 + the locality-based least-connection scheduling, or the
21 + locality-based least-connection with replication scheduling
22 + algorithm can be used to choose which server the connection is
23 + directed to, thus load balancing can be achieved among the servers.
24 + For more information and its administration program, please visit
27 + http://www.linuxvirtualserver.org/
28 + If you want this, say Y.
30 +IP virtual server debugging
32 + Say Y here if you want to get additional messages useful in
33 + debugging the IP virtual server code. You can change the debug
34 + level in /proc/sys/net/ipv4/vs/debug_level
36 +IP masquerading VS table size (the Nth power of 2)
37 +CONFIG_IP_MASQUERADE_VS_TAB_BITS
38 + Using a big ipvs hash table for virtual server will greatly reduce
39 + conflicts in the ipvs hash table when there are hundreds of thousands
40 + of active connections.
42 + Note the table size must be power of 2. The table size will be the
43 + value of 2 to the your input number power. For example, the default
44 + number is 12, so the table size is 4096. Don't input the number too
45 + small, otherwise you will lose performance on it. You can adapt the
46 + table size yourself, according to your virtual server application. It
47 + is good to set the table size not far less than the number of
48 + connections per second multiplying average lasting time of connection
49 + in the table. For example, your virtual server gets 200 connections
50 + per second, the connection lasts for 200 seconds in average in the
51 + masquerading table, the table size should be not far less than
52 + 200x200, it is good to set the table size 32768 (2**15).
54 + Another note that each connection occupies 128 bytes effectively and
55 + each hash entry uses 8 bytes, so you can estimate how much memory is
56 + needed for your box.
58 +IPVS: round-robin scheduling
59 +CONFIG_IP_MASQUERADE_VS_RR
60 + The robin-robin scheduling algorithm simply directs network
61 + connections to different real servers in a round-robin manner.
62 + If you want to compile it in kernel, say Y. If you want to compile
63 + it as a module, say M here and read Documentation/modules.txt.
65 +IPVS: weighted round-robin scheduling
66 +CONFIG_IP_MASQUERADE_VS_WRR
67 + The weighted robin-robin scheduling algorithm directs network
68 + connections to different real servers based on server weights
69 + in a round-robin manner. Servers with higher weights receive
70 + new connections first than those with less weights, and servers
71 + with higher weights get more connections than those with less
72 + weights and servers with equal weights get equal connections.
73 + If you want to compile it in kernel, say Y. If you want to compile
74 + it as a module, say M here and read Documentation/modules.txt.
76 +IPVS: least-connection scheduling
77 +CONFIG_IP_MASQUERADE_VS_LC
78 + The least-connection scheduling algorithm directs network
79 + connections to the server with the least number of active
81 + If you want to compile it in kernel, say Y. If you want to compile
82 + it as a module, say M here and read Documentation/modules.txt.
84 +IPVS: weighted least-connection scheduling
85 +CONFIG_IP_MASQUERADE_VS_WLC
86 + The weighted least-connection scheduling algorithm directs network
87 + connections to the server with the least active connections
88 + normalized by the server weight.
89 + If you want to compile it in kernel, say Y. If you want to compile
90 + it as a module, say M here and read Documentation/modules.txt.
92 +IPVS: locality-based least-connection scheduling
93 +CONFIG_IP_MASQUERADE_VS_LBLC
94 + The locality-based least-connection scheduling algorithm is for
95 + destination IP load balancing. It is usually used in cache cluster.
96 + This algorithm usually directs packet destined for an IP address to
97 + its server if the server is alive and under load. If the server is
98 + overloaded (its active connection numbers is larger than its weight)
99 + and there is a server in its half load, then allocate the weighted
100 + least-connection server to this IP address.
101 + If you want to compile it in kernel, say Y. If you want to compile
102 + it as a module, say M here and read Documentation/modules.txt.
104 +IPVS: locality-based least-connection with replication scheduling
105 +CONFIG_IP_MASQUERADE_VS_LBLCR
106 + The locality-based least-connection with replication scheduling
107 + algorithm is also for destination IP load balancing. It is
108 + usually used in cache cluster. It differs from the LBLC scheduling
109 + as follows: the load balancer maintains mappings from a target
110 + to a set of server nodes that can serve the target. Requests for
111 + a target are assigned to the least-connection node in the target's
112 + server set. If all the node in the server set are over loaded,
113 + it picks up a least-connection node in the cluster and adds it
114 + in the sever set for the target. If the server set has not been
115 + modified for the specified time, the most loaded node is removed
116 + from the server set, in order to avoid high degree of replication.
117 + If you want to compile it in kernel, say Y. If you want to compile
118 + it as a module, say M here and read Documentation/modules.txt.
122 Sometimes it is useful to give several IP addresses to a single
123 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/linux/ip_masq.h linux-2.2.19-vs-1.0.7/include/linux/ip_masq.h
124 --- linux-2.2.19/include/linux/ip_masq.h Sat Oct 23 17:02:32 1999
125 +++ linux-2.2.19-vs-1.0.7/include/linux/ip_masq.h Tue Dec 12 19:17:27 2000
128 #define IP_MASQ_MFW_SCHED 0x01
131 + * Virtual server stuff
134 + /* virtual service options */
135 + u_int16_t protocol;
136 + u_int32_t vaddr; /* virtual address */
138 + u_int32_t vfwmark; /* firwall mark of virtual */
139 + unsigned vs_flags; /* virtual service flags */
140 + unsigned timeout; /* persistent timeout in ticks */
141 + u_int32_t netmask; /* persistent netmask */
143 + /* destination specific options */
144 + u_int32_t daddr; /* real destination address */
146 + unsigned masq_flags; /* destination flags */
147 + int weight; /* destination weight */
151 #define IP_FW_MASQCTL_MAX 256
152 #define IP_MASQ_TNAME_MAX 32
155 struct ip_autofw_user autofw_user;
156 struct ip_mfw_user mfw_user;
157 struct ip_masq_user user;
158 + struct ip_vs_user vs_user;
159 unsigned char m_raw[IP_FW_MASQCTL_MAX];
163 #define IP_MASQ_TARGET_CORE 1
164 #define IP_MASQ_TARGET_MOD 2 /* masq_mod is selected by "name" */
165 #define IP_MASQ_TARGET_USER 3
166 -#define IP_MASQ_TARGET_LAST 4
167 +#define IP_MASQ_TARGET_VS 4
168 +#define IP_MASQ_TARGET_LAST 5
171 #define IP_MASQ_CMD_NONE 0 /* just peek */
172 #define IP_MASQ_CMD_INSERT 1
174 #define IP_MASQ_CMD_LIST 7 /* actually fake: done via /proc */
175 #define IP_MASQ_CMD_ENABLE 8
176 #define IP_MASQ_CMD_DISABLE 9
177 +#define IP_MASQ_CMD_ADD_DEST 10 /* for adding dest in IPVS */
178 +#define IP_MASQ_CMD_DEL_DEST 11 /* for deleting dest in IPVS */
179 +#define IP_MASQ_CMD_SET_DEST 12 /* for setting dest in IPVS */
181 #endif /* _LINUX_IP_MASQ_H */
183 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/linux/sysctl.h linux-2.2.19-vs-1.0.7/include/linux/sysctl.h
184 --- linux-2.2.19/include/linux/sysctl.h Tue Mar 27 09:33:48 2001
185 +++ linux-2.2.19-vs-1.0.7/include/linux/sysctl.h Tue Mar 27 09:32:20 2001
189 NET_IPV4_FIB_HASH=19,
192 NET_IPV4_TCP_TIMESTAMPS=33,
193 NET_IPV4_TCP_WINDOW_SCALING=34,
195 NET_IPV4_CONF_LOG_MARTIANS=11,
196 NET_IPV4_CONF_HIDDEN=12,
197 NET_IPV4_CONF_ARPFILTER=13
200 +/* /proc/sys/net/ipv4/vs */
204 + NET_IPV4_VS_AMEMTHRESH=1,
205 + NET_IPV4_VS_AMDROPRATE=2,
206 + NET_IPV4_VS_DROP_ENTRY=3,
207 + NET_IPV4_VS_DROP_PACKET=4,
208 + NET_IPV4_VS_SECURE_TCP=5,
209 + NET_IPV4_VS_TO_ES=6,
210 + NET_IPV4_VS_TO_SS=7,
211 + NET_IPV4_VS_TO_SR=8,
212 + NET_IPV4_VS_TO_FW=9,
213 + NET_IPV4_VS_TO_TW=10,
214 + NET_IPV4_VS_TO_CL=11,
215 + NET_IPV4_VS_TO_CW=12,
216 + NET_IPV4_VS_TO_LA=13,
217 + NET_IPV4_VS_TO_LI=14,
218 + NET_IPV4_VS_TO_SA=15,
219 + NET_IPV4_VS_TO_UDP=16,
220 + NET_IPV4_VS_TO_ICMP=17,
221 + NET_IPV4_VS_DEBUG_LEVEL=18,
222 + NET_IPV4_VS_LBLC_EXPIRE=19,
223 + NET_IPV4_VS_LBLCR_EXPIRE=20,
226 /* /proc/sys/net/ipv6 */
227 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip.h linux-2.2.19-vs-1.0.7/include/net/ip.h
228 --- linux-2.2.19/include/net/ip.h Tue Mar 27 09:33:48 2001
229 +++ linux-2.2.19-vs-1.0.7/include/net/ip.h Tue Mar 27 17:48:23 2001
231 #define IPSKB_MASQUERADED 1
232 #define IPSKB_TRANSLATED 2
233 #define IPSKB_FORWARDED 4
234 +#ifdef CONFIG_IP_MASQUERADE_VS
235 +#define IPSKB_REDIRECTED 8
240 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip_masq.h linux-2.2.19-vs-1.0.7/include/net/ip_masq.h
241 --- linux-2.2.19/include/net/ip_masq.h Tue Mar 27 09:33:48 2001
242 +++ linux-2.2.19-vs-1.0.7/include/net/ip_masq.h Wed Apr 18 16:17:59 2001
244 #include <linux/ip.h>
245 #include <linux/skbuff.h>
246 #include <linux/list.h>
247 +#ifdef CONFIG_SYSCTL
248 +#include <linux/sysctl.h>
250 #endif /* __KERNEL__ */
252 +#ifdef CONFIG_IP_MASQUERADE_VS
257 * This define affects the number of ports that can be handled
258 * by each of the protocol helper modules.
260 #define IP_MASQ_MOD_CTL 0x00
261 #define IP_MASQ_USER_CTL 0x01
265 -#define IP_MASQ_TAB_SIZE 256
267 #define IP_MASQ_F_NO_DADDR 0x0001 /* no daddr yet */
268 #define IP_MASQ_F_NO_DPORT 0x0002 /* no dport set yet */
269 #define IP_MASQ_F_NO_SADDR 0x0004 /* no sport set yet */
271 #define IP_MASQ_F_USER 0x2000 /* from uspace */
272 #define IP_MASQ_F_SIMPLE_HASH 0x8000 /* prevent s+d and m+d hashing */
274 +#ifdef CONFIG_IP_MASQUERADE_VS
275 +#define IP_MASQ_F_VS 0x00010000 /* virtual server related */
276 +#define IP_MASQ_F_VS_NO_OUTPUT 0x00020000 /* output packets avoid masq */
277 +#define IP_MASQ_F_VS_INACTIVE 0x00040000 /* not established */
278 +#define IP_MASQ_F_VS_FWD_MASK 0x00700000 /* mask for the fdw method */
279 +#define IP_MASQ_F_VS_LOCALNODE 0x00100000 /* local node destination */
280 +#define IP_MASQ_F_VS_TUNNEL 0x00200000 /* packets will be tunneled */
281 +#define IP_MASQ_F_VS_DROUTE 0x00400000 /* direct routing */
282 + /* masquerading otherwise */
283 +#define IP_MASQ_VS_FWD(ms) (ms->flags & IP_MASQ_F_VS_FWD_MASK)
284 +#endif /* CONFIG_IP_MASQUERADE_VS */
288 +#define IP_MASQ_TAB_SIZE 256
291 * Delta seq. info structure
292 * Each MASQ struct has 2 (output AND input seq. changes).
294 struct ip_masq *control; /* Master control connection */
295 atomic_t n_control; /* Number of "controlled" masqs */
296 unsigned flags; /* status flags */
297 - unsigned timeout; /* timeout */
298 + unsigned long timeout; /* timeout */
299 unsigned state; /* state info */
300 struct ip_masq_timeout_table *timeout_table;
301 +#ifdef CONFIG_IP_MASQUERADE_VS
302 + struct ip_vs_dest *dest; /* real server */
303 + atomic_t in_pkts; /* incoming packet counter */
304 +#endif /* CONFIG_IP_MASQUERADE_VS */
309 extern struct list_head ip_masq_d_table[IP_MASQ_TAB_SIZE];
310 extern const char * ip_masq_state_name(int state);
311 extern struct ip_masq_hook *ip_masq_user_hook;
312 -extern u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope);
313 +extern int ip_masq_select_addr(struct sk_buff *skb,__u32 *maddr);
316 * IP_MASQ_APP: IP application masquerading definitions
318 static const char *strProt[] = {"UDP","TCP","ICMP"};
319 int msproto = masq_proto_num(proto);
321 +#ifdef CONFIG_IP_MASQUERADE_VS
322 + if (proto == IPPROTO_IP)
324 +#endif /* CONFIG_IP_MASQUERADE_VS */
325 if (msproto<0||msproto>2) {
326 sprintf(buf, "IP_%d", proto);
329 IP_MASQ_S_CLOSE_WAIT,
332 +#ifdef CONFIG_IP_MASQUERADE_VS
342 + ms->timeout_table = NULL;
343 atomic_dec(&mstim->refcnt);
346 +#ifdef CONFIG_IP_MASQUERADE_VS
348 +extern struct ip_masq_timeout_table masq_timeout_table_dos;
349 +extern void ip_masq_secure_tcp_set(int on);
352 + * This is a simple mechanism to ignore packets when
353 + * we are loaded. Just set ip_masq_drop_rate to 'n' and
354 + * we start to drop 1/n of the packets
357 +extern int ip_masq_drop_rate;
358 +extern int ip_masq_drop_counter;
360 +static __inline__ int ip_masq_todrop(void)
362 + if (!ip_masq_drop_rate) return 0;
363 + if (--ip_masq_drop_counter > 0) return 0;
364 + ip_masq_drop_counter = ip_masq_drop_rate;
368 +#endif /* CONFIG_IP_MASQUERADE_VS */
370 #endif /* __KERNEL__ */
372 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip_vs.h linux-2.2.19-vs-1.0.7/include/net/ip_vs.h
373 --- linux-2.2.19/include/net/ip_vs.h Thu Jan 1 08:00:00 1970
374 +++ linux-2.2.19-vs-1.0.7/include/net/ip_vs.h Thu Apr 19 22:33:09 2001
377 + * IP virtual server
378 + * data structure and functionality definitions
381 +#include <linux/config.h>
386 +#define IP_VS_VERSION_CODE 0x010007
387 +#define NVERSION(version) \
388 + (version >> 16) & 0xFF, \
389 + (version >> 8) & 0xFF, \
393 + * Virtual Service Flags
395 +#define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */
396 +#define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */
399 + * Destination Server Flags
401 +#define IP_VS_DEST_F_AVAILABLE 0x0001 /* Available tag */
404 + * The default IP_VS_TEMPLATE_TIMEOUT is a little larger than average
405 + * connection time plus MASQUERADE_EXPIRE_TCP_FIN(2*60*HZ). Because the
406 + * template won't be released until its controlled masq entries are
408 + * If IP_VS_TEMPLATE_TIMEOUT is too less, the template will soon expire
409 + * and will be put in expire again and again, which requires additional
410 + * overhead. If it is too large, the same will always visit the same
411 + * server, which will make dynamic load imbalance worse.
413 +#define IP_VS_TEMPLATE_TIMEOUT 6*60*HZ
417 +extern int ip_vs_forwarding_related_icmp(struct sk_buff *skb);
419 +#ifdef CONFIG_IP_VS_DEBUG
420 +extern int ip_vs_get_debug_level(void);
421 +#define IP_VS_DBG(level, msg...) \
423 + if (level <= ip_vs_get_debug_level()) \
424 + printk(KERN_DEBUG "IPVS: " ## msg); \
426 +#else /* NO DEBUGGING at ALL */
427 +#define IP_VS_DBG(level, msg...) do {} while (0)
430 +#define IP_VS_ERR(msg...) printk(KERN_ERR "IPVS: " ## msg )
431 +#define IP_VS_INFO(msg...) printk(KERN_INFO "IPVS: " ## msg )
432 +#define IP_VS_WARNING(msg...) \
433 + printk(KERN_WARNING "IPVS: " ## msg)
435 +#ifdef CONFIG_IP_VS_DEBUG
436 +#define EnterFunction(level) \
438 + if (level <= ip_vs_get_debug_level()) \
439 + printk(KERN_DEBUG "Enter: %s, %s line %i\n", \
440 + __FUNCTION__, __FILE__, __LINE__); \
442 +#define LeaveFunction(level) \
444 + if (level <= ip_vs_get_debug_level()) \
445 + printk(KERN_DEBUG "Leave: %s, %s line %i\n", \
446 + __FUNCTION__, __FILE__, __LINE__); \
449 +#define EnterFunction(level) do {} while (0)
450 +#define LeaveFunction(level) do {} while (0)
455 + * IPVS statistics object
459 + spinlock_t lock; /* spin lock */
460 + __u32 conns; /* connections scheduled */
461 + __u32 inpkts; /* incoming packets */
462 + __u32 outpkts; /* outgoing packets */
463 + __u64 inbytes; /* incoming bytes */
464 + __u64 outbytes; /* outgoing bytes */
469 + * The real server destination forwarding entry
470 + * with ip address, port
473 + struct list_head n_list; /* for the dests in the service */
474 + struct list_head d_list; /* for table with all the dests */
476 + __u32 addr; /* IP address of real server */
477 + __u16 port; /* port number of the service */
478 + unsigned flags; /* dest status flags */
479 + unsigned masq_flags; /* flags to copy to masq */
480 + atomic_t activeconns; /* active connections */
481 + atomic_t inactconns; /* inactive connections */
482 + atomic_t refcnt; /* reference counter */
483 + int weight; /* server weight */
484 + struct ip_vs_stats stats; /* statistics */
486 + /* for virtual service */
487 + struct ip_vs_service *svc; /* service that it belongs to */
488 + __u16 protocol; /* which protocol (TCP/UDP) */
489 + __u32 vaddr; /* IP address for virtual service */
490 + __u16 vport; /* port number for the service */
491 + __u32 vfwmark; /* firewall mark of the service */
496 + * The scheduler object
498 +struct ip_vs_scheduler {
499 + struct list_head n_list; /* d-linked list head */
500 + char *name; /* scheduler name */
501 + atomic_t refcnt; /* reference counter */
503 + /* scheduler initializing service */
504 + int (*init_service)(struct ip_vs_service *svc);
505 + /* scheduling service finish */
506 + int (*done_service)(struct ip_vs_service *svc);
507 + /* scheduler updating service */
508 + int (*update_service)(struct ip_vs_service *svc);
510 + /* selecting a server from the given service */
511 + struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc,
512 + struct iphdr *iph);
517 + * The information about the virtual service offered to the net
518 + * and the forwarding entries
520 +struct ip_vs_service {
521 + struct list_head s_list; /* hashed d-linked list head */
522 + struct list_head f_list; /* hashed d-linked list head */
523 + __u16 protocol; /* which protocol (TCP/UDP) */
524 + __u32 addr; /* IP address for virtual service */
525 + __u16 port; /* port number for the service */
526 + __u32 fwmark; /* firewall mark of the service */
527 + unsigned flags; /* service status flags */
528 + unsigned timeout; /* persistent timeout in ticks */
529 + __u32 netmask; /* grouping granularity */
530 + struct list_head destinations; /* real server d-linked list */
531 + struct ip_vs_scheduler *scheduler; /* bound scheduler object */
532 + void *sched_data; /* scheduler application data */
533 + struct ip_vs_stats stats; /* statistics for the service */
538 + * IP Virtual Server masq entry hash table
540 +#define IP_VS_TAB_BITS CONFIG_IP_MASQUERADE_VS_TAB_BITS
541 +#define IP_VS_TAB_SIZE (1 << IP_VS_TAB_BITS)
542 +#define IP_VS_TAB_MASK (IP_VS_TAB_SIZE - 1)
543 +extern struct list_head *ip_vs_table;
546 + * Hash and unhash functions
548 +extern int ip_vs_hash(struct ip_masq *ms);
549 +extern int ip_vs_unhash(struct ip_masq *ms);
552 + * Registering/unregistering scheduler functions
554 +extern int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
555 +extern int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
558 + * Lookup functions for the hash table (caller must lock table)
560 +extern struct ip_masq * __ip_vs_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port);
561 +extern struct ip_masq * __ip_vs_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port);
564 + * Creating a masquerading entry for IPVS
566 +extern struct ip_masq * ip_masq_new_vs(int proto, __u32 maddr, __u16 mport, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned flags);
569 + * IPVS data and functions
571 +extern rwlock_t __ip_vs_lock;
573 +extern void ip_vs_set_state(struct ip_masq *ms, int new_state);
574 +extern void ip_vs_bind_masq(struct ip_masq *ms, struct ip_vs_dest *dest);
575 +extern void ip_vs_unbind_masq(struct ip_masq *ms);
577 +extern int ip_vs_ctl(int optname, struct ip_masq_ctl *mctl, int optlen);
578 +extern struct ip_vs_service *
579 +ip_vs_lookup_service(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport);
580 +extern struct ip_vs_service * ip_vs_lookup_svc_fwm(__u32 fwmark);
581 +extern struct ip_vs_dest *
582 +__ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport);
583 +extern struct ip_vs_dest *ip_vs_lookup_dest(struct ip_vs_service *svc,
584 + __u32 daddr, __u16 dport);
585 +extern struct ip_masq * ip_vs_schedule(struct ip_vs_service *svc,
586 + struct iphdr *iph);
587 +extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb);
588 +extern int ip_vs_tunnel_xmit(struct sk_buff *skb, __u32 daddr);
589 +extern int ip_vs_dr_xmit(struct sk_buff *skb, __u32 daddr);
594 +extern int ip_vs_init(void);
597 + * init function prototypes for scheduling modules
598 + * these function will be called when they are built in kernel
600 +extern int ip_vs_rr_init(void);
601 +extern int ip_vs_wrr_init(void);
602 +extern int ip_vs_lc_init(void);
603 +extern int ip_vs_wlc_init(void);
604 +extern int ip_vs_lblc_init(void);
605 +extern int ip_vs_lblcr_init(void);
609 + * Slow timer functions for IPVS
611 +extern void add_sltimer(struct timer_list * timer);
612 +extern int del_sltimer(struct timer_list * timer);
613 +extern void mod_sltimer(struct timer_list *timer, unsigned long expires);
617 + * IP Virtual Server statistics
619 +extern struct ip_vs_stats ip_vs_stats;
621 +extern __inline__ void
622 +ip_vs_in_stats(struct ip_masq *ms, struct sk_buff *skb)
624 + struct ip_vs_dest *dest = ms->dest;
625 + read_lock(&__ip_vs_lock);
626 + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
627 + spin_lock(&dest->stats.lock);
628 + dest->stats.inpkts++;
629 + dest->stats.inbytes += skb->len;
630 + spin_unlock(&dest->stats.lock);
632 + spin_lock(&dest->svc->stats.lock);
633 + dest->svc->stats.inpkts++;
634 + dest->svc->stats.inbytes += skb->len;
635 + spin_unlock(&dest->svc->stats.lock);
637 + spin_lock(&ip_vs_stats.lock);
638 + ip_vs_stats.inpkts++;
639 + ip_vs_stats.inbytes += skb->len;
640 + spin_unlock(&ip_vs_stats.lock);
642 + read_unlock(&__ip_vs_lock);
646 +extern __inline__ void
647 +ip_vs_out_stats(struct ip_masq *ms, struct sk_buff *skb)
649 + struct ip_vs_dest *dest = ms->dest;
650 + read_lock(&__ip_vs_lock);
651 + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
652 + spin_lock(&dest->stats.lock);
653 + dest->stats.outpkts++;
654 + dest->stats.outbytes += skb->len;
655 + spin_unlock(&dest->stats.lock);
657 + spin_lock(&dest->svc->stats.lock);
658 + dest->svc->stats.outpkts++;
659 + dest->svc->stats.outbytes += skb->len;
660 + spin_unlock(&dest->svc->stats.lock);
662 + spin_lock(&ip_vs_stats.lock);
663 + ip_vs_stats.outpkts++;
664 + ip_vs_stats.outbytes += skb->len;
665 + spin_unlock(&ip_vs_stats.lock);
667 + read_unlock(&__ip_vs_lock);
671 +extern __inline__ void
672 +ip_vs_conn_stats(struct ip_masq *ms, struct ip_vs_service *svc)
674 + spin_lock(&ms->dest->stats.lock);
675 + ms->dest->stats.conns++;
676 + spin_unlock(&ms->dest->stats.lock);
678 + spin_lock(&svc->stats.lock);
679 + svc->stats.conns++;
680 + spin_unlock(&svc->stats.lock);
682 + spin_lock(&ip_vs_stats.lock);
683 + ip_vs_stats.conns++;
684 + spin_unlock(&ip_vs_stats.lock);
689 + * ip_vs_fwd_tag returns the forwarding tag of the masq
691 +extern __inline__ char ip_vs_fwd_tag(struct ip_masq *ms)
695 + switch (IP_MASQ_VS_FWD(ms)) {
696 + case IP_MASQ_F_VS_LOCALNODE: fwd = 'L'; break;
697 + case IP_MASQ_F_VS_TUNNEL: fwd = 'T'; break;
698 + case IP_MASQ_F_VS_DROUTE: fwd = 'R'; break;
704 +extern __inline__ char * ip_vs_fwd_name(unsigned masq_flags)
708 + switch (masq_flags & IP_MASQ_F_VS_FWD_MASK) {
709 + case IP_MASQ_F_VS_LOCALNODE:
712 + case IP_MASQ_F_VS_TUNNEL:
715 + case IP_MASQ_F_VS_DROUTE:
726 + * ip_vs_forward forwards the packet through tunneling, direct
727 + * routing or local node (passing to the upper layer).
728 + * Return values mean:
729 + * 0 skb must be passed to the upper layer
730 + * -1 skb must be released
731 + * -2 skb has been released
733 +extern __inline__ int ip_vs_forward(struct sk_buff *skb, struct ip_masq *ms)
737 + atomic_inc(&ms->in_pkts);
739 + switch (IP_MASQ_VS_FWD(ms)) {
740 + case IP_MASQ_F_VS_TUNNEL:
741 + if (ip_vs_tunnel_xmit(skb, ms->saddr) == 0) {
742 + IP_VS_DBG(10, "tunneling failed.\n");
744 + IP_VS_DBG(10, "tunneling succeeded.\n");
749 + case IP_MASQ_F_VS_DROUTE:
750 + if (ip_vs_dr_xmit(skb, ms->saddr) == 0) {
751 + IP_VS_DBG(10, "direct routing failed.\n");
753 + IP_VS_DBG(10, "direct routing succeeded.\n");
758 + case IP_MASQ_F_VS_LOCALNODE:
765 +#endif /* __KERNEL__ */
767 +#endif /* _IP_VS_H */
768 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/Config.in linux-2.2.19-vs-1.0.7/net/ipv4/Config.in
769 --- linux-2.2.19/net/ipv4/Config.in Sat Dec 16 23:10:12 2000
770 +++ linux-2.2.19-vs-1.0.7/net/ipv4/Config.in Tue Dec 12 18:35:06 2000
772 tristate 'IP: ipportfw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPPORTFW
773 tristate 'IP: ip fwmark masq-forwarding support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_MFW
775 + bool 'IP: masquerading virtual server support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_VS
776 + if [ "$CONFIG_IP_MASQUERADE_VS" = "y" ]; then
777 + bool ' IP virtual server debugging' CONFIG_IP_VS_DEBUG
778 + int ' IP masquerading VS table size (the Nth power of 2)' CONFIG_IP_MASQUERADE_VS_TAB_BITS 12
779 + tristate ' IPVS: round-robin scheduling' CONFIG_IP_MASQUERADE_VS_RR
780 + tristate ' IPVS: weighted round-robin scheduling' CONFIG_IP_MASQUERADE_VS_WRR
781 + tristate ' IPVS: least-connection scheduling' CONFIG_IP_MASQUERADE_VS_LC
782 + tristate ' IPVS: weighted least-connection scheduling' CONFIG_IP_MASQUERADE_VS_WLC
783 + tristate ' IPVS: locality-based least-connection scheduling' CONFIG_IP_MASQUERADE_VS_LBLC
784 + tristate ' IPVS: locality-based least-connection with replication scheduling' CONFIG_IP_MASQUERADE_VS_LBLCR
789 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/Makefile linux-2.2.19-vs-1.0.7/net/ipv4/Makefile
790 --- linux-2.2.19/net/ipv4/Makefile Tue Jan 5 07:31:34 1999
791 +++ linux-2.2.19-vs-1.0.7/net/ipv4/Makefile Sat Dec 2 22:32:10 2000
796 +ifeq ($(CONFIG_IP_MASQUERADE_VS),y)
797 + IPV4X_OBJS += ip_vs.o
799 + ifeq ($(CONFIG_IP_MASQUERADE_VS_RR),y)
800 + IPV4_OBJS += ip_vs_rr.o
802 + ifeq ($(CONFIG_IP_MASQUERADE_VS_RR),m)
803 + M_OBJS += ip_vs_rr.o
807 + ifeq ($(CONFIG_IP_MASQUERADE_VS_WRR),y)
808 + IPV4_OBJS += ip_vs_wrr.o
810 + ifeq ($(CONFIG_IP_MASQUERADE_VS_WRR),m)
811 + M_OBJS += ip_vs_wrr.o
815 + ifeq ($(CONFIG_IP_MASQUERADE_VS_LC),y)
816 + IPV4_OBJS += ip_vs_lc.o
818 + ifeq ($(CONFIG_IP_MASQUERADE_VS_LC),m)
819 + M_OBJS += ip_vs_lc.o
823 + ifeq ($(CONFIG_IP_MASQUERADE_VS_WLC),y)
824 + IPV4_OBJS += ip_vs_wlc.o
826 + ifeq ($(CONFIG_IP_MASQUERADE_VS_WLC),m)
827 + M_OBJS += ip_vs_wlc.o
831 + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLC),y)
832 + IPV4_OBJS += ip_vs_lblc.o
834 + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLC),m)
835 + M_OBJS += ip_vs_lblc.o
839 + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLCR),y)
840 + IPV4_OBJS += ip_vs_lblcr.o
842 + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLCR),m)
843 + M_OBJS += ip_vs_lblcr.o
848 M_OBJS += ip_masq_user.o
849 M_OBJS += ip_masq_ftp.o ip_masq_irc.o ip_masq_raudio.o ip_masq_quake.o
850 M_OBJS += ip_masq_vdolive.o ip_masq_cuseeme.o
851 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_forward.c linux-2.2.19-vs-1.0.7/net/ipv4/ip_forward.c
852 --- linux-2.2.19/net/ipv4/ip_forward.c Fri Jan 7 09:45:02 2000
853 +++ linux-2.2.19-vs-1.0.7/net/ipv4/ip_forward.c Fri Feb 2 15:38:28 2001
855 #include <linux/ip_fw.h>
856 #ifdef CONFIG_IP_MASQUERADE
857 #include <net/ip_masq.h>
858 +#ifdef CONFIG_IP_MASQUERADE_VS
859 +#include <net/ip_vs.h>
862 #include <net/checksum.h>
863 #include <linux/route.h>
868 +#ifdef CONFIG_IP_MASQUERADE_VS
869 + if (iph->protocol == IPPROTO_ICMP &&
870 + !(IPCB(skb)->flags&IPSKB_MASQUERADED)) {
871 + /* Related ICMP packet for IPVS ? */
872 + fw_res = ip_vs_forwarding_related_icmp(skb);
873 + if (fw_res > 0) return ip_local_deliver(skb);
877 #ifdef CONFIG_IP_TRANSPARENT_PROXY
879 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_input.c linux-2.2.19-vs-1.0.7/net/ipv4/ip_input.c
880 --- linux-2.2.19/net/ipv4/ip_input.c Tue Mar 27 09:33:49 2001
881 +++ linux-2.2.19-vs-1.0.7/net/ipv4/ip_input.c Tue Mar 27 09:32:21 2001
887 +#ifdef CONFIG_IP_MASQUERADE_VS
888 + if((IPCB(skb)->flags&IPSKB_REDIRECTED)) {
889 + printk(KERN_DEBUG "ip_input(): ipvs recursion detected. Check ipvs configuration\n");
896 * Some masq modules can re-inject packets if
901 ret = ip_fw_demasquerade(&skb);
902 +#ifdef CONFIG_IP_MASQUERADE_VS
904 + /* skb has already been released */
911 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_masq.c linux-2.2.19-vs-1.0.7/net/ipv4/ip_masq.c
912 --- linux-2.2.19/net/ipv4/ip_masq.c Tue Mar 27 09:33:49 2001
913 +++ linux-2.2.19-vs-1.0.7/net/ipv4/ip_masq.c Wed Apr 18 19:58:48 2001
915 * Kai Bankett : do not toss other IP protos in proto_doff()
916 * Dan Kegel : pointed correct NAT behavior for UDP streams
917 * Julian Anastasov : use daddr and dport as hash keys
919 + * Wensong Zhang : Added virtual server support
920 + * Peter Kese : added masq TCP state handling for input-only
921 + * Julian Anastasov : step to mSR after SYN in INPUT_ONLY table
922 + * Julian Anastasov : fixed huge expire bug for IPVS after bad checksum
923 + * Wensong Zhang : added server status checking for IPVS
927 #include <linux/config.h>
929 #include <linux/ip_fw.h>
930 #include <linux/ip_masq.h>
932 +#ifdef CONFIG_IP_MASQUERADE_VS
933 +#include <net/ip_vs.h>
934 +#endif /* CONFIG_IP_MASQUERADE_VS */
936 int sysctl_ip_masq_debug = 0;
937 int sysctl_ip_masq_udp_dloose = 0;
941 struct ip_masq_hook *ip_masq_user_hook = NULL;
943 +#ifdef CONFIG_IP_MASQUERADE_VS
945 + * Use different state/timeout tables
947 +#ifndef IP_MASQ_MANY_STATE_TABLES
948 +#define IP_MASQ_MANY_STATE_TABLES
951 +int ip_masq_drop_rate = 0;
952 +int ip_masq_drop_counter = 0;
956 +#ifndef CONFIG_IP_MASQUERADE_VS
959 * Timeout table[state]
961 @@ -106,38 +130,104 @@
962 ATOMIC_INIT(0), /* refcnt */
965 - 30*60*HZ, /* IP_MASQ_S_NONE, */
966 - 15*60*HZ, /* IP_MASQ_S_ESTABLISHED, */
967 - 2*60*HZ, /* IP_MASQ_S_SYN_SENT, */
968 - 1*60*HZ, /* IP_MASQ_S_SYN_RECV, */
969 - 2*60*HZ, /* IP_MASQ_S_FIN_WAIT, */
970 - 2*60*HZ, /* IP_MASQ_S_TIME_WAIT, */
971 - 10*HZ, /* IP_MASQ_S_CLOSE, */
972 - 60*HZ, /* IP_MASQ_S_CLOSE_WAIT, */
973 - 30*HZ, /* IP_MASQ_S_LAST_ACK, */
974 - 2*60*HZ, /* IP_MASQ_S_LISTEN, */
975 - 5*60*HZ, /* IP_MASQ_S_UDP, */
976 - 1*60*HZ, /* IP_MASQ_S_ICMP, */
977 - 2*HZ,/* IP_MASQ_S_LAST */
978 + [IP_MASQ_S_NONE] = 30*60*HZ,
979 + [IP_MASQ_S_ESTABLISHED] = 15*60*HZ,
980 + [IP_MASQ_S_SYN_SENT] = 2*60*HZ,
981 + [IP_MASQ_S_SYN_RECV] = 1*60*HZ,
982 + [IP_MASQ_S_FIN_WAIT] = 2*60*HZ,
983 + [IP_MASQ_S_TIME_WAIT] = 2*60*HZ,
984 + [IP_MASQ_S_CLOSE] = 10*HZ,
985 + [IP_MASQ_S_CLOSE_WAIT] = 60*HZ,
986 + [IP_MASQ_S_LAST_ACK] = 30*HZ,
987 + [IP_MASQ_S_LISTEN] = 2*60*HZ,
988 + [IP_MASQ_S_UDP] = 5*60*HZ,
989 + [IP_MASQ_S_ICMP] = 1*60*HZ,
990 + [IP_MASQ_S_LAST] = 2*HZ,
994 +#else /* CONFIG_IP_MASQUERADE_VS */
997 + * Timeout table[state]
999 +/* static int masq_timeout_table[IP_MASQ_S_LAST+1] = { */
1000 +static struct ip_masq_timeout_table masq_timeout_table = {
1001 + ATOMIC_INIT(0), /* refcnt */
1004 + [IP_MASQ_S_NONE] = 30*60*HZ,
1005 + [IP_MASQ_S_ESTABLISHED] = 15*60*HZ,
1006 + [IP_MASQ_S_SYN_SENT] = 2*60*HZ,
1007 + [IP_MASQ_S_SYN_RECV] = 1*60*HZ,
1008 + [IP_MASQ_S_FIN_WAIT] = 2*60*HZ,
1009 + [IP_MASQ_S_TIME_WAIT] = 2*60*HZ,
1010 + [IP_MASQ_S_CLOSE] = 10*HZ,
1011 + [IP_MASQ_S_CLOSE_WAIT] = 60*HZ,
1012 + [IP_MASQ_S_LAST_ACK] = 30*HZ,
1013 + [IP_MASQ_S_LISTEN] = 2*60*HZ,
1014 + [IP_MASQ_S_SYNACK] = 120*HZ,
1015 + [IP_MASQ_S_UDP] = 5*60*HZ,
1016 + [IP_MASQ_S_ICMP] = 1*60*HZ,
1017 + [IP_MASQ_S_LAST] = 2*HZ,
1022 +struct ip_masq_timeout_table masq_timeout_table_dos = {
1023 + ATOMIC_INIT(0), /* refcnt */
1026 + [IP_MASQ_S_NONE] = 15*60*HZ,
1027 + [IP_MASQ_S_ESTABLISHED] = 8*60*HZ,
1028 + [IP_MASQ_S_SYN_SENT] = 60*HZ,
1029 + [IP_MASQ_S_SYN_RECV] = 10*HZ,
1030 + [IP_MASQ_S_FIN_WAIT] = 60*HZ,
1031 + [IP_MASQ_S_TIME_WAIT] = 60*HZ,
1032 + [IP_MASQ_S_CLOSE] = 10*HZ,
1033 + [IP_MASQ_S_CLOSE_WAIT] = 60*HZ,
1034 + [IP_MASQ_S_LAST_ACK] = 30*HZ,
1035 + [IP_MASQ_S_LISTEN] = 2*60*HZ,
1036 + [IP_MASQ_S_SYNACK] = 100*HZ,
1037 + [IP_MASQ_S_UDP] = 3*60*HZ,
1038 + [IP_MASQ_S_ICMP] = 1*60*HZ,
1039 + [IP_MASQ_S_LAST] = 2*HZ,
1044 + * Timeout table to use for the VS entries
1045 + * If NULL we use the default table (masq_timeout_table).
1046 + * Under flood attack we switch to masq_timeout_table_dos
1049 +struct ip_masq_timeout_table *ip_vs_timeout_table = &masq_timeout_table;
1051 +#endif /* CONFIG_IP_MASQUERADE_VS */
1053 +#ifdef CONFIG_IP_MASQUERADE_VS
1054 +#define MASQUERADE_EXPIRE_RETRY(ms) (ms->timeout_table? ms->timeout_table->timeout[IP_MASQ_S_TIME_WAIT] : masq_timeout_table.timeout[IP_MASQ_S_TIME_WAIT])
1056 #define MASQUERADE_EXPIRE_RETRY masq_timeout_table.timeout[IP_MASQ_S_TIME_WAIT]
1059 static const char * state_name_table[IP_MASQ_S_LAST+1] = {
1060 - "NONE", /* IP_MASQ_S_NONE, */
1061 - "ESTABLISHED", /* IP_MASQ_S_ESTABLISHED, */
1062 - "SYN_SENT", /* IP_MASQ_S_SYN_SENT, */
1063 - "SYN_RECV", /* IP_MASQ_S_SYN_RECV, */
1064 - "FIN_WAIT", /* IP_MASQ_S_FIN_WAIT, */
1065 - "TIME_WAIT", /* IP_MASQ_S_TIME_WAIT, */
1066 - "CLOSE", /* IP_MASQ_S_CLOSE, */
1067 - "CLOSE_WAIT", /* IP_MASQ_S_CLOSE_WAIT, */
1068 - "LAST_ACK", /* IP_MASQ_S_LAST_ACK, */
1069 - "LISTEN", /* IP_MASQ_S_LISTEN, */
1070 - "UDP", /* IP_MASQ_S_UDP, */
1071 - "ICMP", /* IP_MASQ_S_ICMP, */
1072 - "BUG!", /* IP_MASQ_S_LAST */
1073 + [IP_MASQ_S_NONE] = "NONE",
1074 + [IP_MASQ_S_ESTABLISHED] = "ESTABLISHED",
1075 + [IP_MASQ_S_SYN_SENT] = "SYN_SENT",
1076 + [IP_MASQ_S_SYN_RECV] = "SYN_RECV",
1077 + [IP_MASQ_S_FIN_WAIT] = "FIN_WAIT",
1078 + [IP_MASQ_S_TIME_WAIT] = "TIME_WAIT",
1079 + [IP_MASQ_S_CLOSE] = "CLOSE",
1080 + [IP_MASQ_S_CLOSE_WAIT] = "CLOSE_WAIT",
1081 + [IP_MASQ_S_LAST_ACK] = "LAST_ACK",
1082 + [IP_MASQ_S_LISTEN] = "LISTEN",
1083 +#ifdef CONFIG_IP_MASQUERADE_VS
1084 + [IP_MASQ_S_SYNACK] = "SYNACK",
1086 + [IP_MASQ_S_UDP] = "UDP",
1087 + [IP_MASQ_S_ICMP] = "ICMP",
1088 + [IP_MASQ_S_LAST] = "BUG!",
1091 #define mNO IP_MASQ_S_NONE
1093 #define mCW IP_MASQ_S_CLOSE_WAIT
1094 #define mLA IP_MASQ_S_LAST_ACK
1095 #define mLI IP_MASQ_S_LISTEN
1096 +#ifdef CONFIG_IP_MASQUERADE_VS
1097 +#define mSA IP_MASQ_S_SYNACK
1100 struct masq_tcp_states_t {
1101 int next_state[IP_MASQ_S_LAST]; /* should be _LAST_TCP */
1102 @@ -159,46 +252,111 @@
1104 if (state >= IP_MASQ_S_LAST)
1106 - return state_name_table[state];
1107 + return state_name_table[state] ? state_name_table[state] : "?";
1110 +#ifndef CONFIG_IP_MASQUERADE_VS
1112 struct masq_tcp_states_t masq_tcp_states [] = {
1114 /* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */
1115 /*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR }},
1116 /*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI }},
1117 -/*ack*/ {{mCL, mES, mSS, mSR, mFW, mTW, mCL, mCW, mCL, mLI }},
1118 +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI }},
1119 /*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI }},
1122 /* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */
1123 -/*syn*/ {{mSS, mES, mSS, mES, mSS, mSS, mSS, mSS, mSS, mLI }},
1124 +/*syn*/ {{mSS, mES, mSS, mSR, mSS, mSS, mSS, mSS, mSS, mLI }},
1125 /*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI }},
1126 -/*ack*/ {{mES, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mES }},
1127 +/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES }},
1128 /*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL }},
1131 -static __inline__ int masq_tcp_state_idx(struct tcphdr *th, int output)
1132 +#else /* CONFIG_IP_MASQUERADE_VS */
1134 +struct masq_tcp_states_t masq_tcp_states [] = {
1136 +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1137 +/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSR }},
1138 +/*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI, mTW }},
1139 +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }},
1140 +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mSR }},
1143 +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1144 +/*syn*/ {{mSS, mES, mSS, mSR, mSS, mSS, mSS, mSS, mSS, mLI, mSR }},
1145 +/*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI, mTW }},
1146 +/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES, mES }},
1147 +/*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL, mCL }},
1150 +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1151 +/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSR }},
1152 +/*fin*/ {{mCL, mFW, mSS, mTW, mFW, mTW, mCL, mCW, mLA, mLI, mTW }},
1153 +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }},
1154 +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }},
1157 +struct masq_tcp_states_t masq_tcp_states_dos [] = {
1159 +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1160 +/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSA }},
1161 +/*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI, mSA }},
1162 +/*ack*/ {{mCL, mES, mSS, mSR, mFW, mTW, mCL, mCW, mCL, mLI, mSA }},
1163 +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }},
1166 +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1167 +/*syn*/ {{mSS, mES, mSS, mSA, mSS, mSS, mSS, mSS, mSS, mLI, mSA }},
1168 +/*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI, mTW }},
1169 +/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES, mES }},
1170 +/*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL, mCL }},
1173 +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1174 +/*syn*/ {{mSA, mES, mES, mSR, mSA, mSA, mSA, mSA, mSA, mSA, mSA }},
1175 +/*fin*/ {{mCL, mFW, mSS, mTW, mFW, mTW, mCL, mCW, mLA, mLI, mTW }},
1176 +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }},
1177 +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }},
1180 +struct masq_tcp_states_t *ip_vs_state_table = masq_tcp_states;
1182 +void ip_masq_secure_tcp_set(int on)
1185 + ip_vs_state_table = masq_tcp_states_dos;
1186 + ip_vs_timeout_table = &masq_timeout_table_dos;
1188 + ip_vs_state_table = masq_tcp_states;
1189 + ip_vs_timeout_table = &masq_timeout_table;
1193 +#endif /* CONFIG_IP_MASQUERADE_VS */
1195 +#define MASQ_STATE_INPUT 0
1196 +#define MASQ_STATE_OUTPUT 4
1197 +#define MASQ_STATE_INPUT_ONLY 8
1199 +static __inline__ int masq_tcp_state_idx(struct tcphdr *th, int state_off)
1202 - * [0-3]: input states, [4-7]: output.
1203 + * [0-3]: input states, [4-7]: output, [8-11] input only states.
1210 + return state_off+3;
1213 + return state_off+0;
1216 + return state_off+1;
1219 + return state_off+2;
1225 static int masq_set_state_timeout(struct ip_masq *ms, int state)
1227 struct ip_masq_timeout_table *mstim = ms->timeout_table;
1228 @@ -221,18 +379,34 @@
1232 -static int masq_tcp_state(struct ip_masq *ms, int output, struct tcphdr *th)
1233 +static int masq_tcp_state(struct ip_masq *ms, int state_off, struct tcphdr *th)
1236 int new_state = IP_MASQ_S_CLOSE;
1238 - if ((state_idx = masq_tcp_state_idx(th, output)) < 0) {
1239 +#ifdef CONFIG_IP_MASQUERADE_VS
1241 + * Update state offset to INPUT_ONLY if necessary
1242 + * or delete NO_OUTPUT flag if output packet detected
1244 + if (ms->flags & IP_MASQ_F_VS_NO_OUTPUT) {
1245 + if (state_off == MASQ_STATE_OUTPUT)
1246 + ms->flags &= ~IP_MASQ_F_VS_NO_OUTPUT;
1247 + else state_off = MASQ_STATE_INPUT_ONLY;
1251 + if ((state_idx = masq_tcp_state_idx(th, state_off)) < 0) {
1252 IP_MASQ_DEBUG(1, "masq_state_idx(%d)=%d!!!\n",
1253 - output, state_idx);
1254 + state_off, state_idx);
1258 +#ifdef CONFIG_IP_MASQUERADE_VS
1259 + new_state = ip_vs_state_table[state_idx].next_state[ms->state];
1261 new_state = masq_tcp_states[state_idx].next_state[ms->state];
1265 if (new_state!=ms->state)
1266 @@ -247,6 +421,15 @@
1267 ntohl(ms->daddr), ntohs(ms->dport),
1268 ip_masq_state_name(ms->state),
1269 ip_masq_state_name(new_state));
1271 +#ifdef CONFIG_IP_MASQUERADE_VS
1273 + * Increase/Decrease the active connection counter and
1274 + * set ms->flags according to ms->state and new_state.
1276 + ip_vs_set_state(ms, new_state);
1277 +#endif /* CONFIG_IP_MASQUERADE_VS */
1279 return masq_set_state_timeout(ms, new_state);
1284 * Handle state transitions
1286 -static int masq_set_state(struct ip_masq *ms, int output, struct iphdr *iph, void *tp)
1287 +static int masq_set_state(struct ip_masq *ms, int state_off, struct iphdr *iph, void *tp)
1289 switch (iph->protocol) {
1293 return masq_set_state_timeout(ms, IP_MASQ_S_UDP);
1295 - return masq_tcp_state(ms, output, tp);
1296 + return masq_tcp_state(ms, state_off, tp);
1302 EXPORT_SYMBOL(ip_masq_get_debug_level);
1303 EXPORT_SYMBOL(ip_masq_new);
1304 +#ifdef CONFIG_IP_MASQUERADE_VS
1305 +EXPORT_SYMBOL(ip_masq_new_vs);
1306 +#endif /* CONFIG_IP_MASQUERADE_VS */
1307 EXPORT_SYMBOL(ip_masq_listen);
1308 EXPORT_SYMBOL(ip_masq_free_ports);
1309 EXPORT_SYMBOL(ip_masq_out_get);
1310 @@ -423,9 +609,17 @@
1313 ms->timer.expires = jiffies+tout;
1314 +#ifdef CONFIG_IP_MASQUERADE_VS
1315 + add_sltimer(&ms->timer);
1317 add_timer(&ms->timer);
1320 +#ifdef CONFIG_IP_MASQUERADE_VS
1321 + del_sltimer(&ms->timer);
1323 del_timer(&ms->timer);
1328 @@ -741,6 +935,10 @@
1331 read_lock(&__ip_masq_lock);
1332 +#ifdef CONFIG_IP_MASQUERADE_VS
1333 + ms = __ip_vs_out_get(protocol, s_addr, s_port, d_addr, d_port);
1335 +#endif /* CONFIG_IP_MASQUERADE_VS */
1336 ms = __ip_masq_out_get(protocol, s_addr, s_port, d_addr, d_port);
1337 read_unlock(&__ip_masq_lock);
1339 @@ -754,7 +952,11 @@
1342 read_lock(&__ip_masq_lock);
1343 - ms = __ip_masq_in_get(protocol, s_addr, s_port, d_addr, d_port);
1344 +#ifdef CONFIG_IP_MASQUERADE_VS
1345 + ms = __ip_vs_in_get(protocol, s_addr, s_port, d_addr, d_port);
1347 +#endif /* CONFIG_IP_MASQUERADE_VS */
1348 + ms = __ip_masq_in_get(protocol, s_addr, s_port, d_addr, d_port);
1349 read_unlock(&__ip_masq_lock);
1352 @@ -791,7 +993,11 @@
1353 static void masq_expire(unsigned long data)
1355 struct ip_masq *ms = (struct ip_masq *)data;
1356 +#ifdef CONFIG_IP_MASQUERADE_VS
1357 + ms->timeout = MASQUERADE_EXPIRE_RETRY(ms);
1359 ms->timeout = MASQUERADE_EXPIRE_RETRY;
1364 @@ -826,6 +1032,15 @@
1366 ip_masq_control_del(ms);
1368 +#ifdef CONFIG_IP_MASQUERADE_VS
1369 + if (ms->flags & IP_MASQ_F_VS) {
1370 + if (ip_vs_unhash(ms)) {
1371 + ip_vs_unbind_masq(ms);
1372 + ip_masq_unbind_app(ms);
1376 +#endif /* CONFIG_IP_MASQUERADE_VS */
1377 if (ip_masq_unhash(ms)) {
1378 if (ms->flags&IP_MASQ_F_MPORT) {
1379 atomic_dec(&mport_count);
1380 @@ -839,6 +1054,9 @@
1381 * refcnt==1 implies I'm the only one referrer
1383 if (atomic_read(&ms->refcnt) == 1) {
1384 +#ifdef IP_MASQ_MANY_STATE_TABLES
1385 + ip_masq_timeout_detach(ms);
1387 kfree_s(ms,sizeof(*ms));
1388 sysctl_ip_always_defrag--;
1390 @@ -1077,6 +1295,83 @@
1395 +#ifdef CONFIG_IP_MASQUERADE_VS
1397 + * Create a new masquerade entry for IPVS, all parameters {maddr,
1398 + * mport, saddr, sport, daddr, dport, mflags} are known. No need
1399 + * to allocate a free mport. And, hash it into the ip_vs_table.
1401 + * Be careful, it can be called from u-space
1404 +struct ip_masq * ip_masq_new_vs(int proto, __u32 maddr, __u16 mport, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned mflags)
1406 + struct ip_masq *ms;
1407 + static int n_fails = 0;
1410 + prio = (mflags&IP_MASQ_F_USER) ? GFP_KERNEL : GFP_ATOMIC;
1412 + ms = (struct ip_masq *) kmalloc(sizeof(struct ip_masq), prio);
1414 + if (++n_fails < 5)
1415 + IP_VS_ERR("ip_masq_new_vs(proto=%s): no memory available.\n",
1416 + masq_proto_name(proto));
1419 + MOD_INC_USE_COUNT;
1421 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,2,14)
1422 + sysctl_ip_always_defrag++;
1424 + memset(ms, 0, sizeof(*ms));
1425 + INIT_LIST_HEAD(&ms->s_list);
1426 + INIT_LIST_HEAD(&ms->m_list);
1427 + INIT_LIST_HEAD(&ms->d_list);
1428 + init_timer(&ms->timer);
1429 + ms->timer.data = (unsigned long)ms;
1430 + ms->timer.function = masq_expire;
1431 + ip_masq_timeout_attach(ms,ip_vs_timeout_table);
1432 + ms->protocol = proto;
1433 + ms->saddr = saddr;
1434 + ms->sport = sport;
1435 + ms->daddr = daddr;
1436 + ms->dport = dport;
1437 + ms->maddr = maddr;
1438 + ms->mport = mport;
1439 + ms->flags = mflags;
1440 + ms->app_data = NULL;
1441 + ms->control = NULL;
1443 + atomic_set(&ms->n_control,0);
1444 + atomic_set(&ms->refcnt,0);
1445 + atomic_set(&ms->in_pkts,0);
1447 + if (mflags & IP_MASQ_F_USER)
1448 + write_lock_bh(&__ip_masq_lock);
1450 + write_lock(&__ip_masq_lock);
1453 + * Hash it in the ip_vs_table
1457 + if (mflags & IP_MASQ_F_USER)
1458 + write_unlock_bh(&__ip_masq_lock);
1460 + write_unlock(&__ip_masq_lock);
1462 + ip_masq_bind_app(ms);
1464 + atomic_inc(&ms->refcnt);
1465 + masq_set_state_timeout(ms, IP_MASQ_S_NONE);
1468 +#endif /* CONFIG_IP_MASQUERADE_VS */
1472 * Get transport protocol data offset, check against size
1474 @@ -1153,25 +1448,20 @@
1478 +#ifndef CONFIG_IP_MASQUERADE_VS
1479 /* Lets determine our maddr now, shall we? */
1481 - struct rtable *rt;
1482 - struct rtable *skb_rt = (struct rtable*)skb->dst;
1483 - struct device *skb_dev = skb_rt->u.dst.dev;
1485 - if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos)|RTO_CONN, skb_dev?skb_dev->ifindex:0)) {
1486 - /* Fallback on old method */
1487 - /* This really shouldn't happen... */
1488 - maddr = inet_select_addr(skb_dev, skb_rt->rt_gateway, RT_SCOPE_UNIVERSE);
1490 - /* Route lookup succeeded */
1491 - maddr = rt->rt_src;
1494 + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) {
1499 switch (iph->protocol) {
1501 +#ifdef CONFIG_IP_MASQUERADE_VS
1502 + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) {
1506 return(ip_fw_masq_icmp(skb_p, maddr));
1508 if (h.uh->check == 0)
1509 @@ -1230,6 +1520,17 @@
1511 ms = ip_masq_out_get_iph(iph);
1513 +#ifdef CONFIG_IP_MASQUERADE_VS
1514 + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) {
1516 + * Drop this packet but don't
1517 + * start the timer from the beginning
1519 + __ip_masq_put(ms);
1520 + add_sltimer(&ms->timer);
1526 * If sysctl !=0 and no pkt has been received yet
1527 @@ -1280,6 +1581,33 @@
1528 ms->daddr = iph->daddr;
1531 +#ifdef CONFIG_IP_MASQUERADE_VS
1532 + struct ip_vs_dest *dest;
1535 + * Check if the packet is from our real service
1537 + read_lock(&__ip_vs_lock);
1538 + dest = __ip_vs_lookup_real_service(iph->protocol,
1539 + iph->saddr, h.portp[0]);
1540 + read_unlock(&__ip_vs_lock);
1543 + * Notify the real server: there is
1544 + * no existing entry if it is not RST packet
1545 + * or not TCP packet.
1547 + if (!h.th->rst || iph->protocol != IPPROTO_TCP)
1548 + icmp_send(skb, ICMP_DEST_UNREACH,
1549 + ICMP_PORT_UNREACH, 0);
1553 + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) {
1559 * Nope, not found, create a new entry for it
1561 @@ -1392,11 +1720,17 @@
1562 IP_MASQ_DEBUG(2, "O-routed from %08X:%04X with masq.addr %08X\n",
1563 ntohl(ms->maddr),ntohs(ms->mport),ntohl(maddr));
1565 - masq_set_state(ms, 1, iph, h.portp);
1566 +#ifdef CONFIG_IP_MASQUERADE_VS
1567 + /* do the IPVS statistics */
1568 + if (ms->flags & IP_MASQ_F_VS)
1569 + ip_vs_out_stats(ms, skb);
1572 + masq_set_state(ms, MASQ_STATE_OUTPUT, iph, h.portp);
1580 * Restore original addresses and ports in the original IP
1581 @@ -1438,6 +1772,12 @@
1582 ms = __ip_masq_out_get(iph->protocol,
1583 iph->daddr, portp[1],
1584 iph->saddr, portp[0]);
1585 +#ifdef CONFIG_IP_MASQUERADE_VS
1587 + ms = __ip_vs_out_get(iph->protocol,
1588 + iph->daddr, portp[1],
1589 + iph->saddr, portp[0]);
1590 +#endif /* CONFIG_IP_MASQUERADE_VS */
1591 read_unlock(&__ip_masq_lock);
1593 IP_MASQ_DEBUG(1, "Incoming frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n",
1594 @@ -1459,6 +1799,12 @@
1595 ms = __ip_masq_in_get(iph->protocol,
1596 iph->daddr, portp[1],
1597 iph->saddr, portp[0]);
1598 +#ifdef CONFIG_IP_MASQUERADE_VS
1600 + ms = __ip_vs_in_get(iph->protocol,
1601 + iph->daddr, portp[1],
1602 + iph->saddr, portp[0]);
1603 +#endif /* CONFIG_IP_MASQUERADE_VS */
1604 read_unlock(&__ip_masq_lock);
1606 IP_MASQ_DEBUG(1, "Outgoing frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n",
1607 @@ -1469,8 +1815,8 @@
1615 * Handle ICMP messages in forward direction.
1616 * Find any that might be relevant, check against existing connections,
1617 @@ -1556,7 +1902,7 @@
1618 ntohs(icmp_id(icmph)),
1621 - masq_set_state(ms, 1, iph, icmph);
1622 + masq_set_state(ms, MASQ_STATE_OUTPUT, iph, icmph);
1626 @@ -1684,11 +2030,28 @@
1630 +#ifdef CONFIG_IP_MASQUERADE_VS
1632 + ms = __ip_vs_out_get(ciph->protocol,
1633 + ciph->daddr, pptr[1],
1634 + ciph->saddr, pptr[0]);
1636 +#endif /* CONFIG_IP_MASQUERADE_VS */
1637 read_unlock(&__ip_masq_lock);
1642 +#ifdef CONFIG_IP_MASQUERADE_VS
1643 + if (IP_MASQ_VS_FWD(ms) != 0) {
1644 + IP_VS_INFO("shouldn't get here, because tun/dr is on the half connection\n");
1647 + /* do the IPVS statistics */
1648 + if (ms->flags & IP_MASQ_F_VS)
1649 + ip_vs_out_stats(ms, skb);
1650 +#endif /* CONFIG_IP_MASQUERADE_VS */
1652 /* Now we do real damage to this packet...! */
1653 /* First change the source IP address, and recalc checksum */
1654 iph->saddr = ms->maddr;
1655 @@ -1739,6 +2102,87 @@
1659 +#ifdef CONFIG_IP_MASQUERADE_VS
1662 + * Check whether this ICMP packet in the FORWARD path is for
1663 + * related IPVS connection and needs to be delivered locally
1666 +int ip_vs_forwarding_related_icmp(struct sk_buff *skb)
1668 + struct iphdr *iph = skb->nh.iph;
1669 + struct icmphdr *icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2));
1670 + unsigned short size = ntohs(iph->tot_len) - (iph->ihl * 4);
1671 + struct iphdr *ciph; /* The ip header contained within the ICMP */
1672 + __u16 *pptr; /* port numbers from TCP/UDP contained header */
1673 + struct ip_masq *ms;
1674 + union ip_masq_tphdr h;
1678 + * PACKET_HOST only, see ip_forward
1681 + h.raw = (char*) iph + iph->ihl * 4;
1683 + doff = proto_doff(iph->protocol, h.raw, size);
1685 + if (doff <= 0) return 0;
1687 + IP_VS_DBG(10, "icmp fwd/rev (%d,%d) %u.%u.%u.%u -> %u.%u.%u.%u\n",
1688 + icmph->type, ntohs(icmp_id(icmph)),
1689 + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
1691 + if ((icmph->type != ICMP_DEST_UNREACH) &&
1692 + (icmph->type != ICMP_SOURCE_QUENCH) &&
1693 + (icmph->type != ICMP_TIME_EXCEEDED))
1697 + * If we get here we have an ICMP error of one of the above 3 types
1698 + * Now find the contained IP header
1701 + ciph = (struct iphdr *) (icmph + 1);
1702 + size -= sizeof(struct icmphdr);
1703 + if (size < sizeof(struct iphdr)) return 0;
1705 + /* We are only interested ICMPs generated from TCP or UDP packets */
1706 + if (ciph->protocol == IPPROTO_TCP) {
1707 + if (size < sizeof(struct tcphdr)) return 0;
1710 + if (ciph->protocol == IPPROTO_UDP) {
1711 + if (size < sizeof(struct udphdr)) return 0;
1715 + /* We don't ensure for now the checksum is correct */
1717 + /* This is pretty much what __ip_masq_in_get_iph() does,
1718 + except params are wrong way round */
1719 + pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]);
1721 + read_lock(&__ip_masq_lock);
1722 + ms = __ip_vs_in_get(ciph->protocol,
1727 + read_unlock(&__ip_masq_lock);
1729 + if (!ms) return 0;
1730 + IP_VS_DBG(10, "Delivering locally ICMP for %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u to %u.%u.%u.%u\n",
1731 + NIPQUAD(ciph->daddr), ntohs(pptr[1]),
1732 + NIPQUAD(ciph->saddr), ntohs(pptr[0]),
1733 + NIPQUAD(ms->saddr));
1734 + __ip_masq_put(ms);
1738 +#endif /* CONFIG_IP_MASQUERADE_VS */
1741 * Handle ICMP messages in reverse (demasquerade) direction.
1742 * Find any that might be relevant, check against existing connections,
1743 @@ -1812,7 +2256,7 @@
1744 ntohs(icmp_id(icmph)),
1747 - masq_set_state(ms, 0, iph, icmph);
1748 + masq_set_state(ms, MASQ_STATE_INPUT, iph, icmph);
1752 @@ -1914,9 +2358,11 @@
1753 * *outgoing* so the ports are reversed (and addresses)
1755 pptr = (__u16 *)&(((char *)ciph)[csize]);
1756 +#ifndef CONFIG_IP_MASQUERADE_VS
1757 if (ntohs(pptr[0]) < PORT_MASQ_BEGIN ||
1758 ntohs(pptr[0]) > PORT_MASQ_END)
1762 /* Ensure the checksum is correct */
1763 if (ip_compute_csum((unsigned char *) icmph, len))
1764 @@ -1927,7 +2373,6 @@
1769 IP_MASQ_DEBUG(2, "Handling reverse ICMP for %08X:%04X -> %08X:%04X\n",
1770 ntohl(ciph->saddr), ntohs(pptr[0]),
1771 ntohl(ciph->daddr), ntohs(pptr[1]));
1772 @@ -1935,6 +2380,14 @@
1774 /* This is pretty much what __ip_masq_in_get_iph() does, except params are wrong way round */
1775 read_lock(&__ip_masq_lock);
1776 +#ifdef CONFIG_IP_MASQUERADE_VS
1777 + ms = __ip_vs_in_get(ciph->protocol,
1783 +#endif /* CONFIG_IP_MASQUERADE_VS */
1784 ms = __ip_masq_in_get(ciph->protocol,
1787 @@ -1945,10 +2398,23 @@
1791 +#ifdef CONFIG_IP_MASQUERADE_VS
1792 + /* do the IPVS statistics */
1793 + if (ms->flags & IP_MASQ_F_VS)
1794 + ip_vs_in_stats(ms, skb);
1796 + if (IP_MASQ_VS_FWD(ms) != 0) {
1797 + int ret = ip_vs_forward(skb, ms);
1798 + __ip_masq_put(ms);
1801 +#endif /* CONFIG_IP_MASQUERADE_VS */
1803 if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) {
1808 ciph = (struct iphdr *) (icmph + 1);
1809 pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]);
1811 @@ -1998,7 +2464,10 @@
1816 +#ifdef CONFIG_IP_MASQUERADE_VS
1817 + struct ip_vs_service *svc = NULL;
1821 * Big tappo: only PACKET_HOST (nor loopback neither mcasts)
1822 * ... don't know why 1st test DOES NOT include 2nd (?)
1823 @@ -2039,13 +2508,21 @@
1824 return(ip_fw_demasq_icmp(skb_p));
1829 * Make sure packet is in the masq range
1830 * ... or some mod-ule relaxes input range
1831 * ... or there is still some `special' mport opened
1833 +#ifdef CONFIG_IP_MASQUERADE_VS
1834 + svc = ip_vs_lookup_service(skb->fwmark,
1835 + iph->protocol, maddr, h.portp[1]);
1837 + (ntohs(h.portp[1]) < PORT_MASQ_BEGIN
1838 + || ntohs(h.portp[1]) > PORT_MASQ_END)
1840 if ((ntohs(h.portp[1]) < PORT_MASQ_BEGIN
1841 || ntohs(h.portp[1]) > PORT_MASQ_END)
1842 +#endif /* CONFIG_IP_MASQUERADE_VS */
1843 #ifdef CONFIG_IP_MASQUERADE_MOD
1844 && (ip_masq_mod_in_rule(skb, iph) != 1)
1846 @@ -2100,6 +2577,21 @@
1848 ms = ip_masq_in_get_iph(iph);
1850 +#ifdef CONFIG_IP_MASQUERADE_VS
1852 + * Checking the server status
1854 + if (ms && ms->dest && !(ms->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1856 + * If the dest is not avaiable, don't restart the timer
1857 + * of the packet, but silently drop it.
1859 + add_sltimer(&ms->timer);
1860 + __ip_masq_put(ms);
1866 * Give additional modules a chance to create an entry
1868 @@ -2116,6 +2608,27 @@
1869 ip_masq_mod_in_update(skb, iph, ms);
1872 +#ifdef CONFIG_IP_MASQUERADE_VS
1874 + (h.th->syn || (iph->protocol!=IPPROTO_TCP)) && svc) {
1875 + if (ip_masq_todrop()) {
1877 + * It seems that we are very loaded.
1878 + * We have to drop this packet :(
1883 + * Let the virtual server select a real server
1884 + * for the incomming connection, and create a
1885 + * masquerading entry.
1887 + ms = ip_vs_schedule(svc, iph);
1889 + return ip_vs_leave(svc, skb);
1890 + ip_vs_conn_stats(ms, svc);
1892 +#endif /* CONFIG_IP_MASQUERADE_VS */
1896 @@ -2168,13 +2681,43 @@
1901 +#ifdef CONFIG_IP_MASQUERADE_VS
1902 + /* do the IPVS statistics */
1903 + if (ms->flags & IP_MASQ_F_VS)
1904 + ip_vs_in_stats(ms, skb);
1906 + if (IP_MASQ_VS_FWD(ms) != 0) {
1910 + * Sorry for setting state of masq entry so early
1911 + * no matter whether the packet is forwarded
1912 + * successfully or not, because ip_vs_forward may
1913 + * have already released the skb. Although it
1914 + * brokes the original sematics, it won't lead to
1915 + * serious errors. We look forward to fixing it
1916 + * under the Rusty's netfilter framework both for
1917 + * correctness and modularization.
1919 + masq_set_state(ms, MASQ_STATE_INPUT, iph, h.portp);
1921 + ret = ip_vs_forward(skb, ms);
1926 + IP_VS_DBG(10, "masquerading packet...\n");
1927 +#endif /* CONFIG_IP_MASQUERADE_VS */
1929 if ((skb=masq_skb_cow(skb_p, &iph, &h.raw)) == NULL) {
1934 iph->daddr = ms->saddr;
1935 h.portp[1] = ms->sport;
1939 * Invalidate csum saving if tunnel has masq helper
1941 @@ -2231,15 +2774,28 @@
1942 h.uh->check = 0xFFFF;
1945 - ip_send_check(iph);
1946 + ip_send_check(iph);
1948 IP_MASQ_DEBUG(2, "I-routed to %08X:%04X\n",ntohl(iph->daddr),ntohs(h.portp[1]));
1950 - masq_set_state (ms, 0, iph, h.portp);
1951 + masq_set_state(ms, MASQ_STATE_INPUT, iph, h.portp);
1956 +#ifdef CONFIG_IP_MASQUERADE_VS
1959 + * Drop packet if it belongs to virtual service but no entry
1960 + * is found or created. Furthermore, send DEST_UNREACH icmp
1961 + * packet to clients if it is not RST or it is not TCP.
1963 + if (!h.th->rst || iph->protocol != IPPROTO_TCP) {
1964 + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
1970 /* sorry, all this trouble for a no-hit :) */
1972 @@ -2350,7 +2906,6 @@
1973 len += sprintf(buffer+len, "%-127s\n", temp);
1977 read_unlock_bh(&__ip_masq_lock);
1980 @@ -2358,9 +2913,52 @@
1981 read_unlock_bh(&__ip_masq_lock);
1986 +#ifdef CONFIG_IP_MASQUERADE_VS
1987 + for(idx = 0; idx < IP_VS_TAB_SIZE; idx++)
1990 + * Lock is actually only need in next loop
1991 + * we are called from uspace: must stop bh.
1993 + read_lock_bh(&__ip_masq_lock);
1995 + l = &ip_vs_table[idx];
1996 + for (e=l->next; e!=l; e=e->next) {
1997 + ms = list_entry(e, struct ip_masq, m_list);
1999 + if (pos <= offset) {
2005 + * We have locked the tables, no need to del/add timers
2009 + sprintf(temp,"%s %08X:%04X %08X:%04X %04X %08X %6d %6d %7lu",
2010 + masq_proto_name(ms->protocol),
2011 + ntohl(ms->saddr), ntohs(ms->sport),
2012 + ntohl(ms->daddr), ntohs(ms->dport),
2014 + ms->out_seq.init_seq,
2015 + ms->out_seq.delta,
2016 + ms->out_seq.previous_delta,
2017 + ms->timer.expires-jiffies);
2018 + len += sprintf(buffer+len, "%-127s\n", temp);
2020 + if(len >= length) {
2021 + read_unlock_bh(&__ip_masq_lock);
2025 + read_unlock_bh(&__ip_masq_lock);
2028 +#endif /* CONFIG_IP_MASQUERADE_VS */
2031 begin = len - (pos - offset);
2032 *start = buffer + begin;
2034 @@ -2386,17 +2984,29 @@
2035 len, sizeof(struct ip_fw_masq));
2037 masq = (struct ip_fw_masq *)m;
2038 - if (masq->tcp_timeout)
2039 + if (masq->tcp_timeout) {
2040 masq_timeout_table.timeout[IP_MASQ_S_ESTABLISHED]
2041 +#ifdef CONFIG_IP_MASQUERADE_VS
2042 + = masq_timeout_table_dos.timeout[IP_MASQ_S_ESTABLISHED]
2044 = masq->tcp_timeout;
2047 - if (masq->tcp_fin_timeout)
2048 + if (masq->tcp_fin_timeout) {
2049 masq_timeout_table.timeout[IP_MASQ_S_FIN_WAIT]
2050 +#ifdef CONFIG_IP_MASQUERADE_VS
2051 + = masq_timeout_table_dos.timeout[IP_MASQ_S_FIN_WAIT]
2053 = masq->tcp_fin_timeout;
2056 - if (masq->udp_timeout)
2057 + if (masq->udp_timeout) {
2058 masq_timeout_table.timeout[IP_MASQ_S_UDP]
2059 +#ifdef CONFIG_IP_MASQUERADE_VS
2060 + = masq_timeout_table_dos.timeout[IP_MASQ_S_UDP]
2062 = masq->udp_timeout;
2067 @@ -2468,6 +3078,11 @@
2068 ret = ip_masq_mod_ctl(optname, &masq_ctl, optlen);
2071 +#ifdef CONFIG_IP_MASQUERADE_VS
2072 + case IP_MASQ_TARGET_VS:
2073 + ret = ip_vs_ctl(optname, &masq_ctl, optlen);
2079 @@ -2529,12 +3144,25 @@
2082 #endif /* CONFIG_PROC_FS */
2085 - * Wrapper over inet_select_addr()
2086 + * Determine maddr from skb
2088 -u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope)
2089 +int ip_masq_select_addr(struct sk_buff *skb, __u32 *maddr)
2091 - return inet_select_addr(dev, dst, scope);
2092 + struct rtable *rt;
2093 + struct rtable *skb_rt = (struct rtable*)skb->dst;
2094 + struct device *skb_dev = skb_rt->u.dst.dev;
2095 + struct iphdr *iph = skb->nh.iph;
2097 + if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos)|RTO_CONN, skb_dev?skb_dev->ifindex:0)) {
2100 + /* Route lookup succeeded */
2101 + *maddr = rt->rt_src;
2108 @@ -2587,7 +3215,7 @@
2109 (char *) IPPROTO_ICMP,
2113 +#endif /* CONFIG_PROC_FS */
2114 #ifdef CONFIG_IP_MASQUERADE_IPAUTOFW
2117 @@ -2596,6 +3224,9 @@
2119 #ifdef CONFIG_IP_MASQUERADE_MFW
2122 +#ifdef CONFIG_IP_MASQUERADE_VS
2127 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs.c linux-2.2.19-vs-1.0.7/net/ipv4/ip_vs.c
2128 --- linux-2.2.19/net/ipv4/ip_vs.c Thu Jan 1 08:00:00 1970
2129 +++ linux-2.2.19-vs-1.0.7/net/ipv4/ip_vs.c Thu Apr 19 22:37:31 2001
2132 + * IPVS An implementation of the IP virtual server support for the
2133 + * LINUX operating system. IPVS is now implemented as a part
2134 + * of IP masquerading code. IPVS can be used to build a
2135 + * high-performance and highly available server based on a
2136 + * cluster of servers.
2140 + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
2141 + * Peter Kese <peter.kese@ijs.si>
2143 + * This program is free software; you can redistribute it and/or
2144 + * modify it under the terms of the GNU General Public License
2145 + * as published by the Free Software Foundation; either version
2146 + * 2 of the License, or (at your option) any later version.
2149 + * Wensong Zhang : fixed the overflow bug in ip_vs_procinfo
2150 + * Wensong Zhang : added editing dest and service functions
2151 + * Wensong Zhang : changed the names of some functions
2152 + * Wensong Zhang : fixed the unlocking bug in ip_vs_del_dest
2153 + * Wensong Zhang : added a separate hash table for IPVS
2154 + * Wensong Zhang : added slow timer for IPVS masq entries
2155 + * Julian Anastasov : fixed the number of active connections
2156 + * Wensong Zhang : added persistent port
2157 + * Wensong Zhang : fixed the incorrect lookup in hash table
2158 + * Wensong Zhang : added server status checking
2159 + * Wensong Zhang : fixed the incorrect slow timer vector layout
2160 + * Wensong Zhang : fixed the sltimer added twice bug of mst
2161 + * Julian Anastasov : fixed the IP_MASQ_F_VS_INACTIVE cleared bug after editing dest
2162 + * Wensong Zhang : added the inactive connection counter
2163 + * Wensong Zhang : changed the body of ip_vs_schedule
2164 + * Julian Anastasov : fixed the unlocking bug in ip_vs_schedule
2165 + * Julian Anastasov : fixed the uncounting bug in creating masqs by template
2166 + * Wensong Zhang : changed some condition orders for a bit performance
2167 + * Julian Anastasov : don't touch counters in ip_vs_unbind_masq for templates
2168 + * Wensong Zhang : added the hash table for virtual services
2169 + * Wensong Zhang : changed destination lists to d-linked lists
2170 + * Wensong Zhang : changed the scheduler list to the d-linked list
2171 + * Wensong Zhang : added new persistent service handling
2172 + * Julian Anastasov : fixed the counting bug in ip_vs_unbind_masq again
2173 + * (don't touch counters for templates)
2174 + * Wensong Zhang : changed some IP_VS_ERR to IP_VS_DBG in the ip_vs_tunnel_xmit
2175 + * Wensong Zhang : added different timeout support for persistent svc
2176 + * Wensong Zhang : fixed the bug that persistent svc cannot be edited
2177 + * Julian Anastasov : removed extra read_unlock in __ip_vs_lookup_service
2178 + * Julian Anastasov : changed not to restart template timers if dest is unavailable
2179 + * Julian Anastasov : added the destination trash
2180 + * Wensong Zhang : added the update_service call in ip_vs_del_dest
2181 + * Wensong Zhang : added the ip_vs_leave function
2182 + * Lars Marowsky-Bree : added persistence granularity support
2183 + * Julian Anastasov : changed some comestics things for debugging
2184 + * Wensong Zhang : use vmalloc to allocate big ipvs hash table
2185 + * Wensong Zhang : changed the tunneling/direct routing methods a little
2186 + * Julian Anastasov : fixed the return bug of ip_vs_leave(-2 instead of -3)
2187 + * Roberto Nibali : fixed the undefined variable bug in the IP_VS_DBG of ip_vs_dr_xmit
2188 + * Julian Anastasov : changed ICMP_PROT_UNREACH to ICMP_PORT_UNREACH in ip_vs_leave
2189 + * Wensong Zhang : added port zero support for persistent services
2190 + * Wensong Zhang : fixed the bug that virtual ftp service blocks other services not listed in ipvs table
2191 + * Wensong Zhang : invalidate a persistent template when its dest is unavailable
2192 + * Julian Anastasov : changed two IP_VS_ERR calls to IP_VS_DBG
2193 + * Wensong Zhang : added random drop of syn entries
2194 + * Wensong Zhang : added random drop of UDP entris
2195 + * Julian Anastasov : added droprate defense against DoS attack
2196 + * Julian Anastasov : added secure_tcp defense against DoS attack
2197 + * Wensong Zhang : revisited dropentry defense against DoS attach
2198 + * Horms : added the fwmark service feature
2199 + * Wensong Zhang : changed to two service hash tables
2200 + * Julian Anastasov : corrected trash_dest lookup for both
2201 + * normal service and fwmark service
2205 +#include <linux/config.h>
2206 +#include <linux/module.h>
2207 +#include <linux/types.h>
2208 +#include <linux/kernel.h>
2209 +#include <linux/errno.h>
2210 +#include <linux/vmalloc.h>
2211 +#include <linux/swap.h>
2212 +#include <net/ip_masq.h>
2214 +#include <linux/sysctl.h>
2215 +#include <linux/ip_fw.h>
2216 +#include <linux/ip_masq.h>
2217 +#include <linux/proc_fs.h>
2219 +#include <linux/inetdevice.h>
2220 +#include <linux/ip.h>
2221 +#include <net/icmp.h>
2222 +#include <net/ip.h>
2223 +#include <net/route.h>
2224 +#include <net/ip_vs.h>
2227 +#include <linux/kmod.h>
2230 +EXPORT_SYMBOL(register_ip_vs_scheduler);
2231 +EXPORT_SYMBOL(unregister_ip_vs_scheduler);
2232 +EXPORT_SYMBOL(ip_vs_bind_masq);
2233 +EXPORT_SYMBOL(ip_vs_unbind_masq);
2234 +EXPORT_SYMBOL(ip_vs_lookup_dest);
2235 +#ifdef CONFIG_IP_VS_DEBUG
2236 +EXPORT_SYMBOL(ip_vs_get_debug_level);
2239 +int sysctl_ip_vs_drop_entry = 0;
2240 +int sysctl_ip_vs_drop_packet = 0;
2241 +int sysctl_ip_vs_secure_tcp = 0;
2242 +int sysctl_ip_vs_amemthresh = 1024;
2243 +int sysctl_ip_vs_am_droprate = 10;
2245 +#ifdef CONFIG_IP_VS_DEBUG
2246 +static int sysctl_ip_vs_debug_level = 0;
2248 +int ip_vs_get_debug_level(void)
2250 + return sysctl_ip_vs_debug_level;
2255 +int ip_vs_dropentry = 0;
2257 +static inline void update_defense_level(void)
2259 + int ip_vs_amem = nr_free_pages+page_cache_size+(buffermem>>PAGE_SHIFT);
2260 + int nomem = (ip_vs_amem < sysctl_ip_vs_amemthresh);
2263 + switch (sysctl_ip_vs_drop_entry) {
2265 + ip_vs_dropentry = 0;
2269 + ip_vs_dropentry = 1;
2270 + sysctl_ip_vs_drop_entry = 2;
2272 + ip_vs_dropentry = 0;
2277 + ip_vs_dropentry = 1;
2279 + ip_vs_dropentry = 0;
2280 + sysctl_ip_vs_drop_entry = 1;
2284 + ip_vs_dropentry = 1;
2289 + switch (sysctl_ip_vs_drop_packet) {
2291 + ip_masq_drop_rate = 0;
2295 + ip_masq_drop_rate = ip_masq_drop_counter
2296 + = sysctl_ip_vs_amemthresh /
2297 + (sysctl_ip_vs_amemthresh-ip_vs_amem);
2298 + sysctl_ip_vs_drop_packet = 2;
2300 + ip_masq_drop_rate = 0;
2305 + ip_masq_drop_rate = ip_masq_drop_counter
2306 + = sysctl_ip_vs_amemthresh /
2307 + (sysctl_ip_vs_amemthresh-ip_vs_amem);
2309 + ip_masq_drop_rate = 0;
2310 + sysctl_ip_vs_drop_packet = 1;
2314 + ip_masq_drop_rate = sysctl_ip_vs_am_droprate;
2319 + switch (sysctl_ip_vs_secure_tcp) {
2321 + ip_masq_secure_tcp_set(0);
2325 + ip_masq_secure_tcp_set(1);
2326 + sysctl_ip_vs_secure_tcp = 2;
2328 + ip_masq_secure_tcp_set(0);
2333 + ip_masq_secure_tcp_set(1);
2335 + ip_masq_secure_tcp_set(0);
2336 + sysctl_ip_vs_secure_tcp = 1;
2340 + ip_masq_secure_tcp_set(1);
2346 +static inline int todrop_entry(struct ip_masq *ms)
2349 + * The drop rate array needs tuning for real environments.
2351 + static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
2352 + static char todrop_counter[9] = {0};
2355 + if (ms->timeout+jiffies-ms->timer.expires < 60*HZ)
2358 + i = atomic_read(&ms->in_pkts);
2359 + if (i > 8) return 0;
2361 + if (!todrop_rate[i]) return 0;
2362 + if (--todrop_counter[i] > 0) return 0;
2364 + todrop_counter[i] = todrop_rate[i];
2368 +static inline void ip_vs_random_dropentry(void)
2371 + struct ip_masq *ms;
2372 + struct list_head *l,*e;
2373 + struct ip_masq *mst;
2374 + void (*fn)(unsigned long);
2377 + * Randomly scan 1/32 of the whole table every second
2379 + for (i=0; i < (IP_VS_TAB_SIZE>>5); i++) {
2381 + * Lock is actually needed in this loop.
2383 + write_lock(&__ip_masq_lock);
2385 + l = &ip_vs_table[net_random()&IP_VS_TAB_MASK];
2386 + for (e=l->next; e!=l; e=e->next) {
2387 + ms = list_entry(e, struct ip_masq, m_list);
2388 + if (ms->dport == 0)
2389 + /* masq template */
2391 + switch(ms->state) {
2392 + case IP_MASQ_S_SYN_RECV:
2393 + case IP_MASQ_S_SYNACK:
2396 + case IP_MASQ_S_ESTABLISHED:
2397 + case IP_MASQ_S_UDP:
2398 + if (todrop_entry(ms))
2407 + * Drop the entry, and drop its mst if not referenced
2409 + write_unlock(&__ip_masq_lock);
2410 + IP_VS_DBG(4, "Drop masq\n");
2411 + mst = ms->control;
2412 + fn = (ms->timer).function;
2413 + del_sltimer(&ms->timer);
2414 + fn((unsigned long)ms);
2415 + if (mst && !atomic_read(&mst->n_control)) {
2416 + IP_VS_DBG(4, "Drop masq template\n");
2417 + del_sltimer(&mst->timer);
2418 + fn((unsigned long)mst);
2420 + write_lock(&__ip_masq_lock);
2422 + write_unlock(&__ip_masq_lock);
2428 + * The following block implements slow timers for IPVS, most code is stolen
2429 + * from linux/kernel/sched.c
2430 + * Slow timer is used to avoid the overhead of cascading timers, when lots
2431 + * of masq entries (>50,000) are cluttered in the system.
2433 +#define SHIFT_BITS 6
2435 +#define TVR_BITS 10
2436 +#define TVN_SIZE (1 << TVN_BITS)
2437 +#define TVR_SIZE (1 << TVR_BITS)
2438 +#define TVN_MASK (TVN_SIZE - 1)
2439 +#define TVR_MASK (TVR_SIZE - 1)
2441 +struct sltimer_vec {
2443 + struct timer_list *vec[TVN_SIZE];
2446 +struct sltimer_vec_root {
2448 + struct timer_list *vec[TVR_SIZE];
2451 +static struct sltimer_vec sltv3 = { 0 };
2452 +static struct sltimer_vec sltv2 = { 0 };
2453 +static struct sltimer_vec_root sltv1 = { 0 };
2455 +static struct sltimer_vec * const sltvecs[] = {
2456 + (struct sltimer_vec *)&sltv1, &sltv2, &sltv3
2459 +#define NOOF_SLTVECS (sizeof(sltvecs) / sizeof(sltvecs[0]))
2461 +static unsigned long sltimer_jiffies = 0;
2463 +static inline void insert_sltimer(struct timer_list *timer,
2464 + struct timer_list **vec, int idx)
2466 + if ((timer->next = vec[idx]))
2467 + vec[idx]->prev = timer;
2469 + timer->prev = (struct timer_list *)&vec[idx];
2472 +static inline void internal_add_sltimer(struct timer_list *timer)
2475 + * must be cli-ed when calling this
2477 + unsigned long expires = timer->expires;
2478 + unsigned long idx = (expires - sltimer_jiffies) >> SHIFT_BITS;
2480 + if (idx < TVR_SIZE) {
2481 + int i = (expires >> SHIFT_BITS) & TVR_MASK;
2482 + insert_sltimer(timer, sltv1.vec, i);
2483 + } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
2484 + int i = (expires >> (SHIFT_BITS+TVR_BITS)) & TVN_MASK;
2485 + insert_sltimer(timer, sltv2.vec, i);
2486 + } else if ((signed long) idx < 0) {
2488 + * can happen if you add a timer with expires == jiffies,
2489 + * or you set a timer to go off in the past
2491 + insert_sltimer(timer, sltv1.vec, sltv1.index);
2492 + } else if (idx <= 0xffffffffUL) {
2493 + int i = (expires >> (SHIFT_BITS+TVR_BITS+TVN_BITS)) & TVN_MASK;
2494 + insert_sltimer(timer, sltv3.vec, i);
2496 + /* Can only get here on architectures with 64-bit jiffies */
2497 + timer->next = timer->prev = timer;
2501 +rwlock_t sltimerlist_lock = RW_LOCK_UNLOCKED;
2503 +void add_sltimer(struct timer_list *timer)
2505 + write_lock(&sltimerlist_lock);
2508 + internal_add_sltimer(timer);
2510 + write_unlock(&sltimerlist_lock);
2514 + printk("bug: kernel sltimer added twice at %p.\n",
2515 + __builtin_return_address(0));
2519 +static inline int detach_sltimer(struct timer_list *timer)
2521 + struct timer_list *prev = timer->prev;
2523 + struct timer_list *next = timer->next;
2524 + prev->next = next;
2526 + next->prev = prev;
2532 +void mod_sltimer(struct timer_list *timer, unsigned long expires)
2534 + write_lock(&sltimerlist_lock);
2535 + timer->expires = expires;
2536 + detach_sltimer(timer);
2537 + internal_add_sltimer(timer);
2538 + write_unlock(&sltimerlist_lock);
2541 +int del_sltimer(struct timer_list * timer)
2545 + write_lock(&sltimerlist_lock);
2546 + ret = detach_sltimer(timer);
2547 + timer->next = timer->prev = 0;
2548 + write_unlock(&sltimerlist_lock);
2553 +static inline void cascade_sltimers(struct sltimer_vec *tv)
2556 + * cascade all the timers from tv up one level
2558 + struct timer_list *timer;
2559 + timer = tv->vec[tv->index];
2561 + * We are removing _all_ timers from the list, so we don't have to
2562 + * detach them individually, just clear the list afterwards.
2565 + struct timer_list *tmp = timer;
2566 + timer = timer->next;
2567 + internal_add_sltimer(tmp);
2569 + tv->vec[tv->index] = NULL;
2570 + tv->index = (tv->index + 1) & TVN_MASK;
2573 +static inline void run_sltimer_list(void)
2575 + write_lock(&sltimerlist_lock);
2576 + while ((long)(jiffies - sltimer_jiffies) >= 0) {
2577 + struct timer_list *timer;
2578 + if (!sltv1.index) {
2581 + cascade_sltimers(sltvecs[n]);
2582 + } while (sltvecs[n]->index == 1 && ++n < NOOF_SLTVECS);
2584 + while ((timer = sltv1.vec[sltv1.index])) {
2585 + void (*fn)(unsigned long) = timer->function;
2586 + unsigned long data = timer->data;
2587 + detach_sltimer(timer);
2588 + timer->next = timer->prev = NULL;
2589 + write_unlock(&sltimerlist_lock);
2591 + write_lock(&sltimerlist_lock);
2593 + sltimer_jiffies += 1<<SHIFT_BITS;
2594 + sltv1.index = (sltv1.index + 1) & TVR_MASK;
2596 + write_unlock(&sltimerlist_lock);
2599 +static void sltimer_handler(unsigned long data);
2601 +struct timer_list slow_timer = {
2608 + * Slow timer handler is activated every second
2610 +#define SLTIMER_PERIOD 1*HZ
2612 +void sltimer_handler(unsigned long data)
2614 + run_sltimer_list();
2616 + update_defense_level();
2617 + if (ip_vs_dropentry)
2618 + ip_vs_random_dropentry();
2620 + mod_timer(&slow_timer, (jiffies + SLTIMER_PERIOD));
2625 + * The port number of FTP service (in network order).
2627 +#define FTPPORT __constant_htons(21)
2628 +#define FTPDATA __constant_htons(20)
2633 +rwlock_t __ip_vs_lock = RW_LOCK_UNLOCKED;
2636 + * Hash table: for input and output packets lookups of IPVS
2638 +#define IP_MASQ_NTABLES 3
2640 +struct list_head *ip_vs_table;
2643 + * Hash table: for virtual service lookups
2645 +#define IP_VS_SVC_TAB_BITS 8
2646 +#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
2647 +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
2649 +/* the service table hashed by <protocol, addr, port> */
2650 +struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
2651 +/* the service table hashed by fwmark */
2652 +struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
2655 + * Hash table: for real service lookups
2657 +#define IP_VS_RTAB_BITS 4
2658 +#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
2659 +#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
2661 +struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
2664 + * IPVS scheduler list
2666 +struct list_head ip_vs_schedulers;
2669 + * Trash for destinations
2671 +struct list_head ip_vs_dest_trash;
2674 + * FTP & NULL virtual service counters
2676 +atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
2677 +atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
2680 + * Register a scheduler in the scheduler list
2682 +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
2685 + IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
2689 + if (!scheduler->name) {
2690 + IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
2694 + if (scheduler->n_list.next != &scheduler->n_list) {
2695 + IP_VS_ERR("register_ip_vs_scheduler(): scheduler already linked\n");
2700 + * Add it into the d-linked scheduler list
2702 + list_add(&scheduler->n_list, &ip_vs_schedulers);
2709 + * Unregister a scheduler in the scheduler list
2711 +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
2714 + IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
2719 + * Only allow unregistration if it is not referenced
2721 + if (atomic_read(&scheduler->refcnt)) {
2722 + IP_VS_ERR("unregister_ip_vs_scheduler(): is in use by %d guys. failed\n",
2723 + atomic_read(&scheduler->refcnt));
2727 + if (scheduler->n_list.next == &scheduler->n_list) {
2728 + IP_VS_ERR("unregister_ip_vs_scheduler(): scheduler is not in the list. failed\n");
2733 + * Removed it from the d-linked scheduler list
2735 + list_del(&scheduler->n_list);
2742 + * Bind a service with a scheduler
2743 + * Must called with the __ip_vs_lock lock, and return bool.
2745 +int ip_vs_bind_scheduler(struct ip_vs_service *svc,
2746 + struct ip_vs_scheduler *scheduler)
2748 + if (svc == NULL) {
2749 + IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
2752 + if (scheduler == NULL) {
2753 + IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
2757 + svc->scheduler = scheduler;
2758 + atomic_inc(&scheduler->refcnt);
2760 + if(scheduler->init_service)
2761 + if(scheduler->init_service(svc) != 0) {
2762 + IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
2771 + * Unbind a service with its scheduler
2772 + * Must called with the __ip_vs_lock lock, and return bool.
2774 +int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
2776 + struct ip_vs_scheduler *sched;
2778 + if (svc == NULL) {
2779 + IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
2783 + sched = svc->scheduler;
2784 + if (sched == NULL) {
2785 + IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
2789 + if(sched->done_service)
2790 + if(sched->done_service(svc) != 0) {
2791 + IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
2795 + atomic_dec(&sched->refcnt);
2796 + svc->scheduler = NULL;
2803 + * Get scheduler in the scheduler list by name
2805 +struct ip_vs_scheduler * ip_vs_sched_getbyname(const char *sched_name)
2807 + struct ip_vs_scheduler *sched;
2808 + struct list_head *l, *e;
2810 + IP_VS_DBG(6, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
2813 + read_lock_bh(&__ip_vs_lock);
2815 + l = &ip_vs_schedulers;
2816 + for (e=l->next; e!=l; e=e->next) {
2817 + sched = list_entry(e, struct ip_vs_scheduler, n_list);
2818 + if (strcmp(sched_name, sched->name)==0) {
2820 + read_unlock_bh(&__ip_vs_lock);
2825 + read_unlock_bh(&__ip_vs_lock);
2831 + * Lookup scheduler and try to load it if it doesn't exist
2833 +struct ip_vs_scheduler * ip_vs_lookup_scheduler(const char *sched_name)
2835 + struct ip_vs_scheduler *sched;
2838 + * Search for the scheduler by sched_name
2840 + sched = ip_vs_sched_getbyname(sched_name);
2843 + * If scheduler not found, load the module and search again
2845 + if (sched == NULL) {
2846 + char module_name[IP_MASQ_TNAME_MAX+8];
2847 + sprintf(module_name,"ip_vs_%s",sched_name);
2849 + request_module(module_name);
2850 +#endif /* CONFIG_KMOD */
2851 + sched = ip_vs_sched_getbyname(sched_name);
2859 + * Returns hash value for IPVS masq entry
2862 +static __inline__ unsigned
2863 +ip_vs_hash_key(unsigned proto, __u32 addr, __u16 port)
2865 + unsigned addrh = ntohl(addr);
2867 + return (proto^addrh^(addrh>>IP_VS_TAB_BITS)^ntohs(port))
2873 + * Hashes ip_masq in ip_vs_table by proto,addr,port.
2874 + * should be called with locked tables.
2875 + * returns bool success.
2877 +int ip_vs_hash(struct ip_masq *ms)
2881 + if (ms->flags & IP_MASQ_F_HASHED) {
2882 + IP_VS_ERR("ip_vs_hash(): request for already hashed, "
2883 + "called from %p\n", __builtin_return_address(0));
2888 + * Note: because ip_masq_put sets masq expire only if its
2889 + * refcnt==IP_MASQ_NTABLES, otherwise the masq entry
2890 + * will never expire.
2892 + atomic_add(IP_MASQ_NTABLES, &ms->refcnt);
2895 + * Hash by proto,d{addr,port},
2896 + * which are client address and port in IPVS.
2898 + hash = ip_vs_hash_key(ms->protocol, ms->daddr, ms->dport);
2899 + list_add(&ms->m_list, &ip_vs_table[hash]);
2901 + ms->flags |= IP_MASQ_F_HASHED;
2907 + * Unhashes ip_masq from ip_vs_table.
2908 + * should be called with locked tables.
2909 + * returns bool success.
2911 +int ip_vs_unhash(struct ip_masq *ms)
2913 + if (!(ms->flags & IP_MASQ_F_HASHED)) {
2914 + IP_VS_ERR("ip_vs_unhash(): request for unhash flagged, "
2915 + "called from %p\n", __builtin_return_address(0));
2920 + * Remove it from the list and decrease its reference counter.
2922 + list_del(&ms->m_list);
2923 + atomic_sub(IP_MASQ_NTABLES, &ms->refcnt);
2925 + ms->flags &= ~IP_MASQ_F_HASHED;
2931 + * Gets ip_masq associated with supplied parameters in the ip_vs_table.
2932 + * Called for pkts coming from OUTside-to-INside.
2933 + * s_addr, s_port: pkt source address (foreign host)
2934 + * d_addr, d_port: pkt dest address (load balancer)
2935 + * Caller must lock tables
2937 +struct ip_masq * __ip_vs_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
2940 + struct ip_masq *ms;
2941 + struct list_head *l,*e;
2943 + hash = ip_vs_hash_key(protocol, s_addr, s_port);
2945 + l = &ip_vs_table[hash];
2946 + for (e=l->next; e!=l; e=e->next) {
2947 + ms = list_entry(e, struct ip_masq, m_list);
2948 + if (s_addr==ms->daddr && s_port==ms->dport &&
2949 + d_port==ms->mport && d_addr==ms->maddr &&
2950 + protocol==ms->protocol) {
2952 + atomic_inc(&ms->refcnt);
2959 + IP_VS_DBG(7, "look/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
2960 + masq_proto_name(protocol),
2961 + NIPQUAD(s_addr), ntohs(s_port),
2962 + NIPQUAD(d_addr), ntohs(d_port),
2963 + ms?"hit":"not hit");
2970 + * Gets ip_masq associated with supplied parameters in the ip_vs_table.
2971 + * Called for pkts coming from inside-to-OUTside.
2972 + * s_addr, s_port: pkt source address (inside host)
2973 + * d_addr, d_port: pkt dest address (foreign host)
2974 + * Caller must lock tables
2976 +struct ip_masq * __ip_vs_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
2979 + struct ip_masq *ms;
2980 + struct list_head *l,*e;
2983 + * Check for "full" addressed entries
2985 + hash = ip_vs_hash_key(protocol, d_addr, d_port);
2987 + l = &ip_vs_table[hash];
2988 + for (e=l->next; e!=l; e=e->next) {
2989 + ms = list_entry(e, struct ip_masq, m_list);
2990 + if (d_addr == ms->daddr && d_port == ms->dport &&
2991 + s_port == ms->sport && s_addr == ms->saddr &&
2992 + protocol == ms->protocol) {
2994 + atomic_inc(&ms->refcnt);
3001 + IP_VS_DBG(7, "look/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
3002 + masq_proto_name(protocol),
3003 + NIPQUAD(s_addr), ntohs(s_port),
3004 + NIPQUAD(d_addr), ntohs(d_port),
3005 + ms?"hit":"not hit");
3012 + * Called by ip_vs_sched_persist to look for masq template.
3014 +static __inline__ struct ip_masq *ip_vs_in_get
3015 +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
3017 + struct ip_masq *ms;
3019 + read_lock(&__ip_masq_lock);
3020 + ms = __ip_vs_in_get(protocol, s_addr, s_port, d_addr, d_port);
3021 + read_unlock(&__ip_masq_lock);
3028 + * Returns hash value for virtual service
3030 +static __inline__ unsigned
3031 +ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
3033 + register unsigned porth = ntohs(port);
3035 + return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
3036 + & IP_VS_SVC_TAB_MASK;
3040 + * Returns hash value of fwmark for virtual service lookup
3042 +static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
3044 + return fwmark & IP_VS_SVC_TAB_MASK;
3048 + * Hashes ip_vs_service in the ip_vs_svc_table by <proto,addr,port>
3049 + * or in the ip_vs_svc_fwm_table by fwmark.
3050 + * Should be called with locked tables.
3051 + * Returns bool success.
3053 +int ip_vs_svc_hash(struct ip_vs_service *svc)
3057 + if (svc->flags & IP_VS_SVC_F_HASHED) {
3058 + IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
3059 + "called from %p\n", __builtin_return_address(0));
3063 + if (svc->fwmark == 0) {
3065 + * Hash by <protocol,addr,port> in ip_vs_svc_table
3067 + hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
3068 + list_add(&svc->s_list, &ip_vs_svc_table[hash]);
3071 + * Hash by fwmark in ip_vs_svc_fwm_table
3073 + hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
3074 + list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
3077 + svc->flags |= IP_VS_SVC_F_HASHED;
3083 + * Unhashes ip_vs_service from ip_vs_svc_table/ip_vs_svc_fwm_table.
3084 + * Should be called with locked tables.
3085 + * Returns bool success.
3087 +int ip_vs_svc_unhash(struct ip_vs_service *svc)
3089 + if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
3090 + IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
3091 + "called from %p\n", __builtin_return_address(0));
3095 + if (svc->fwmark == 0) {
3097 + * Remove it from the ip_vs_svc_table table.
3099 + list_del(&svc->s_list);
3102 + * Remove it from the ip_vs_svc_fwm_table table.
3104 + list_del(&svc->f_list);
3107 + svc->flags &= ~IP_VS_SVC_F_HASHED;
3113 + * Lookup service by {proto,addr,port} in the service table.
3115 +static __inline__ struct ip_vs_service *
3116 +__ip_vs_lookup_service(__u16 protocol, __u32 vaddr, __u16 vport)
3119 + struct ip_vs_service *svc;
3120 + struct list_head *l,*e;
3123 + * Check for "full" addressed entries
3124 + * Note: as long as IP_VS_SVC_TAB_BITS is larger than zero,
3125 + * <TCP,addr,port> and <UDP,addr,port> have different hash
3126 + * keys, there is no need to do protcol checking.
3128 + hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
3130 + l = &ip_vs_svc_table[hash];
3131 + for (e=l->next; e!=l; e=e->next) {
3132 + svc = list_entry(e, struct ip_vs_service, s_list);
3133 + if ((svc->addr == vaddr)
3134 + && (svc->port == vport)) {
3145 + * Lookup service by fwmark in the service table.
3147 +static __inline__ struct ip_vs_service * __ip_vs_lookup_svc_fwm(__u32 fwmark)
3150 + struct ip_vs_service *svc;
3151 + struct list_head *l,*e;
3154 + * Check for fwmark-indexed entries
3156 + hash = ip_vs_svc_fwm_hashkey(fwmark);
3158 + l = &ip_vs_svc_fwm_table[hash];
3159 + for (e=l->next; e!=l; e=e->next) {
3160 + svc = list_entry(e, struct ip_vs_service, f_list);
3161 + if (svc->fwmark == fwmark) {
3170 +struct ip_vs_service *
3171 +ip_vs_lookup_service(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
3173 + struct ip_vs_service *svc;
3175 + read_lock(&__ip_vs_lock);
3179 + * Check the table hashed by fwmark first
3181 + svc = __ip_vs_lookup_svc_fwm(fwmark);
3187 + * Check the table hashed by <protocol,addr,port>
3188 + * first for "full" addressed entries
3190 + svc = __ip_vs_lookup_service(protocol, vaddr, vport);
3193 + && protocol == IPPROTO_TCP
3194 + && atomic_read(&ip_vs_ftpsvc_counter)
3195 + && (vport==FTPDATA || ntohs(vport)>=PROT_SOCK)){
3197 + * Check if ftp service entry exists, the packet
3198 + * might belong to FTP data connections.
3200 + svc = __ip_vs_lookup_service(protocol, vaddr, FTPPORT);
3204 + && atomic_read(&ip_vs_nullsvc_counter)) {
3206 + * Check if the catch-all port (port zero) exists
3208 + svc = __ip_vs_lookup_service(protocol, vaddr, 0);
3212 + read_unlock(&__ip_vs_lock);
3214 + IP_VS_DBG(5, "lookup_service fwm %d %s %u.%u.%u.%u:%d %s\n",
3216 + masq_proto_name(protocol),
3217 + NIPQUAD(vaddr), ntohs(vport),
3218 + svc?"hit":"not hit");
3225 + * Returns hash value for real service
3227 +static __inline__ unsigned
3228 +ip_vs_rs_hashkey(__u32 addr, __u16 port)
3230 + register unsigned porth = ntohs(port);
3232 + return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) & IP_VS_RTAB_MASK;
3236 + * Hashes ip_vs_dest in ip_vs_rtable by proto,addr,port.
3237 + * should be called with locked tables.
3238 + * returns bool success.
3240 +int ip_vs_rs_hash(struct ip_vs_dest *dest)
3244 + if (!list_empty(&dest->d_list)) {
3249 + * Hash by proto,addr,port,
3250 + * which are the parameters of the real service.
3252 + hash = ip_vs_rs_hashkey(dest->addr, dest->port);
3253 + list_add(&dest->d_list, &ip_vs_rtable[hash]);
3259 + * UNhashes ip_vs_dest from ip_vs_rtable.
3260 + * should be called with locked tables.
3261 + * returns bool success.
3263 +int ip_vs_rs_unhash(struct ip_vs_dest *dest)
3266 + * Remove it from the ip_vs_rtable table.
3268 + if (!list_empty(&dest->d_list)) {
3269 + list_del(&dest->d_list);
3270 + INIT_LIST_HEAD(&dest->d_list);
3277 + * Lookup real service by {proto,addr,port} in the real service table.
3279 +struct ip_vs_dest * __ip_vs_lookup_real_service(__u16 protocol,
3280 + __u32 daddr, __u16 dport)
3283 + struct ip_vs_dest *dest;
3284 + struct list_head *l,*e;
3287 + * Check for "full" addressed entries
3288 + * Return the first found entry
3290 + hash = ip_vs_rs_hashkey(daddr, dport);
3292 + l = &ip_vs_rtable[hash];
3293 + for (e=l->next; e!=l; e=e->next) {
3294 + dest = list_entry(e, struct ip_vs_dest, d_list);
3295 + if ((dest->addr == daddr)
3296 + && (dest->port == dport)
3297 + && ((dest->protocol == protocol) || dest->vfwmark)) {
3307 + * Lookup destination by {addr,port} in the given service
3309 +struct ip_vs_dest * ip_vs_lookup_dest(struct ip_vs_service *svc,
3310 + __u32 daddr, __u16 dport)
3312 + struct ip_vs_dest *dest;
3313 + struct list_head *l, *e;
3315 + read_lock_bh(&__ip_vs_lock);
3318 + * Find the destination for the given service
3320 + l = &svc->destinations;
3321 + for (e=l->next; e!=l; e=e->next) {
3322 + dest = list_entry(e, struct ip_vs_dest, n_list);
3323 + if ((dest->addr == daddr) && (dest->port == dport)) {
3325 + read_unlock_bh(&__ip_vs_lock);
3330 + read_unlock_bh(&__ip_vs_lock);
3336 + * Lookup dest by {svc,addr,port} in the destination trash.
3337 + * Called by ip_vs_add_dest with the __ip_vs_lock.
3338 + * The destination trash is used to hold the destinations that are removed
3339 + * from the service table but are still referenced by some masq entries.
3340 + * The reason to add the destination trash is when the dest is temporary
3341 + * down (either by administrator or by monitor program), the dest can be
3342 + * picked back from the trash, the remaining connections to the dest can
3343 + * continue, and the counting information of the dest is also useful for
3346 +struct ip_vs_dest * __ip_vs_get_trash_dest(struct ip_vs_service *svc,
3347 + __u32 daddr, __u16 dport)
3349 + struct ip_vs_dest *dest;
3350 + struct list_head *l, *e;
3353 + * Find the destination in trash
3355 + l = &ip_vs_dest_trash;
3356 + for (e=l->next; e!=l; e=e->next) {
3357 + dest = list_entry(e, struct ip_vs_dest, n_list);
3358 + IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%d still in trash, "
3361 + NIPQUAD(dest->addr), ntohs(dest->port),
3362 + atomic_read(&dest->refcnt));
3363 + if (dest->addr == daddr &&
3364 + dest->port == dport &&
3365 + dest->vfwmark == svc->fwmark &&
3367 + (dest->protocol == svc->protocol &&
3368 + dest->vaddr == svc->addr &&
3369 + dest->vport == svc->port))) {
3375 + * Try to purge the destination from trash if not referenced
3377 + if (atomic_read(&dest->refcnt) == 1) {
3378 + IP_VS_DBG(3, "Remove destination %u/%u.%u.%u.%u:%d "
3381 + NIPQUAD(dest->addr), ntohs(dest->port));
3383 + list_del(&dest->n_list);
3384 + kfree_s(dest, sizeof(*dest));
3392 + * Update a destination in the given service
3394 +void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
3395 + struct ip_masq_ctl *mctl)
3397 + struct ip_vs_user *mm = &mctl->u.vs_user;
3400 + * Set the weight and the flags
3402 + dest->weight = mm->weight;
3403 + dest->masq_flags = mm->masq_flags;
3405 + dest->masq_flags |= IP_MASQ_F_VS;
3406 + dest->masq_flags |= IP_MASQ_F_VS_INACTIVE;
3409 + * Check if local node and update the flags
3411 + if (inet_addr_type(mm->daddr) == RTN_LOCAL) {
3412 + dest->masq_flags = (dest->masq_flags & ~IP_MASQ_F_VS_FWD_MASK)
3413 + | IP_MASQ_F_VS_LOCALNODE;
3417 + * Set the IP_MASQ_F_VS_NO_OUTPUT flag if not masquerading
3419 + if ((dest->masq_flags & IP_MASQ_F_VS_FWD_MASK) != 0) {
3420 + dest->masq_flags |= IP_MASQ_F_VS_NO_OUTPUT;
3423 + * Put the real service in ip_vs_rtable if not present.
3424 + * For now only for NAT!
3426 + ip_vs_rs_hash(dest);
3431 + * Set the dest status flags
3433 + dest->flags |= IP_VS_DEST_F_AVAILABLE;
3439 + * Create a destination for the given service
3441 +struct ip_vs_dest *ip_vs_new_dest(struct ip_vs_service *svc,
3442 + struct ip_masq_ctl *mctl)
3444 + struct ip_vs_dest *dest;
3445 + struct ip_vs_user *mm = &mctl->u.vs_user;
3449 + dest = (struct ip_vs_dest*) kmalloc(sizeof(struct ip_vs_dest),
3451 + if (dest == NULL) {
3452 + IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
3455 + memset(dest, 0, sizeof(struct ip_vs_dest));
3457 + dest->protocol = svc->protocol;
3458 + dest->vaddr = svc->addr;
3459 + dest->vport = svc->port;
3460 + dest->vfwmark = svc->fwmark;
3461 + dest->addr = mm->daddr;
3462 + dest->port = mm->dport;
3464 + atomic_set(&dest->activeconns, 0);
3465 + atomic_set(&dest->inactconns, 0);
3466 + atomic_set(&dest->refcnt, 0);
3468 + INIT_LIST_HEAD(&dest->d_list);
3469 + dest->stats.lock = SPIN_LOCK_UNLOCKED;
3470 + __ip_vs_update_dest(svc, dest, mctl);
3479 + * Add a destination into an existing service
3481 +int ip_vs_add_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl)
3483 + struct ip_vs_dest *dest;
3484 + struct ip_vs_user *mm = &mctl->u.vs_user;
3485 + __u32 daddr = mm->daddr;
3486 + __u16 dport = mm->dport;
3490 + if (mm->weight < 0) {
3491 + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
3496 + * Check if the dest already exists in the list
3498 + dest = ip_vs_lookup_dest(svc, daddr, dport);
3499 + if (dest != NULL) {
3500 + IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
3504 + write_lock_bh(&__ip_vs_lock);
3507 + * Check if the dest already exists in the trash and
3508 + * is from the same service
3510 + dest = __ip_vs_get_trash_dest(svc, daddr, dport);
3511 + if (dest != NULL) {
3512 + IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%d from trash, "
3513 + "refcnt=%d, service %u.%u.%u.%u:%d\n",
3514 + NIPQUAD(daddr), ntohs(dport),
3515 + atomic_read(&dest->refcnt),
3516 + NIPQUAD(dest->vaddr),
3517 + ntohs(dest->vport));
3520 + * Get the destination from the trash
3522 + list_del(&dest->n_list);
3523 + list_add(&dest->n_list, &svc->destinations);
3525 + __ip_vs_update_dest(svc, dest, mctl);
3527 + write_unlock_bh(&__ip_vs_lock);
3532 + * Allocate and initialize the dest structure
3534 + dest = ip_vs_new_dest(svc, mctl);
3535 + if (dest == NULL) {
3536 + write_unlock_bh(&__ip_vs_lock);
3537 + IP_VS_ERR("ip_vs_add_dest(): out of memory\n");
3542 + * Add the dest entry into the list
3544 + list_add(&dest->n_list, &svc->destinations);
3545 + atomic_inc(&dest->refcnt);
3547 + write_unlock_bh(&__ip_vs_lock);
3555 + * Edit a destination in the given service
3557 +int ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl)
3559 + struct ip_vs_dest *dest;
3560 + struct ip_vs_user *mm = &mctl->u.vs_user;
3561 + __u32 daddr = mm->daddr;
3562 + __u16 dport = mm->dport;
3566 + if (mm->weight < 0) {
3567 + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
3572 + * Lookup the destination list
3574 + dest = ip_vs_lookup_dest(svc, daddr, dport);
3575 + if (dest == NULL) {
3576 + IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
3580 + write_lock_bh(&__ip_vs_lock);
3582 + __ip_vs_update_dest(svc, dest, mctl);
3584 + write_unlock_bh(&__ip_vs_lock);
3592 + * Delete a destination from the given service
3594 +void __ip_vs_del_dest(struct ip_vs_dest *dest)
3596 + dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
3599 + * Remove it from the d-linked destination list.
3601 + list_del(&dest->n_list);
3604 + * Remove it from the d-linked list with the real services.
3606 + ip_vs_rs_unhash(dest);
3609 + * Decrease the refcnt of the dest, and free the dest
3610 + * if nobody refers to it (refcnt=0). Otherwise, throw
3611 + * the destination into the trash.
3613 + if (atomic_dec_and_test(&dest->refcnt))
3614 + kfree_s(dest, sizeof(*dest));
3616 + IP_VS_DBG(3, "Move dest %u.%u.%u.%u:%d into trash, "
3618 + NIPQUAD(dest->addr), ntohs(dest->port),
3619 + atomic_read(&dest->refcnt));
3620 + list_add(&dest->n_list, &ip_vs_dest_trash);
3621 + atomic_inc(&dest->refcnt);
3625 +int ip_vs_del_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl)
3627 + struct ip_vs_dest *dest;
3628 + struct ip_vs_user *mm = &mctl->u.vs_user;
3629 + __u32 daddr = mm->daddr;
3630 + __u16 dport = mm->dport;
3635 + * Lookup the destination list
3637 + dest = ip_vs_lookup_dest(svc, daddr, dport);
3638 + if (dest == NULL) {
3639 + IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
3643 + write_lock_bh(&__ip_vs_lock);
3646 + * Remove dest from the destination list
3648 + __ip_vs_del_dest(dest);
3651 + * Called the update_service function of its scheduler
3653 + svc->scheduler->update_service(svc);
3655 + write_unlock_bh(&__ip_vs_lock);
3664 + * Add a service into the service hash table
3666 +int ip_vs_add_service(struct ip_masq_ctl *mctl)
3668 + struct ip_vs_user *mm = &mctl->u.vs_user;
3669 + __u16 protocol = mm->protocol;
3670 + __u32 vaddr = mm->vaddr;
3671 + __u16 vport = mm->vport;
3672 + __u32 vfwmark = mm->vfwmark;
3675 + struct ip_vs_scheduler *sched;
3676 + struct ip_vs_service *svc;
3681 + * Lookup the scheduler, by 'mctl->m_tname'
3683 + sched = ip_vs_lookup_scheduler(mctl->m_tname);
3684 + if (sched == NULL) {
3685 + IP_VS_INFO("Scheduler module ip_vs_%s.o not found\n",
3690 + write_lock_bh(&__ip_vs_lock);
3693 + * Check if the service already exists
3696 + svc = __ip_vs_lookup_service(protocol, vaddr, vport);
3698 + svc = __ip_vs_lookup_svc_fwm(vfwmark);
3700 + if (svc != NULL) {
3701 + IP_VS_DBG(1, "ip_vs_add_service: service already exists.\n");
3706 + svc = (struct ip_vs_service*)
3707 + kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
3708 + if (svc == NULL) {
3709 + IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
3713 + memset(svc, 0, sizeof(struct ip_vs_service));
3715 + svc->protocol = protocol;
3716 + svc->addr = vaddr;
3717 + svc->port = vport;
3718 + svc->fwmark = vfwmark;
3719 + svc->flags = mm->vs_flags;
3720 + svc->timeout = mm->timeout;
3721 + svc->netmask = mm->netmask;
3723 + INIT_LIST_HEAD(&svc->destinations);
3724 + svc->stats.lock = SPIN_LOCK_UNLOCKED;
3727 + * Bind the scheduler
3729 + ip_vs_bind_scheduler(svc, sched);
3732 + * Hash the service into the service table
3734 + ip_vs_svc_hash(svc);
3737 + * Update the virtual service counters
3739 + if (vport == FTPPORT)
3740 + atomic_inc(&ip_vs_ftpsvc_counter);
3741 + else if (vport == 0)
3742 + atomic_inc(&ip_vs_nullsvc_counter);
3745 + write_unlock_bh(&__ip_vs_lock);
3752 + * Edit a service and bind it with a new scheduler
3754 +int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_masq_ctl *mctl)
3756 + struct ip_vs_user *mm = &mctl->u.vs_user;
3757 + struct ip_vs_scheduler *sched;
3762 + * Lookup the scheduler, by 'mctl->m_tname'
3764 + sched = ip_vs_lookup_scheduler(mctl->m_tname);
3765 + if (sched == NULL) {
3766 + IP_VS_INFO("Scheduler module ip_vs_%s.o not found\n",
3771 + write_lock_bh(&__ip_vs_lock);
3774 + * Set the flags and timeout value
3776 + svc->flags = mm->vs_flags | IP_VS_SVC_F_HASHED;
3777 + svc->timeout = mm->timeout;
3778 + svc->netmask = mm->netmask;
3781 + * Unbind the old scheduler
3783 + ip_vs_unbind_scheduler(svc);
3786 + * Bind the new scheduler
3788 + ip_vs_bind_scheduler(svc, sched);
3790 + write_unlock_bh(&__ip_vs_lock);
3798 + * Delete a service from the service list
3800 +int __ip_vs_del_service(struct ip_vs_service *svc)
3802 + struct list_head *l;
3803 + struct ip_vs_dest *dest;
3806 + * Unbind scheduler
3808 + ip_vs_unbind_scheduler(svc);
3811 + * Unlink the whole destination list
3813 + l = &svc->destinations;
3814 + while (l->next != l) {
3815 + dest = list_entry(l->next, struct ip_vs_dest, n_list);
3816 + __ip_vs_del_dest(dest);
3820 + * Unhash it from the service table
3822 + if (ip_vs_svc_unhash(svc)) {
3824 + * Update the virtual service counters
3826 + if (svc->port == FTPPORT)
3827 + atomic_dec(&ip_vs_ftpsvc_counter);
3828 + else if (svc->port == 0)
3829 + atomic_dec(&ip_vs_nullsvc_counter);
3832 + * Free the service
3834 + kfree_s(svc, sizeof(struct ip_vs_service));
3837 + * Called the update_service function of its scheduler
3839 + svc->scheduler->update_service(svc);
3846 +int ip_vs_del_service(struct ip_vs_service *svc)
3853 + write_lock_bh(&__ip_vs_lock);
3855 + __ip_vs_del_service(svc);
3857 + write_unlock_bh(&__ip_vs_lock);
3864 + * Flush all the virtual services
3866 +int ip_vs_flush(void)
3869 + struct ip_vs_service *svc;
3870 + struct list_head *l;
3872 + write_lock_bh(&__ip_vs_lock);
3875 + * Flush the service table hashed by <protocol,addr,port>
3877 + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3878 + l = &ip_vs_svc_table[idx];
3879 + while (l->next != l) {
3880 + svc = list_entry(l->next,struct ip_vs_service,s_list);
3882 + if (__ip_vs_del_service(svc))
3888 + * Flush the service table hashed by fwmark
3890 + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3891 + l = &ip_vs_svc_fwm_table[idx];
3892 + while (l->next != l) {
3893 + svc = list_entry(l->next,struct ip_vs_service,f_list);
3895 + if (__ip_vs_del_service(svc))
3901 + write_unlock_bh(&__ip_vs_lock);
3907 + * Change the connection counter and the flags if the masq state changes
3908 + * Called by the masq_tcp_state function.
3910 +void ip_vs_set_state(struct ip_masq *ms, int new_state)
3912 + struct ip_vs_dest *dest = ms->dest;
3915 + (ms->flags & IP_MASQ_F_VS) && (new_state != ms->state)) {
3916 + if (!(ms->flags & IP_MASQ_F_VS_INACTIVE) &&
3917 + (new_state != IP_MASQ_S_ESTABLISHED)) {
3918 + atomic_dec(&dest->activeconns);
3919 + atomic_inc(&dest->inactconns);
3920 + ms->flags |= IP_MASQ_F_VS_INACTIVE;
3921 + } else if ((ms->flags & IP_MASQ_F_VS_INACTIVE) &&
3922 + (new_state == IP_MASQ_S_ESTABLISHED)) {
3923 + atomic_inc(&dest->activeconns);
3924 + atomic_dec(&dest->inactconns);
3925 + ms->flags &= ~IP_MASQ_F_VS_INACTIVE;
3928 + IP_VS_DBG(8, "Set-state masq fwd:%c s:%s c:%u.%u.%u.%u:%d "
3929 + "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d flg:%X cnt:%d\n",
3930 + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state),
3931 + NIPQUAD(ms->daddr), ntohs(ms->dport),
3932 + NIPQUAD(ms->maddr), ntohs(ms->mport),
3933 + NIPQUAD(ms->saddr), ntohs(ms->sport),
3934 + ms->flags, atomic_read(&ms->refcnt));
3940 + * Bind a masq entry with a virtual service destination
3941 + * Called when a new masq entry is created for VS.
3943 +void ip_vs_bind_masq(struct ip_masq *ms, struct ip_vs_dest *dest)
3945 + ms->flags |= dest->masq_flags;
3949 + * Increase the refcnt counter of the dest.
3951 + atomic_inc(&dest->refcnt);
3953 + IP_VS_DBG(9, "Bind-masq fwd:%c s:%s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
3954 + "d:%u.%u.%u.%u:%d flg:%X cnt:%d destcnt:%d\n",
3955 + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state),
3956 + NIPQUAD(ms->daddr), ntohs(ms->dport),
3957 + NIPQUAD(ms->maddr), ntohs(ms->mport),
3958 + NIPQUAD(ms->saddr), ntohs(ms->sport),
3959 + ms->flags, atomic_read(&ms->refcnt),
3960 + atomic_read(&dest->refcnt));
3965 + * Unbind a masq entry with its VS destination
3966 + * Called by the masq_expire function.
3968 +void ip_vs_unbind_masq(struct ip_masq *ms)
3970 + struct ip_vs_dest *dest = ms->dest;
3972 + IP_VS_DBG(9, "Unbind-masq fwd:%c s:%s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
3973 + "d:%u.%u.%u.%u:%d flg:%X cnt:%d destcnt:%d\n",
3974 + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state),
3975 + NIPQUAD(ms->daddr),ntohs(ms->dport),
3976 + NIPQUAD(ms->maddr),ntohs(ms->mport),
3977 + NIPQUAD(ms->saddr),ntohs(ms->sport),
3978 + ms->flags, atomic_read(&ms->refcnt),
3979 + atomic_read(&dest->refcnt));
3983 + * Decrease the inactconns or activeconns counter
3984 + * if it is not a masq template (ms->dport!=0).
3987 + if (ms->flags & IP_MASQ_F_VS_INACTIVE) {
3988 + atomic_dec(&dest->inactconns);
3990 + atomic_dec(&dest->activeconns);
3995 + * Decrease the refcnt of the dest, and free the dest
3996 + * if nobody refers to it (refcnt=0).
3998 + if (atomic_dec_and_test(&dest->refcnt))
3999 + kfree_s(dest, sizeof(*dest));
4005 + * Checking if the destination of a masq template is available.
4006 + * If available, return 1, otherwise return 0 and invalidate this
4009 +int ip_vs_check_template(struct ip_masq *mst)
4011 + struct ip_vs_dest *dest = mst->dest;
4014 + * Checking the dest server status.
4016 + if ((dest == NULL) ||
4017 + !(dest->flags & IP_VS_DEST_F_AVAILABLE)) {
4018 + IP_VS_DBG(9, "check_template: dest not available for prot %s "
4019 + "src %u.%u.%u.%u:%d dest %u.%u.%u.%u:%d -> %X:%X\n",
4020 + masq_proto_name(mst->protocol),
4021 + NIPQUAD(mst->daddr), ntohs(mst->dport),
4022 + NIPQUAD(mst->maddr), ntohs(mst->mport),
4023 + (dest!=NULL)? ntohl(dest->addr):0,
4024 + (dest!=NULL)? ntohs(dest->port):0);
4027 + * Invalidate the masq template
4029 + ip_vs_unhash(mst);
4030 + mst->sport = 65535;
4031 + mst->mport = 65535;
4036 + * Simply decrease the refcnt of the template,
4037 + * don't restart its timer.
4039 + atomic_dec(&mst->refcnt);
4047 + * IPVS persistent scheduling function
4048 + * It creates a masq entry according to its template if exists, or selects
4049 + * a server and creates a masq entry plus a template.
4052 +ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
4054 + struct ip_masq *ms = NULL;
4055 + struct ip_vs_dest *dest;
4056 + const __u16 *portp;
4057 + struct ip_masq *mst;
4058 + __u16 dport; /* destination port to forward */
4059 + __u32 snet; /* source network of the client, after masking */
4061 + portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
4063 + /* Mask saddr with the netmask to adjust template granularity */
4064 + snet = iph->saddr & svc->netmask;
4066 + IP_VS_DBG(6, "P-schedule: src %u.%u.%u.%u:%d dest %u.%u.%u.%u:%d "
4067 + "snet %u.%u.%u.%u/%u.%u.%u.%u\n",
4068 + NIPQUAD(iph->saddr), ntohs(portp[0]),
4069 + NIPQUAD(iph->daddr), ntohs(portp[1]),
4070 + NIPQUAD(snet), NIPQUAD(svc->netmask));
4073 + * As far as we know, FTP is a very complicated network protocol, and
4074 + * it uses control connection and data connections. For active FTP,
4075 + * FTP server initilize data connection to the client, its source port
4076 + * is often 20. For passive FTP, FTP server tells the clients the port
4077 + * that it passively listens to, and the client issues the data
4078 + * connection. In the tunneling or direct routing mode, the load
4079 + * balancer is on the client-to-server half of connection, the port
4080 + * number is unknown to the load balancer. So, a template masq like
4081 + * <daddr, 0, maddr, 0, saddr, 0> is created for persistent FTP
4082 + * service, and a template like <daddr, 0, maddr, mport, saddr, sport>
4083 + * is created for other persistent services.
4085 + if (portp[1] == svc->port) {
4086 + /* Check if a template already exists */
4087 + if (svc->port != FTPPORT)
4088 + mst = ip_vs_in_get(iph->protocol, snet, 0,
4089 + iph->daddr, portp[1]);
4091 + mst = ip_vs_in_get(iph->protocol, snet, 0,
4094 + if (!mst || !ip_vs_check_template(mst)) {
4096 + * No template found or the dest of the masq
4097 + * template is not available.
4099 + read_lock(&__ip_vs_lock);
4101 + dest = svc->scheduler->schedule(svc, iph);
4102 + if (dest == NULL) {
4103 + IP_VS_DBG(1, "P-schedule: no dest found.\n");
4104 + read_unlock(&__ip_vs_lock);
4109 + * Create a template like <protocol,daddr,0,
4110 + * maddr,mport,saddr,sport> for non-ftp service,
4111 + * and <protocol,daddr,0,maddr,0,saddr,0>
4112 + * for ftp service.
4114 + if (svc->port != FTPPORT)
4115 + mst = ip_masq_new_vs(iph->protocol,
4116 + iph->daddr, portp[1],
4117 + dest->addr, dest->port,
4121 + mst = ip_masq_new_vs(iph->protocol,
4126 + if (mst == NULL) {
4127 + IP_VS_ERR("ip_masq_new_vs template failed\n");
4128 + read_unlock(&__ip_vs_lock);
4133 + * Bind the template with dest and set timeout.
4135 + ip_vs_bind_masq(mst, dest);
4136 + mst->timeout = svc->timeout;
4138 + read_unlock(&__ip_vs_lock);
4141 + * Template found and its destination is available.
4146 + * Delete its timer so that it can be put back.
4148 + del_sltimer(&mst->timer);
4150 + dport = dest->port;
4153 + * Note: persistent fwmark-based services and persistent
4154 + * port zero service are handled here.
4155 + * fwmark template: <IPPROTO_IP,daddr,0,fwmark,0,saddr,0>
4156 + * port zero template: <protocol,daddr,0,maddr,0,saddr,0>
4159 + mst = ip_vs_in_get(IPPROTO_IP, snet, 0,
4160 + htonl(svc->fwmark), 0);
4162 + mst = ip_vs_in_get(iph->protocol,
4163 + snet, 0, iph->daddr, 0);
4165 + if (!mst || !ip_vs_check_template(mst)) {
4167 + * If it is not persistent port zero, return NULL.
4172 + read_lock(&__ip_vs_lock);
4174 + dest = svc->scheduler->schedule(svc, iph);
4175 + if (dest == NULL) {
4176 + IP_VS_DBG(1, "P-schedule: no dest found.\n");
4177 + read_unlock(&__ip_vs_lock);
4182 + * Create a template according to the service
4185 + mst = ip_masq_new_vs(IPPROTO_IP,
4186 + htonl(svc->fwmark), 0,
4191 + mst = ip_masq_new_vs(iph->protocol,
4196 + if (mst == NULL) {
4197 + IP_VS_ERR("ip_masq_new_vs template failed\n");
4198 + read_unlock(&__ip_vs_lock);
4203 + * Bind the template with dest and set timeout.
4205 + ip_vs_bind_masq(mst, dest);
4206 + mst->timeout = svc->timeout;
4207 + read_unlock(&__ip_vs_lock);
4212 + * Delete its timer so that it can be put back.
4214 + del_sltimer(&mst->timer);
4220 + * Create a new masq according to the template
4222 + ms = ip_masq_new_vs(iph->protocol,
4223 + iph->daddr, portp[1],
4224 + dest->addr, dport,
4225 + iph->saddr, portp[0],
4228 + IP_VS_ERR("ip_masq_new_vs failed\n");
4234 + * Bind the masq entry with the vs dest.
4236 + ip_vs_bind_masq(ms, dest);
4239 + * Increase the inactive connection counter
4240 + * because it is in Syn-Received
4241 + * state (inactive) when the masq is created.
4243 + atomic_inc(&dest->inactconns);
4248 + ip_masq_control_add(ms, mst);
4256 + * IPVS main scheduling function
4257 + * It selects a server according to the virtual service, and
4258 + * creates a masq entry.
4260 +struct ip_masq *ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph)
4262 + struct ip_masq *ms = NULL;
4263 + struct ip_vs_dest *dest;
4264 + const __u16 *portp;
4267 + * Persistent service
4269 + if (svc->flags & IP_VS_SVC_F_PERSISTENT)
4270 + return ip_vs_sched_persist(svc, iph);
4273 + * Non-persistent service
4275 + portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
4276 + if (!svc->fwmark && portp[1] != svc->port) {
4278 + IP_VS_ERR("Schedule: port zero only supported in persistent services, check your ipvs configuration\n");
4282 + read_lock(&__ip_vs_lock);
4284 + dest = svc->scheduler->schedule(svc, iph);
4285 + if (dest == NULL) {
4286 + IP_VS_DBG(1, "Schedule: no dest found.\n");
4287 + read_unlock(&__ip_vs_lock);
4292 + * Create a masquerading entry.
4294 + ms = ip_masq_new_vs(iph->protocol,
4295 + iph->daddr, portp[1],
4296 + dest->addr, dest->port?dest->port:portp[1],
4297 + iph->saddr, portp[0],
4300 + IP_VS_ERR("Schedule: ip_masq_new_vs failed\n");
4301 + read_unlock(&__ip_vs_lock);
4306 + * Bind the masq entry with the vs dest.
4308 + ip_vs_bind_masq(ms, dest);
4311 + * Increase the inactive connection counter because it is in
4312 + * Syn-Received state (inactive) when the masq is created.
4314 + atomic_inc(&dest->inactconns);
4316 + IP_VS_DBG(9, "Schedule masq fwd:%c s:%s c:%u.%u.%u.%u:%d "
4317 + "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d flg:%X cnt:%d\n",
4318 + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state),
4319 + NIPQUAD(ms->daddr),ntohs(ms->dport),
4320 + NIPQUAD(ms->maddr),ntohs(ms->mport),
4321 + NIPQUAD(ms->saddr),ntohs(ms->sport),
4322 + ms->flags, atomic_read(&ms->refcnt));
4324 + read_unlock(&__ip_vs_lock);
4331 + * Pass or drop the packet.
4332 + * Called by ip_fw_demasquerade, when the virtual service is available but
4333 + * no destination is available for a new connection.
4335 +int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb)
4337 + struct iphdr *iph = skb->nh.iph;
4338 + __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
4341 + * When the virtual ftp service is presented, packets destined
4342 + * for other services on the VIP may get here (except services
4343 + * listed in the ipvs table), pass the packets, because it is
4344 + * not ipvs job to decide to drop the packets.
4346 + if ((svc->port == FTPPORT) && (portp[1] != FTPPORT))
4350 + * Notify the client that the destination is unreachable, and
4351 + * release the socket buffer.
4352 + * Since it is in IP layer, the TCP socket is not actually
4353 + * created, the TCP RST packet cannot be sent, instead that
4354 + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
4356 + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
4363 + * IPVS user control entry
4365 +int ip_vs_ctl(int optname, struct ip_masq_ctl *mctl, int optlen)
4367 + struct ip_vs_service *svc = NULL;
4368 + struct ip_vs_user *mm = &mctl->u.vs_user;
4369 + __u32 vaddr = mm->vaddr;
4370 + __u16 vport = mm->vport;
4371 + int proto_num = masq_proto_num(mm->protocol);
4374 + * Check the size of mctl, no overflow...
4376 + if (optlen != sizeof(*mctl))
4380 + * Flush all the virtual service...
4382 + if (mctl->m_cmd == IP_MASQ_CMD_FLUSH)
4383 + return ip_vs_flush();
4386 + * Check for valid protocol: TCP or UDP
4388 + if (mm->vfwmark == 0 && (proto_num < 0 || proto_num > 1)) {
4389 + IP_VS_INFO("vs_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s",
4390 + ntohs(mm->protocol),
4391 + NIPQUAD(vaddr), ntohs(vport), mctl->m_tname);
4396 + * Lookup the exact service by (protocol, vaddr, vport)
4398 + read_lock(&__ip_vs_lock);
4400 + if (mm->vfwmark == 0)
4401 + svc = __ip_vs_lookup_service(mm->protocol, vaddr, vport);
4403 + svc = __ip_vs_lookup_svc_fwm(mm->vfwmark);
4405 + read_unlock(&__ip_vs_lock);
4407 + switch (mctl->m_cmd) {
4408 + case IP_MASQ_CMD_ADD:
4412 + return ip_vs_add_service(mctl);
4414 + case IP_MASQ_CMD_SET:
4418 + return ip_vs_edit_service(svc, mctl);
4420 + case IP_MASQ_CMD_DEL:
4424 + return ip_vs_del_service(svc);
4426 + case IP_MASQ_CMD_ADD_DEST:
4430 + return ip_vs_add_dest(svc, mctl);
4432 + case IP_MASQ_CMD_SET_DEST:
4436 + return ip_vs_edit_dest(svc, mctl);
4438 + case IP_MASQ_CMD_DEL_DEST:
4442 + return ip_vs_del_dest(svc, mctl);
4448 +#ifdef CONFIG_SYSCTL
4450 +static int ip_vs_sysctl_defense_mode(ctl_table *ctl, int write,
4451 + struct file * filp,void *buffer, size_t *lenp)
4453 + int *valp = ctl->data;
4457 + ret = proc_dointvec(ctl, write, filp, buffer, lenp);
4458 + if (write && (*valp != val)) {
4459 + if ((*valp < 0) || (*valp > 3)) {
4460 + /* Restore the correct value */
4463 + update_defense_level();
4469 +ctl_table ipv4_vs_table[] = {
4470 +#ifdef CONFIG_IP_VS_DEBUG
4471 + {NET_IPV4_VS_DEBUG_LEVEL, "debug_level",
4472 + &sysctl_ip_vs_debug_level, sizeof(int), 0644, NULL,
4475 + {NET_IPV4_VS_AMEMTHRESH, "amemthresh",
4476 + &sysctl_ip_vs_amemthresh, sizeof(int), 0644, NULL,
4478 + {NET_IPV4_VS_AMDROPRATE, "am_droprate",
4479 + &sysctl_ip_vs_am_droprate, sizeof(int), 0644, NULL,
4481 + {NET_IPV4_VS_DROP_ENTRY, "drop_entry",
4482 + &sysctl_ip_vs_drop_entry, sizeof(int), 0644, NULL,
4483 + &ip_vs_sysctl_defense_mode},
4484 + {NET_IPV4_VS_DROP_PACKET, "drop_packet",
4485 + &sysctl_ip_vs_drop_packet, sizeof(int), 0644, NULL,
4486 + &ip_vs_sysctl_defense_mode},
4487 + {NET_IPV4_VS_SECURE_TCP, "secure_tcp",
4488 + &sysctl_ip_vs_secure_tcp, sizeof(int), 0644, NULL,
4489 + &ip_vs_sysctl_defense_mode},
4490 + {NET_IPV4_VS_TO_ES, "timeout_established",
4491 + &masq_timeout_table_dos.timeout[IP_MASQ_S_ESTABLISHED],
4492 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4493 + {NET_IPV4_VS_TO_SS, "timeout_synsent",
4494 + &masq_timeout_table_dos.timeout[IP_MASQ_S_SYN_SENT],
4495 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4496 + {NET_IPV4_VS_TO_SR, "timeout_synrecv",
4497 + &masq_timeout_table_dos.timeout[IP_MASQ_S_SYN_RECV],
4498 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4499 + {NET_IPV4_VS_TO_FW, "timeout_finwait",
4500 + &masq_timeout_table_dos.timeout[IP_MASQ_S_FIN_WAIT],
4501 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4502 + {NET_IPV4_VS_TO_TW, "timeout_timewait",
4503 + &masq_timeout_table_dos.timeout[IP_MASQ_S_TIME_WAIT],
4504 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4505 + {NET_IPV4_VS_TO_CL, "timeout_close",
4506 + &masq_timeout_table_dos.timeout[IP_MASQ_S_CLOSE],
4507 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4508 + {NET_IPV4_VS_TO_CW, "timeout_closewait",
4509 + &masq_timeout_table_dos.timeout[IP_MASQ_S_CLOSE_WAIT],
4510 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4511 + {NET_IPV4_VS_TO_LA, "timeout_lastack",
4512 + &masq_timeout_table_dos.timeout[IP_MASQ_S_LAST_ACK],
4513 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4514 + {NET_IPV4_VS_TO_LI, "timeout_listen",
4515 + &masq_timeout_table_dos.timeout[IP_MASQ_S_LISTEN],
4516 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4517 + {NET_IPV4_VS_TO_SA, "timeout_synack",
4518 + &masq_timeout_table_dos.timeout[IP_MASQ_S_SYNACK],
4519 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4520 + {NET_IPV4_VS_TO_UDP, "timeout_udp",
4521 + &masq_timeout_table_dos.timeout[IP_MASQ_S_UDP],
4522 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4523 + {NET_IPV4_VS_TO_ICMP, "timeout_icmp",
4524 + &masq_timeout_table_dos.timeout[IP_MASQ_S_ICMP],
4525 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4530 +#ifdef CONFIG_PROC_FS
4532 + * Write the contents of the VS rule table to a PROCfs file.
4534 +static int ip_vs_procinfo(char *buf, char **start, off_t offset,
4535 + int length, int *eof, void *data)
4539 + char temp[64], temp2[32];
4541 + struct ip_vs_service *svc;
4542 + struct ip_vs_dest *dest;
4543 + struct list_head *l, *e, *p, *q;
4546 + * Note: since the length of the buffer is usually the multiple
4547 + * of 512, it is good to use fixed record of the divisor of 512,
4548 + * so that records won't be truncated at buffer boundary.
4551 + if (pos > offset) {
4553 + "IP Virtual Server version %d.%d.%d (size=%d)",
4554 + NVERSION(IP_VS_VERSION_CODE), IP_VS_TAB_SIZE);
4555 + len += sprintf(buf+len, "%-63s\n", temp);
4556 + len += sprintf(buf+len, "%-63s\n",
4557 + "Prot LocalAddress:Port Scheduler Flags");
4558 + len += sprintf(buf+len, "%-63s\n",
4559 + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn");
4562 + read_lock_bh(&__ip_vs_lock);
4564 + /* print the service table hashed by <protocol,addr,port> */
4565 + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4566 + l = &ip_vs_svc_table[idx];
4567 + for (e=l->next; e!=l; e=e->next) {
4568 + svc = list_entry(e, struct ip_vs_service, s_list);
4570 + if (pos > offset) {
4571 + if (svc->flags & IP_VS_SVC_F_PERSISTENT)
4572 + sprintf(temp2, "persistent %d %08X",
4574 + ntohl(svc->netmask));
4578 + sprintf(temp, "%s %08X:%04X %s %s",
4579 + masq_proto_name(svc->protocol),
4582 + svc->scheduler->name, temp2);
4583 + len += sprintf(buf+len, "%-63s\n", temp);
4584 + if (len >= length)
4588 + p = &svc->destinations;
4589 + for (q=p->next; q!=p; q=q->next) {
4590 + dest = list_entry(q, struct ip_vs_dest, n_list);
4592 + if (pos <= offset)
4595 + " -> %08X:%04X %-7s %-6d %-10d %-10d",
4596 + ntohl(dest->addr),
4597 + ntohs(dest->port),
4598 + ip_vs_fwd_name(dest->masq_flags),
4600 + atomic_read(&dest->activeconns),
4601 + atomic_read(&dest->inactconns));
4602 + len += sprintf(buf+len, "%-63s\n", temp);
4603 + if (len >= length)
4609 + /* print the service table hashed by fwmark */
4610 + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4611 + l = &ip_vs_svc_fwm_table[idx];
4612 + for (e=l->next; e!=l; e=e->next) {
4613 + svc = list_entry(e, struct ip_vs_service, f_list);
4615 + if (pos > offset) {
4616 + if (svc->flags & IP_VS_SVC_F_PERSISTENT)
4617 + sprintf(temp2, "persistent %d %08X",
4619 + ntohl(svc->netmask));
4623 + sprintf(temp, "FWM %08X %s %s",
4625 + svc->scheduler->name, temp2);
4626 + len += sprintf(buf+len, "%-63s\n", temp);
4627 + if (len >= length)
4631 + p = &svc->destinations;
4632 + for (q=p->next; q!=p; q=q->next) {
4633 + dest = list_entry(q, struct ip_vs_dest, n_list);
4635 + if (pos <= offset)
4638 + " -> %08X:%04X %-7s %-6d %-10d %-10d",
4639 + ntohl(dest->addr),
4640 + ntohs(dest->port),
4641 + ip_vs_fwd_name(dest->masq_flags),
4643 + atomic_read(&dest->activeconns),
4644 + atomic_read(&dest->inactconns));
4645 + len += sprintf(buf+len, "%-63s\n", temp);
4646 + if (len >= length)
4653 + read_unlock_bh(&__ip_vs_lock);
4655 + *start = buf+len-(pos-offset); /* Start of wanted data */
4664 +struct proc_dir_entry ip_vs_proc_entry = {
4665 + 0, /* dynamic inode */
4666 + 2, "vs", /* namelen and name */
4667 + S_IFREG | S_IRUGO, /* mode */
4668 + 1, 0, 0, 0, /* nlinks, owner, group, size */
4669 + &proc_net_inode_operations, /* operations */
4670 + NULL, /* get_info */
4671 + NULL, /* fill_inode */
4672 + NULL, NULL, NULL, /* next, parent, subdir */
4674 + &ip_vs_procinfo, /* function to generate proc data */
4679 + * Write the IPVS statistic information to a PROCfs file.
4681 +struct ip_vs_stats ip_vs_stats = {SPIN_LOCK_UNLOCKED, 0, 0};
4684 +ip_vs_stats_get_info(char *buf, char **start, off_t offset,
4685 + int length, int *eof, void *data)
4691 + struct ip_vs_service *svc;
4692 + struct ip_vs_dest *dest;
4693 + struct list_head *l, *e, *p, *q;
4696 + if (pos > offset) {
4697 + len += sprintf(buf+len, "%-63s\n",
4698 +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
4699 + "TotalConns InPkts OutPkts InBytes OutBytes");
4700 + spin_lock(&ip_vs_stats.lock);
4701 + sprintf(temp, " %8X %8X %8X %8X%08X %8X%08X",
4702 + ip_vs_stats.conns,
4703 + ip_vs_stats.inpkts,
4704 + ip_vs_stats.outpkts,
4705 + (__u32)(ip_vs_stats.inbytes >> 32),
4706 + (__u32)ip_vs_stats.inbytes,
4707 + (__u32)(ip_vs_stats.outbytes >> 32),
4708 + (__u32)ip_vs_stats.outbytes);
4709 + spin_unlock(&ip_vs_stats.lock);
4710 + len += sprintf(buf+len, "%-63s\n", temp);
4713 + read_lock_bh(&__ip_vs_lock);
4715 + /* print the service statistics */
4717 + if (pos > offset) {
4718 + len += sprintf(buf+len, "%-127s\n",
4719 + "\nVirtual Service\n"
4720 + "Pro VirtService Conns InPkts OutPkts InBytes OutBytes");
4723 + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4724 + l = &ip_vs_svc_table[idx];
4725 + for (e=l->next; e!=l; e=e->next) {
4726 + svc = list_entry(e, struct ip_vs_service, s_list);
4728 + if (pos <= offset)
4730 + spin_lock(&svc->stats.lock);
4731 + sprintf(temp, "%3s %08X:%04X %8X %8X %8X %8X%08X %8X%08X",
4732 + masq_proto_name(svc->protocol),
4736 + svc->stats.inpkts,
4737 + svc->stats.outpkts,
4738 + (__u32)(svc->stats.inbytes >> 32),
4739 + (__u32)svc->stats.inbytes,
4740 + (__u32)(svc->stats.outbytes >> 32),
4741 + (__u32)svc->stats.outbytes);
4742 + spin_unlock(&svc->stats.lock);
4743 + len += sprintf(buf+len, "%-127s\n", temp);
4744 + if (pos >= offset+length)
4749 + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4750 + l = &ip_vs_svc_fwm_table[idx];
4751 + for (e=l->next; e!=l; e=e->next) {
4752 + svc = list_entry(e, struct ip_vs_service, f_list);
4754 + if (pos <= offset)
4756 + spin_lock(&svc->stats.lock);
4757 + sprintf(temp, "FWM %08X %8X %8X %8X %8X%08X %8X%08X",
4760 + svc->stats.inpkts,
4761 + svc->stats.outpkts,
4762 + (__u32)(svc->stats.inbytes >> 32),
4763 + (__u32)svc->stats.inbytes,
4764 + (__u32)(svc->stats.outbytes >> 32),
4765 + (__u32)svc->stats.outbytes);
4766 + spin_unlock(&svc->stats.lock);
4767 + len += sprintf(buf+len, "%-127s\n", temp);
4768 + if (pos >= offset+length)
4773 + /* print the real server statistics */
4775 + if (pos > offset) {
4776 + len += sprintf(buf+len, "%-127s\n",
4777 + "\nReal Service\n"
4778 + "Pro VirtService RealService Conns InPkts OutPkts InBytes OutBytes");
4781 + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4782 + l = &ip_vs_svc_table[idx];
4783 + for (e=l->next; e!=l; e=e->next) {
4784 + svc = list_entry(e, struct ip_vs_service, s_list);
4785 + p = &svc->destinations;
4786 + for (q=p->next; q!=p; q=q->next) {
4787 + dest = list_entry(q, struct ip_vs_dest, n_list);
4789 + if (pos <= offset)
4791 + spin_lock(&dest->stats.lock);
4793 + "%3s %08X:%04X %08X:%04X %8X %8X %8X %8X%08X %8X%08X",
4794 + masq_proto_name(svc->protocol),
4797 + ntohl(dest->addr),
4798 + ntohs(dest->port),
4799 + dest->stats.conns,
4800 + dest->stats.inpkts,
4801 + dest->stats.outpkts,
4802 + (__u32)(dest->stats.inbytes >> 32),
4803 + (__u32)dest->stats.inbytes,
4804 + (__u32)(dest->stats.outbytes >> 32),
4805 + (__u32)dest->stats.outbytes);
4806 + spin_unlock(&dest->stats.lock);
4807 + len += sprintf(buf+len, "%-127s\n", temp);
4808 + if (pos >= offset+length)
4814 + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4815 + l = &ip_vs_svc_fwm_table[idx];
4816 + for (e=l->next; e!=l; e=e->next) {
4817 + svc = list_entry(e, struct ip_vs_service, f_list);
4818 + p = &svc->destinations;
4819 + for (q=p->next; q!=p; q=q->next) {
4820 + dest = list_entry(q,struct ip_vs_dest,n_list);
4822 + if (pos <= offset)
4824 + spin_lock(&dest->stats.lock);
4826 + "FWM %08X %08X:%04X %8X %8X %8X %8X%08X %8X%08X",
4828 + ntohl(dest->addr),
4829 + ntohs(dest->port),
4830 + dest->stats.conns,
4831 + dest->stats.inpkts,
4832 + dest->stats.outpkts,
4833 + (__u32)(dest->stats.inbytes >> 32),
4834 + (__u32)dest->stats.inbytes,
4835 + (__u32)(dest->stats.outbytes >> 32),
4836 + (__u32)dest->stats.outbytes);
4837 + spin_unlock(&dest->stats.lock);
4838 + len += sprintf(buf+len, "%-127s\n", temp);
4839 + if (pos >= offset+length)
4845 + read_unlock_bh(&__ip_vs_lock);
4847 + *start = buf+len-(pos-offset); /* Start of wanted data */
4856 +struct proc_dir_entry ip_vs_stat_proc_entry = {
4857 + 0, /* dynamic inode */
4858 + 8, "vs_stats", /* namelen and name */
4859 + S_IFREG | S_IRUGO, /* mode */
4860 + 1, 0, 0, 0, /* nlinks, owner, group, size */
4861 + &proc_net_inode_operations, /* operations */
4862 + NULL, /* get_info */
4863 + NULL, /* fill_inode */
4864 + NULL, NULL, NULL, /* next, parent, subdir */
4866 + &ip_vs_stats_get_info, /* function to generate proc data */
4873 + * This function encapsulates the packet in a new IP header, its destination
4874 + * will be set to the daddr. Most code of this function is from ipip.c.
4876 + * It is called in the ip_vs_forward() function. The load balancer
4877 + * selects a real server from a cluster based on a scheduling algorithm,
4878 + * encapsulates the packet and forwards it to the selected server. All real
4879 + * servers are configured with "ifconfig tunl0 <Virtual IP Address> up".
4880 + * When the server receives the encapsulated packet, it decapsulates the
4881 + * packet, processes the request and return the reply packets directly to
4882 + * the client without passing the load balancer. This can greatly
4883 + * increase the scalability of virtual server.
4885 + * if succeeded, return 1; otherwise, return 0.
4888 +int ip_vs_tunnel_xmit(struct sk_buff *skb, __u32 daddr)
4890 + struct rtable *rt; /* Route to the other host */
4891 + struct device *tdev; /* Device to other host */
4892 + struct iphdr *old_iph = skb->nh.iph;
4893 + u8 tos = old_iph->tos;
4894 + u16 df = old_iph->frag_off;
4895 + struct iphdr *iph; /* Our new IP header */
4896 + int max_headroom; /* The extra header space needed */
4901 + if (skb->protocol != __constant_htons(ETH_P_IP)) {
4902 + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): protocol error, ETH_P_IP: %d, skb protocol: %d\n",
4903 + __constant_htons(ETH_P_IP),skb->protocol);
4907 + if (ip_route_output(&rt, dst, src, RT_TOS(tos), 0)) {
4908 + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): route error, dest: "
4909 + "%u.%u.%u.%u\n", NIPQUAD(dst));
4910 + goto tx_error_icmp;
4912 + tdev = rt->u.dst.dev;
4914 + mtu = rt->u.dst.pmtu - sizeof(struct iphdr);
4917 + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): mtu less than 68\n");
4920 + if (skb->dst && mtu < skb->dst->pmtu)
4921 + skb->dst->pmtu = mtu;
4923 + df |= (old_iph->frag_off&__constant_htons(IP_DF));
4925 + if ((old_iph->frag_off&__constant_htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
4926 + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
4928 + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): frag needed\n");
4932 + skb->h.raw = skb->nh.raw;
4935 + * Okay, now see if we can stuff it in the buffer as-is.
4937 + max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
4939 + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
4940 + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
4944 + IP_VS_ERR("ip_vs_tunnel_xmit(): no memory for new_skb\n");
4951 + skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
4952 + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
4953 + dst_release(skb->dst);
4954 + skb->dst = &rt->u.dst;
4957 + * Push down and install the IPIP header.
4960 + iph = skb->nh.iph;
4962 + iph->ihl = sizeof(struct iphdr)>>2;
4963 + iph->frag_off = df;
4964 + iph->protocol = IPPROTO_IPIP;
4966 + iph->daddr = rt->rt_dst;
4967 + iph->saddr = rt->rt_src;
4968 + iph->ttl = old_iph->ttl;
4969 + iph->tot_len = htons(skb->len);
4970 + iph->id = htons(ip_id_count++);
4971 + ip_send_check(iph);
4973 + IPCB(skb)->flags |= IPSKB_REDIRECTED;
4974 + IPCB(skb)->flags |= IPSKB_MASQUERADED;
4980 + dst_link_failure(skb);
4990 +int ip_vs_dr_xmit(struct sk_buff *skb, __u32 daddr)
4992 + struct rtable *rt; /* Route to the other host */
4993 + struct iphdr *iph = skb->nh.iph;
4994 + u8 tos = iph->tos;
4997 + if (ip_route_output(&rt, daddr, 0, RT_TOS(tos), 0)) {
4998 + IP_VS_DBG(0, "ip_vs_dr_xmit(): route error, dest: %u.%u.%u.%u\n",
5000 + goto tx_error_icmp;
5003 + /* MTU checking */
5004 + mtu = rt->u.dst.pmtu;
5005 + if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
5006 + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
5008 + IP_VS_DBG(0, "ip_vs_dr_xmit(): frag needed\n");
5012 + dst_release(skb->dst);
5013 + skb->dst = &rt->u.dst;
5015 + IPCB(skb)->flags |= IPSKB_REDIRECTED;
5016 + IPCB(skb)->flags |= IPSKB_MASQUERADED;
5022 + dst_link_failure(skb);
5030 + * Initialize IP virtual server
5032 +__initfunc(int ip_vs_init(void))
5037 + * Allocate the ip_vs_table and initialize its list head.
5038 + * Initilize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable,
5039 + * ip_vs_schedulers and ip_vs_dest_trash.
5041 + if (!(ip_vs_table =
5042 + vmalloc(IP_VS_TAB_SIZE*sizeof(struct list_head)))) {
5045 + for(idx = 0; idx < IP_VS_TAB_SIZE; idx++) {
5046 + INIT_LIST_HEAD(&ip_vs_table[idx]);
5048 + IP_VS_INFO("Connection hash table configured "
5049 + "(size=%d, memory=%ldKbytes)\n",
5051 + (long) (IP_VS_TAB_SIZE*sizeof(struct list_head))/1024);
5053 + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
5054 + INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
5055 + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
5057 + for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
5058 + INIT_LIST_HEAD(&ip_vs_rtable[idx]);
5060 + INIT_LIST_HEAD(&ip_vs_schedulers);
5061 + INIT_LIST_HEAD(&ip_vs_dest_trash);
5064 + * Hook the slow_timer handler in the system timer.
5066 + slow_timer.expires = jiffies+SLTIMER_PERIOD;
5067 + add_timer(&slow_timer);
5069 +#ifdef CONFIG_PROC_FS
5070 + ip_masq_proc_register(&ip_vs_proc_entry);
5071 + ip_masq_proc_register(&ip_vs_stat_proc_entry);
5074 +#ifdef CONFIG_IP_MASQUERADE_VS_RR
5077 +#ifdef CONFIG_IP_MASQUERADE_VS_WRR
5080 +#ifdef CONFIG_IP_MASQUERADE_VS_LC
5083 +#ifdef CONFIG_IP_MASQUERADE_VS_WLC
5086 +#ifdef CONFIG_IP_MASQUERADE_VS_LBLC
5087 + ip_vs_lblc_init();
5089 +#ifdef CONFIG_IP_MASQUERADE_VS_LBLCR
5090 + ip_vs_lblcr_init();
5094 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lblc.c linux-2.2.19-vs-1.0.7/net/ipv4/ip_vs_lblc.c
5095 --- linux-2.2.19/net/ipv4/ip_vs_lblc.c Thu Jan 1 08:00:00 1970
5096 +++ linux-2.2.19-vs-1.0.7/net/ipv4/ip_vs_lblc.c Fri Feb 2 18:49:08 2001
5099 + * IPVS: Locality-Based Least-Connection scheduling module
5103 + * Authors: Wensong Zhang <wensong@gnuchina.org>
5105 + * This program is free software; you can redistribute it and/or
5106 + * modify it under the terms of the GNU General Public License
5107 + * as published by the Free Software Foundation; either version
5108 + * 2 of the License, or (at your option) any later version.
5111 + * Martin Hamilton : fixed the terrible locking bugs
5112 + * *lock(tbl->lock) ==> *lock(&tbl->lock)
5113 + * Wensong Zhang : fixed the uninitilized tbl->lock bug
5114 + * Wensong Zhang : added doing full expiration check to
5115 + * collect stale entries of 24+ hours when
5116 + * no partial expire check in a half hour
5121 + * The lblc algorithm is as follows (pseudo code):
5123 + * if cachenode[dest_ip] is null then
5124 + * n, cachenode[dest_ip] <- {weighted least-conn node};
5126 + * n <- cachenode[dest_ip];
5127 + * if (n is dead) OR
5128 + * (n.conns>n.weight AND
5129 + * there is a node m with m.conns<m.weight/2) then
5130 + * n, cachenode[dest_ip] <- {weighted least-conn node};
5134 + * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
5135 + * me to write this module.
5138 +#include <linux/config.h>
5139 +#include <linux/module.h>
5141 +#include <linux/kmod.h>
5143 +#include <linux/types.h>
5144 +#include <linux/kernel.h>
5145 +#include <linux/errno.h>
5146 +#include <linux/vmalloc.h>
5147 +#include <net/ip_masq.h>
5148 +#ifdef CONFIG_IP_MASQUERADE_MOD
5149 +#include <net/ip_masq_mod.h>
5151 +#include <linux/sysctl.h>
5152 +#include <linux/proc_fs.h>
5153 +#include <linux/ip_fw.h>
5154 +#include <net/ip_vs.h>
5158 + * It is for garbage collection of stale IPVS lblc entries,
5159 + * when the table is full.
5161 +#define CHECK_EXPIRE_INTERVAL (60*HZ)
5162 +#define ENTRY_TIMEOUT (5*60*HZ)
5165 + * It is for full expiration check.
5166 + * When there is no partial expiration check (garbage collection)
5167 + * in a half hour, do a full expiration check to collect stale
5168 + * entries that haven't been touched for a day (by default).
5170 +#define COUNT_FOR_FULL_EXPIRATION 30
5171 +int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
5175 + * for IPVS lblc entry hash table
5177 +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
5178 +#define CONFIG_IP_VS_LBLC_TAB_BITS 10
5180 +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
5181 +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
5182 +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
5186 + * IPVS lblc entry represents an association between destination
5187 + * IP address and its destination server
5189 +struct ip_vs_lblc_entry {
5190 + struct list_head list;
5191 + __u32 addr; /* destination IP address */
5192 + struct ip_vs_dest *dest; /* real server (cache) */
5193 + unsigned long lastuse; /* last used time */
5198 + * IPVS lblc hash table
5200 +struct ip_vs_lblc_table {
5201 + rwlock_t lock; /* lock for this table */
5202 + struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
5203 + atomic_t entries; /* number of entries */
5204 + int max_size; /* maximum size of entries */
5205 + struct timer_list periodic_timer; /* collect stale entries */
5206 + int rover; /* rover for expire check */
5207 + int counter; /* counter for no expire */
5213 + * IPVS LBLC sysctl table
5215 +struct ip_vs_lblc_sysctl_table {
5216 + struct ctl_table_header *sysctl_header;
5217 + ctl_table vs_vars[2];
5218 + ctl_table vs_dir[2];
5219 + ctl_table ipv4_dir[2];
5220 + ctl_table root_dir[2];
5224 +static struct ip_vs_lblc_sysctl_table lblc_sysctl_table = {
5226 + {{NET_IPV4_VS_LBLC_EXPIRE, "lblc_expiration",
5227 + &sysctl_ip_vs_lblc_expiration,
5228 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
5230 + {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblc_sysctl_table.vs_vars},
5232 + {{NET_IPV4, "ipv4", NULL, 0, 0555, lblc_sysctl_table.vs_dir},
5234 + {{CTL_NET, "net", NULL, 0, 0555, lblc_sysctl_table.ipv4_dir},
5240 + * new/free a ip_vs_lblc_entry, which is a mapping of a destination
5241 + * IP address to a server.
5243 +static inline struct ip_vs_lblc_entry *
5244 +ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest)
5246 + struct ip_vs_lblc_entry *en;
5248 + en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
5250 + IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
5254 + INIT_LIST_HEAD(&en->list);
5257 + atomic_inc(&dest->refcnt);
5264 +static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
5266 + list_del(&en->list);
5267 + atomic_dec(&en->dest->refcnt);
5273 + * Returns hash value for IPVS LBLC entry
5275 +static inline unsigned ip_vs_lblc_hashkey(__u32 addr)
5277 + return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
5282 + * Hash an entry in the ip_vs_lblc_table.
5283 + * returns bool success.
5286 +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
5290 + if (!list_empty(&en->list)) {
5291 + IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
5292 + "called from %p\n", __builtin_return_address(0));
5297 + * Hash by destination IP address
5299 + hash = ip_vs_lblc_hashkey(en->addr);
5301 + write_lock(&tbl->lock);
5302 + list_add(&en->list, &tbl->bucket[hash]);
5303 + atomic_inc(&tbl->entries);
5304 + write_unlock(&tbl->lock);
5312 + * Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
5313 + * returns bool success.
5315 +static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
5316 + struct ip_vs_lblc_entry *en)
5318 + if (list_empty(&en->list)) {
5319 + IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
5320 + "called from %p\n", __builtin_return_address(0));
5325 + * Remove it from the table
5327 + write_lock(&tbl->lock);
5328 + list_del(&en->list);
5329 + INIT_LIST_HEAD(&en->list);
5330 + write_unlock(&tbl->lock);
5338 + * Get ip_vs_lblc_entry associated with supplied parameters.
5340 +static inline struct ip_vs_lblc_entry *
5341 +ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr)
5344 + struct ip_vs_lblc_entry *en;
5345 + struct list_head *l,*e;
5347 + hash = ip_vs_lblc_hashkey(addr);
5349 + read_lock(&tbl->lock);
5351 + l = &tbl->bucket[hash];
5352 + for (e=l->next; e!=l; e=e->next) {
5353 + en = list_entry(e, struct ip_vs_lblc_entry, list);
5354 + if (en->addr == addr) {
5356 + read_unlock(&tbl->lock);
5361 + read_unlock(&tbl->lock);
5368 + * Flush all the entries of the specified table.
5370 +static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
5373 + struct list_head *l;
5374 + struct ip_vs_lblc_entry *en;
5376 + for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
5377 + write_lock(&tbl->lock);
5378 + for (l=&tbl->bucket[i]; l->next!=l; ) {
5379 + en = list_entry(l->next,
5380 + struct ip_vs_lblc_entry, list);
5381 + ip_vs_lblc_free(en);
5382 + atomic_dec(&tbl->entries);
5384 + write_unlock(&tbl->lock);
5389 +static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
5391 + unsigned long now = jiffies;
5393 + struct list_head *l, *e;
5394 + struct ip_vs_lblc_entry *en;
5396 + for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
5397 + j = (j + 1) & IP_VS_LBLC_TAB_MASK;
5398 + e = l = &tbl->bucket[j];
5399 + write_lock(&tbl->lock);
5400 + while (e->next != l) {
5401 + en = list_entry(e->next,
5402 + struct ip_vs_lblc_entry, list);
5403 + if ((now - en->lastuse) <
5404 + sysctl_ip_vs_lblc_expiration) {
5408 + ip_vs_lblc_free(en);
5409 + atomic_dec(&tbl->entries);
5411 + write_unlock(&tbl->lock);
5418 + * Periodical timer handler for IPVS lblc table
5419 + * It is used to collect stale entries when the number of entries
5420 + * exceeds the maximum size of the table.
5422 + * Fixme: we probably need more complicated algorithm to collect
5423 + * entries that have not been used for a long time even
5424 + * if the number of entries doesn't exceed the maximum size
5426 + * The full expiration check is for this purpose now.
5428 +static void ip_vs_lblc_check_expire(unsigned long data)
5430 + struct ip_vs_lblc_table *tbl;
5431 + unsigned long now = jiffies;
5434 + struct list_head *l, *e;
5435 + struct ip_vs_lblc_entry *en;
5437 + tbl = (struct ip_vs_lblc_table *)data;
5439 + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
5440 + /* do full expiration check */
5441 + ip_vs_lblc_full_check(tbl);
5446 + if (atomic_read(&tbl->entries) < tbl->max_size) {
5451 + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
5452 + if (goal > tbl->max_size/2)
5453 + goal = tbl->max_size/2;
5455 + for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
5456 + j = (j + 1) & IP_VS_LBLC_TAB_MASK;
5457 + e = l = &tbl->bucket[j];
5458 + write_lock(&tbl->lock);
5459 + while (e->next != l) {
5460 + en = list_entry(e->next,
5461 + struct ip_vs_lblc_entry, list);
5462 + if ((now - en->lastuse) < ENTRY_TIMEOUT) {
5466 + ip_vs_lblc_free(en);
5467 + atomic_dec(&tbl->entries);
5470 + write_unlock(&tbl->lock);
5477 + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
5481 +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
5484 + struct ip_vs_lblc_table *tbl;
5487 + * Allocate the ip_vs_lblc_table for this service
5489 + tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
5490 + if (tbl == NULL) {
5491 + IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
5494 + svc->sched_data = tbl;
5495 + IP_VS_DBG(0, "LBLC hash table (memory=%dbytes) allocated for "
5496 + "current service\n",
5497 + sizeof(struct ip_vs_lblc_table));
5500 + * Initialize the hash buckets
5502 + for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
5503 + INIT_LIST_HEAD(&tbl->bucket[i]);
5505 + tbl->lock = RW_LOCK_UNLOCKED;
5506 + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
5511 + * Hook periodic timer for garbage collection
5513 + init_timer(&tbl->periodic_timer);
5514 + tbl->periodic_timer.data = (unsigned long)tbl;
5515 + tbl->periodic_timer.function = ip_vs_lblc_check_expire;
5516 + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
5517 + add_timer(&tbl->periodic_timer);
5519 + MOD_INC_USE_COUNT;
5524 +static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
5526 + struct ip_vs_lblc_table *tbl = svc->sched_data;
5528 + /* remove periodic timer */
5529 + del_timer(&tbl->periodic_timer);
5531 + /* got to clean up table entries here */
5532 + ip_vs_lblc_flush(tbl);
5534 + /* release the table itself */
5535 + kfree(svc->sched_data);
5536 + IP_VS_DBG(0, "LBLC hash table (memory=%dbytes) released\n",
5537 + sizeof(struct ip_vs_lblc_table));
5539 + MOD_DEC_USE_COUNT;
5544 +static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
5550 +static inline struct ip_vs_dest *
5551 +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
5553 + register struct list_head *l, *e;
5554 + struct ip_vs_dest *dest, *least;
5557 + l = &svc->destinations;
5562 + * We think the overhead of processing active connections is fifty
5563 + * times than that of inactive conncetions in average. (This fifty
5564 + * times might be not accurate, we will change it later.) We use
5565 + * the following formula to estimate the overhead:
5566 + * dest->activeconns*50 + dest->inactconns
5568 + * (dest overhead) / dest->weight
5570 + * Remember -- no floats in kernel mode!!!
5571 + * The comparison of h1*w2 > h2*w1 is equivalent to that of
5573 + * if every weight is larger than zero.
5575 + * The server with weight=0 is quiesced and will not receive any
5579 + for (e=l->next; e!=l; e=e->next) {
5580 + least = list_entry(e, struct ip_vs_dest, n_list);
5581 + if (least->weight > 0) {
5582 + loh = atomic_read(&least->activeconns) * 50
5583 + + atomic_read(&least->inactconns);
5590 + * Find the destination with the least load.
5593 + for (e=e->next; e!=l; e=e->next)
5595 + dest = list_entry(e, struct ip_vs_dest, n_list);
5596 + doh = atomic_read(&dest->activeconns) * 50
5597 + + atomic_read(&dest->inactconns);
5598 + if (loh * dest->weight > doh * least->weight)
5605 + IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
5606 + "activeconns %d refcnt %d weight %d overhead %d\n",
5607 + NIPQUAD(least->addr), ntohs(least->port),
5608 + atomic_read(&least->activeconns),
5609 + atomic_read(&least->refcnt), least->weight, loh);
5616 + * If this destination server is overloaded and there is a less loaded
5617 + * server, then return true.
5620 +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
5622 + if (atomic_read(&dest->activeconns) > dest->weight) {
5623 + register struct list_head *l, *e;
5624 + struct ip_vs_dest *d;
5626 + l = &svc->destinations;
5627 + for (e=l->next; e!=l; e=e->next) {
5628 + d = list_entry(e, struct ip_vs_dest, n_list);
5629 + if (atomic_read(&d->activeconns)*2 < d->weight) {
5639 + * Locality-Based (weighted) Least-Connection scheduling
5641 +static struct ip_vs_dest *
5642 +ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
5644 + struct ip_vs_dest *dest;
5645 + struct ip_vs_lblc_table *tbl;
5646 + struct ip_vs_lblc_entry *en;
5648 + IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
5650 + tbl = (struct ip_vs_lblc_table *)svc->sched_data;
5651 + en = ip_vs_lblc_get(tbl, iph->daddr);
5653 + dest = __ip_vs_wlc_schedule(svc, iph);
5654 + if (dest == NULL) {
5655 + IP_VS_DBG(1, "no destination available\n");
5658 + en = ip_vs_lblc_new(iph->daddr, dest);
5662 + ip_vs_lblc_hash(tbl, en);
5665 + if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
5666 + || dest->weight <= 0
5667 + || is_overloaded(dest, svc)) {
5668 + dest = __ip_vs_wlc_schedule(svc, iph);
5669 + if (dest == NULL) {
5670 + IP_VS_DBG(1, "no destination available\n");
5673 + atomic_dec(&en->dest->refcnt);
5674 + atomic_inc(&dest->refcnt);
5678 + en->lastuse = jiffies;
5680 + IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
5681 + "--> server %u.%u.%u.%u:%d\n",
5682 + NIPQUAD(en->addr),
5683 + NIPQUAD(dest->addr),
5684 + ntohs(dest->port));
5690 +static struct ip_vs_scheduler ip_vs_lblc_scheduler =
5693 + "lblc", /* name */
5694 + ATOMIC_INIT(0), /* refcnt */
5695 + ip_vs_lblc_init_svc, /* service initializer */
5696 + ip_vs_lblc_done_svc, /* service done */
5697 + ip_vs_lblc_update_svc, /* service updater */
5698 + ip_vs_lblc_schedule, /* select a server from the destination list */
5702 +__initfunc(int ip_vs_lblc_init(void))
5704 + IP_VS_INFO("Initializing LBLC scheduling\n");
5705 + INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
5706 + lblc_sysctl_table.sysctl_header =
5707 + register_sysctl_table(lblc_sysctl_table.root_dir, 0);
5708 + return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
5715 +int init_module(void)
5717 + INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
5719 + /* module initialization by 'request_module' */
5720 + if (register_ip_vs_scheduler(&ip_vs_lblc_scheduler) != 0)
5723 + lblc_sysctl_table.sysctl_header =
5724 + register_sysctl_table(lblc_sysctl_table.root_dir, 0);
5726 + IP_VS_INFO("LBLC scheduling module loaded.\n");
5731 +void cleanup_module(void)
5733 + /* module cleanup by 'release_module' */
5734 + if (unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler) != 0) {
5735 + IP_VS_INFO("cannot remove LBLC scheduling module\n");
5737 + IP_VS_INFO("LBLC scheduling module unloaded.\n");
5739 + unregister_sysctl_table(lblc_sysctl_table.sysctl_header);
5742 +#endif /* MODULE */
5743 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lblcr.c linux-2.2.19-vs-1.0.7/net/ipv4/ip_vs_lblcr.c
5744 --- linux-2.2.19/net/ipv4/ip_vs_lblcr.c Thu Jan 1 08:00:00 1970
5745 +++ linux-2.2.19-vs-1.0.7/net/ipv4/ip_vs_lblcr.c Tue Mar 27 17:37:00 2001
5748 + * IPVS: Locality-Based Least-Connection with Replication scheduler
5752 + * Authors: Wensong Zhang <wensong@gnuchina.org>
5754 + * This program is free software; you can redistribute it and/or
5755 + * modify it under the terms of the GNU General Public License
5756 + * as published by the Free Software Foundation; either version
5757 + * 2 of the License, or (at your option) any later version.
5760 + * Julian Anastasov : Added the missing (dest->weight>0)
5761 + * condition in the ip_vs_dest_set_max.
5766 + * The lblc/r algorithm is as follows (pseudo code):
5768 + * if serverSet[dest_ip] is null then
5769 + * n, serverSet[dest_ip] <- {weighted least-conn node};
5771 + * n <- {least-conn (alive) node in serverSet[dest_ip]};
5772 + * if (n is null) OR
5773 + * (n.conns>n.weight AND
5774 + * there is a node m with m.conns<m.weight/2) then
5775 + * n <- {weighted least-conn node};
5776 + * add n to serverSet[dest_ip];
5777 + * if |serverSet[dest_ip]| > 1 AND
5778 + * now - serverSet[dest_ip].lastMod > T then
5779 + * m <- {most conn node in serverSet[dest_ip]};
5780 + * remove m from serverSet[dest_ip];
5781 + * if serverSet[dest_ip] changed then
5782 + * serverSet[dest_ip].lastMod <- now;
5788 +#include <linux/config.h>
5789 +#include <linux/module.h>
5791 +#include <linux/kmod.h>
5793 +#include <linux/types.h>
5794 +#include <linux/kernel.h>
5795 +#include <linux/errno.h>
5796 +#include <linux/vmalloc.h>
5797 +#include <net/ip_masq.h>
5798 +#ifdef CONFIG_IP_MASQUERADE_MOD
5799 +#include <net/ip_masq_mod.h>
5801 +#include <linux/sysctl.h>
5802 +#include <linux/proc_fs.h>
5803 +#include <linux/ip_fw.h>
5804 +#include <net/ip_vs.h>
5808 + * It is for garbage collection of stale IPVS lblcr entries,
5809 + * when the table is full.
5811 +#define CHECK_EXPIRE_INTERVAL (60*HZ)
5812 +#define ENTRY_TIMEOUT (6*60*HZ)
5815 + * It is for full expiration check.
5816 + * When there is no partial expiration check (garbage collection)
5817 + * in a half hour, do a full expiration check to collect stale
5818 + * entries that haven't been touched for a day.
5820 +#define COUNT_FOR_FULL_EXPIRATION 30
5821 +int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
5825 + * for IPVS lblcr entry hash table
5827 +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
5828 +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
5830 +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
5831 +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
5832 +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
5836 + * IPVS destination set structure and operations
5838 +struct ip_vs_dest_list {
5839 + struct ip_vs_dest_list *next; /* list link */
5840 + struct ip_vs_dest *dest; /* destination server */
5843 +struct ip_vs_dest_set {
5844 + atomic_t size; /* set size */
5845 + unsigned long lastmod; /* last modified time */
5846 + struct ip_vs_dest_list *list; /* destination list */
5847 + rwlock_t lock; /* lock for this list */
5851 +static struct ip_vs_dest_list *
5852 +ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
5854 + struct ip_vs_dest_list *e;
5856 + for (e=set->list; e!=NULL; e=e->next) {
5857 + if (e->dest == dest)
5858 + /* already existed */
5862 + e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
5864 + IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
5868 + atomic_inc(&dest->refcnt);
5871 + /* link it to the list */
5872 + write_lock(&set->lock);
5873 + if (set->list != NULL) {
5874 + e->next = set->list->next;
5880 + write_unlock(&set->lock);
5882 + atomic_inc(&set->size);
5883 + set->lastmod = jiffies;
5888 +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
5890 + struct ip_vs_dest_list *e, **ep;
5892 + write_lock(&set->lock);
5893 + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
5894 + if (e->dest == dest) {
5897 + atomic_dec(&set->size);
5898 + set->lastmod = jiffies;
5899 + atomic_dec(&e->dest->refcnt);
5905 + write_unlock(&set->lock);
5908 +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
5910 + struct ip_vs_dest_list *e, **ep;
5912 + write_lock(&set->lock);
5913 + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
5916 + * We don't kfree dest because it is refered either
5917 + * by its service or by the trash dest list.
5919 + atomic_dec(&e->dest->refcnt);
5922 + write_unlock(&set->lock);
5925 +/* get weighted least-connection node in the destination set */
5926 +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
5928 + register struct ip_vs_dest_list *e;
5929 + struct ip_vs_dest *dest, *least;
5935 + read_lock(&set->lock);
5936 + /* select the first destination server, whose weight > 0 */
5937 + for (e=set->list; e!=NULL; e=e->next) {
5939 + if ((least->weight > 0)
5940 + && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
5941 + loh = atomic_read(&least->activeconns) * 50
5942 + + atomic_read(&least->inactconns);
5946 + read_unlock(&set->lock);
5949 + /* find the destination with the weighted least load */
5951 + for (e=e->next; e!=NULL; e=e->next) {
5953 + doh = atomic_read(&dest->activeconns) * 50
5954 + + atomic_read(&dest->inactconns);
5955 + if ((loh*dest->weight > doh*least->weight)
5956 + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
5961 + read_unlock(&set->lock);
5963 + IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
5964 + "activeconns %d refcnt %d weight %d overhead %d\n",
5965 + NIPQUAD(least->addr), ntohs(least->port),
5966 + atomic_read(&least->activeconns),
5967 + atomic_read(&least->refcnt), least->weight, loh);
5972 +/* get weighted most-connection node in the destination set */
5973 +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
5975 + register struct ip_vs_dest_list *e;
5976 + struct ip_vs_dest *dest, *most;
5982 + read_lock(&set->lock);
5983 + /* select the first destination server, whose weight > 0 */
5984 + for (e=set->list; e!=NULL; e=e->next) {
5986 + if (most->weight > 0) {
5987 + moh = atomic_read(&most->activeconns) * 50
5988 + + atomic_read(&most->inactconns);
5992 + read_unlock(&set->lock);
5995 + /* find the destination with the weighted most load */
5997 + for (e=e->next; e!=NULL; e=e->next) {
5999 + doh = atomic_read(&dest->activeconns) * 50
6000 + + atomic_read(&dest->inactconns);
6001 + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
6002 + if (moh*dest->weight < doh*most->weight
6003 + && dest->weight > 0) {
6008 + read_unlock(&set->lock);
6010 + IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
6011 + "activeconns %d refcnt %d weight %d overhead %d\n",
6012 + NIPQUAD(most->addr), ntohs(most->port),
6013 + atomic_read(&most->activeconns),
6014 + atomic_read(&most->refcnt), most->weight, moh);
6020 + * IPVS lblcr entry represents an association between destination
6021 + * IP address and its destination server set
6023 +struct ip_vs_lblcr_entry {
6024 + struct list_head list;
6025 + __u32 addr; /* destination IP address */
6026 + struct ip_vs_dest_set set; /* destination server set */
6027 + unsigned long lastuse; /* last used time */
6032 + * IPVS lblcr hash table
6034 +struct ip_vs_lblcr_table {
6035 + rwlock_t lock; /* lock for this table */
6036 + struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
6037 + atomic_t entries; /* number of entries */
6038 + int max_size; /* maximum size of entries */
6039 + struct timer_list periodic_timer; /* collect stale entries */
6040 + int rover; /* rover for expire check */
6041 + int counter; /* counter for no expire */
6046 + * IPVS LBLCR sysctl table
6048 +struct ip_vs_lblcr_sysctl_table {
6049 + struct ctl_table_header *sysctl_header;
6050 + ctl_table vs_vars[2];
6051 + ctl_table vs_dir[2];
6052 + ctl_table ipv4_dir[2];
6053 + ctl_table root_dir[2];
6057 +static struct ip_vs_lblcr_sysctl_table lblcr_sysctl_table = {
6059 + {{NET_IPV4_VS_LBLCR_EXPIRE, "lblcr_expiration",
6060 + &sysctl_ip_vs_lblcr_expiration,
6061 + sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
6063 + {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblcr_sysctl_table.vs_vars},
6065 + {{NET_IPV4, "ipv4", NULL, 0, 0555, lblcr_sysctl_table.vs_dir},
6067 + {{CTL_NET, "net", NULL, 0, 0555, lblcr_sysctl_table.ipv4_dir},
6073 + * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
6074 + * IP address to a server.
6076 +static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr)
6078 + struct ip_vs_lblcr_entry *en;
6080 + en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
6082 + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
6086 + INIT_LIST_HEAD(&en->list);
6089 + /* initilize its dest set */
6090 + atomic_set(&(en->set.size), 0);
6091 + en->set.list = NULL;
6092 + en->set.lock = RW_LOCK_UNLOCKED;
6098 +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
6100 + list_del(&en->list);
6101 + ip_vs_dest_set_eraseall(&en->set);
6107 + * Returns hash value for IPVS LBLCR entry
6109 +static inline unsigned ip_vs_lblcr_hashkey(__u32 addr)
6111 + return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
6116 + * Hash an entry in the ip_vs_lblcr_table.
6117 + * returns bool success.
6120 +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
6124 + if (!list_empty(&en->list)) {
6125 + IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
6126 + "called from %p\n", __builtin_return_address(0));
6131 + * Hash by destination IP address
6133 + hash = ip_vs_lblcr_hashkey(en->addr);
6135 + write_lock(&tbl->lock);
6136 + list_add(&en->list, &tbl->bucket[hash]);
6137 + atomic_inc(&tbl->entries);
6138 + write_unlock(&tbl->lock);
6146 + * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
6147 + * returns bool success.
6149 +static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
6150 + struct ip_vs_lblcr_entry *en)
6152 + if (list_empty(&en->list)) {
6153 + IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
6154 + "called from %p\n", __builtin_return_address(0));
6159 + * Remove it from the table
6161 + write_lock(&tbl->lock);
6162 + list_del(&en->list);
6163 + INIT_LIST_HEAD(&en->list);
6164 + write_unlock(&tbl->lock);
6172 + * Get ip_vs_lblcr_entry associated with supplied parameters.
6174 +static inline struct ip_vs_lblcr_entry *
6175 +ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr)
6178 + struct ip_vs_lblcr_entry *en;
6179 + struct list_head *l,*e;
6181 + hash = ip_vs_lblcr_hashkey(addr);
6182 + l = &tbl->bucket[hash];
6184 + read_lock(&tbl->lock);
6186 + for (e=l->next; e!=l; e=e->next) {
6187 + en = list_entry(e, struct ip_vs_lblcr_entry, list);
6188 + if (en->addr == addr) {
6190 + read_unlock(&tbl->lock);
6195 + read_unlock(&tbl->lock);
6202 + * Flush all the entries of the specified table.
6204 +static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
6207 + struct list_head *l;
6208 + struct ip_vs_lblcr_entry *en;
6210 + for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
6211 + write_lock(&tbl->lock);
6212 + for (l=&tbl->bucket[i]; l->next!=l; ) {
6213 + en = list_entry(l->next,
6214 + struct ip_vs_lblcr_entry, list);
6215 + ip_vs_lblcr_free(en);
6216 + atomic_dec(&tbl->entries);
6218 + write_unlock(&tbl->lock);
6223 +static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
6225 + unsigned long now = jiffies;
6227 + struct list_head *l, *e;
6228 + struct ip_vs_lblcr_entry *en;
6230 + for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
6231 + j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
6232 + e = l = &tbl->bucket[j];
6233 + write_lock(&tbl->lock);
6234 + while (e->next != l) {
6235 + en = list_entry(e->next,
6236 + struct ip_vs_lblcr_entry, list);
6237 + if ((now - en->lastuse) <
6238 + sysctl_ip_vs_lblcr_expiration) {
6242 + ip_vs_lblcr_free(en);
6243 + atomic_dec(&tbl->entries);
6245 + write_unlock(&tbl->lock);
6252 + * Periodical timer handler for IPVS lblcr table
6253 + * It is used to collect stale entries when the number of entries
6254 + * exceeds the maximum size of the table.
6256 + * Fixme: we probably need more complicated algorithm to collect
6257 + * entries that have not been used for a long time even
6258 + * if the number of entries doesn't exceed the maximum size
6260 + * The full expiration check is for this purpose now.
6262 +static void ip_vs_lblcr_check_expire(unsigned long data)
6264 + struct ip_vs_lblcr_table *tbl;
6265 + unsigned long now = jiffies;
6268 + struct list_head *l, *e;
6269 + struct ip_vs_lblcr_entry *en;
6271 + tbl = (struct ip_vs_lblcr_table *)data;
6273 + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
6274 + /* do full expiration check */
6275 + ip_vs_lblcr_full_check(tbl);
6280 + if (atomic_read(&tbl->entries) < tbl->max_size) {
6285 + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
6286 + if (goal > tbl->max_size/2)
6287 + goal = tbl->max_size/2;
6289 + for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
6290 + j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
6291 + e = l = &tbl->bucket[j];
6292 + write_lock(&tbl->lock);
6293 + while (e->next != l) {
6294 + en = list_entry(e->next,
6295 + struct ip_vs_lblcr_entry, list);
6296 + if ((now - en->lastuse) < ENTRY_TIMEOUT) {
6300 + ip_vs_lblcr_free(en);
6301 + atomic_dec(&tbl->entries);
6304 + write_unlock(&tbl->lock);
6311 + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
6315 +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
6318 + struct ip_vs_lblcr_table *tbl;
6321 + * Allocate the ip_vs_lblcr_table for this service
6323 + tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
6324 + if (tbl == NULL) {
6325 + IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
6328 + svc->sched_data = tbl;
6329 + IP_VS_DBG(0, "LBLCR hash table (memory=%dbytes) allocated for "
6330 + "current service\n",
6331 + sizeof(struct ip_vs_lblcr_table));
6334 + * Initialize the hash buckets
6336 + for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
6337 + INIT_LIST_HEAD(&tbl->bucket[i]);
6339 + tbl->lock = RW_LOCK_UNLOCKED;
6340 + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
6345 + * Hook periodic timer for garbage collection
6347 + init_timer(&tbl->periodic_timer);
6348 + tbl->periodic_timer.data = (unsigned long)tbl;
6349 + tbl->periodic_timer.function = ip_vs_lblcr_check_expire;
6350 + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
6351 + add_timer(&tbl->periodic_timer);
6353 + MOD_INC_USE_COUNT;
6358 +static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
6360 + struct ip_vs_lblcr_table *tbl = svc->sched_data;
6362 + /* remove periodic timer */
6363 + del_timer(&tbl->periodic_timer);
6365 + /* got to clean up table entries here */
6366 + ip_vs_lblcr_flush(tbl);
6368 + /* release the table itself */
6369 + kfree(svc->sched_data);
6370 + IP_VS_DBG(0, "LBLCR hash table (memory=%dbytes) released\n",
6371 + sizeof(struct ip_vs_lblcr_table));
6373 + MOD_DEC_USE_COUNT;
6378 +static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
6384 +static inline struct ip_vs_dest *
6385 +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
6387 + register struct list_head *l, *e;
6388 + struct ip_vs_dest *dest, *least;
6391 + l = &svc->destinations;
6396 + * We think the overhead of processing active connections is fifty
6397 + * times than that of inactive conncetions in average. (This fifty
6398 + * times might be not accurate, we will change it later.) We use
6399 + * the following formula to estimate the overhead:
6400 + * dest->activeconns*50 + dest->inactconns
6402 + * (dest overhead) / dest->weight
6404 + * Remember -- no floats in kernel mode!!!
6405 + * The comparison of h1*w2 > h2*w1 is equivalent to that of
6407 + * if every weight is larger than zero.
6409 + * The server with weight=0 is quiesced and will not receive any
6413 + for (e=l->next; e!=l; e=e->next) {
6414 + least = list_entry(e, struct ip_vs_dest, n_list);
6415 + if (least->weight > 0) {
6416 + loh = atomic_read(&least->activeconns) * 50
6417 + + atomic_read(&least->inactconns);
6424 + * Find the destination with the least load.
6427 + for (e=e->next; e!=l; e=e->next) {
6428 + dest = list_entry(e, struct ip_vs_dest, n_list);
6429 + doh = atomic_read(&dest->activeconns) * 50
6430 + + atomic_read(&dest->inactconns);
6431 + if (loh*dest->weight > doh*least->weight) {
6437 + IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
6438 + "activeconns %d refcnt %d weight %d overhead %d\n",
6439 + NIPQUAD(least->addr), ntohs(least->port),
6440 + atomic_read(&least->activeconns),
6441 + atomic_read(&least->refcnt), least->weight, loh);
6448 + * If this destination server is overloaded and there is a less loaded
6449 + * server, then return true.
6452 +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
6454 + if (atomic_read(&dest->activeconns) > dest->weight) {
6455 + register struct list_head *l, *e;
6456 + struct ip_vs_dest *d;
6458 + l = &svc->destinations;
6459 + for (e=l->next; e!=l; e=e->next) {
6460 + d = list_entry(e, struct ip_vs_dest, n_list);
6461 + if (atomic_read(&d->activeconns)*2 < d->weight) {
6471 + * Locality-Based (weighted) Least-Connection scheduling
6473 +static struct ip_vs_dest *
6474 +ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
6476 + struct ip_vs_dest *dest;
6477 + struct ip_vs_lblcr_table *tbl;
6478 + struct ip_vs_lblcr_entry *en;
6480 + IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
6482 + tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
6483 + en = ip_vs_lblcr_get(tbl, iph->daddr);
6485 + dest = __ip_vs_wlc_schedule(svc, iph);
6486 + if (dest == NULL) {
6487 + IP_VS_DBG(1, "no destination available\n");
6490 + en = ip_vs_lblcr_new(iph->daddr);
6494 + ip_vs_dest_set_insert(&en->set, dest);
6495 + ip_vs_lblcr_hash(tbl, en);
6497 + dest = ip_vs_dest_set_min(&en->set);
6498 + if (!dest || is_overloaded(dest, svc)) {
6499 + dest = __ip_vs_wlc_schedule(svc, iph);
6500 + if (dest == NULL) {
6501 + IP_VS_DBG(1, "no destination available\n");
6504 + ip_vs_dest_set_insert(&en->set, dest);
6506 + if (atomic_read(&en->set.size) > 1 &&
6507 + jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
6508 + struct ip_vs_dest *m;
6509 + m = ip_vs_dest_set_max(&en->set);
6510 + if (m) ip_vs_dest_set_erase(&en->set, m);
6513 + en->lastuse = jiffies;
6515 + IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
6516 + "--> server %u.%u.%u.%u:%d\n",
6517 + NIPQUAD(en->addr),
6518 + NIPQUAD(dest->addr),
6519 + ntohs(dest->port));
6526 + * IPVS LBLCR Scheduler structure
6528 +static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
6531 + "lblcr", /* name */
6532 + ATOMIC_INIT(0), /* refcnt */
6533 + ip_vs_lblcr_init_svc, /* service initializer */
6534 + ip_vs_lblcr_done_svc, /* service done */
6535 + ip_vs_lblcr_update_svc, /* service updater */
6536 + ip_vs_lblcr_schedule, /* select a server from the destination list */
6540 +__initfunc(int ip_vs_lblcr_init(void))
6542 + IP_VS_INFO("Initializing LBLCR scheduling\n");
6543 + INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
6544 + lblcr_sysctl_table.sysctl_header =
6545 + register_sysctl_table(lblcr_sysctl_table.root_dir, 0);
6546 + return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
6553 +int init_module(void)
6555 + INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
6557 + /* module initialization by 'request_module' */
6558 + if (register_ip_vs_scheduler(&ip_vs_lblcr_scheduler) != 0)
6561 + lblcr_sysctl_table.sysctl_header =
6562 + register_sysctl_table(lblcr_sysctl_table.root_dir, 0);
6564 + IP_VS_INFO("LBLCR scheduling module loaded.\n");
6569 +void cleanup_module(void)
6571 + /* module cleanup by 'release_module' */
6572 + if (unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler) != 0) {
6573 + IP_VS_INFO("cannot remove LBLCR scheduling module\n");
6575 + IP_VS_INFO("LBLCR scheduling module unloaded.\n");
6577 + unregister_sysctl_table(lblcr_sysctl_table.sysctl_header);
6580 +#endif /* MODULE */
6581 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lc.c linux-2.2.19-vs-1.0.7/net/ipv4/ip_vs_lc.c
6582 --- linux-2.2.19/net/ipv4/ip_vs_lc.c Thu Jan 1 08:00:00 1970
6583 +++ linux-2.2.19-vs-1.0.7/net/ipv4/ip_vs_lc.c Fri Nov 24 10:02:53 2000
6586 + * IPVS: Least-Connection Scheduling module
6590 + * Authors: Wensong Zhang <wensong@iinchina.net>
6592 + * This program is free software; you can redistribute it and/or
6593 + * modify it under the terms of the GNU General Public License
6594 + * as published by the Free Software Foundation; either version
6595 + * 2 of the License, or (at your option) any later version.
6598 + * Wensong Zhang : added the ip_vs_lc_update_svc
6599 + * Wensong Zhang : added any dest with weight=0 is quiesced
6603 +#include <linux/config.h>
6604 +#include <linux/module.h>
6606 +#include <linux/kmod.h>
6608 +#include <linux/types.h>
6609 +#include <linux/kernel.h>
6610 +#include <linux/errno.h>
6611 +#include <net/ip_masq.h>
6612 +#ifdef CONFIG_IP_MASQUERADE_MOD
6613 +#include <net/ip_masq_mod.h>
6615 +#include <linux/ip_fw.h>
6616 +#include <net/ip_vs.h>
6619 +static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
6621 + MOD_INC_USE_COUNT;
6626 +static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
6628 + MOD_DEC_USE_COUNT;
6633 +static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
6640 + * Least Connection scheduling
6642 +static struct ip_vs_dest* ip_vs_lc_schedule(struct ip_vs_service *svc,
6643 + struct iphdr *iph)
6645 + struct list_head *l, *e;
6646 + struct ip_vs_dest *dest, *least;
6649 + IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
6651 + l = &svc->destinations;
6656 + * Simply select the server with the least number of
6657 + * (activeconns<<5) + inactconns
6658 + * Except whose weight is equal to zero.
6659 + * If the weight is equal to zero, it means that the server is
6660 + * quiesced, the existing connections to the server still get
6661 + * served, but no new connection is assigned to the server.
6664 + for (e=l->next; e!=l; e=e->next) {
6665 + least = list_entry (e, struct ip_vs_dest, n_list);
6666 + if (least->weight > 0) {
6667 + lac = (atomic_read(&least->activeconns) << 5)
6668 + + atomic_read(&least->inactconns);
6675 + * Find the destination with the least load.
6678 + for (e=e->next; e!=l; e=e->next) {
6679 + dest = list_entry(e, struct ip_vs_dest, n_list);
6680 + if (dest->weight == 0)
6682 + dac = (atomic_read(&dest->activeconns) << 5)
6683 + + atomic_read(&dest->inactconns);
6690 + IP_VS_DBG(6, "LC: server %d.%d.%d.%d:%d activeconns %d inactconns %d\n",
6691 + NIPQUAD(least->addr), ntohs(least->port),
6692 + atomic_read(&least->activeconns),
6693 + atomic_read(&least->inactconns));
6699 +static struct ip_vs_scheduler ip_vs_lc_scheduler = {
6702 + ATOMIC_INIT(0), /* refcnt */
6703 + ip_vs_lc_init_svc, /* service initializer */
6704 + ip_vs_lc_done_svc, /* service done */
6705 + ip_vs_lc_update_svc, /* service updater */
6706 + ip_vs_lc_schedule, /* select a server from the destination list */
6710 +__initfunc(int ip_vs_lc_init(void))
6712 + IP_VS_INFO("Initializing LC scheduling\n");
6713 + INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
6714 + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
6721 +int init_module(void)
6723 + INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
6725 + /* module initialization by 'request_module' */
6726 + if(register_ip_vs_scheduler(&ip_vs_lc_scheduler) != 0)
6729 + IP_VS_INFO("LC scheduling module loaded.\n");
6734 +void cleanup_module(void)
6736 + /* module cleanup by 'release_module' */
6737 + if(unregister_ip_vs_scheduler(&ip_vs_lc_scheduler) != 0)
6738 + IP_VS_INFO("cannot remove LC scheduling module\n");
6740 + IP_VS_INFO("LC scheduling module unloaded.\n");
6743 +#endif /* MODULE */
6744 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_rr.c linux-2.2.19-vs-1.0.7/net/ipv4/ip_vs_rr.c
6745 --- linux-2.2.19/net/ipv4/ip_vs_rr.c Thu Jan 1 08:00:00 1970
6746 +++ linux-2.2.19-vs-1.0.7/net/ipv4/ip_vs_rr.c Fri Nov 24 10:04:12 2000
6749 + * IPVS: Round-Robin Scheduling module
6753 + * Authors: Wensong Zhang <wensong@iinchina.net>
6754 + * Peter Kese <peter.kese@ijs.si>
6756 + * This program is free software; you can redistribute it and/or
6757 + * modify it under the terms of the GNU General Public License
6758 + * as published by the Free Software Foundation; either version
6759 + * 2 of the License, or (at your option) any later version.
6762 + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest
6763 + * Julian Anastasov : fixed the NULL pointer access bug in debugging
6764 + * Wensong Zhang : changed some comestics things for debugging
6765 + * Wensong Zhang : changed for the d-linked destination list
6766 + * Wensong Zhang : added the ip_vs_rr_update_svc
6767 + * Wensong Zhang : added any dest with weight=0 is quiesced
6771 +#include <linux/config.h>
6772 +#include <linux/module.h>
6774 +#include <linux/kmod.h>
6776 +#include <linux/types.h>
6777 +#include <linux/kernel.h>
6778 +#include <linux/errno.h>
6779 +#include <net/ip_masq.h>
6780 +#ifdef CONFIG_IP_MASQUERADE_MOD
6781 +#include <net/ip_masq_mod.h>
6783 +#include <linux/ip_fw.h>
6784 +#include <net/ip_vs.h>
6787 +static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
6789 + svc->sched_data = &svc->destinations;
6790 + MOD_INC_USE_COUNT;
6795 +static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
6797 + MOD_DEC_USE_COUNT;
6802 +static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
6804 + svc->sched_data = &svc->destinations;
6810 + * Round-Robin Scheduling
6812 +static struct ip_vs_dest* ip_vs_rr_schedule(struct ip_vs_service *svc,
6813 + struct iphdr *iph)
6815 + register struct list_head *p, *q;
6816 + struct ip_vs_dest *dest;
6818 + IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
6820 + p = (struct list_head *)svc->sched_data;
6824 + if (q == &svc->destinations) {
6828 + dest = list_entry(q, struct ip_vs_dest, n_list);
6829 + if (dest->weight > 0)
6837 + svc->sched_data = q;
6838 + IP_VS_DBG(6, "RR: server %d.%d.%d.%d:%d "
6839 + "activeconns %d refcnt %d weight %d\n",
6840 + NIPQUAD(dest->addr), ntohs(dest->port),
6841 + atomic_read(&dest->activeconns),
6842 + atomic_read(&dest->refcnt), dest->weight);
6848 +static struct ip_vs_scheduler ip_vs_rr_scheduler = {
6851 + ATOMIC_INIT(0), /* refcnt */
6852 + ip_vs_rr_init_svc, /* service initializer */
6853 + ip_vs_rr_done_svc, /* service done */
6854 + ip_vs_rr_update_svc, /* service updater */
6855 + ip_vs_rr_schedule, /* select a server from the destination list */
6859 +__initfunc(int ip_vs_rr_init(void))
6861 + IP_VS_INFO("Initializing RR scheduling\n");
6862 + INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
6863 + return register_ip_vs_scheduler(&ip_vs_rr_scheduler) ;
6870 +int init_module(void)
6872 + INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
6874 + /* module initialization by 'request_module' */
6875 + if(register_ip_vs_scheduler(&ip_vs_rr_scheduler) != 0)
6878 + IP_VS_INFO("RR scheduling module loaded.\n");
6883 +void cleanup_module(void)
6885 + /* module cleanup by 'release_module' */
6886 + if(unregister_ip_vs_scheduler(&ip_vs_rr_scheduler) != 0)
6887 + IP_VS_INFO("cannot remove RR scheduling module\n");
6889 + IP_VS_INFO("RR scheduling module unloaded.\n");
6892 +#endif /* MODULE */
6893 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_wlc.c linux-2.2.19-vs-1.0.7/net/ipv4/ip_vs_wlc.c
6894 --- linux-2.2.19/net/ipv4/ip_vs_wlc.c Thu Jan 1 08:00:00 1970
6895 +++ linux-2.2.19-vs-1.0.7/net/ipv4/ip_vs_wlc.c Fri Nov 24 09:59:32 2000
6898 + * IPVS: Weighted Least-Connection Scheduling module
6902 + * Authors: Wensong Zhang <wensong@iinchina.net>
6903 + * Peter Kese <peter.kese@ijs.si>
6905 + * This program is free software; you can redistribute it and/or
6906 + * modify it under the terms of the GNU General Public License
6907 + * as published by the Free Software Foundation; either version
6908 + * 2 of the License, or (at your option) any later version.
6911 + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
6912 + * Wensong Zhang : changed to use the inactconns in scheduling
6913 + * Wensong Zhang : changed some comestics things for debugging
6914 + * Wensong Zhang : changed for the d-linked destination list
6915 + * Wensong Zhang : added the ip_vs_wlc_update_svc
6916 + * Wensong Zhang : added any dest with weight=0 is quiesced
6920 +#include <linux/config.h>
6921 +#include <linux/module.h>
6923 +#include <linux/kmod.h>
6925 +#include <linux/types.h>
6926 +#include <linux/kernel.h>
6927 +#include <linux/errno.h>
6928 +#include <net/ip_masq.h>
6929 +#ifdef CONFIG_IP_MASQUERADE_MOD
6930 +#include <net/ip_masq_mod.h>
6932 +#include <linux/ip_fw.h>
6933 +#include <net/ip_vs.h>
6937 +ip_vs_wlc_init_svc(struct ip_vs_service *svc)
6939 + MOD_INC_USE_COUNT;
6945 +ip_vs_wlc_done_svc(struct ip_vs_service *svc)
6947 + MOD_DEC_USE_COUNT;
6953 +ip_vs_wlc_update_svc(struct ip_vs_service *svc)
6960 + * Weighted Least Connection scheduling
6962 +static struct ip_vs_dest *
6963 +ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
6965 + register struct list_head *l, *e;
6966 + struct ip_vs_dest *dest, *least;
6969 + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
6971 + l = &svc->destinations;
6976 + * We think the overhead of processing active connections is fifty
6977 + * times than that of inactive conncetions in average. (This fifty
6978 + * times might be not accurate, we will change it later.) We use
6979 + * the following formula to estimate the overhead:
6980 + * dest->activeconns*50 + dest->inactconns
6982 + * (dest overhead) / dest->weight
6984 + * Remember -- no floats in kernel mode!!!
6985 + * The comparison of h1*w2 > h2*w1 is equivalent to that of
6987 + * if every weight is larger than zero.
6989 + * The server with weight=0 is quiesced and will not receive any
6993 + for (e=l->next; e!=l; e=e->next) {
6994 + least = list_entry(e, struct ip_vs_dest, n_list);
6995 + if (least->weight > 0) {
6996 + loh = atomic_read(&least->activeconns) * 50
6997 + + atomic_read(&least->inactconns);
7004 + * Find the destination with the least load.
7007 + for (e=e->next; e!=l; e=e->next) {
7008 + dest = list_entry(e, struct ip_vs_dest, n_list);
7009 + doh = atomic_read(&dest->activeconns) * 50
7010 + + atomic_read(&dest->inactconns);
7011 + if (loh * dest->weight > doh * least->weight) {
7017 + IP_VS_DBG(6, "WLC: server %d.%d.%d.%d:%d "
7018 + "activeconns %d refcnt %d weight %d overhead %d\n",
7019 + NIPQUAD(least->addr), ntohs(least->port),
7020 + atomic_read(&least->activeconns),
7021 + atomic_read(&least->refcnt), least->weight, loh);
7027 +static struct ip_vs_scheduler ip_vs_wlc_scheduler =
7031 + ATOMIC_INIT (0), /* refcnt */
7032 + ip_vs_wlc_init_svc, /* service initializer */
7033 + ip_vs_wlc_done_svc, /* service done */
7034 + ip_vs_wlc_update_svc, /* service updater */
7035 + ip_vs_wlc_schedule, /* select a server from the destination list */
7039 +__initfunc(int ip_vs_wlc_init (void))
7041 + IP_VS_INFO("Initializing WLC scheduling\n");
7042 + INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list);
7043 + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
7050 +int init_module(void)
7052 + INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list);
7054 + /* module initialization by 'request_module' */
7055 + if (register_ip_vs_scheduler(&ip_vs_wlc_scheduler) != 0)
7058 + IP_VS_INFO("WLC scheduling module loaded.\n");
7063 +void cleanup_module(void)
7065 + /* module cleanup by 'release_module' */
7066 + if (unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler) != 0)
7067 + IP_VS_INFO("cannot remove WLC scheduling module\n");
7069 + IP_VS_INFO("WLC scheduling module unloaded.\n");
7072 +#endif /* MODULE */
7073 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_wrr.c linux-2.2.19-vs-1.0.7/net/ipv4/ip_vs_wrr.c
7074 --- linux-2.2.19/net/ipv4/ip_vs_wrr.c Thu Jan 1 08:00:00 1970
7075 +++ linux-2.2.19-vs-1.0.7/net/ipv4/ip_vs_wrr.c Fri Nov 24 09:57:23 2000
7078 + * IPVS: Weighted Round-Robin Scheduling module
7082 + * Authors: Wensong Zhang <wensong@iinchina.net>
7084 + * This program is free software; you can redistribute it and/or
7085 + * modify it under the terms of the GNU General Public License
7086 + * as published by the Free Software Foundation; either version
7087 + * 2 of the License, or (at your option) any later version.
7090 + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
7091 + * Wensong Zhang : changed some comestics things for debugging
7092 + * Wensong Zhang : changed for the d-linked destination list
7093 + * Wensong Zhang : added the ip_vs_wrr_update_svc
7094 + * Julian Anastasov : return -ENOMEM instead of ENOMEM in the
7095 + * ip_vs_wrr_init_svc
7096 + * Julian Anastasov : fixed the bug of returning destination
7097 + * with weight 0 when all weights are zero
7101 +#include <linux/config.h>
7102 +#include <linux/module.h>
7104 +#include <linux/kmod.h>
7106 +#include <linux/types.h>
7107 +#include <linux/kernel.h>
7108 +#include <linux/errno.h>
7109 +#include <net/ip_masq.h>
7110 +#ifdef CONFIG_IP_MASQUERADE_MOD
7111 +#include <net/ip_masq_mod.h>
7113 +#include <linux/ip_fw.h>
7114 +#include <net/ip_vs.h>
7117 + * current destination pointer for weighted round-robin scheduling
7119 +struct ip_vs_wrr_mark {
7120 + struct list_head *cl; /* current list head */
7121 + int cw; /* current weight */
7125 +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
7128 + * Allocate the mark variable for WRR scheduling
7130 + svc->sched_data = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
7132 + if (svc->sched_data == NULL) {
7133 + IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
7136 + memset(svc->sched_data, 0, sizeof(struct ip_vs_wrr_mark));
7138 + ((struct ip_vs_wrr_mark*)svc->sched_data)->cl = &svc->destinations;
7140 + MOD_INC_USE_COUNT;
7145 +static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
7148 + * Release the mark variable
7150 + kfree_s(svc->sched_data, sizeof(struct ip_vs_wrr_mark));
7152 + MOD_DEC_USE_COUNT;
7157 +static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
7159 + ((struct ip_vs_wrr_mark*)svc->sched_data)->cl = &svc->destinations;
7165 + * Get the maximum weight of the service destinations.
7167 +int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
7169 + register struct list_head *l, *e;
7170 + struct ip_vs_dest *dest;
7173 + l = &svc->destinations;
7174 + for (e=l->next; e!=l; e=e->next) {
7175 + dest = list_entry(e, struct ip_vs_dest, n_list);
7176 + if (dest->weight > weight)
7177 + weight = dest->weight;
7185 + * Weighted Round-Robin Scheduling
7187 +static struct ip_vs_dest* ip_vs_wrr_schedule(struct ip_vs_service *svc,
7188 + struct iphdr *iph)
7190 + struct ip_vs_dest *dest;
7191 + struct ip_vs_wrr_mark *mark = svc->sched_data;
7193 + IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
7196 + * This loop will always terminate, because 0<mark->cw<max_weight,
7197 + * and at least one server has its weight equal to max_weight.
7200 + if (mark->cl == &svc->destinations) {
7201 + /* it is at the head of the destination list */
7203 + if (mark->cl == mark->cl->next)
7204 + /* no dest entry */
7207 + mark->cl = svc->destinations.next;
7209 + if (mark->cw <= 0) {
7210 + mark->cw = ip_vs_wrr_max_weight(svc);
7212 + * Still zero, which means no availabe servers.
7214 + if (mark->cw == 0) {
7215 + mark->cl = &svc->destinations;
7216 + IP_VS_INFO("ip_vs_wrr_schedule(): "
7217 + "no available servers\n");
7222 + else mark->cl = mark->cl->next;
7224 + if (mark->cl != &svc->destinations) {
7225 + /* not at the head of the list */
7226 + dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
7227 + if (dest->weight >= mark->cw)
7232 + IP_VS_DBG(6, "WRR: server %d.%d.%d.%d:%d "
7233 + "activeconns %d refcnt %d weight %d\n",
7234 + NIPQUAD(dest->addr), ntohs(dest->port),
7235 + atomic_read(&dest->activeconns),
7236 + atomic_read(&dest->refcnt), dest->weight);
7242 +static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
7245 + ATOMIC_INIT(0), /* refcnt */
7246 + ip_vs_wrr_init_svc, /* service initializer */
7247 + ip_vs_wrr_done_svc, /* service done */
7248 + ip_vs_wrr_update_svc, /* service updater */
7249 + ip_vs_wrr_schedule, /* select a server from the destination list */
7253 +__initfunc(int ip_vs_wrr_init(void))
7255 + IP_VS_INFO("Initializing WRR scheduling\n");
7256 + INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
7257 + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
7263 +int init_module(void)
7265 + INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
7267 + /* module initialization by 'request_module' */
7268 + if(register_ip_vs_scheduler(&ip_vs_wrr_scheduler) != 0)
7271 + IP_VS_INFO("WRR scheduling module loaded.\n");
7276 +void cleanup_module(void)
7278 + /* module cleanup by 'release_module' */
7279 + if(unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler) != 0)
7280 + IP_VS_INFO("cannot remove WRR scheduling module\n");
7282 + IP_VS_INFO("WRR scheduling module unloaded.\n");
7285 +#endif /* MODULE */
7286 diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/sysctl_net_ipv4.c linux-2.2.19-vs-1.0.7/net/ipv4/sysctl_net_ipv4.c
7287 --- linux-2.2.19/net/ipv4/sysctl_net_ipv4.c Tue Mar 27 09:33:49 2001
7288 +++ linux-2.2.19-vs-1.0.7/net/ipv4/sysctl_net_ipv4.c Tue Mar 27 09:32:21 2001
7290 struct ipv4_config ipv4_config;
7292 extern ctl_table ipv4_route_table[];
7293 +#ifdef CONFIG_IP_MASQUERADE_VS
7294 +extern ctl_table ipv4_vs_table[];
7297 #ifdef CONFIG_SYSCTL
7299 @@ -198,7 +201,10 @@
7300 {NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships",
7301 &sysctl_igmp_max_memberships, sizeof(int), 0644, NULL, &proc_dointvec},
7303 +#ifdef CONFIG_IP_MASQUERADE_VS
7304 + {NET_IPV4_VS, "vs", NULL, 0, 0555, ipv4_vs_table},
7310 #endif /* CONFIG_SYSCTL */