]> git.pld-linux.org Git - packages/kernel.git/blame - kernel-ipvs-1.0.8-2.2.19.patch
- replaced by linux-2.4-sfq.patch
[packages/kernel.git] / kernel-ipvs-1.0.8-2.2.19.patch
CommitLineData
cb4dd8b9 1diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/Documentation/Configure.help linux-2.2.19-vs-1.0.8/Documentation/Configure.help
2--- linux-2.2.19/Documentation/Configure.help Tue Mar 27 09:33:35 2001
3+++ linux-2.2.19-vs-1.0.8/Documentation/Configure.help Tue Mar 27 09:32:02 2001
4@@ -2807,6 +2807,118 @@
5 The module will be called ip_masq_markfw.o. If you want to compile
6 it as a module, say M here and read Documentation/modules.txt.
7
8+IP: masquerading virtual server support
9+CONFIG_IP_MASQUERADE_VS
10+ IP Virtual Server support will let you build a virtual server
11+ based on cluster of two or more real servers. This option must
12+ be enabled for at least one of the clustered computers that will
13+ take care of intercepting incomming connections to a single IP
14+ address and scheduling them to real servers.
15+
16+ Three request dispatching techniques are implemented, they are
17+ virtual server via NAT, virtual server via tunneling and virtual
18+ server via direct routing. The round-robin scheduling, the weighted
19+ round-robin secheduling, the weighted least-connection scheduling,
20+ the locality-based least-connection scheduling, or the
21+ locality-based least-connection with replication scheduling
22+ algorithm can be used to choose which server the connection is
23+ directed to, thus load balancing can be achieved among the servers.
24+ For more information and its administration program, please visit
25+ the following URL:
26+
27+ http://www.linuxvirtualserver.org/
28+ If you want this, say Y.
29+
30+IP virtual server debugging
31+CONFIG_IP_VS_DEBUG
32+ Say Y here if you want to get additional messages useful in
33+ debugging the IP virtual server code. You can change the debug
34+ level in /proc/sys/net/ipv4/vs/debug_level
35+
36+IP masquerading VS table size (the Nth power of 2)
37+CONFIG_IP_MASQUERADE_VS_TAB_BITS
38+ Using a big ipvs hash table for virtual server will greatly reduce
39+ conflicts in the ipvs hash table when there are hundreds of thousands
40+ of active connections.
41+
42+ Note the table size must be power of 2. The table size will be the
43+ value of 2 to the your input number power. For example, the default
44+ number is 12, so the table size is 4096. Don't input the number too
45+ small, otherwise you will lose performance on it. You can adapt the
46+ table size yourself, according to your virtual server application. It
47+ is good to set the table size not far less than the number of
48+ connections per second multiplying average lasting time of connection
49+ in the table. For example, your virtual server gets 200 connections
50+ per second, the connection lasts for 200 seconds in average in the
51+ masquerading table, the table size should be not far less than
52+ 200x200, it is good to set the table size 32768 (2**15).
53+
54+ Another note that each connection occupies 128 bytes effectively and
55+ each hash entry uses 8 bytes, so you can estimate how much memory is
56+ needed for your box.
57+
58+IPVS: round-robin scheduling
59+CONFIG_IP_MASQUERADE_VS_RR
60+ The robin-robin scheduling algorithm simply directs network
61+ connections to different real servers in a round-robin manner.
62+ If you want to compile it in kernel, say Y. If you want to compile
63+ it as a module, say M here and read Documentation/modules.txt.
64+
65+IPVS: weighted round-robin scheduling
66+CONFIG_IP_MASQUERADE_VS_WRR
67+ The weighted robin-robin scheduling algorithm directs network
68+ connections to different real servers based on server weights
69+ in a round-robin manner. Servers with higher weights receive
70+ new connections first than those with less weights, and servers
71+ with higher weights get more connections than those with less
72+ weights and servers with equal weights get equal connections.
73+ If you want to compile it in kernel, say Y. If you want to compile
74+ it as a module, say M here and read Documentation/modules.txt.
75+
76+IPVS: least-connection scheduling
77+CONFIG_IP_MASQUERADE_VS_LC
78+ The least-connection scheduling algorithm directs network
79+ connections to the server with the least number of active
80+ connections.
81+ If you want to compile it in kernel, say Y. If you want to compile
82+ it as a module, say M here and read Documentation/modules.txt.
83+
84+IPVS: weighted least-connection scheduling
85+CONFIG_IP_MASQUERADE_VS_WLC
86+ The weighted least-connection scheduling algorithm directs network
87+ connections to the server with the least active connections
88+ normalized by the server weight.
89+ If you want to compile it in kernel, say Y. If you want to compile
90+ it as a module, say M here and read Documentation/modules.txt.
91+
92+IPVS: locality-based least-connection scheduling
93+CONFIG_IP_MASQUERADE_VS_LBLC
94+ The locality-based least-connection scheduling algorithm is for
95+ destination IP load balancing. It is usually used in cache cluster.
96+ This algorithm usually directs packet destined for an IP address to
97+ its server if the server is alive and under load. If the server is
98+ overloaded (its active connection numbers is larger than its weight)
99+ and there is a server in its half load, then allocate the weighted
100+ least-connection server to this IP address.
101+ If you want to compile it in kernel, say Y. If you want to compile
102+ it as a module, say M here and read Documentation/modules.txt.
103+
104+IPVS: locality-based least-connection with replication scheduling
105+CONFIG_IP_MASQUERADE_VS_LBLCR
106+ The locality-based least-connection with replication scheduling
107+ algorithm is also for destination IP load balancing. It is
108+ usually used in cache cluster. It differs from the LBLC scheduling
109+ as follows: the load balancer maintains mappings from a target
110+ to a set of server nodes that can serve the target. Requests for
111+ a target are assigned to the least-connection node in the target's
112+ server set. If all the node in the server set are over loaded,
113+ it picks up a least-connection node in the cluster and adds it
114+ in the sever set for the target. If the server set has not been
115+ modified for the specified time, the most loaded node is removed
116+ from the server set, in order to avoid high degree of replication.
117+ If you want to compile it in kernel, say Y. If you want to compile
118+ it as a module, say M here and read Documentation/modules.txt.
119+
120 IP: aliasing support
121 CONFIG_IP_ALIAS
122 Sometimes it is useful to give several IP addresses to a single
123diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/linux/ip_masq.h linux-2.2.19-vs-1.0.8/include/linux/ip_masq.h
124--- linux-2.2.19/include/linux/ip_masq.h Sat Oct 23 17:02:32 1999
125+++ linux-2.2.19-vs-1.0.8/include/linux/ip_masq.h Tue Dec 12 19:17:27 2000
126@@ -103,6 +103,27 @@
127
128 #define IP_MASQ_MFW_SCHED 0x01
129
130+/*
131+ * Virtual server stuff
132+ */
133+struct ip_vs_user {
134+ /* virtual service options */
135+ u_int16_t protocol;
136+ u_int32_t vaddr; /* virtual address */
137+ u_int16_t vport;
138+ u_int32_t vfwmark; /* firwall mark of virtual */
139+ unsigned vs_flags; /* virtual service flags */
140+ unsigned timeout; /* persistent timeout in ticks */
141+ u_int32_t netmask; /* persistent netmask */
142+
143+ /* destination specific options */
144+ u_int32_t daddr; /* real destination address */
145+ u_int16_t dport;
146+ unsigned masq_flags; /* destination flags */
147+ int weight; /* destination weight */
148+};
149+
150+
151 #define IP_FW_MASQCTL_MAX 256
152 #define IP_MASQ_TNAME_MAX 32
153
154@@ -115,6 +136,7 @@
155 struct ip_autofw_user autofw_user;
156 struct ip_mfw_user mfw_user;
157 struct ip_masq_user user;
158+ struct ip_vs_user vs_user;
159 unsigned char m_raw[IP_FW_MASQCTL_MAX];
160 } u;
161 };
162@@ -124,7 +146,9 @@
163 #define IP_MASQ_TARGET_CORE 1
164 #define IP_MASQ_TARGET_MOD 2 /* masq_mod is selected by "name" */
165 #define IP_MASQ_TARGET_USER 3
166-#define IP_MASQ_TARGET_LAST 4
167+#define IP_MASQ_TARGET_VS 4
168+#define IP_MASQ_TARGET_LAST 5
169+
170
171 #define IP_MASQ_CMD_NONE 0 /* just peek */
172 #define IP_MASQ_CMD_INSERT 1
173@@ -136,5 +160,9 @@
174 #define IP_MASQ_CMD_LIST 7 /* actually fake: done via /proc */
175 #define IP_MASQ_CMD_ENABLE 8
176 #define IP_MASQ_CMD_DISABLE 9
177+#define IP_MASQ_CMD_ADD_DEST 10 /* for adding dest in IPVS */
178+#define IP_MASQ_CMD_DEL_DEST 11 /* for deleting dest in IPVS */
179+#define IP_MASQ_CMD_SET_DEST 12 /* for setting dest in IPVS */
180
181 #endif /* _LINUX_IP_MASQ_H */
182+
183diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/linux/sysctl.h linux-2.2.19-vs-1.0.8/include/linux/sysctl.h
184--- linux-2.2.19/include/linux/sysctl.h Tue Mar 27 09:33:48 2001
185+++ linux-2.2.19-vs-1.0.8/include/linux/sysctl.h Tue Mar 27 09:32:20 2001
186@@ -196,6 +196,7 @@
187 NET_IPV4_NEIGH=17,
188 NET_IPV4_ROUTE=18,
189 NET_IPV4_FIB_HASH=19,
190+ NET_IPV4_VS=20,
191
192 NET_IPV4_TCP_TIMESTAMPS=33,
193 NET_IPV4_TCP_WINDOW_SCALING=34,
194@@ -275,6 +276,32 @@
195 NET_IPV4_CONF_LOG_MARTIANS=11,
196 NET_IPV4_CONF_HIDDEN=12,
197 NET_IPV4_CONF_ARPFILTER=13
198+};
199+
200+/* /proc/sys/net/ipv4/vs */
201+
202+enum
203+{
204+ NET_IPV4_VS_AMEMTHRESH=1,
205+ NET_IPV4_VS_AMDROPRATE=2,
206+ NET_IPV4_VS_DROP_ENTRY=3,
207+ NET_IPV4_VS_DROP_PACKET=4,
208+ NET_IPV4_VS_SECURE_TCP=5,
209+ NET_IPV4_VS_TO_ES=6,
210+ NET_IPV4_VS_TO_SS=7,
211+ NET_IPV4_VS_TO_SR=8,
212+ NET_IPV4_VS_TO_FW=9,
213+ NET_IPV4_VS_TO_TW=10,
214+ NET_IPV4_VS_TO_CL=11,
215+ NET_IPV4_VS_TO_CW=12,
216+ NET_IPV4_VS_TO_LA=13,
217+ NET_IPV4_VS_TO_LI=14,
218+ NET_IPV4_VS_TO_SA=15,
219+ NET_IPV4_VS_TO_UDP=16,
220+ NET_IPV4_VS_TO_ICMP=17,
221+ NET_IPV4_VS_DEBUG_LEVEL=18,
222+ NET_IPV4_VS_LBLC_EXPIRE=19,
223+ NET_IPV4_VS_LBLCR_EXPIRE=20,
224 };
225
226 /* /proc/sys/net/ipv6 */
227diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip.h linux-2.2.19-vs-1.0.8/include/net/ip.h
228--- linux-2.2.19/include/net/ip.h Tue Mar 27 09:33:48 2001
229+++ linux-2.2.19-vs-1.0.8/include/net/ip.h Tue Mar 27 17:48:23 2001
230@@ -47,6 +47,9 @@
231 #define IPSKB_MASQUERADED 1
232 #define IPSKB_TRANSLATED 2
233 #define IPSKB_FORWARDED 4
234+#ifdef CONFIG_IP_MASQUERADE_VS
235+#define IPSKB_REDIRECTED 8
236+#endif
237 };
238
239 struct ipcm_cookie
240diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip_masq.h linux-2.2.19-vs-1.0.8/include/net/ip_masq.h
241--- linux-2.2.19/include/net/ip_masq.h Tue Mar 27 09:33:48 2001
242+++ linux-2.2.19-vs-1.0.8/include/net/ip_masq.h Wed Apr 18 16:17:59 2001
243@@ -12,8 +12,15 @@
244 #include <linux/ip.h>
245 #include <linux/skbuff.h>
246 #include <linux/list.h>
247+#ifdef CONFIG_SYSCTL
248+#include <linux/sysctl.h>
249+#endif
250 #endif /* __KERNEL__ */
251
252+#ifdef CONFIG_IP_MASQUERADE_VS
253+struct ip_vs_dest;
254+#endif
255+
256 /*
257 * This define affects the number of ports that can be handled
258 * by each of the protocol helper modules.
259@@ -66,10 +73,6 @@
260 #define IP_MASQ_MOD_CTL 0x00
261 #define IP_MASQ_USER_CTL 0x01
262
263-#ifdef __KERNEL__
264-
265-#define IP_MASQ_TAB_SIZE 256
266-
267 #define IP_MASQ_F_NO_DADDR 0x0001 /* no daddr yet */
268 #define IP_MASQ_F_NO_DPORT 0x0002 /* no dport set yet */
269 #define IP_MASQ_F_NO_SADDR 0x0004 /* no sport set yet */
270@@ -86,6 +89,22 @@
271 #define IP_MASQ_F_USER 0x2000 /* from uspace */
272 #define IP_MASQ_F_SIMPLE_HASH 0x8000 /* prevent s+d and m+d hashing */
273
274+#ifdef CONFIG_IP_MASQUERADE_VS
275+#define IP_MASQ_F_VS 0x00010000 /* virtual server related */
276+#define IP_MASQ_F_VS_NO_OUTPUT 0x00020000 /* output packets avoid masq */
277+#define IP_MASQ_F_VS_INACTIVE 0x00040000 /* not established */
278+#define IP_MASQ_F_VS_FWD_MASK 0x00700000 /* mask for the fdw method */
279+#define IP_MASQ_F_VS_LOCALNODE 0x00100000 /* local node destination */
280+#define IP_MASQ_F_VS_TUNNEL 0x00200000 /* packets will be tunneled */
281+#define IP_MASQ_F_VS_DROUTE 0x00400000 /* direct routing */
282+ /* masquerading otherwise */
283+#define IP_MASQ_VS_FWD(ms) (ms->flags & IP_MASQ_F_VS_FWD_MASK)
284+#endif /* CONFIG_IP_MASQUERADE_VS */
285+
286+#ifdef __KERNEL__
287+
288+#define IP_MASQ_TAB_SIZE 256
289+
290 /*
291 * Delta seq. info structure
292 * Each MASQ struct has 2 (output AND input seq. changes).
293@@ -114,9 +133,13 @@
294 struct ip_masq *control; /* Master control connection */
295 atomic_t n_control; /* Number of "controlled" masqs */
296 unsigned flags; /* status flags */
297- unsigned timeout; /* timeout */
298+ unsigned long timeout; /* timeout */
299 unsigned state; /* state info */
300 struct ip_masq_timeout_table *timeout_table;
301+#ifdef CONFIG_IP_MASQUERADE_VS
302+ struct ip_vs_dest *dest; /* real server */
303+ atomic_t in_pkts; /* incoming packet counter */
304+#endif /* CONFIG_IP_MASQUERADE_VS */
305 };
306
307 /*
308@@ -179,7 +202,7 @@
309 extern struct list_head ip_masq_d_table[IP_MASQ_TAB_SIZE];
310 extern const char * ip_masq_state_name(int state);
311 extern struct ip_masq_hook *ip_masq_user_hook;
312-extern u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope);
313+extern int ip_masq_select_addr(struct sk_buff *skb,__u32 *maddr);
314 /*
315 *
316 * IP_MASQ_APP: IP application masquerading definitions
317@@ -354,6 +377,10 @@
318 static const char *strProt[] = {"UDP","TCP","ICMP"};
319 int msproto = masq_proto_num(proto);
320
321+#ifdef CONFIG_IP_MASQUERADE_VS
322+ if (proto == IPPROTO_IP)
323+ return "IP ";
324+#endif /* CONFIG_IP_MASQUERADE_VS */
325 if (msproto<0||msproto>2) {
326 sprintf(buf, "IP_%d", proto);
327 return buf;
328@@ -372,6 +399,9 @@
329 IP_MASQ_S_CLOSE_WAIT,
330 IP_MASQ_S_LAST_ACK,
331 IP_MASQ_S_LISTEN,
332+#ifdef CONFIG_IP_MASQUERADE_VS
333+ IP_MASQ_S_SYNACK,
334+#endif
335 IP_MASQ_S_UDP,
336 IP_MASQ_S_ICMP,
337 IP_MASQ_S_LAST
338@@ -395,8 +425,33 @@
339
340 if (!mstim)
341 return;
342+ ms->timeout_table = NULL;
343 atomic_dec(&mstim->refcnt);
344 }
345+
346+#ifdef CONFIG_IP_MASQUERADE_VS
347+
348+extern struct ip_masq_timeout_table masq_timeout_table_dos;
349+extern void ip_masq_secure_tcp_set(int on);
350+
351+/*
352+ * This is a simple mechanism to ignore packets when
353+ * we are loaded. Just set ip_masq_drop_rate to 'n' and
354+ * we start to drop 1/n of the packets
355+ */
356+
357+extern int ip_masq_drop_rate;
358+extern int ip_masq_drop_counter;
359+
360+static __inline__ int ip_masq_todrop(void)
361+{
362+ if (!ip_masq_drop_rate) return 0;
363+ if (--ip_masq_drop_counter > 0) return 0;
364+ ip_masq_drop_counter = ip_masq_drop_rate;
365+ return 1;
366+}
367+
368+#endif /* CONFIG_IP_MASQUERADE_VS */
369
370 #endif /* __KERNEL__ */
371
372diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip_vs.h linux-2.2.19-vs-1.0.8/include/net/ip_vs.h
373--- linux-2.2.19/include/net/ip_vs.h Thu Jan 1 08:00:00 1970
374+++ linux-2.2.19-vs-1.0.8/include/net/ip_vs.h Tue Apr 24 18:07:00 2001
375@@ -0,0 +1,392 @@
376+/*
377+ * IP virtual server
378+ * data structure and functionality definitions
379+ */
380+
381+#include <linux/config.h>
382+
383+#ifndef _IP_VS_H
384+#define _IP_VS_H
385+
386+#define IP_VS_VERSION_CODE 0x010008
387+#define NVERSION(version) \
388+ (version >> 16) & 0xFF, \
389+ (version >> 8) & 0xFF, \
390+ version & 0xFF
391+
392+/*
393+ * Virtual Service Flags
394+ */
395+#define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */
396+#define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */
397+
398+/*
399+ * Destination Server Flags
400+ */
401+#define IP_VS_DEST_F_AVAILABLE 0x0001 /* Available tag */
402+
403+/*
404+ * The default IP_VS_TEMPLATE_TIMEOUT is a little larger than average
405+ * connection time plus MASQUERADE_EXPIRE_TCP_FIN(2*60*HZ). Because the
406+ * template won't be released until its controlled masq entries are
407+ * expired.
408+ * If IP_VS_TEMPLATE_TIMEOUT is too less, the template will soon expire
409+ * and will be put in expire again and again, which requires additional
410+ * overhead. If it is too large, the same will always visit the same
411+ * server, which will make dynamic load imbalance worse.
412+ */
413+#define IP_VS_TEMPLATE_TIMEOUT 6*60*HZ
414+
415+#ifdef __KERNEL__
416+
417+extern int ip_vs_forwarding_related_icmp(struct sk_buff *skb);
418+
419+#ifdef CONFIG_IP_VS_DEBUG
420+extern int ip_vs_get_debug_level(void);
421+#define IP_VS_DBG(level, msg...) \
422+ do { \
423+ if (level <= ip_vs_get_debug_level()) \
424+ printk(KERN_DEBUG "IPVS: " ## msg); \
425+ } while (0)
426+#else /* NO DEBUGGING at ALL */
427+#define IP_VS_DBG(level, msg...) do {} while (0)
428+#endif
429+
430+#define IP_VS_ERR(msg...) printk(KERN_ERR "IPVS: " ## msg )
431+#define IP_VS_INFO(msg...) printk(KERN_INFO "IPVS: " ## msg )
432+#define IP_VS_WARNING(msg...) \
433+ printk(KERN_WARNING "IPVS: " ## msg)
434+
435+#ifdef CONFIG_IP_VS_DEBUG
436+#define EnterFunction(level) \
437+ do { \
438+ if (level <= ip_vs_get_debug_level()) \
439+ printk(KERN_DEBUG "Enter: %s, %s line %i\n", \
440+ __FUNCTION__, __FILE__, __LINE__); \
441+ } while (0)
442+#define LeaveFunction(level) \
443+ do { \
444+ if (level <= ip_vs_get_debug_level()) \
445+ printk(KERN_DEBUG "Leave: %s, %s line %i\n", \
446+ __FUNCTION__, __FILE__, __LINE__); \
447+ } while (0)
448+#else
449+#define EnterFunction(level) do {} while (0)
450+#define LeaveFunction(level) do {} while (0)
451+#endif
452+
453+
454+/*
455+ * IPVS statistics object
456+ */
457+struct ip_vs_stats
458+{
459+ spinlock_t lock; /* spin lock */
460+ __u32 conns; /* connections scheduled */
461+ __u32 inpkts; /* incoming packets */
462+ __u32 outpkts; /* outgoing packets */
463+ __u64 inbytes; /* incoming bytes */
464+ __u64 outbytes; /* outgoing bytes */
465+};
466+
467+
468+/*
469+ * The real server destination forwarding entry
470+ * with ip address, port
471+ */
472+struct ip_vs_dest {
473+ struct list_head n_list; /* for the dests in the service */
474+ struct list_head d_list; /* for table with all the dests */
475+
476+ __u32 addr; /* IP address of real server */
477+ __u16 port; /* port number of the service */
478+ unsigned flags; /* dest status flags */
479+ unsigned masq_flags; /* flags to copy to masq */
480+ atomic_t activeconns; /* active connections */
481+ atomic_t inactconns; /* inactive connections */
482+ atomic_t refcnt; /* reference counter */
483+ int weight; /* server weight */
484+ struct ip_vs_stats stats; /* statistics */
485+
486+ /* for virtual service */
487+ struct ip_vs_service *svc; /* service that it belongs to */
488+ __u16 protocol; /* which protocol (TCP/UDP) */
489+ __u32 vaddr; /* IP address for virtual service */
490+ __u16 vport; /* port number for the service */
491+ __u32 vfwmark; /* firewall mark of the service */
492+};
493+
494+
495+/*
496+ * The scheduler object
497+ */
498+struct ip_vs_scheduler {
499+ struct list_head n_list; /* d-linked list head */
500+ char *name; /* scheduler name */
501+ atomic_t refcnt; /* reference counter */
502+
503+ /* scheduler initializing service */
504+ int (*init_service)(struct ip_vs_service *svc);
505+ /* scheduling service finish */
506+ int (*done_service)(struct ip_vs_service *svc);
507+ /* scheduler updating service */
508+ int (*update_service)(struct ip_vs_service *svc);
509+
510+ /* selecting a server from the given service */
511+ struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc,
512+ struct iphdr *iph);
513+};
514+
515+
516+/*
517+ * The information about the virtual service offered to the net
518+ * and the forwarding entries
519+ */
520+struct ip_vs_service {
521+ struct list_head s_list; /* for normal service table */
522+ struct list_head f_list; /* for fwmark-based service table */
523+ atomic_t refcnt; /* reference counter */
524+
525+ __u16 protocol; /* which protocol (TCP/UDP) */
526+ __u32 addr; /* IP address for virtual service */
527+ __u16 port; /* port number for the service */
528+ __u32 fwmark; /* firewall mark of the service */
529+ unsigned flags; /* service status flags */
530+ unsigned timeout; /* persistent timeout in ticks */
531+ __u32 netmask; /* grouping granularity */
532+ struct list_head destinations; /* real server d-linked list */
533+ struct ip_vs_stats stats; /* statistics for the service */
534+
535+ /* for scheduling */
536+ struct ip_vs_scheduler *scheduler; /* bound scheduler object */
537+ void *sched_data; /* scheduler application data */
538+};
539+
540+
541+/*
542+ * IP Virtual Server masq entry hash table
543+ */
544+#define IP_VS_TAB_BITS CONFIG_IP_MASQUERADE_VS_TAB_BITS
545+#define IP_VS_TAB_SIZE (1 << IP_VS_TAB_BITS)
546+#define IP_VS_TAB_MASK (IP_VS_TAB_SIZE - 1)
547+extern struct list_head *ip_vs_table;
548+
549+/*
550+ * Hash and unhash functions
551+ */
552+extern int ip_vs_hash(struct ip_masq *ms);
553+extern int ip_vs_unhash(struct ip_masq *ms);
554+
555+/*
556+ * Registering/unregistering scheduler functions
557+ */
558+extern int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
559+extern int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
560+
561+/*
562+ * Lookup functions for the hash table (caller must lock table)
563+ */
564+extern struct ip_masq * __ip_vs_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port);
565+extern struct ip_masq * __ip_vs_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port);
566+
567+/*
568+ * Creating a masquerading entry for IPVS
569+ */
570+extern struct ip_masq * ip_masq_new_vs(int proto, __u32 maddr, __u16 mport, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned flags);
571+
572+/*
573+ * IPVS data and functions
574+ */
575+extern rwlock_t __ip_vs_lock;
576+
577+extern void ip_vs_set_state(struct ip_masq *ms, int new_state);
578+extern void ip_vs_bind_masq(struct ip_masq *ms, struct ip_vs_dest *dest);
579+extern void ip_vs_unbind_masq(struct ip_masq *ms);
580+
581+extern int ip_vs_ctl(int optname, struct ip_masq_ctl *mctl, int optlen);
582+extern struct ip_vs_service *
583+ip_vs_lookup_service(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport);
584+extern struct ip_vs_service * ip_vs_lookup_svc_fwm(__u32 fwmark);
585+extern struct ip_vs_dest *
586+__ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport);
587+extern struct ip_vs_dest *ip_vs_lookup_dest(struct ip_vs_service *svc,
588+ __u32 daddr, __u16 dport);
589+extern struct ip_masq * ip_vs_schedule(struct ip_vs_service *svc,
590+ struct iphdr *iph);
591+extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb);
592+extern int ip_vs_tunnel_xmit(struct sk_buff *skb, __u32 daddr);
593+extern int ip_vs_dr_xmit(struct sk_buff *skb, __u32 daddr);
594+
595+/*
596+ * init function
597+ */
598+extern int ip_vs_init(void);
599+
600+/*
601+ * init function prototypes for scheduling modules
602+ * these function will be called when they are built in kernel
603+ */
604+extern int ip_vs_rr_init(void);
605+extern int ip_vs_wrr_init(void);
606+extern int ip_vs_lc_init(void);
607+extern int ip_vs_wlc_init(void);
608+extern int ip_vs_lblc_init(void);
609+extern int ip_vs_lblcr_init(void);
610+
611+
612+/*
613+ * Slow timer functions for IPVS
614+ */
615+extern void add_sltimer(struct timer_list * timer);
616+extern int del_sltimer(struct timer_list * timer);
617+extern void mod_sltimer(struct timer_list *timer, unsigned long expires);
618+
619+
620+/*
621+ * IP Virtual Server statistics
622+ */
623+extern struct ip_vs_stats ip_vs_stats;
624+
625+extern __inline__ void
626+ip_vs_in_stats(struct ip_masq *ms, struct sk_buff *skb)
627+{
628+ struct ip_vs_dest *dest = ms->dest;
629+ if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
630+ spin_lock(&dest->stats.lock);
631+ dest->stats.inpkts++;
632+ dest->stats.inbytes += skb->len;
633+ spin_unlock(&dest->stats.lock);
634+
635+ spin_lock(&dest->svc->stats.lock);
636+ dest->svc->stats.inpkts++;
637+ dest->svc->stats.inbytes += skb->len;
638+ spin_unlock(&dest->svc->stats.lock);
639+
640+ spin_lock(&ip_vs_stats.lock);
641+ ip_vs_stats.inpkts++;
642+ ip_vs_stats.inbytes += skb->len;
643+ spin_unlock(&ip_vs_stats.lock);
644+ }
645+}
646+
647+
648+extern __inline__ void
649+ip_vs_out_stats(struct ip_masq *ms, struct sk_buff *skb)
650+{
651+ struct ip_vs_dest *dest = ms->dest;
652+ if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
653+ spin_lock(&dest->stats.lock);
654+ dest->stats.outpkts++;
655+ dest->stats.outbytes += skb->len;
656+ spin_unlock(&dest->stats.lock);
657+
658+ spin_lock(&dest->svc->stats.lock);
659+ dest->svc->stats.outpkts++;
660+ dest->svc->stats.outbytes += skb->len;
661+ spin_unlock(&dest->svc->stats.lock);
662+
663+ spin_lock(&ip_vs_stats.lock);
664+ ip_vs_stats.outpkts++;
665+ ip_vs_stats.outbytes += skb->len;
666+ spin_unlock(&ip_vs_stats.lock);
667+ }
668+}
669+
670+
671+extern __inline__ void
672+ip_vs_conn_stats(struct ip_masq *ms, struct ip_vs_service *svc)
673+{
674+ spin_lock(&ms->dest->stats.lock);
675+ ms->dest->stats.conns++;
676+ spin_unlock(&ms->dest->stats.lock);
677+
678+ spin_lock(&svc->stats.lock);
679+ svc->stats.conns++;
680+ spin_unlock(&svc->stats.lock);
681+
682+ spin_lock(&ip_vs_stats.lock);
683+ ip_vs_stats.conns++;
684+ spin_unlock(&ip_vs_stats.lock);
685+}
686+
687+
688+/*
689+ * ip_vs_fwd_tag returns the forwarding tag of the masq
690+ */
691+extern __inline__ char ip_vs_fwd_tag(struct ip_masq *ms)
692+{
693+ char fwd = 'M';
694+
695+ switch (IP_MASQ_VS_FWD(ms)) {
696+ case IP_MASQ_F_VS_LOCALNODE: fwd = 'L'; break;
697+ case IP_MASQ_F_VS_TUNNEL: fwd = 'T'; break;
698+ case IP_MASQ_F_VS_DROUTE: fwd = 'R'; break;
699+ }
700+ return fwd;
701+}
702+
703+
704+extern __inline__ char * ip_vs_fwd_name(unsigned masq_flags)
705+{
706+ char *fwd;
707+
708+ switch (masq_flags & IP_MASQ_F_VS_FWD_MASK) {
709+ case IP_MASQ_F_VS_LOCALNODE:
710+ fwd = "Local";
711+ break;
712+ case IP_MASQ_F_VS_TUNNEL:
713+ fwd = "Tunnel";
714+ break;
715+ case IP_MASQ_F_VS_DROUTE:
716+ fwd = "Route";
717+ break;
718+ default:
719+ fwd = "Masq";
720+ }
721+ return fwd;
722+}
723+
724+
725+/*
726+ * ip_vs_forward forwards the packet through tunneling, direct
727+ * routing or local node (passing to the upper layer).
728+ * Return values mean:
729+ * 0 skb must be passed to the upper layer
730+ * -1 skb must be released
731+ * -2 skb has been released
732+ */
733+extern __inline__ int ip_vs_forward(struct sk_buff *skb, struct ip_masq *ms)
734+{
735+ int ret = -1;
736+
737+ atomic_inc(&ms->in_pkts);
738+
739+ switch (IP_MASQ_VS_FWD(ms)) {
740+ case IP_MASQ_F_VS_TUNNEL:
741+ if (ip_vs_tunnel_xmit(skb, ms->saddr) == 0) {
742+ IP_VS_DBG(10, "tunneling failed.\n");
743+ } else {
744+ IP_VS_DBG(10, "tunneling succeeded.\n");
745+ }
746+ ret = -2;
747+ break;
748+
749+ case IP_MASQ_F_VS_DROUTE:
750+ if (ip_vs_dr_xmit(skb, ms->saddr) == 0) {
751+ IP_VS_DBG(10, "direct routing failed.\n");
752+ } else {
753+ IP_VS_DBG(10, "direct routing succeeded.\n");
754+ }
755+ ret = -2;
756+ break;
757+
758+ case IP_MASQ_F_VS_LOCALNODE:
759+ ret = 0;
760+ }
761+
762+ return ret;
763+}
764+
765+#endif /* __KERNEL__ */
766+
767+#endif /* _IP_VS_H */
768diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/Config.in linux-2.2.19-vs-1.0.8/net/ipv4/Config.in
769--- linux-2.2.19/net/ipv4/Config.in Sat Dec 16 23:10:12 2000
770+++ linux-2.2.19-vs-1.0.8/net/ipv4/Config.in Tue Dec 12 18:35:06 2000
771@@ -51,6 +51,17 @@
772 tristate 'IP: ipportfw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPPORTFW
773 tristate 'IP: ip fwmark masq-forwarding support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_MFW
774 fi
775+ bool 'IP: masquerading virtual server support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_VS
776+ if [ "$CONFIG_IP_MASQUERADE_VS" = "y" ]; then
777+ bool ' IP virtual server debugging' CONFIG_IP_VS_DEBUG
778+ int ' IP masquerading VS table size (the Nth power of 2)' CONFIG_IP_MASQUERADE_VS_TAB_BITS 12
779+ tristate ' IPVS: round-robin scheduling' CONFIG_IP_MASQUERADE_VS_RR
780+ tristate ' IPVS: weighted round-robin scheduling' CONFIG_IP_MASQUERADE_VS_WRR
781+ tristate ' IPVS: least-connection scheduling' CONFIG_IP_MASQUERADE_VS_LC
782+ tristate ' IPVS: weighted least-connection scheduling' CONFIG_IP_MASQUERADE_VS_WLC
783+ tristate ' IPVS: locality-based least-connection scheduling' CONFIG_IP_MASQUERADE_VS_LBLC
784+ tristate ' IPVS: locality-based least-connection with replication scheduling' CONFIG_IP_MASQUERADE_VS_LBLCR
785+ fi
786 fi
787 fi
788 fi
789diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/Makefile linux-2.2.19-vs-1.0.8/net/ipv4/Makefile
790--- linux-2.2.19/net/ipv4/Makefile Tue Jan 5 07:31:34 1999
791+++ linux-2.2.19-vs-1.0.8/net/ipv4/Makefile Sat Dec 2 22:32:10 2000
792@@ -91,6 +91,58 @@
793
794 endif
795
796+ifeq ($(CONFIG_IP_MASQUERADE_VS),y)
797+ IPV4X_OBJS += ip_vs.o
798+
799+ ifeq ($(CONFIG_IP_MASQUERADE_VS_RR),y)
800+ IPV4_OBJS += ip_vs_rr.o
801+ else
802+ ifeq ($(CONFIG_IP_MASQUERADE_VS_RR),m)
803+ M_OBJS += ip_vs_rr.o
804+ endif
805+ endif
806+
807+ ifeq ($(CONFIG_IP_MASQUERADE_VS_WRR),y)
808+ IPV4_OBJS += ip_vs_wrr.o
809+ else
810+ ifeq ($(CONFIG_IP_MASQUERADE_VS_WRR),m)
811+ M_OBJS += ip_vs_wrr.o
812+ endif
813+ endif
814+
815+ ifeq ($(CONFIG_IP_MASQUERADE_VS_LC),y)
816+ IPV4_OBJS += ip_vs_lc.o
817+ else
818+ ifeq ($(CONFIG_IP_MASQUERADE_VS_LC),m)
819+ M_OBJS += ip_vs_lc.o
820+ endif
821+ endif
822+
823+ ifeq ($(CONFIG_IP_MASQUERADE_VS_WLC),y)
824+ IPV4_OBJS += ip_vs_wlc.o
825+ else
826+ ifeq ($(CONFIG_IP_MASQUERADE_VS_WLC),m)
827+ M_OBJS += ip_vs_wlc.o
828+ endif
829+ endif
830+
831+ ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLC),y)
832+ IPV4_OBJS += ip_vs_lblc.o
833+ else
834+ ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLC),m)
835+ M_OBJS += ip_vs_lblc.o
836+ endif
837+ endif
838+
839+ ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLCR),y)
840+ IPV4_OBJS += ip_vs_lblcr.o
841+ else
842+ ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLCR),m)
843+ M_OBJS += ip_vs_lblcr.o
844+ endif
845+ endif
846+endif
847+
848 M_OBJS += ip_masq_user.o
849 M_OBJS += ip_masq_ftp.o ip_masq_irc.o ip_masq_raudio.o ip_masq_quake.o
850 M_OBJS += ip_masq_vdolive.o ip_masq_cuseeme.o
851diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_forward.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_forward.c
852--- linux-2.2.19/net/ipv4/ip_forward.c Fri Jan 7 09:45:02 2000
853+++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_forward.c Fri Feb 2 15:38:28 2001
854@@ -41,6 +41,9 @@
855 #include <linux/ip_fw.h>
856 #ifdef CONFIG_IP_MASQUERADE
857 #include <net/ip_masq.h>
858+#ifdef CONFIG_IP_MASQUERADE_VS
859+#include <net/ip_vs.h>
860+#endif
861 #endif
862 #include <net/checksum.h>
863 #include <linux/route.h>
864@@ -103,6 +106,14 @@
865 }
866 #endif
867
868+#ifdef CONFIG_IP_MASQUERADE_VS
869+ if (iph->protocol == IPPROTO_ICMP &&
870+ !(IPCB(skb)->flags&IPSKB_MASQUERADED)) {
871+ /* Related ICMP packet for IPVS ? */
872+ fw_res = ip_vs_forwarding_related_icmp(skb);
873+ if (fw_res > 0) return ip_local_deliver(skb);
874+ }
875+#endif
876
877 #ifdef CONFIG_IP_TRANSPARENT_PROXY
878 if (ip_chksock(skb))
879diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_input.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_input.c
880--- linux-2.2.19/net/ipv4/ip_input.c Tue Mar 27 09:33:49 2001
881+++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_input.c Tue Mar 27 09:32:21 2001
882@@ -250,6 +250,15 @@
883 */
884 {
885 int ret;
886+
887+#ifdef CONFIG_IP_MASQUERADE_VS
888+ if((IPCB(skb)->flags&IPSKB_REDIRECTED)) {
889+ printk(KERN_DEBUG "ip_input(): ipvs recursion detected. Check ipvs configuration\n");
890+ kfree_skb(skb);
891+ return 0;
892+ }
893+#endif
894+
895 /*
896 * Some masq modules can re-inject packets if
897 * bad configured.
898@@ -262,6 +271,12 @@
899 }
900
901 ret = ip_fw_demasquerade(&skb);
902+#ifdef CONFIG_IP_MASQUERADE_VS
903+ if (ret == -2) {
904+ /* skb has already been released */
905+ return 0;
906+ }
907+#endif
908 if (ret < 0) {
909 kfree_skb(skb);
910 return 0;
911diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_masq.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_masq.c
912--- linux-2.2.19/net/ipv4/ip_masq.c Tue Mar 27 09:33:49 2001
913+++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_masq.c Wed Apr 18 19:58:48 2001
914@@ -50,7 +50,12 @@
915 * Kai Bankett : do not toss other IP protos in proto_doff()
916 * Dan Kegel : pointed correct NAT behavior for UDP streams
917 * Julian Anastasov : use daddr and dport as hash keys
918- *
919+ * Wensong Zhang : Added virtual server support
920+ * Peter Kese : added masq TCP state handling for input-only
921+ * Julian Anastasov : step to mSR after SYN in INPUT_ONLY table
922+ * Julian Anastasov : fixed huge expire bug for IPVS after bad checksum
923+ * Wensong Zhang : added server status checking for IPVS
924+ *
925 */
926
927 #include <linux/config.h>
928@@ -85,6 +90,10 @@
929 #include <linux/ip_fw.h>
930 #include <linux/ip_masq.h>
931
932+#ifdef CONFIG_IP_MASQUERADE_VS
933+#include <net/ip_vs.h>
934+#endif /* CONFIG_IP_MASQUERADE_VS */
935+
936 int sysctl_ip_masq_debug = 0;
937 int sysctl_ip_masq_udp_dloose = 0;
938
939@@ -98,6 +107,21 @@
940
941 struct ip_masq_hook *ip_masq_user_hook = NULL;
942
943+#ifdef CONFIG_IP_MASQUERADE_VS
944+/*
945+ * Use different state/timeout tables
946+ */
947+#ifndef IP_MASQ_MANY_STATE_TABLES
948+#define IP_MASQ_MANY_STATE_TABLES
949+#endif
950+
951+int ip_masq_drop_rate = 0;
952+int ip_masq_drop_counter = 0;
953+
954+#endif
955+
956+#ifndef CONFIG_IP_MASQUERADE_VS
957+
958 /*
959 * Timeout table[state]
960 */
961@@ -106,38 +130,104 @@
962 ATOMIC_INIT(0), /* refcnt */
963 0, /* scale */
964 {
965- 30*60*HZ, /* IP_MASQ_S_NONE, */
966- 15*60*HZ, /* IP_MASQ_S_ESTABLISHED, */
967- 2*60*HZ, /* IP_MASQ_S_SYN_SENT, */
968- 1*60*HZ, /* IP_MASQ_S_SYN_RECV, */
969- 2*60*HZ, /* IP_MASQ_S_FIN_WAIT, */
970- 2*60*HZ, /* IP_MASQ_S_TIME_WAIT, */
971- 10*HZ, /* IP_MASQ_S_CLOSE, */
972- 60*HZ, /* IP_MASQ_S_CLOSE_WAIT, */
973- 30*HZ, /* IP_MASQ_S_LAST_ACK, */
974- 2*60*HZ, /* IP_MASQ_S_LISTEN, */
975- 5*60*HZ, /* IP_MASQ_S_UDP, */
976- 1*60*HZ, /* IP_MASQ_S_ICMP, */
977- 2*HZ,/* IP_MASQ_S_LAST */
978+ [IP_MASQ_S_NONE] = 30*60*HZ,
979+ [IP_MASQ_S_ESTABLISHED] = 15*60*HZ,
980+ [IP_MASQ_S_SYN_SENT] = 2*60*HZ,
981+ [IP_MASQ_S_SYN_RECV] = 1*60*HZ,
982+ [IP_MASQ_S_FIN_WAIT] = 2*60*HZ,
983+ [IP_MASQ_S_TIME_WAIT] = 2*60*HZ,
984+ [IP_MASQ_S_CLOSE] = 10*HZ,
985+ [IP_MASQ_S_CLOSE_WAIT] = 60*HZ,
986+ [IP_MASQ_S_LAST_ACK] = 30*HZ,
987+ [IP_MASQ_S_LISTEN] = 2*60*HZ,
988+ [IP_MASQ_S_UDP] = 5*60*HZ,
989+ [IP_MASQ_S_ICMP] = 1*60*HZ,
990+ [IP_MASQ_S_LAST] = 2*HZ,
991 }, /* timeout */
992 };
993
994+#else /* CONFIG_IP_MASQUERADE_VS */
995+
996+/*
997+ * Timeout table[state]
998+ */
999+/* static int masq_timeout_table[IP_MASQ_S_LAST+1] = { */
1000+static struct ip_masq_timeout_table masq_timeout_table = {
1001+ ATOMIC_INIT(0), /* refcnt */
1002+ 0, /* scale */
1003+ {
1004+ [IP_MASQ_S_NONE] = 30*60*HZ,
1005+ [IP_MASQ_S_ESTABLISHED] = 15*60*HZ,
1006+ [IP_MASQ_S_SYN_SENT] = 2*60*HZ,
1007+ [IP_MASQ_S_SYN_RECV] = 1*60*HZ,
1008+ [IP_MASQ_S_FIN_WAIT] = 2*60*HZ,
1009+ [IP_MASQ_S_TIME_WAIT] = 2*60*HZ,
1010+ [IP_MASQ_S_CLOSE] = 10*HZ,
1011+ [IP_MASQ_S_CLOSE_WAIT] = 60*HZ,
1012+ [IP_MASQ_S_LAST_ACK] = 30*HZ,
1013+ [IP_MASQ_S_LISTEN] = 2*60*HZ,
1014+ [IP_MASQ_S_SYNACK] = 120*HZ,
1015+ [IP_MASQ_S_UDP] = 5*60*HZ,
1016+ [IP_MASQ_S_ICMP] = 1*60*HZ,
1017+ [IP_MASQ_S_LAST] = 2*HZ,
1018+ }, /* timeout */
1019+};
1020+
1021+
1022+struct ip_masq_timeout_table masq_timeout_table_dos = {
1023+ ATOMIC_INIT(0), /* refcnt */
1024+ 0, /* scale */
1025+ {
1026+ [IP_MASQ_S_NONE] = 15*60*HZ,
1027+ [IP_MASQ_S_ESTABLISHED] = 8*60*HZ,
1028+ [IP_MASQ_S_SYN_SENT] = 60*HZ,
1029+ [IP_MASQ_S_SYN_RECV] = 10*HZ,
1030+ [IP_MASQ_S_FIN_WAIT] = 60*HZ,
1031+ [IP_MASQ_S_TIME_WAIT] = 60*HZ,
1032+ [IP_MASQ_S_CLOSE] = 10*HZ,
1033+ [IP_MASQ_S_CLOSE_WAIT] = 60*HZ,
1034+ [IP_MASQ_S_LAST_ACK] = 30*HZ,
1035+ [IP_MASQ_S_LISTEN] = 2*60*HZ,
1036+ [IP_MASQ_S_SYNACK] = 100*HZ,
1037+ [IP_MASQ_S_UDP] = 3*60*HZ,
1038+ [IP_MASQ_S_ICMP] = 1*60*HZ,
1039+ [IP_MASQ_S_LAST] = 2*HZ,
1040+ }, /* timeout */
1041+};
1042+
1043+/*
1044+ * Timeout table to use for the VS entries
1045+ * If NULL we use the default table (masq_timeout_table).
1046+ * Under flood attack we switch to masq_timeout_table_dos
1047+ */
1048+
1049+struct ip_masq_timeout_table *ip_vs_timeout_table = &masq_timeout_table;
1050+
1051+#endif /* CONFIG_IP_MASQUERADE_VS */
1052+
1053+#ifdef CONFIG_IP_MASQUERADE_VS
1054+#define MASQUERADE_EXPIRE_RETRY(ms) (ms->timeout_table? ms->timeout_table->timeout[IP_MASQ_S_TIME_WAIT] : masq_timeout_table.timeout[IP_MASQ_S_TIME_WAIT])
1055+#else
1056 #define MASQUERADE_EXPIRE_RETRY masq_timeout_table.timeout[IP_MASQ_S_TIME_WAIT]
1057+#endif
1058
1059 static const char * state_name_table[IP_MASQ_S_LAST+1] = {
1060- "NONE", /* IP_MASQ_S_NONE, */
1061- "ESTABLISHED", /* IP_MASQ_S_ESTABLISHED, */
1062- "SYN_SENT", /* IP_MASQ_S_SYN_SENT, */
1063- "SYN_RECV", /* IP_MASQ_S_SYN_RECV, */
1064- "FIN_WAIT", /* IP_MASQ_S_FIN_WAIT, */
1065- "TIME_WAIT", /* IP_MASQ_S_TIME_WAIT, */
1066- "CLOSE", /* IP_MASQ_S_CLOSE, */
1067- "CLOSE_WAIT", /* IP_MASQ_S_CLOSE_WAIT, */
1068- "LAST_ACK", /* IP_MASQ_S_LAST_ACK, */
1069- "LISTEN", /* IP_MASQ_S_LISTEN, */
1070- "UDP", /* IP_MASQ_S_UDP, */
1071- "ICMP", /* IP_MASQ_S_ICMP, */
1072- "BUG!", /* IP_MASQ_S_LAST */
1073+ [IP_MASQ_S_NONE] = "NONE",
1074+ [IP_MASQ_S_ESTABLISHED] = "ESTABLISHED",
1075+ [IP_MASQ_S_SYN_SENT] = "SYN_SENT",
1076+ [IP_MASQ_S_SYN_RECV] = "SYN_RECV",
1077+ [IP_MASQ_S_FIN_WAIT] = "FIN_WAIT",
1078+ [IP_MASQ_S_TIME_WAIT] = "TIME_WAIT",
1079+ [IP_MASQ_S_CLOSE] = "CLOSE",
1080+ [IP_MASQ_S_CLOSE_WAIT] = "CLOSE_WAIT",
1081+ [IP_MASQ_S_LAST_ACK] = "LAST_ACK",
1082+ [IP_MASQ_S_LISTEN] = "LISTEN",
1083+#ifdef CONFIG_IP_MASQUERADE_VS
1084+ [IP_MASQ_S_SYNACK] = "SYNACK",
1085+#endif
1086+ [IP_MASQ_S_UDP] = "UDP",
1087+ [IP_MASQ_S_ICMP] = "ICMP",
1088+ [IP_MASQ_S_LAST] = "BUG!",
1089 };
1090
1091 #define mNO IP_MASQ_S_NONE
1092@@ -150,6 +240,9 @@
1093 #define mCW IP_MASQ_S_CLOSE_WAIT
1094 #define mLA IP_MASQ_S_LAST_ACK
1095 #define mLI IP_MASQ_S_LISTEN
1096+#ifdef CONFIG_IP_MASQUERADE_VS
1097+#define mSA IP_MASQ_S_SYNACK
1098+#endif
1099
1100 struct masq_tcp_states_t {
1101 int next_state[IP_MASQ_S_LAST]; /* should be _LAST_TCP */
1102@@ -159,46 +252,111 @@
1103 {
1104 if (state >= IP_MASQ_S_LAST)
1105 return "ERR!";
1106- return state_name_table[state];
1107+ return state_name_table[state] ? state_name_table[state] : "?";
1108 }
1109
1110+#ifndef CONFIG_IP_MASQUERADE_VS
1111+
1112 struct masq_tcp_states_t masq_tcp_states [] = {
1113 /* INPUT */
1114 /* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */
1115 /*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR }},
1116 /*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI }},
1117-/*ack*/ {{mCL, mES, mSS, mSR, mFW, mTW, mCL, mCW, mCL, mLI }},
1118+/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI }},
1119 /*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI }},
1120
1121 /* OUTPUT */
1122 /* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */
1123-/*syn*/ {{mSS, mES, mSS, mES, mSS, mSS, mSS, mSS, mSS, mLI }},
1124+/*syn*/ {{mSS, mES, mSS, mSR, mSS, mSS, mSS, mSS, mSS, mLI }},
1125 /*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI }},
1126-/*ack*/ {{mES, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mES }},
1127+/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES }},
1128 /*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL }},
1129 };
1130
1131-static __inline__ int masq_tcp_state_idx(struct tcphdr *th, int output)
1132+#else /* CONFIG_IP_MASQUERADE_VS */
1133+
1134+struct masq_tcp_states_t masq_tcp_states [] = {
1135+/* INPUT */
1136+/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1137+/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSR }},
1138+/*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI, mTW }},
1139+/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }},
1140+/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mSR }},
1141+
1142+/* OUTPUT */
1143+/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1144+/*syn*/ {{mSS, mES, mSS, mSR, mSS, mSS, mSS, mSS, mSS, mLI, mSR }},
1145+/*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI, mTW }},
1146+/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES, mES }},
1147+/*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL, mCL }},
1148+
1149+/* INPUT-ONLY */
1150+/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1151+/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSR }},
1152+/*fin*/ {{mCL, mFW, mSS, mTW, mFW, mTW, mCL, mCW, mLA, mLI, mTW }},
1153+/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }},
1154+/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }},
1155+};
1156+
1157+struct masq_tcp_states_t masq_tcp_states_dos [] = {
1158+/* INPUT */
1159+/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1160+/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSA }},
1161+/*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI, mSA }},
1162+/*ack*/ {{mCL, mES, mSS, mSR, mFW, mTW, mCL, mCW, mCL, mLI, mSA }},
1163+/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }},
1164+
1165+/* OUTPUT */
1166+/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1167+/*syn*/ {{mSS, mES, mSS, mSA, mSS, mSS, mSS, mSS, mSS, mLI, mSA }},
1168+/*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI, mTW }},
1169+/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES, mES }},
1170+/*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL, mCL }},
1171+
1172+/* INPUT-ONLY */
1173+/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */
1174+/*syn*/ {{mSA, mES, mES, mSR, mSA, mSA, mSA, mSA, mSA, mSA, mSA }},
1175+/*fin*/ {{mCL, mFW, mSS, mTW, mFW, mTW, mCL, mCW, mLA, mLI, mTW }},
1176+/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }},
1177+/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }},
1178+};
1179+
1180+struct masq_tcp_states_t *ip_vs_state_table = masq_tcp_states;
1181+
1182+void ip_masq_secure_tcp_set(int on)
1183+{
1184+ if (on) {
1185+ ip_vs_state_table = masq_tcp_states_dos;
1186+ ip_vs_timeout_table = &masq_timeout_table_dos;
1187+ } else {
1188+ ip_vs_state_table = masq_tcp_states;
1189+ ip_vs_timeout_table = &masq_timeout_table;
1190+ }
1191+}
1192+
1193+#endif /* CONFIG_IP_MASQUERADE_VS */
1194+
1195+#define MASQ_STATE_INPUT 0
1196+#define MASQ_STATE_OUTPUT 4
1197+#define MASQ_STATE_INPUT_ONLY 8
1198+
1199+static __inline__ int masq_tcp_state_idx(struct tcphdr *th, int state_off)
1200 {
1201 /*
1202- * [0-3]: input states, [4-7]: output.
1203+ * [0-3]: input states, [4-7]: output, [8-11] input only states.
1204 */
1205- if (output)
1206- output=4;
1207-
1208 if (th->rst)
1209- return output+3;
1210+ return state_off+3;
1211 if (th->syn)
1212- return output+0;
1213+ return state_off+0;
1214 if (th->fin)
1215- return output+1;
1216+ return state_off+1;
1217 if (th->ack)
1218- return output+2;
1219+ return state_off+2;
1220 return -1;
1221 }
1222
1223
1224-
1225 static int masq_set_state_timeout(struct ip_masq *ms, int state)
1226 {
1227 struct ip_masq_timeout_table *mstim = ms->timeout_table;
1228@@ -221,18 +379,34 @@
1229 return state;
1230 }
1231
1232-static int masq_tcp_state(struct ip_masq *ms, int output, struct tcphdr *th)
1233+static int masq_tcp_state(struct ip_masq *ms, int state_off, struct tcphdr *th)
1234 {
1235 int state_idx;
1236 int new_state = IP_MASQ_S_CLOSE;
1237
1238- if ((state_idx = masq_tcp_state_idx(th, output)) < 0) {
1239+#ifdef CONFIG_IP_MASQUERADE_VS
1240+ /*
1241+ * Update state offset to INPUT_ONLY if necessary
1242+ * or delete NO_OUTPUT flag if output packet detected
1243+ */
1244+ if (ms->flags & IP_MASQ_F_VS_NO_OUTPUT) {
1245+ if (state_off == MASQ_STATE_OUTPUT)
1246+ ms->flags &= ~IP_MASQ_F_VS_NO_OUTPUT;
1247+ else state_off = MASQ_STATE_INPUT_ONLY;
1248+ }
1249+#endif
1250+
1251+ if ((state_idx = masq_tcp_state_idx(th, state_off)) < 0) {
1252 IP_MASQ_DEBUG(1, "masq_state_idx(%d)=%d!!!\n",
1253- output, state_idx);
1254+ state_off, state_idx);
1255 goto tcp_state_out;
1256 }
1257
1258+#ifdef CONFIG_IP_MASQUERADE_VS
1259+ new_state = ip_vs_state_table[state_idx].next_state[ms->state];
1260+#else
1261 new_state = masq_tcp_states[state_idx].next_state[ms->state];
1262+#endif
1263
1264 tcp_state_out:
1265 if (new_state!=ms->state)
1266@@ -247,6 +421,15 @@
1267 ntohl(ms->daddr), ntohs(ms->dport),
1268 ip_masq_state_name(ms->state),
1269 ip_masq_state_name(new_state));
1270+
1271+#ifdef CONFIG_IP_MASQUERADE_VS
1272+ /*
1273+ * Increase/Decrease the active connection counter and
1274+ * set ms->flags according to ms->state and new_state.
1275+ */
1276+ ip_vs_set_state(ms, new_state);
1277+#endif /* CONFIG_IP_MASQUERADE_VS */
1278+
1279 return masq_set_state_timeout(ms, new_state);
1280 }
1281
1282@@ -254,7 +437,7 @@
1283 /*
1284 * Handle state transitions
1285 */
1286-static int masq_set_state(struct ip_masq *ms, int output, struct iphdr *iph, void *tp)
1287+static int masq_set_state(struct ip_masq *ms, int state_off, struct iphdr *iph, void *tp)
1288 {
1289 switch (iph->protocol) {
1290 case IPPROTO_ICMP:
1291@@ -262,7 +445,7 @@
1292 case IPPROTO_UDP:
1293 return masq_set_state_timeout(ms, IP_MASQ_S_UDP);
1294 case IPPROTO_TCP:
1295- return masq_tcp_state(ms, output, tp);
1296+ return masq_tcp_state(ms, state_off, tp);
1297 }
1298 return -1;
1299 }
1300@@ -361,6 +544,9 @@
1301
1302 EXPORT_SYMBOL(ip_masq_get_debug_level);
1303 EXPORT_SYMBOL(ip_masq_new);
1304+#ifdef CONFIG_IP_MASQUERADE_VS
1305+EXPORT_SYMBOL(ip_masq_new_vs);
1306+#endif /* CONFIG_IP_MASQUERADE_VS */
1307 EXPORT_SYMBOL(ip_masq_listen);
1308 EXPORT_SYMBOL(ip_masq_free_ports);
1309 EXPORT_SYMBOL(ip_masq_out_get);
1310@@ -423,9 +609,17 @@
1311 {
1312 if (tout) {
1313 ms->timer.expires = jiffies+tout;
1314+#ifdef CONFIG_IP_MASQUERADE_VS
1315+ add_sltimer(&ms->timer);
1316+#else
1317 add_timer(&ms->timer);
1318+#endif
1319 } else {
1320+#ifdef CONFIG_IP_MASQUERADE_VS
1321+ del_sltimer(&ms->timer);
1322+#else
1323 del_timer(&ms->timer);
1324+#endif
1325 }
1326 }
1327
1328@@ -741,6 +935,10 @@
1329 struct ip_masq *ms;
1330
1331 read_lock(&__ip_masq_lock);
1332+#ifdef CONFIG_IP_MASQUERADE_VS
1333+ ms = __ip_vs_out_get(protocol, s_addr, s_port, d_addr, d_port);
1334+ if (ms == NULL)
1335+#endif /* CONFIG_IP_MASQUERADE_VS */
1336 ms = __ip_masq_out_get(protocol, s_addr, s_port, d_addr, d_port);
1337 read_unlock(&__ip_masq_lock);
1338
1339@@ -754,7 +952,11 @@
1340 struct ip_masq *ms;
1341
1342 read_lock(&__ip_masq_lock);
1343- ms = __ip_masq_in_get(protocol, s_addr, s_port, d_addr, d_port);
1344+#ifdef CONFIG_IP_MASQUERADE_VS
1345+ ms = __ip_vs_in_get(protocol, s_addr, s_port, d_addr, d_port);
1346+ if (ms == NULL)
1347+#endif /* CONFIG_IP_MASQUERADE_VS */
1348+ ms = __ip_masq_in_get(protocol, s_addr, s_port, d_addr, d_port);
1349 read_unlock(&__ip_masq_lock);
1350
1351 if (ms)
1352@@ -791,7 +993,11 @@
1353 static void masq_expire(unsigned long data)
1354 {
1355 struct ip_masq *ms = (struct ip_masq *)data;
1356+#ifdef CONFIG_IP_MASQUERADE_VS
1357+ ms->timeout = MASQUERADE_EXPIRE_RETRY(ms);
1358+#else
1359 ms->timeout = MASQUERADE_EXPIRE_RETRY;
1360+#endif
1361
1362 /*
1363 * hey, I'm using it
1364@@ -826,6 +1032,15 @@
1365 if (ms->control)
1366 ip_masq_control_del(ms);
1367
1368+#ifdef CONFIG_IP_MASQUERADE_VS
1369+ if (ms->flags & IP_MASQ_F_VS) {
1370+ if (ip_vs_unhash(ms)) {
1371+ ip_vs_unbind_masq(ms);
1372+ ip_masq_unbind_app(ms);
1373+ }
1374+ }
1375+ else
1376+#endif /* CONFIG_IP_MASQUERADE_VS */
1377 if (ip_masq_unhash(ms)) {
1378 if (ms->flags&IP_MASQ_F_MPORT) {
1379 atomic_dec(&mport_count);
1380@@ -839,6 +1054,9 @@
1381 * refcnt==1 implies I'm the only one referrer
1382 */
1383 if (atomic_read(&ms->refcnt) == 1) {
1384+#ifdef IP_MASQ_MANY_STATE_TABLES
1385+ ip_masq_timeout_detach(ms);
1386+#endif
1387 kfree_s(ms,sizeof(*ms));
1388 sysctl_ip_always_defrag--;
1389 MOD_DEC_USE_COUNT;
1390@@ -1077,6 +1295,83 @@
1391 return NULL;
1392 }
1393
1394+
1395+#ifdef CONFIG_IP_MASQUERADE_VS
1396+/*
1397+ * Create a new masquerade entry for IPVS, all parameters {maddr,
1398+ * mport, saddr, sport, daddr, dport, mflags} are known. No need
1399+ * to allocate a free mport. And, hash it into the ip_vs_table.
1400+ *
1401+ * Be careful, it can be called from u-space
1402+ */
1403+
1404+struct ip_masq * ip_masq_new_vs(int proto, __u32 maddr, __u16 mport, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned mflags)
1405+{
1406+ struct ip_masq *ms;
1407+ static int n_fails = 0;
1408+ int prio;
1409+
1410+ prio = (mflags&IP_MASQ_F_USER) ? GFP_KERNEL : GFP_ATOMIC;
1411+
1412+ ms = (struct ip_masq *) kmalloc(sizeof(struct ip_masq), prio);
1413+ if (ms == NULL) {
1414+ if (++n_fails < 5)
1415+ IP_VS_ERR("ip_masq_new_vs(proto=%s): no memory available.\n",
1416+ masq_proto_name(proto));
1417+ return NULL;
1418+ }
1419+ MOD_INC_USE_COUNT;
1420+
1421+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,2,14)
1422+ sysctl_ip_always_defrag++;
1423+#endif
1424+ memset(ms, 0, sizeof(*ms));
1425+ INIT_LIST_HEAD(&ms->s_list);
1426+ INIT_LIST_HEAD(&ms->m_list);
1427+ INIT_LIST_HEAD(&ms->d_list);
1428+ init_timer(&ms->timer);
1429+ ms->timer.data = (unsigned long)ms;
1430+ ms->timer.function = masq_expire;
1431+ ip_masq_timeout_attach(ms,ip_vs_timeout_table);
1432+ ms->protocol = proto;
1433+ ms->saddr = saddr;
1434+ ms->sport = sport;
1435+ ms->daddr = daddr;
1436+ ms->dport = dport;
1437+ ms->maddr = maddr;
1438+ ms->mport = mport;
1439+ ms->flags = mflags;
1440+ ms->app_data = NULL;
1441+ ms->control = NULL;
1442+
1443+ atomic_set(&ms->n_control,0);
1444+ atomic_set(&ms->refcnt,0);
1445+ atomic_set(&ms->in_pkts,0);
1446+
1447+ if (mflags & IP_MASQ_F_USER)
1448+ write_lock_bh(&__ip_masq_lock);
1449+ else
1450+ write_lock(&__ip_masq_lock);
1451+
1452+ /*
1453+ * Hash it in the ip_vs_table
1454+ */
1455+ ip_vs_hash(ms);
1456+
1457+ if (mflags & IP_MASQ_F_USER)
1458+ write_unlock_bh(&__ip_masq_lock);
1459+ else
1460+ write_unlock(&__ip_masq_lock);
1461+
1462+ ip_masq_bind_app(ms);
1463+ n_fails = 0;
1464+ atomic_inc(&ms->refcnt);
1465+ masq_set_state_timeout(ms, IP_MASQ_S_NONE);
1466+ return ms;
1467+}
1468+#endif /* CONFIG_IP_MASQUERADE_VS */
1469+
1470+
1471 /*
1472 * Get transport protocol data offset, check against size
1473 * return:
1474@@ -1153,25 +1448,20 @@
1475 return -1;
1476 }
1477
1478+#ifndef CONFIG_IP_MASQUERADE_VS
1479 /* Lets determine our maddr now, shall we? */
1480- if (maddr == 0) {
1481- struct rtable *rt;
1482- struct rtable *skb_rt = (struct rtable*)skb->dst;
1483- struct device *skb_dev = skb_rt->u.dst.dev;
1484-
1485- if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos)|RTO_CONN, skb_dev?skb_dev->ifindex:0)) {
1486- /* Fallback on old method */
1487- /* This really shouldn't happen... */
1488- maddr = inet_select_addr(skb_dev, skb_rt->rt_gateway, RT_SCOPE_UNIVERSE);
1489- } else {
1490- /* Route lookup succeeded */
1491- maddr = rt->rt_src;
1492- ip_rt_put(rt);
1493- }
1494+ if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) {
1495+ return -1;
1496 }
1497+#endif
1498
1499 switch (iph->protocol) {
1500 case IPPROTO_ICMP:
1501+#ifdef CONFIG_IP_MASQUERADE_VS
1502+ if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) {
1503+ return -1;
1504+ }
1505+#endif
1506 return(ip_fw_masq_icmp(skb_p, maddr));
1507 case IPPROTO_UDP:
1508 if (h.uh->check == 0)
1509@@ -1230,6 +1520,17 @@
1510
1511 ms = ip_masq_out_get_iph(iph);
1512 if (ms!=NULL) {
1513+#ifdef CONFIG_IP_MASQUERADE_VS
1514+ if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) {
1515+ /*
1516+ * Drop this packet but don't
1517+ * start the timer from the beginning
1518+ */
1519+ __ip_masq_put(ms);
1520+ add_sltimer(&ms->timer);
1521+ return -1;
1522+ }
1523+#endif
1524
1525 /*
1526 * If sysctl !=0 and no pkt has been received yet
1527@@ -1280,6 +1581,33 @@
1528 ms->daddr = iph->daddr;
1529 }
1530 } else {
1531+#ifdef CONFIG_IP_MASQUERADE_VS
1532+ struct ip_vs_dest *dest;
1533+
1534+ /*
1535+ * Check if the packet is from our real service
1536+ */
1537+ read_lock(&__ip_vs_lock);
1538+ dest = __ip_vs_lookup_real_service(iph->protocol,
1539+ iph->saddr, h.portp[0]);
1540+ read_unlock(&__ip_vs_lock);
1541+ if (dest) {
1542+ /*
1543+ * Notify the real server: there is
1544+ * no existing entry if it is not RST packet
1545+ * or not TCP packet.
1546+ */
1547+ if (!h.th->rst || iph->protocol != IPPROTO_TCP)
1548+ icmp_send(skb, ICMP_DEST_UNREACH,
1549+ ICMP_PORT_UNREACH, 0);
1550+ return -1;
1551+ }
1552+
1553+ if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) {
1554+ return -1;
1555+ }
1556+#endif
1557+
1558 /*
1559 * Nope, not found, create a new entry for it
1560 */
1561@@ -1392,11 +1720,17 @@
1562 IP_MASQ_DEBUG(2, "O-routed from %08X:%04X with masq.addr %08X\n",
1563 ntohl(ms->maddr),ntohs(ms->mport),ntohl(maddr));
1564
1565- masq_set_state(ms, 1, iph, h.portp);
1566+#ifdef CONFIG_IP_MASQUERADE_VS
1567+ /* do the IPVS statistics */
1568+ if (ms->flags & IP_MASQ_F_VS)
1569+ ip_vs_out_stats(ms, skb);
1570+#endif
1571+
1572+ masq_set_state(ms, MASQ_STATE_OUTPUT, iph, h.portp);
1573 ip_masq_put(ms);
1574
1575 return 0;
1576- }
1577+}
1578
1579 /*
1580 * Restore original addresses and ports in the original IP
1581@@ -1438,6 +1772,12 @@
1582 ms = __ip_masq_out_get(iph->protocol,
1583 iph->daddr, portp[1],
1584 iph->saddr, portp[0]);
1585+#ifdef CONFIG_IP_MASQUERADE_VS
1586+ if (ms == NULL)
1587+ ms = __ip_vs_out_get(iph->protocol,
1588+ iph->daddr, portp[1],
1589+ iph->saddr, portp[0]);
1590+#endif /* CONFIG_IP_MASQUERADE_VS */
1591 read_unlock(&__ip_masq_lock);
1592 if (ms) {
1593 IP_MASQ_DEBUG(1, "Incoming frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n",
1594@@ -1459,6 +1799,12 @@
1595 ms = __ip_masq_in_get(iph->protocol,
1596 iph->daddr, portp[1],
1597 iph->saddr, portp[0]);
1598+#ifdef CONFIG_IP_MASQUERADE_VS
1599+ if (ms == NULL)
1600+ ms = __ip_vs_in_get(iph->protocol,
1601+ iph->daddr, portp[1],
1602+ iph->saddr, portp[0]);
1603+#endif /* CONFIG_IP_MASQUERADE_VS */
1604 read_unlock(&__ip_masq_lock);
1605 if (ms) {
1606 IP_MASQ_DEBUG(1, "Outgoing frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n",
1607@@ -1469,8 +1815,8 @@
1608 return 1;
1609 }
1610 return 0;
1611-
1612 }
1613+
1614 /*
1615 * Handle ICMP messages in forward direction.
1616 * Find any that might be relevant, check against existing connections,
1617@@ -1556,7 +1902,7 @@
1618 ntohs(icmp_id(icmph)),
1619 icmph->type);
1620
1621- masq_set_state(ms, 1, iph, icmph);
1622+ masq_set_state(ms, MASQ_STATE_OUTPUT, iph, icmph);
1623 ip_masq_put(ms);
1624
1625 return 1;
1626@@ -1684,11 +2030,28 @@
1627 pptr[1],
1628 ciph->saddr,
1629 pptr[0]);
1630+#ifdef CONFIG_IP_MASQUERADE_VS
1631+ if (ms == NULL) {
1632+ ms = __ip_vs_out_get(ciph->protocol,
1633+ ciph->daddr, pptr[1],
1634+ ciph->saddr, pptr[0]);
1635+ }
1636+#endif /* CONFIG_IP_MASQUERADE_VS */
1637 read_unlock(&__ip_masq_lock);
1638
1639 if (ms == NULL)
1640 return 0;
1641
1642+#ifdef CONFIG_IP_MASQUERADE_VS
1643+ if (IP_MASQ_VS_FWD(ms) != 0) {
1644+ IP_VS_INFO("shouldn't get here, because tun/dr is on the half connection\n");
1645+ }
1646+
1647+ /* do the IPVS statistics */
1648+ if (ms->flags & IP_MASQ_F_VS)
1649+ ip_vs_out_stats(ms, skb);
1650+#endif /* CONFIG_IP_MASQUERADE_VS */
1651+
1652 /* Now we do real damage to this packet...! */
1653 /* First change the source IP address, and recalc checksum */
1654 iph->saddr = ms->maddr;
1655@@ -1739,6 +2102,87 @@
1656 return skb;
1657 }
1658
1659+#ifdef CONFIG_IP_MASQUERADE_VS
1660+
1661+/*
1662+ * Check whether this ICMP packet in the FORWARD path is for
1663+ * related IPVS connection and needs to be delivered locally
1664+ */
1665+
1666+int ip_vs_forwarding_related_icmp(struct sk_buff *skb)
1667+{
1668+ struct iphdr *iph = skb->nh.iph;
1669+ struct icmphdr *icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2));
1670+ unsigned short size = ntohs(iph->tot_len) - (iph->ihl * 4);
1671+ struct iphdr *ciph; /* The ip header contained within the ICMP */
1672+ __u16 *pptr; /* port numbers from TCP/UDP contained header */
1673+ struct ip_masq *ms;
1674+ union ip_masq_tphdr h;
1675+ int doff;
1676+
1677+ /*
1678+ * PACKET_HOST only, see ip_forward
1679+ */
1680+
1681+ h.raw = (char*) iph + iph->ihl * 4;
1682+
1683+ doff = proto_doff(iph->protocol, h.raw, size);
1684+
1685+ if (doff <= 0) return 0;
1686+
1687+ IP_VS_DBG(10, "icmp fwd/rev (%d,%d) %u.%u.%u.%u -> %u.%u.%u.%u\n",
1688+ icmph->type, ntohs(icmp_id(icmph)),
1689+ NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
1690+
1691+ if ((icmph->type != ICMP_DEST_UNREACH) &&
1692+ (icmph->type != ICMP_SOURCE_QUENCH) &&
1693+ (icmph->type != ICMP_TIME_EXCEEDED))
1694+ return 0;
1695+
1696+ /*
1697+ * If we get here we have an ICMP error of one of the above 3 types
1698+ * Now find the contained IP header
1699+ */
1700+
1701+ ciph = (struct iphdr *) (icmph + 1);
1702+ size -= sizeof(struct icmphdr);
1703+ if (size < sizeof(struct iphdr)) return 0;
1704+
1705+ /* We are only interested ICMPs generated from TCP or UDP packets */
1706+ if (ciph->protocol == IPPROTO_TCP) {
1707+ if (size < sizeof(struct tcphdr)) return 0;
1708+ }
1709+ else
1710+ if (ciph->protocol == IPPROTO_UDP) {
1711+ if (size < sizeof(struct udphdr)) return 0;
1712+ }
1713+ else return 0;
1714+
1715+ /* We don't ensure for now the checksum is correct */
1716+
1717+ /* This is pretty much what __ip_masq_in_get_iph() does,
1718+ except params are wrong way round */
1719+ pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]);
1720+
1721+ read_lock(&__ip_masq_lock);
1722+ ms = __ip_vs_in_get(ciph->protocol,
1723+ ciph->daddr,
1724+ pptr[1],
1725+ ciph->saddr,
1726+ pptr[0]);
1727+ read_unlock(&__ip_masq_lock);
1728+
1729+ if (!ms) return 0;
1730+ IP_VS_DBG(10, "Delivering locally ICMP for %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u to %u.%u.%u.%u\n",
1731+ NIPQUAD(ciph->daddr), ntohs(pptr[1]),
1732+ NIPQUAD(ciph->saddr), ntohs(pptr[0]),
1733+ NIPQUAD(ms->saddr));
1734+ __ip_masq_put(ms);
1735+
1736+ return 1;
1737+}
1738+#endif /* CONFIG_IP_MASQUERADE_VS */
1739+
1740 /*
1741 * Handle ICMP messages in reverse (demasquerade) direction.
1742 * Find any that might be relevant, check against existing connections,
1743@@ -1812,7 +2256,7 @@
1744 ntohs(icmp_id(icmph)),
1745 icmph->type);
1746
1747- masq_set_state(ms, 0, iph, icmph);
1748+ masq_set_state(ms, MASQ_STATE_INPUT, iph, icmph);
1749 ip_masq_put(ms);
1750
1751 return 1;
1752@@ -1914,9 +2358,11 @@
1753 * *outgoing* so the ports are reversed (and addresses)
1754 */
1755 pptr = (__u16 *)&(((char *)ciph)[csize]);
1756+#ifndef CONFIG_IP_MASQUERADE_VS
1757 if (ntohs(pptr[0]) < PORT_MASQ_BEGIN ||
1758 ntohs(pptr[0]) > PORT_MASQ_END)
1759 return 0;
1760+#endif
1761
1762 /* Ensure the checksum is correct */
1763 if (ip_compute_csum((unsigned char *) icmph, len))
1764@@ -1927,7 +2373,6 @@
1765 return(-1);
1766 }
1767
1768-
1769 IP_MASQ_DEBUG(2, "Handling reverse ICMP for %08X:%04X -> %08X:%04X\n",
1770 ntohl(ciph->saddr), ntohs(pptr[0]),
1771 ntohl(ciph->daddr), ntohs(pptr[1]));
1772@@ -1935,6 +2380,14 @@
1773
1774 /* This is pretty much what __ip_masq_in_get_iph() does, except params are wrong way round */
1775 read_lock(&__ip_masq_lock);
1776+#ifdef CONFIG_IP_MASQUERADE_VS
1777+ ms = __ip_vs_in_get(ciph->protocol,
1778+ ciph->daddr,
1779+ pptr[1],
1780+ ciph->saddr,
1781+ pptr[0]);
1782+ if (ms == NULL)
1783+#endif /* CONFIG_IP_MASQUERADE_VS */
1784 ms = __ip_masq_in_get(ciph->protocol,
1785 ciph->daddr,
1786 pptr[1],
1787@@ -1945,10 +2398,23 @@
1788 if (ms == NULL)
1789 return 0;
1790
1791+#ifdef CONFIG_IP_MASQUERADE_VS
1792+ /* do the IPVS statistics */
1793+ if (ms->flags & IP_MASQ_F_VS)
1794+ ip_vs_in_stats(ms, skb);
1795+
1796+ if (IP_MASQ_VS_FWD(ms) != 0) {
1797+ int ret = ip_vs_forward(skb, ms);
1798+ __ip_masq_put(ms);
1799+ return ret;
1800+ }
1801+#endif /* CONFIG_IP_MASQUERADE_VS */
1802+
1803 if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) {
1804 __ip_masq_put(ms);
1805 return -1;
1806 }
1807+
1808 ciph = (struct iphdr *) (icmph + 1);
1809 pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]);
1810
1811@@ -1998,7 +2464,10 @@
1812 int csum = 0;
1813 int csum_ok = 0;
1814 __u32 maddr;
1815-
1816+#ifdef CONFIG_IP_MASQUERADE_VS
1817+ struct ip_vs_service *svc = NULL;
1818+#endif
1819+
1820 /*
1821 * Big tappo: only PACKET_HOST (nor loopback neither mcasts)
1822 * ... don't know why 1st test DOES NOT include 2nd (?)
1823@@ -2039,13 +2508,21 @@
1824 return(ip_fw_demasq_icmp(skb_p));
1825 case IPPROTO_TCP:
1826 case IPPROTO_UDP:
1827- /*
1828+ /*
1829 * Make sure packet is in the masq range
1830 * ... or some mod-ule relaxes input range
1831 * ... or there is still some `special' mport opened
1832 */
1833+#ifdef CONFIG_IP_MASQUERADE_VS
1834+ svc = ip_vs_lookup_service(skb->fwmark,
1835+ iph->protocol, maddr, h.portp[1]);
1836+ if (!svc &&
1837+ (ntohs(h.portp[1]) < PORT_MASQ_BEGIN
1838+ || ntohs(h.portp[1]) > PORT_MASQ_END)
1839+#else
1840 if ((ntohs(h.portp[1]) < PORT_MASQ_BEGIN
1841 || ntohs(h.portp[1]) > PORT_MASQ_END)
1842+#endif /* CONFIG_IP_MASQUERADE_VS */
1843 #ifdef CONFIG_IP_MASQUERADE_MOD
1844 && (ip_masq_mod_in_rule(skb, iph) != 1)
1845 #endif
1846@@ -2100,6 +2577,21 @@
1847
1848 ms = ip_masq_in_get_iph(iph);
1849
1850+#ifdef CONFIG_IP_MASQUERADE_VS
1851+ /*
1852+ * Checking the server status
1853+ */
1854+ if (ms && ms->dest && !(ms->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1855+ /*
1856+ * If the dest is not avaiable, don't restart the timer
1857+ * of the packet, but silently drop it.
1858+ */
1859+ add_sltimer(&ms->timer);
1860+ __ip_masq_put(ms);
1861+ return -1;
1862+ }
1863+#endif
1864+
1865 /*
1866 * Give additional modules a chance to create an entry
1867 */
1868@@ -2116,6 +2608,27 @@
1869 ip_masq_mod_in_update(skb, iph, ms);
1870 #endif
1871
1872+#ifdef CONFIG_IP_MASQUERADE_VS
1873+ if (!ms &&
1874+ (h.th->syn || (iph->protocol!=IPPROTO_TCP)) && svc) {
1875+ if (ip_masq_todrop()) {
1876+ /*
1877+ * It seems that we are very loaded.
1878+ * We have to drop this packet :(
1879+ */
1880+ return -1;
1881+ }
1882+ /*
1883+ * Let the virtual server select a real server
1884+ * for the incomming connection, and create a
1885+ * masquerading entry.
1886+ */
1887+ ms = ip_vs_schedule(svc, iph);
1888+ if (!ms)
1889+ return ip_vs_leave(svc, skb);
1890+ ip_vs_conn_stats(ms, svc);
1891+ }
1892+#endif /* CONFIG_IP_MASQUERADE_VS */
1893
1894 if (ms != NULL)
1895 {
1896@@ -2168,13 +2681,43 @@
1897
1898 }
1899 }
1900+
1901+#ifdef CONFIG_IP_MASQUERADE_VS
1902+ /* do the IPVS statistics */
1903+ if (ms->flags & IP_MASQ_F_VS)
1904+ ip_vs_in_stats(ms, skb);
1905+
1906+ if (IP_MASQ_VS_FWD(ms) != 0) {
1907+ int ret;
1908+
1909+ /*
1910+ * Sorry for setting state of masq entry so early
1911+ * no matter whether the packet is forwarded
1912+ * successfully or not, because ip_vs_forward may
1913+ * have already released the skb. Although it
1914+ * brokes the original sematics, it won't lead to
1915+ * serious errors. We look forward to fixing it
1916+ * under the Rusty's netfilter framework both for
1917+ * correctness and modularization.
1918+ */
1919+ masq_set_state(ms, MASQ_STATE_INPUT, iph, h.portp);
1920+
1921+ ret = ip_vs_forward(skb, ms);
1922+ ip_masq_put(ms);
1923+ return ret;
1924+ }
1925+
1926+ IP_VS_DBG(10, "masquerading packet...\n");
1927+#endif /* CONFIG_IP_MASQUERADE_VS */
1928+
1929 if ((skb=masq_skb_cow(skb_p, &iph, &h.raw)) == NULL) {
1930 ip_masq_put(ms);
1931 return -1;
1932 }
1933+
1934 iph->daddr = ms->saddr;
1935 h.portp[1] = ms->sport;
1936-
1937+
1938 /*
1939 * Invalidate csum saving if tunnel has masq helper
1940 */
1941@@ -2231,15 +2774,28 @@
1942 h.uh->check = 0xFFFF;
1943 break;
1944 }
1945- ip_send_check(iph);
1946+ ip_send_check(iph);
1947
1948 IP_MASQ_DEBUG(2, "I-routed to %08X:%04X\n",ntohl(iph->daddr),ntohs(h.portp[1]));
1949
1950- masq_set_state (ms, 0, iph, h.portp);
1951+ masq_set_state(ms, MASQ_STATE_INPUT, iph, h.portp);
1952 ip_masq_put(ms);
1953
1954 return 1;
1955 }
1956+#ifdef CONFIG_IP_MASQUERADE_VS
1957+ if (svc) {
1958+ /*
1959+ * Drop packet if it belongs to virtual service but no entry
1960+ * is found or created. Furthermore, send DEST_UNREACH icmp
1961+ * packet to clients if it is not RST or it is not TCP.
1962+ */
1963+ if (!h.th->rst || iph->protocol != IPPROTO_TCP) {
1964+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
1965+ }
1966+ return -1;
1967+ }
1968+#endif
1969
1970 /* sorry, all this trouble for a no-hit :) */
1971 return 0;
1972@@ -2350,7 +2906,6 @@
1973 len += sprintf(buffer+len, "%-127s\n", temp);
1974
1975 if(len >= length) {
1976-
1977 read_unlock_bh(&__ip_masq_lock);
1978 goto done;
1979 }
1980@@ -2358,9 +2913,52 @@
1981 read_unlock_bh(&__ip_masq_lock);
1982
1983 }
1984-done:
1985
1986+#ifdef CONFIG_IP_MASQUERADE_VS
1987+ for(idx = 0; idx < IP_VS_TAB_SIZE; idx++)
1988+ {
1989+ /*
1990+ * Lock is actually only need in next loop
1991+ * we are called from uspace: must stop bh.
1992+ */
1993+ read_lock_bh(&__ip_masq_lock);
1994
1995+ l = &ip_vs_table[idx];
1996+ for (e=l->next; e!=l; e=e->next) {
1997+ ms = list_entry(e, struct ip_masq, m_list);
1998+ pos += 128;
1999+ if (pos <= offset) {
2000+ len = 0;
2001+ continue;
2002+ }
2003+
2004+ /*
2005+ * We have locked the tables, no need to del/add timers
2006+ * nor cli() 8)
2007+ */
2008+
2009+ sprintf(temp,"%s %08X:%04X %08X:%04X %04X %08X %6d %6d %7lu",
2010+ masq_proto_name(ms->protocol),
2011+ ntohl(ms->saddr), ntohs(ms->sport),
2012+ ntohl(ms->daddr), ntohs(ms->dport),
2013+ ntohs(ms->mport),
2014+ ms->out_seq.init_seq,
2015+ ms->out_seq.delta,
2016+ ms->out_seq.previous_delta,
2017+ ms->timer.expires-jiffies);
2018+ len += sprintf(buffer+len, "%-127s\n", temp);
2019+
2020+ if(len >= length) {
2021+ read_unlock_bh(&__ip_masq_lock);
2022+ goto done;
2023+ }
2024+ }
2025+ read_unlock_bh(&__ip_masq_lock);
2026+
2027+ }
2028+#endif /* CONFIG_IP_MASQUERADE_VS */
2029+
2030+done:
2031 begin = len - (pos - offset);
2032 *start = buffer + begin;
2033 len -= begin;
2034@@ -2386,17 +2984,29 @@
2035 len, sizeof(struct ip_fw_masq));
2036 } else {
2037 masq = (struct ip_fw_masq *)m;
2038- if (masq->tcp_timeout)
2039+ if (masq->tcp_timeout) {
2040 masq_timeout_table.timeout[IP_MASQ_S_ESTABLISHED]
2041+#ifdef CONFIG_IP_MASQUERADE_VS
2042+ = masq_timeout_table_dos.timeout[IP_MASQ_S_ESTABLISHED]
2043+#endif
2044 = masq->tcp_timeout;
2045+ }
2046
2047- if (masq->tcp_fin_timeout)
2048+ if (masq->tcp_fin_timeout) {
2049 masq_timeout_table.timeout[IP_MASQ_S_FIN_WAIT]
2050+#ifdef CONFIG_IP_MASQUERADE_VS
2051+ = masq_timeout_table_dos.timeout[IP_MASQ_S_FIN_WAIT]
2052+#endif
2053 = masq->tcp_fin_timeout;
2054+ }
2055
2056- if (masq->udp_timeout)
2057+ if (masq->udp_timeout) {
2058 masq_timeout_table.timeout[IP_MASQ_S_UDP]
2059+#ifdef CONFIG_IP_MASQUERADE_VS
2060+ = masq_timeout_table_dos.timeout[IP_MASQ_S_UDP]
2061+#endif
2062 = masq->udp_timeout;
2063+ }
2064 ret = 0;
2065 }
2066 return ret;
2067@@ -2468,6 +3078,11 @@
2068 ret = ip_masq_mod_ctl(optname, &masq_ctl, optlen);
2069 break;
2070 #endif
2071+#ifdef CONFIG_IP_MASQUERADE_VS
2072+ case IP_MASQ_TARGET_VS:
2073+ ret = ip_vs_ctl(optname, &masq_ctl, optlen);
2074+ break;
2075+#endif
2076 }
2077
2078 /*
2079@@ -2529,12 +3144,25 @@
2080 }
2081 }
2082 #endif /* CONFIG_PROC_FS */
2083+
2084 /*
2085- * Wrapper over inet_select_addr()
2086+ * Determine maddr from skb
2087 */
2088-u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope)
2089+int ip_masq_select_addr(struct sk_buff *skb, __u32 *maddr)
2090 {
2091- return inet_select_addr(dev, dst, scope);
2092+ struct rtable *rt;
2093+ struct rtable *skb_rt = (struct rtable*)skb->dst;
2094+ struct device *skb_dev = skb_rt->u.dst.dev;
2095+ struct iphdr *iph = skb->nh.iph;
2096+
2097+ if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos)|RTO_CONN, skb_dev?skb_dev->ifindex:0)) {
2098+ return -1;
2099+ } else {
2100+ /* Route lookup succeeded */
2101+ *maddr = rt->rt_src;
2102+ ip_rt_put(rt);
2103+ return 0;
2104+ }
2105 }
2106
2107 /*
2108@@ -2587,7 +3215,7 @@
2109 (char *) IPPROTO_ICMP,
2110 ip_masq_user_info
2111 });
2112-#endif
2113+#endif /* CONFIG_PROC_FS */
2114 #ifdef CONFIG_IP_MASQUERADE_IPAUTOFW
2115 ip_autofw_init();
2116 #endif
2117@@ -2596,6 +3224,9 @@
2118 #endif
2119 #ifdef CONFIG_IP_MASQUERADE_MFW
2120 ip_mfw_init();
2121+#endif
2122+#ifdef CONFIG_IP_MASQUERADE_VS
2123+ ip_vs_init();
2124 #endif
2125 ip_masq_app_init();
2126
2127diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs.c
2128--- linux-2.2.19/net/ipv4/ip_vs.c Thu Jan 1 08:00:00 1970
2129+++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs.c Mon May 14 22:04:50 2001
2130@@ -0,0 +1,3015 @@
2131+/*
2132+ * IPVS An implementation of the IP virtual server support for the
2133+ * LINUX operating system. IPVS is now implemented as a part
2134+ * of IP masquerading code. IPVS can be used to build a
2135+ * high-performance and highly available server based on a
2136+ * cluster of servers.
2137+ *
2138+ * Version: $Id$
2139+ *
2140+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
2141+ * Peter Kese <peter.kese@ijs.si>
2142+ *
2143+ * This program is free software; you can redistribute it and/or
2144+ * modify it under the terms of the GNU General Public License
2145+ * as published by the Free Software Foundation; either version
2146+ * 2 of the License, or (at your option) any later version.
2147+ *
2148+ * Changes:
2149+ * Wensong Zhang : fixed the overflow bug in ip_vs_procinfo
2150+ * Wensong Zhang : added editing dest and service functions
2151+ * Wensong Zhang : changed the names of some functions
2152+ * Wensong Zhang : fixed the unlocking bug in ip_vs_del_dest
2153+ * Wensong Zhang : added a separate hash table for IPVS
2154+ * Wensong Zhang : added slow timer for IPVS masq entries
2155+ * Julian Anastasov : fixed the number of active connections
2156+ * Wensong Zhang : added persistent port
2157+ * Wensong Zhang : fixed the incorrect lookup in hash table
2158+ * Wensong Zhang : added server status checking
2159+ * Wensong Zhang : fixed the incorrect slow timer vector layout
2160+ * Wensong Zhang : fixed the sltimer added twice bug of mst
2161+ * Julian Anastasov : fixed the IP_MASQ_F_VS_INACTIVE cleared bug after editing dest
2162+ * Wensong Zhang : added the inactive connection counter
2163+ * Wensong Zhang : changed the body of ip_vs_schedule
2164+ * Julian Anastasov : fixed the unlocking bug in ip_vs_schedule
2165+ * Julian Anastasov : fixed the uncounting bug in creating masqs by template
2166+ * Wensong Zhang : changed some condition orders for a bit performance
2167+ * Julian Anastasov : don't touch counters in ip_vs_unbind_masq for templates
2168+ * Wensong Zhang : added the hash table for virtual services
2169+ * Wensong Zhang : changed destination lists to d-linked lists
2170+ * Wensong Zhang : changed the scheduler list to the d-linked list
2171+ * Wensong Zhang : added new persistent service handling
2172+ * Julian Anastasov : fixed the counting bug in ip_vs_unbind_masq again
2173+ * (don't touch counters for templates)
2174+ * Wensong Zhang : changed some IP_VS_ERR to IP_VS_DBG in the ip_vs_tunnel_xmit
2175+ * Wensong Zhang : added different timeout support for persistent svc
2176+ * Wensong Zhang : fixed the bug that persistent svc cannot be edited
2177+ * Julian Anastasov : removed extra read_unlock in __ip_vs_lookup_service
2178+ * Julian Anastasov : changed not to restart template timers if dest is unavailable
2179+ * Julian Anastasov : added the destination trash
2180+ * Wensong Zhang : added the update_service call in ip_vs_del_dest
2181+ * Wensong Zhang : added the ip_vs_leave function
2182+ * Lars Marowsky-Bree : added persistence granularity support
2183+ * Julian Anastasov : changed some comestics things for debugging
2184+ * Wensong Zhang : use vmalloc to allocate big ipvs hash table
2185+ * Wensong Zhang : changed the tunneling/direct routing methods a little
2186+ * Julian Anastasov : fixed the return bug of ip_vs_leave(-2 instead of -3)
2187+ * Roberto Nibali : fixed the undefined variable bug in the IP_VS_DBG of ip_vs_dr_xmit
2188+ * Julian Anastasov : changed ICMP_PROT_UNREACH to ICMP_PORT_UNREACH in ip_vs_leave
2189+ * Wensong Zhang : added port zero support for persistent services
2190+ * Wensong Zhang : fixed the bug that virtual ftp service blocks other services not listed in ipvs table
2191+ * Wensong Zhang : invalidate a persistent template when its dest is unavailable
2192+ * Julian Anastasov : changed two IP_VS_ERR calls to IP_VS_DBG
2193+ * Wensong Zhang : added random drop of syn entries
2194+ * Wensong Zhang : added random drop of UDP entris
2195+ * Julian Anastasov : added droprate defense against DoS attack
2196+ * Julian Anastasov : added secure_tcp defense against DoS attack
2197+ * Wensong Zhang : revisited dropentry defense against DoS attach
2198+ * Horms : added the fwmark service feature
2199+ * Wensong Zhang : changed to two service hash tables
2200+ * Julian Anastasov : corrected trash_dest lookup for both
2201+ * normal service and fwmark service
2202+ *
2203+ */
2204+
2205+#include <linux/config.h>
2206+#include <linux/module.h>
2207+#include <linux/types.h>
2208+#include <linux/kernel.h>
2209+#include <linux/errno.h>
2210+#include <linux/vmalloc.h>
2211+#include <linux/swap.h>
2212+#include <net/ip_masq.h>
2213+
2214+#include <linux/sysctl.h>
2215+#include <linux/ip_fw.h>
2216+#include <linux/ip_masq.h>
2217+#include <linux/proc_fs.h>
2218+
2219+#include <linux/inetdevice.h>
2220+#include <linux/ip.h>
2221+#include <net/icmp.h>
2222+#include <net/ip.h>
2223+#include <net/route.h>
2224+#include <net/ip_vs.h>
2225+
2226+#ifdef CONFIG_KMOD
2227+#include <linux/kmod.h>
2228+#endif
2229+
2230+EXPORT_SYMBOL(register_ip_vs_scheduler);
2231+EXPORT_SYMBOL(unregister_ip_vs_scheduler);
2232+EXPORT_SYMBOL(ip_vs_bind_masq);
2233+EXPORT_SYMBOL(ip_vs_unbind_masq);
2234+EXPORT_SYMBOL(ip_vs_lookup_dest);
2235+#ifdef CONFIG_IP_VS_DEBUG
2236+EXPORT_SYMBOL(ip_vs_get_debug_level);
2237+#endif
2238+
2239+int sysctl_ip_vs_drop_entry = 0;
2240+int sysctl_ip_vs_drop_packet = 0;
2241+int sysctl_ip_vs_secure_tcp = 0;
2242+int sysctl_ip_vs_amemthresh = 1024;
2243+int sysctl_ip_vs_am_droprate = 10;
2244+
2245+#ifdef CONFIG_IP_VS_DEBUG
2246+static int sysctl_ip_vs_debug_level = 0;
2247+
2248+int ip_vs_get_debug_level(void)
2249+{
2250+ return sysctl_ip_vs_debug_level;
2251+}
2252+#endif
2253+
2254+
2255+int ip_vs_dropentry = 0;
2256+
2257+static inline void update_defense_level(void)
2258+{
2259+ int ip_vs_amem = nr_free_pages+page_cache_size+(buffermem>>PAGE_SHIFT);
2260+ int nomem = (ip_vs_amem < sysctl_ip_vs_amemthresh);
2261+
2262+ /* drop_entry */
2263+ switch (sysctl_ip_vs_drop_entry) {
2264+ case 0:
2265+ ip_vs_dropentry = 0;
2266+ break;
2267+ case 1:
2268+ if (nomem) {
2269+ ip_vs_dropentry = 1;
2270+ sysctl_ip_vs_drop_entry = 2;
2271+ } else {
2272+ ip_vs_dropentry = 0;
2273+ }
2274+ break;
2275+ case 2:
2276+ if (nomem) {
2277+ ip_vs_dropentry = 1;
2278+ } else {
2279+ ip_vs_dropentry = 0;
2280+ sysctl_ip_vs_drop_entry = 1;
2281+ };
2282+ break;
2283+ case 3:
2284+ ip_vs_dropentry = 1;
2285+ break;
2286+ }
2287+
2288+ /* drop_packet */
2289+ switch (sysctl_ip_vs_drop_packet) {
2290+ case 0:
2291+ ip_masq_drop_rate = 0;
2292+ break;
2293+ case 1:
2294+ if (nomem) {
2295+ ip_masq_drop_rate = ip_masq_drop_counter
2296+ = sysctl_ip_vs_amemthresh /
2297+ (sysctl_ip_vs_amemthresh-ip_vs_amem);
2298+ sysctl_ip_vs_drop_packet = 2;
2299+ } else {
2300+ ip_masq_drop_rate = 0;
2301+ }
2302+ break;
2303+ case 2:
2304+ if (nomem) {
2305+ ip_masq_drop_rate = ip_masq_drop_counter
2306+ = sysctl_ip_vs_amemthresh /
2307+ (sysctl_ip_vs_amemthresh-ip_vs_amem);
2308+ } else {
2309+ ip_masq_drop_rate = 0;
2310+ sysctl_ip_vs_drop_packet = 1;
2311+ }
2312+ break;
2313+ case 3:
2314+ ip_masq_drop_rate = sysctl_ip_vs_am_droprate;
2315+ break;
2316+ }
2317+
2318+ /* secure_tcp */
2319+ switch (sysctl_ip_vs_secure_tcp) {
2320+ case 0:
2321+ ip_masq_secure_tcp_set(0);
2322+ break;
2323+ case 1:
2324+ if (nomem) {
2325+ ip_masq_secure_tcp_set(1);
2326+ sysctl_ip_vs_secure_tcp = 2;
2327+ } else {
2328+ ip_masq_secure_tcp_set(0);
2329+ }
2330+ break;
2331+ case 2:
2332+ if (nomem) {
2333+ ip_masq_secure_tcp_set(1);
2334+ } else {
2335+ ip_masq_secure_tcp_set(0);
2336+ sysctl_ip_vs_secure_tcp = 1;
2337+ }
2338+ break;
2339+ case 3:
2340+ ip_masq_secure_tcp_set(1);
2341+ break;
2342+ }
2343+}
2344+
2345+
2346+static inline int todrop_entry(struct ip_masq *ms)
2347+{
2348+ /*
2349+ * The drop rate array needs tuning for real environments.
2350+ */
2351+ static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
2352+ static char todrop_counter[9] = {0};
2353+ int i;
2354+
2355+ if (ms->timeout+jiffies-ms->timer.expires < 60*HZ)
2356+ return 0;
2357+
2358+ i = atomic_read(&ms->in_pkts);
2359+ if (i > 8) return 0;
2360+
2361+ if (!todrop_rate[i]) return 0;
2362+ if (--todrop_counter[i] > 0) return 0;
2363+
2364+ todrop_counter[i] = todrop_rate[i];
2365+ return 1;
2366+}
2367+
2368+static inline void ip_vs_random_dropentry(void)
2369+{
2370+ int i;
2371+ struct ip_masq *ms;
2372+ struct list_head *l,*e;
2373+ struct ip_masq *mst;
2374+ void (*fn)(unsigned long);
2375+
2376+ /*
2377+ * Randomly scan 1/32 of the whole table every second
2378+ */
2379+ for (i=0; i < (IP_VS_TAB_SIZE>>5); i++) {
2380+ /*
2381+ * Lock is actually needed in this loop.
2382+ */
2383+ write_lock(&__ip_masq_lock);
2384+
2385+ l = &ip_vs_table[net_random()&IP_VS_TAB_MASK];
2386+ for (e=l->next; e!=l; e=e->next) {
2387+ ms = list_entry(e, struct ip_masq, m_list);
2388+ if (ms->dport == 0)
2389+ /* masq template */
2390+ continue;
2391+ switch(ms->state) {
2392+ case IP_MASQ_S_SYN_RECV:
2393+ case IP_MASQ_S_SYNACK:
2394+ break;
2395+
2396+ case IP_MASQ_S_ESTABLISHED:
2397+ case IP_MASQ_S_UDP:
2398+ if (todrop_entry(ms))
2399+ break;
2400+ continue;
2401+
2402+ default:
2403+ continue;
2404+ }
2405+
2406+ /*
2407+ * Drop the entry, and drop its mst if not referenced
2408+ */
2409+ write_unlock(&__ip_masq_lock);
2410+ IP_VS_DBG(4, "Drop masq\n");
2411+ mst = ms->control;
2412+ fn = (ms->timer).function;
2413+ del_sltimer(&ms->timer);
2414+ fn((unsigned long)ms);
2415+ if (mst && !atomic_read(&mst->n_control)) {
2416+ IP_VS_DBG(4, "Drop masq template\n");
2417+ del_sltimer(&mst->timer);
2418+ fn((unsigned long)mst);
2419+ }
2420+ write_lock(&__ip_masq_lock);
2421+ }
2422+ write_unlock(&__ip_masq_lock);
2423+ }
2424+}
2425+
2426+
2427+/*
2428+ * The following block implements slow timers for IPVS, most code is stolen
2429+ * from linux/kernel/sched.c
2430+ * Slow timer is used to avoid the overhead of cascading timers, when lots
2431+ * of masq entries (>50,000) are cluttered in the system.
2432+ */
2433+#define SHIFT_BITS 6
2434+#define TVN_BITS 8
2435+#define TVR_BITS 10
2436+#define TVN_SIZE (1 << TVN_BITS)
2437+#define TVR_SIZE (1 << TVR_BITS)
2438+#define TVN_MASK (TVN_SIZE - 1)
2439+#define TVR_MASK (TVR_SIZE - 1)
2440+
2441+struct sltimer_vec {
2442+ int index;
2443+ struct timer_list *vec[TVN_SIZE];
2444+};
2445+
2446+struct sltimer_vec_root {
2447+ int index;
2448+ struct timer_list *vec[TVR_SIZE];
2449+};
2450+
2451+static struct sltimer_vec sltv3 = { 0 };
2452+static struct sltimer_vec sltv2 = { 0 };
2453+static struct sltimer_vec_root sltv1 = { 0 };
2454+
2455+static struct sltimer_vec * const sltvecs[] = {
2456+ (struct sltimer_vec *)&sltv1, &sltv2, &sltv3
2457+};
2458+
2459+#define NOOF_SLTVECS (sizeof(sltvecs) / sizeof(sltvecs[0]))
2460+
2461+static unsigned long sltimer_jiffies = 0;
2462+
2463+static inline void insert_sltimer(struct timer_list *timer,
2464+ struct timer_list **vec, int idx)
2465+{
2466+ if ((timer->next = vec[idx]))
2467+ vec[idx]->prev = timer;
2468+ vec[idx] = timer;
2469+ timer->prev = (struct timer_list *)&vec[idx];
2470+}
2471+
2472+static inline void internal_add_sltimer(struct timer_list *timer)
2473+{
2474+ /*
2475+ * must be cli-ed when calling this
2476+ */
2477+ unsigned long expires = timer->expires;
2478+ unsigned long idx = (expires - sltimer_jiffies) >> SHIFT_BITS;
2479+
2480+ if (idx < TVR_SIZE) {
2481+ int i = (expires >> SHIFT_BITS) & TVR_MASK;
2482+ insert_sltimer(timer, sltv1.vec, i);
2483+ } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
2484+ int i = (expires >> (SHIFT_BITS+TVR_BITS)) & TVN_MASK;
2485+ insert_sltimer(timer, sltv2.vec, i);
2486+ } else if ((signed long) idx < 0) {
2487+ /*
2488+ * can happen if you add a timer with expires == jiffies,
2489+ * or you set a timer to go off in the past
2490+ */
2491+ insert_sltimer(timer, sltv1.vec, sltv1.index);
2492+ } else if (idx <= 0xffffffffUL) {
2493+ int i = (expires >> (SHIFT_BITS+TVR_BITS+TVN_BITS)) & TVN_MASK;
2494+ insert_sltimer(timer, sltv3.vec, i);
2495+ } else {
2496+ /* Can only get here on architectures with 64-bit jiffies */
2497+ timer->next = timer->prev = timer;
2498+ }
2499+}
2500+
2501+rwlock_t sltimerlist_lock = RW_LOCK_UNLOCKED;
2502+
2503+void add_sltimer(struct timer_list *timer)
2504+{
2505+ write_lock(&sltimerlist_lock);
2506+ if (timer->prev)
2507+ goto bug;
2508+ internal_add_sltimer(timer);
2509+out:
2510+ write_unlock(&sltimerlist_lock);
2511+ return;
2512+
2513+bug:
2514+ printk("bug: kernel sltimer added twice at %p.\n",
2515+ __builtin_return_address(0));
2516+ goto out;
2517+}
2518+
2519+static inline int detach_sltimer(struct timer_list *timer)
2520+{
2521+ struct timer_list *prev = timer->prev;
2522+ if (prev) {
2523+ struct timer_list *next = timer->next;
2524+ prev->next = next;
2525+ if (next)
2526+ next->prev = prev;
2527+ return 1;
2528+ }
2529+ return 0;
2530+}
2531+
2532+void mod_sltimer(struct timer_list *timer, unsigned long expires)
2533+{
2534+ write_lock(&sltimerlist_lock);
2535+ timer->expires = expires;
2536+ detach_sltimer(timer);
2537+ internal_add_sltimer(timer);
2538+ write_unlock(&sltimerlist_lock);
2539+}
2540+
2541+int del_sltimer(struct timer_list * timer)
2542+{
2543+ int ret;
2544+
2545+ write_lock(&sltimerlist_lock);
2546+ ret = detach_sltimer(timer);
2547+ timer->next = timer->prev = 0;
2548+ write_unlock(&sltimerlist_lock);
2549+ return ret;
2550+}
2551+
2552+
2553+static inline void cascade_sltimers(struct sltimer_vec *tv)
2554+{
2555+ /*
2556+ * cascade all the timers from tv up one level
2557+ */
2558+ struct timer_list *timer;
2559+ timer = tv->vec[tv->index];
2560+ /*
2561+ * We are removing _all_ timers from the list, so we don't have to
2562+ * detach them individually, just clear the list afterwards.
2563+ */
2564+ while (timer) {
2565+ struct timer_list *tmp = timer;
2566+ timer = timer->next;
2567+ internal_add_sltimer(tmp);
2568+ }
2569+ tv->vec[tv->index] = NULL;
2570+ tv->index = (tv->index + 1) & TVN_MASK;
2571+}
2572+
2573+static inline void run_sltimer_list(void)
2574+{
2575+ write_lock(&sltimerlist_lock);
2576+ while ((long)(jiffies - sltimer_jiffies) >= 0) {
2577+ struct timer_list *timer;
2578+ if (!sltv1.index) {
2579+ int n = 1;
2580+ do {
2581+ cascade_sltimers(sltvecs[n]);
2582+ } while (sltvecs[n]->index == 1 && ++n < NOOF_SLTVECS);
2583+ }
2584+ while ((timer = sltv1.vec[sltv1.index])) {
2585+ void (*fn)(unsigned long) = timer->function;
2586+ unsigned long data = timer->data;
2587+ detach_sltimer(timer);
2588+ timer->next = timer->prev = NULL;
2589+ write_unlock(&sltimerlist_lock);
2590+ fn(data);
2591+ write_lock(&sltimerlist_lock);
2592+ }
2593+ sltimer_jiffies += 1<<SHIFT_BITS;
2594+ sltv1.index = (sltv1.index + 1) & TVR_MASK;
2595+ }
2596+ write_unlock(&sltimerlist_lock);
2597+}
2598+
2599+static void sltimer_handler(unsigned long data);
2600+
2601+struct timer_list slow_timer = {
2602+ NULL, NULL,
2603+ 0, 0,
2604+ sltimer_handler,
2605+};
2606+
2607+/*
2608+ * Slow timer handler is activated every second
2609+ */
2610+#define SLTIMER_PERIOD 1*HZ
2611+
2612+void sltimer_handler(unsigned long data)
2613+{
2614+ run_sltimer_list();
2615+
2616+ update_defense_level();
2617+ if (ip_vs_dropentry)
2618+ ip_vs_random_dropentry();
2619+
2620+ mod_timer(&slow_timer, (jiffies + SLTIMER_PERIOD));
2621+}
2622+
2623+
2624+/*
2625+ * The port number of FTP service (in network order).
2626+ */
2627+#define FTPPORT __constant_htons(21)
2628+#define FTPDATA __constant_htons(20)
2629+
2630+/*
2631+ * Lock for IPVS
2632+ */
2633+rwlock_t __ip_vs_lock = RW_LOCK_UNLOCKED;
2634+
2635+/*
2636+ * Hash table: for input and output packets lookups of IPVS
2637+ */
2638+#define IP_MASQ_NTABLES 3
2639+
2640+struct list_head *ip_vs_table;
2641+
2642+/*
2643+ * Hash table: for virtual service lookups
2644+ */
2645+#define IP_VS_SVC_TAB_BITS 8
2646+#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
2647+#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
2648+
2649+/* the service table hashed by <protocol, addr, port> */
2650+struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
2651+/* the service table hashed by fwmark */
2652+struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
2653+
2654+/*
2655+ * Hash table: for real service lookups
2656+ */
2657+#define IP_VS_RTAB_BITS 4
2658+#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
2659+#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
2660+
2661+struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
2662+
2663+/*
2664+ * IPVS scheduler list
2665+ */
2666+struct list_head ip_vs_schedulers;
2667+
2668+/*
2669+ * Trash for destinations
2670+ */
2671+struct list_head ip_vs_dest_trash;
2672+
2673+/*
2674+ * FTP & NULL virtual service counters
2675+ */
2676+atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
2677+atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
2678+
2679+/*
2680+ * Register a scheduler in the scheduler list
2681+ */
2682+int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
2683+{
2684+ if (!scheduler) {
2685+ IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
2686+ return -EINVAL;
2687+ }
2688+
2689+ if (!scheduler->name) {
2690+ IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
2691+ return -EINVAL;
2692+ }
2693+
2694+ if (scheduler->n_list.next != &scheduler->n_list) {
2695+ IP_VS_ERR("register_ip_vs_scheduler(): scheduler already linked\n");
2696+ return -EINVAL;
2697+ }
2698+
2699+ /*
2700+ * Add it into the d-linked scheduler list
2701+ */
2702+ list_add(&scheduler->n_list, &ip_vs_schedulers);
2703+
2704+ return 0;
2705+}
2706+
2707+
2708+/*
2709+ * Unregister a scheduler in the scheduler list
2710+ */
2711+int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
2712+{
2713+ if (!scheduler) {
2714+ IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
2715+ return -EINVAL;
2716+ }
2717+
2718+ /*
2719+ * Only allow unregistration if it is not referenced
2720+ */
2721+ if (atomic_read(&scheduler->refcnt)) {
2722+ IP_VS_ERR("unregister_ip_vs_scheduler(): is in use by %d guys. failed\n",
2723+ atomic_read(&scheduler->refcnt));
2724+ return -EINVAL;
2725+ }
2726+
2727+ if (scheduler->n_list.next == &scheduler->n_list) {
2728+ IP_VS_ERR("unregister_ip_vs_scheduler(): scheduler is not in the list. failed\n");
2729+ return -EINVAL;
2730+ }
2731+
2732+ /*
2733+ * Removed it from the d-linked scheduler list
2734+ */
2735+ list_del(&scheduler->n_list);
2736+
2737+ return 0;
2738+}
2739+
2740+
2741+/*
2742+ * Bind a service with a scheduler
2743+ * Must called with the __ip_vs_lock lock, and return bool.
2744+ */
2745+int ip_vs_bind_scheduler(struct ip_vs_service *svc,
2746+ struct ip_vs_scheduler *scheduler)
2747+{
2748+ if (svc == NULL) {
2749+ IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
2750+ return -EINVAL;
2751+ }
2752+ if (scheduler == NULL) {
2753+ IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
2754+ return -EINVAL;
2755+ }
2756+
2757+ svc->scheduler = scheduler;
2758+ atomic_inc(&scheduler->refcnt);
2759+
2760+ if(scheduler->init_service)
2761+ if(scheduler->init_service(svc) != 0) {
2762+ IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
2763+ return -EINVAL;
2764+ }
2765+
2766+ return 0;
2767+}
2768+
2769+
2770+/*
2771+ * Unbind a service with its scheduler
2772+ * Must called with the __ip_vs_lock lock, and return bool.
2773+ */
2774+int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
2775+{
2776+ struct ip_vs_scheduler *sched;
2777+
2778+ if (svc == NULL) {
2779+ IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
2780+ return -EINVAL;
2781+ }
2782+
2783+ sched = svc->scheduler;
2784+ if (sched == NULL) {
2785+ IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
2786+ return -EINVAL;
2787+ }
2788+
2789+ if(sched->done_service)
2790+ if(sched->done_service(svc) != 0) {
2791+ IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
2792+ return -EINVAL;
2793+ }
2794+
2795+ atomic_dec(&sched->refcnt);
2796+ svc->scheduler = NULL;
2797+
2798+ return 0;
2799+}
2800+
2801+
2802+/*
2803+ * Get scheduler in the scheduler list by name
2804+ */
2805+struct ip_vs_scheduler * ip_vs_sched_getbyname(const char *sched_name)
2806+{
2807+ struct ip_vs_scheduler *sched;
2808+ struct list_head *l, *e;
2809+
2810+ IP_VS_DBG(6, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
2811+ sched_name);
2812+
2813+ read_lock_bh(&__ip_vs_lock);
2814+
2815+ l = &ip_vs_schedulers;
2816+ for (e=l->next; e!=l; e=e->next) {
2817+ sched = list_entry(e, struct ip_vs_scheduler, n_list);
2818+ if (strcmp(sched_name, sched->name)==0) {
2819+ /* HIT */
2820+ read_unlock_bh(&__ip_vs_lock);
2821+ return sched;
2822+ }
2823+ }
2824+
2825+ read_unlock_bh(&__ip_vs_lock);
2826+ return NULL;
2827+}
2828+
2829+
2830+/*
2831+ * Lookup scheduler and try to load it if it doesn't exist
2832+ */
2833+struct ip_vs_scheduler * ip_vs_lookup_scheduler(const char *sched_name)
2834+{
2835+ struct ip_vs_scheduler *sched;
2836+
2837+ /*
2838+ * Search for the scheduler by sched_name
2839+ */
2840+ sched = ip_vs_sched_getbyname(sched_name);
2841+
2842+ /*
2843+ * If scheduler not found, load the module and search again
2844+ */
2845+ if (sched == NULL) {
2846+ char module_name[IP_MASQ_TNAME_MAX+8];
2847+ sprintf(module_name,"ip_vs_%s",sched_name);
2848+#ifdef CONFIG_KMOD
2849+ request_module(module_name);
2850+#endif /* CONFIG_KMOD */
2851+ sched = ip_vs_sched_getbyname(sched_name);
2852+ }
2853+
2854+ return sched;
2855+}
2856+
2857+
2858+/*
2859+ * Returns hash value for IPVS masq entry
2860+ */
2861+
2862+static __inline__ unsigned
2863+ip_vs_hash_key(unsigned proto, __u32 addr, __u16 port)
2864+{
2865+ unsigned addrh = ntohl(addr);
2866+
2867+ return (proto^addrh^(addrh>>IP_VS_TAB_BITS)^ntohs(port))
2868+ & IP_VS_TAB_MASK;
2869+}
2870+
2871+
2872+/*
2873+ * Hashes ip_masq in ip_vs_table by proto,addr,port.
2874+ * should be called with locked tables.
2875+ * returns bool success.
2876+ */
2877+int ip_vs_hash(struct ip_masq *ms)
2878+{
2879+ unsigned hash;
2880+
2881+ if (ms->flags & IP_MASQ_F_HASHED) {
2882+ IP_VS_ERR("ip_vs_hash(): request for already hashed, "
2883+ "called from %p\n", __builtin_return_address(0));
2884+ return 0;
2885+ }
2886+
2887+ /*
2888+ * Note: because ip_masq_put sets masq expire only if its
2889+ * refcnt==IP_MASQ_NTABLES, otherwise the masq entry
2890+ * will never expire.
2891+ */
2892+ atomic_add(IP_MASQ_NTABLES, &ms->refcnt);
2893+
2894+ /*
2895+ * Hash by proto,d{addr,port},
2896+ * which are client address and port in IPVS.
2897+ */
2898+ hash = ip_vs_hash_key(ms->protocol, ms->daddr, ms->dport);
2899+ list_add(&ms->m_list, &ip_vs_table[hash]);
2900+
2901+ ms->flags |= IP_MASQ_F_HASHED;
2902+ return 1;
2903+}
2904+
2905+
2906+/*
2907+ * Unhashes ip_masq from ip_vs_table.
2908+ * should be called with locked tables.
2909+ * returns bool success.
2910+ */
2911+int ip_vs_unhash(struct ip_masq *ms)
2912+{
2913+ if (!(ms->flags & IP_MASQ_F_HASHED)) {
2914+ IP_VS_ERR("ip_vs_unhash(): request for unhash flagged, "
2915+ "called from %p\n", __builtin_return_address(0));
2916+ return 0;
2917+ }
2918+
2919+ /*
2920+ * Remove it from the list and decrease its reference counter.
2921+ */
2922+ list_del(&ms->m_list);
2923+ atomic_sub(IP_MASQ_NTABLES, &ms->refcnt);
2924+
2925+ ms->flags &= ~IP_MASQ_F_HASHED;
2926+ return 1;
2927+}
2928+
2929+
2930+/*
2931+ * Gets ip_masq associated with supplied parameters in the ip_vs_table.
2932+ * Called for pkts coming from OUTside-to-INside.
2933+ * s_addr, s_port: pkt source address (foreign host)
2934+ * d_addr, d_port: pkt dest address (load balancer)
2935+ * Caller must lock tables
2936+ */
2937+struct ip_masq * __ip_vs_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
2938+{
2939+ unsigned hash;
2940+ struct ip_masq *ms;
2941+ struct list_head *l,*e;
2942+
2943+ hash = ip_vs_hash_key(protocol, s_addr, s_port);
2944+
2945+ l = &ip_vs_table[hash];
2946+ for (e=l->next; e!=l; e=e->next) {
2947+ ms = list_entry(e, struct ip_masq, m_list);
2948+ if (s_addr==ms->daddr && s_port==ms->dport &&
2949+ d_port==ms->mport && d_addr==ms->maddr &&
2950+ protocol==ms->protocol) {
2951+ /* HIT */
2952+ atomic_inc(&ms->refcnt);
2953+ goto out;
2954+ }
2955+ }
2956+ ms = NULL;
2957+
2958+ out:
2959+ IP_VS_DBG(7, "look/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
2960+ masq_proto_name(protocol),
2961+ NIPQUAD(s_addr), ntohs(s_port),
2962+ NIPQUAD(d_addr), ntohs(d_port),
2963+ ms?"hit":"not hit");
2964+
2965+ return ms;
2966+}
2967+
2968+
2969+/*
2970+ * Gets ip_masq associated with supplied parameters in the ip_vs_table.
2971+ * Called for pkts coming from inside-to-OUTside.
2972+ * s_addr, s_port: pkt source address (inside host)
2973+ * d_addr, d_port: pkt dest address (foreign host)
2974+ * Caller must lock tables
2975+ */
2976+struct ip_masq * __ip_vs_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
2977+{
2978+ unsigned hash;
2979+ struct ip_masq *ms;
2980+ struct list_head *l,*e;
2981+
2982+ /*
2983+ * Check for "full" addressed entries
2984+ */
2985+ hash = ip_vs_hash_key(protocol, d_addr, d_port);
2986+
2987+ l = &ip_vs_table[hash];
2988+ for (e=l->next; e!=l; e=e->next) {
2989+ ms = list_entry(e, struct ip_masq, m_list);
2990+ if (d_addr == ms->daddr && d_port == ms->dport &&
2991+ s_port == ms->sport && s_addr == ms->saddr &&
2992+ protocol == ms->protocol) {
2993+ /* HIT */
2994+ atomic_inc(&ms->refcnt);
2995+ goto out;
2996+ }
2997+ }
2998+ ms = NULL;
2999+
3000+ out:
3001+ IP_VS_DBG(7, "look/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
3002+ masq_proto_name(protocol),
3003+ NIPQUAD(s_addr), ntohs(s_port),
3004+ NIPQUAD(d_addr), ntohs(d_port),
3005+ ms?"hit":"not hit");
3006+
3007+ return ms;
3008+}
3009+
3010+
3011+/*
3012+ * Called by ip_vs_sched_persist to look for masq template.
3013+ */
3014+static __inline__ struct ip_masq *ip_vs_in_get
3015+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
3016+{
3017+ struct ip_masq *ms;
3018+
3019+ read_lock(&__ip_masq_lock);
3020+ ms = __ip_vs_in_get(protocol, s_addr, s_port, d_addr, d_port);
3021+ read_unlock(&__ip_masq_lock);
3022+
3023+ return ms;
3024+}
3025+
3026+
3027+/*
3028+ * Returns hash value for virtual service
3029+ */
3030+static __inline__ unsigned
3031+ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
3032+{
3033+ register unsigned porth = ntohs(port);
3034+
3035+ return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
3036+ & IP_VS_SVC_TAB_MASK;
3037+}
3038+
3039+/*
3040+ * Returns hash value of fwmark for virtual service lookup
3041+ */
3042+static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
3043+{
3044+ return fwmark & IP_VS_SVC_TAB_MASK;
3045+}
3046+
3047+/*
3048+ * Hashes ip_vs_service in the ip_vs_svc_table by <proto,addr,port>
3049+ * or in the ip_vs_svc_fwm_table by fwmark.
3050+ * Should be called with locked tables.
3051+ * Returns bool success.
3052+ */
3053+int ip_vs_svc_hash(struct ip_vs_service *svc)
3054+{
3055+ unsigned hash;
3056+
3057+ if (svc->flags & IP_VS_SVC_F_HASHED) {
3058+ IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
3059+ "called from %p\n", __builtin_return_address(0));
3060+ return 0;
3061+ }
3062+
3063+ if (svc->fwmark == 0) {
3064+ /*
3065+ * Hash by <protocol,addr,port> in ip_vs_svc_table
3066+ */
3067+ hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
3068+ list_add(&svc->s_list, &ip_vs_svc_table[hash]);
3069+ } else {
3070+ /*
3071+ * Hash by fwmark in ip_vs_svc_fwm_table
3072+ */
3073+ hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
3074+ list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
3075+ }
3076+
3077+ svc->flags |= IP_VS_SVC_F_HASHED;
3078+ atomic_inc(&svc->refcnt);
3079+ return 1;
3080+}
3081+
3082+
3083+/*
3084+ * Unhashes ip_vs_service from ip_vs_svc_table/ip_vs_svc_fwm_table.
3085+ * Should be called with locked tables.
3086+ * Returns bool success.
3087+ */
3088+int ip_vs_svc_unhash(struct ip_vs_service *svc)
3089+{
3090+ if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
3091+ IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
3092+ "called from %p\n", __builtin_return_address(0));
3093+ return 0;
3094+ }
3095+
3096+ if (svc->fwmark == 0) {
3097+ /*
3098+ * Remove it from the ip_vs_svc_table table.
3099+ */
3100+ list_del(&svc->s_list);
3101+ } else {
3102+ /*
3103+ * Remove it from the ip_vs_svc_fwm_table table.
3104+ */
3105+ list_del(&svc->f_list);
3106+ }
3107+
3108+ svc->flags &= ~IP_VS_SVC_F_HASHED;
3109+ atomic_dec(&svc->refcnt);
3110+ return 1;
3111+}
3112+
3113+
3114+/*
3115+ * Lookup service by {proto,addr,port} in the service table.
3116+ */
3117+static __inline__ struct ip_vs_service *
3118+__ip_vs_lookup_service(__u16 protocol, __u32 vaddr, __u16 vport)
3119+{
3120+ unsigned hash;
3121+ struct ip_vs_service *svc;
3122+ struct list_head *l,*e;
3123+
3124+ /*
3125+ * Check for "full" addressed entries
3126+ * Note: as long as IP_VS_SVC_TAB_BITS is larger than zero,
3127+ * <TCP,addr,port> and <UDP,addr,port> have different hash
3128+ * keys, there is no need to do protcol checking.
3129+ */
3130+ hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
3131+
3132+ l = &ip_vs_svc_table[hash];
3133+ for (e=l->next; e!=l; e=e->next) {
3134+ svc = list_entry(e, struct ip_vs_service, s_list);
3135+ if ((svc->addr == vaddr)
3136+ && (svc->port == vport)) {
3137+ /* HIT */
3138+ return svc;
3139+ }
3140+ }
3141+
3142+ return NULL;
3143+}
3144+
3145+
3146+/*
3147+ * Lookup service by fwmark in the service table.
3148+ */
3149+static __inline__ struct ip_vs_service * __ip_vs_lookup_svc_fwm(__u32 fwmark)
3150+{
3151+ unsigned hash;
3152+ struct ip_vs_service *svc;
3153+ struct list_head *l,*e;
3154+
3155+ /*
3156+ * Check for fwmark-indexed entries
3157+ */
3158+ hash = ip_vs_svc_fwm_hashkey(fwmark);
3159+
3160+ l = &ip_vs_svc_fwm_table[hash];
3161+ for (e=l->next; e!=l; e=e->next) {
3162+ svc = list_entry(e, struct ip_vs_service, f_list);
3163+ if (svc->fwmark == fwmark) {
3164+ /* HIT */
3165+ return svc;
3166+ }
3167+ }
3168+
3169+ return NULL;
3170+}
3171+
3172+struct ip_vs_service *
3173+ip_vs_lookup_service(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
3174+{
3175+ struct ip_vs_service *svc;
3176+
3177+ read_lock(&__ip_vs_lock);
3178+
3179+ if (fwmark) {
3180+ /*
3181+ * Check the table hashed by fwmark first
3182+ */
3183+ svc = __ip_vs_lookup_svc_fwm(fwmark);
3184+ if (svc)
3185+ goto out;
3186+ }
3187+
3188+ /*
3189+ * Check the table hashed by <protocol,addr,port>
3190+ * first for "full" addressed entries
3191+ */
3192+ svc = __ip_vs_lookup_service(protocol, vaddr, vport);
3193+
3194+ if (svc == NULL
3195+ && protocol == IPPROTO_TCP
3196+ && atomic_read(&ip_vs_ftpsvc_counter)
3197+ && (vport==FTPDATA || ntohs(vport)>=PROT_SOCK)){
3198+ /*
3199+ * Check if ftp service entry exists, the packet
3200+ * might belong to FTP data connections.
3201+ */
3202+ svc = __ip_vs_lookup_service(protocol, vaddr, FTPPORT);
3203+ }
3204+
3205+ if (svc == NULL
3206+ && atomic_read(&ip_vs_nullsvc_counter)) {
3207+ /*
3208+ * Check if the catch-all port (port zero) exists
3209+ */
3210+ svc = __ip_vs_lookup_service(protocol, vaddr, 0);
3211+ }
3212+
3213+ out:
3214+ read_unlock(&__ip_vs_lock);
3215+
3216+ IP_VS_DBG(5, "lookup_service fwm %d %s %u.%u.%u.%u:%d %s\n",
3217+ fwmark,
3218+ masq_proto_name(protocol),
3219+ NIPQUAD(vaddr), ntohs(vport),
3220+ svc?"hit":"not hit");
3221+
3222+ return svc;
3223+}
3224+
3225+
3226+/*
3227+ * Bind a destination with a service
3228+ */
3229+static inline void
3230+__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
3231+{
3232+ atomic_inc(&svc->refcnt);
3233+ dest->svc = svc;
3234+}
3235+
3236+/*
3237+ * Unbind a destination with its service
3238+ */
3239+static inline void
3240+__ip_vs_unbind_svc(struct ip_vs_dest *dest)
3241+{
3242+ struct ip_vs_service *svc = dest->svc;
3243+
3244+ dest->svc = NULL;
3245+ if (atomic_dec_and_test(&svc->refcnt)) {
3246+ IP_VS_DBG(2, "release svc %s %u.%u.%u.%u:%d\n",
3247+ masq_proto_name(svc->protocol),
3248+ NIPQUAD(svc->addr), ntohs(svc->port));
3249+ kfree_s(svc, sizeof(struct ip_vs_service));
3250+ }
3251+}
3252+
3253+
3254+/*
3255+ * Returns hash value for real service
3256+ */
3257+static __inline__ unsigned
3258+ip_vs_rs_hashkey(__u32 addr, __u16 port)
3259+{
3260+ register unsigned porth = ntohs(port);
3261+
3262+ return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) & IP_VS_RTAB_MASK;
3263+}
3264+
3265+/*
3266+ * Hashes ip_vs_dest in ip_vs_rtable by proto,addr,port.
3267+ * should be called with locked tables.
3268+ * returns bool success.
3269+ */
3270+int ip_vs_rs_hash(struct ip_vs_dest *dest)
3271+{
3272+ unsigned hash;
3273+
3274+ if (!list_empty(&dest->d_list)) {
3275+ return 0;
3276+ }
3277+
3278+ /*
3279+ * Hash by proto,addr,port,
3280+ * which are the parameters of the real service.
3281+ */
3282+ hash = ip_vs_rs_hashkey(dest->addr, dest->port);
3283+ list_add(&dest->d_list, &ip_vs_rtable[hash]);
3284+
3285+ return 1;
3286+}
3287+
3288+/*
3289+ * UNhashes ip_vs_dest from ip_vs_rtable.
3290+ * should be called with locked tables.
3291+ * returns bool success.
3292+ */
3293+int ip_vs_rs_unhash(struct ip_vs_dest *dest)
3294+{
3295+ /*
3296+ * Remove it from the ip_vs_rtable table.
3297+ */
3298+ if (!list_empty(&dest->d_list)) {
3299+ list_del(&dest->d_list);
3300+ INIT_LIST_HEAD(&dest->d_list);
3301+ }
3302+
3303+ return 1;
3304+}
3305+
3306+/*
3307+ * Lookup real service by {proto,addr,port} in the real service table.
3308+ */
3309+struct ip_vs_dest * __ip_vs_lookup_real_service(__u16 protocol,
3310+ __u32 daddr, __u16 dport)
3311+{
3312+ unsigned hash;
3313+ struct ip_vs_dest *dest;
3314+ struct list_head *l,*e;
3315+
3316+ /*
3317+ * Check for "full" addressed entries
3318+ * Return the first found entry
3319+ */
3320+ hash = ip_vs_rs_hashkey(daddr, dport);
3321+
3322+ l = &ip_vs_rtable[hash];
3323+ for (e=l->next; e!=l; e=e->next) {
3324+ dest = list_entry(e, struct ip_vs_dest, d_list);
3325+ if ((dest->addr == daddr)
3326+ && (dest->port == dport)
3327+ && ((dest->protocol == protocol) || dest->vfwmark)) {
3328+ /* HIT */
3329+ return dest;
3330+ }
3331+ }
3332+
3333+ return NULL;
3334+}
3335+
3336+/*
3337+ * Lookup destination by {addr,port} in the given service
3338+ */
3339+struct ip_vs_dest * ip_vs_lookup_dest(struct ip_vs_service *svc,
3340+ __u32 daddr, __u16 dport)
3341+{
3342+ struct ip_vs_dest *dest;
3343+ struct list_head *l, *e;
3344+
3345+ read_lock_bh(&__ip_vs_lock);
3346+
3347+ /*
3348+ * Find the destination for the given service
3349+ */
3350+ l = &svc->destinations;
3351+ for (e=l->next; e!=l; e=e->next) {
3352+ dest = list_entry(e, struct ip_vs_dest, n_list);
3353+ if ((dest->addr == daddr) && (dest->port == dport)) {
3354+ /* HIT */
3355+ read_unlock_bh(&__ip_vs_lock);
3356+ return dest;
3357+ }
3358+ }
3359+
3360+ read_unlock_bh(&__ip_vs_lock);
3361+ return NULL;
3362+}
3363+
3364+
3365+/*
3366+ * Lookup dest by {svc,addr,port} in the destination trash.
3367+ * Called by ip_vs_add_dest with the __ip_vs_lock.
3368+ * The destination trash is used to hold the destinations that are removed
3369+ * from the service table but are still referenced by some masq entries.
3370+ * The reason to add the destination trash is when the dest is temporary
3371+ * down (either by administrator or by monitor program), the dest can be
3372+ * picked back from the trash, the remaining connections to the dest can
3373+ * continue, and the counting information of the dest is also useful for
3374+ * scheduling.
3375+ */
3376+struct ip_vs_dest * __ip_vs_get_trash_dest(struct ip_vs_service *svc,
3377+ __u32 daddr, __u16 dport)
3378+{
3379+ struct ip_vs_dest *dest;
3380+ struct list_head *l, *e;
3381+
3382+ /*
3383+ * Find the destination in trash
3384+ */
3385+ l = &ip_vs_dest_trash;
3386+ for (e=l->next; e!=l; e=e->next) {
3387+ dest = list_entry(e, struct ip_vs_dest, n_list);
3388+ IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%d still in trash, "
3389+ "refcnt=%d\n",
3390+ dest->vfwmark,
3391+ NIPQUAD(dest->addr), ntohs(dest->port),
3392+ atomic_read(&dest->refcnt));
3393+ if (dest->addr == daddr &&
3394+ dest->port == dport &&
3395+ dest->vfwmark == svc->fwmark &&
3396+ (svc->fwmark ||
3397+ (dest->protocol == svc->protocol &&
3398+ dest->vaddr == svc->addr &&
3399+ dest->vport == svc->port))) {
3400+ /* HIT */
3401+ return dest;
3402+ }
3403+
3404+ /*
3405+ * Try to purge the destination from trash if not referenced
3406+ */
3407+ if (atomic_read(&dest->refcnt) == 1) {
3408+ IP_VS_DBG(3, "Remove destination %u/%u.%u.%u.%u:%d "
3409+ "from trash\n",
3410+ dest->vfwmark,
3411+ NIPQUAD(dest->addr), ntohs(dest->port));
3412+ e = e->prev;
3413+ list_del(&dest->n_list);
3414+ __ip_vs_unbind_svc(dest);
3415+ kfree_s(dest, sizeof(*dest));
3416+ }
3417+ }
3418+ return NULL;
3419+}
3420+
3421+
3422+/*
3423+ * Update a destination in the given service
3424+ */
3425+void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
3426+ struct ip_masq_ctl *mctl)
3427+{
3428+ struct ip_vs_user *mm = &mctl->u.vs_user;
3429+
3430+ /*
3431+ * Set the weight and the flags
3432+ */
3433+ dest->weight = mm->weight;
3434+ dest->masq_flags = mm->masq_flags;
3435+
3436+ dest->masq_flags |= IP_MASQ_F_VS;
3437+ dest->masq_flags |= IP_MASQ_F_VS_INACTIVE;
3438+
3439+ /*
3440+ * Check if local node and update the flags
3441+ */
3442+ if (inet_addr_type(mm->daddr) == RTN_LOCAL) {
3443+ dest->masq_flags = (dest->masq_flags & ~IP_MASQ_F_VS_FWD_MASK)
3444+ | IP_MASQ_F_VS_LOCALNODE;
3445+ }
3446+
3447+ /*
3448+ * Set the IP_MASQ_F_VS_NO_OUTPUT flag if not masquerading
3449+ */
3450+ if ((dest->masq_flags & IP_MASQ_F_VS_FWD_MASK) != 0) {
3451+ dest->masq_flags |= IP_MASQ_F_VS_NO_OUTPUT;
3452+ } else {
3453+ /*
3454+ * Put the real service in ip_vs_rtable if not present.
3455+ * For now only for NAT!
3456+ */
3457+ ip_vs_rs_hash(dest);
3458+ }
3459+
3460+
3461+ /* bind the service */
3462+ if (!dest->svc) {
3463+ __ip_vs_bind_svc(dest, svc);
3464+ } else {
3465+ if (dest->svc != svc) {
3466+ __ip_vs_unbind_svc(dest);
3467+ __ip_vs_bind_svc(dest, svc);
3468+ }
3469+ }
3470+
3471+ /*
3472+ * Set the dest status flags
3473+ */
3474+ dest->flags |= IP_VS_DEST_F_AVAILABLE;
3475+}
3476+
3477+
3478+/*
3479+ * Create a destination for the given service
3480+ */
3481+struct ip_vs_dest *ip_vs_new_dest(struct ip_vs_service *svc,
3482+ struct ip_masq_ctl *mctl)
3483+{
3484+ struct ip_vs_dest *dest;
3485+ struct ip_vs_user *mm = &mctl->u.vs_user;
3486+
3487+ EnterFunction(2);
3488+
3489+ dest = (struct ip_vs_dest*) kmalloc(sizeof(struct ip_vs_dest),
3490+ GFP_ATOMIC);
3491+ if (dest == NULL) {
3492+ IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
3493+ return NULL;
3494+ }
3495+ memset(dest, 0, sizeof(struct ip_vs_dest));
3496+
3497+ dest->protocol = svc->protocol;
3498+ dest->vaddr = svc->addr;
3499+ dest->vport = svc->port;
3500+ dest->vfwmark = svc->fwmark;
3501+ dest->addr = mm->daddr;
3502+ dest->port = mm->dport;
3503+
3504+ atomic_set(&dest->activeconns, 0);
3505+ atomic_set(&dest->inactconns, 0);
3506+ atomic_set(&dest->refcnt, 0);
3507+
3508+ INIT_LIST_HEAD(&dest->d_list);
3509+ dest->stats.lock = SPIN_LOCK_UNLOCKED;
3510+ __ip_vs_update_dest(svc, dest, mctl);
3511+
3512+ LeaveFunction(2);
3513+
3514+ return dest;
3515+}
3516+
3517+
3518+/*
3519+ * Add a destination into an existing service
3520+ */
3521+int ip_vs_add_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl)
3522+{
3523+ struct ip_vs_dest *dest;
3524+ struct ip_vs_user *mm = &mctl->u.vs_user;
3525+ __u32 daddr = mm->daddr;
3526+ __u16 dport = mm->dport;
3527+
3528+ EnterFunction(2);
3529+
3530+ if (mm->weight < 0) {
3531+ IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
3532+ return -ERANGE;
3533+ }
3534+
3535+ /*
3536+ * Check if the dest already exists in the list
3537+ */
3538+ dest = ip_vs_lookup_dest(svc, daddr, dport);
3539+ if (dest != NULL) {
3540+ IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
3541+ return -EEXIST;
3542+ }
3543+
3544+ write_lock_bh(&__ip_vs_lock);
3545+
3546+ /*
3547+ * Check if the dest already exists in the trash and
3548+ * is from the same service
3549+ */
3550+ dest = __ip_vs_get_trash_dest(svc, daddr, dport);
3551+ if (dest != NULL) {
3552+ IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%d from trash, "
3553+ "refcnt=%d, service %u.%u.%u.%u:%d\n",
3554+ NIPQUAD(daddr), ntohs(dport),
3555+ atomic_read(&dest->refcnt),
3556+ NIPQUAD(dest->vaddr),
3557+ ntohs(dest->vport));
3558+
3559+ /*
3560+ * Get the destination from the trash
3561+ */
3562+ list_del(&dest->n_list);
3563+ list_add(&dest->n_list, &svc->destinations);
3564+
3565+ __ip_vs_update_dest(svc, dest, mctl);
3566+
3567+ write_unlock_bh(&__ip_vs_lock);
3568+ return 0;
3569+ }
3570+
3571+ /*
3572+ * Allocate and initialize the dest structure
3573+ */
3574+ dest = ip_vs_new_dest(svc, mctl);
3575+ if (dest == NULL) {
3576+ write_unlock_bh(&__ip_vs_lock);
3577+ IP_VS_ERR("ip_vs_add_dest(): out of memory\n");
3578+ return -ENOMEM;
3579+ }
3580+
3581+ /*
3582+ * Add the dest entry into the list
3583+ */
3584+ list_add(&dest->n_list, &svc->destinations);
3585+ atomic_inc(&dest->refcnt);
3586+
3587+ write_unlock_bh(&__ip_vs_lock);
3588+
3589+ LeaveFunction(2);
3590+ return 0;
3591+}
3592+
3593+
3594+/*
3595+ * Edit a destination in the given service
3596+ */
3597+int ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl)
3598+{
3599+ struct ip_vs_dest *dest;
3600+ struct ip_vs_user *mm = &mctl->u.vs_user;
3601+ __u32 daddr = mm->daddr;
3602+ __u16 dport = mm->dport;
3603+
3604+ EnterFunction(2);
3605+
3606+ if (mm->weight < 0) {
3607+ IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
3608+ return -ERANGE;
3609+ }
3610+
3611+ /*
3612+ * Lookup the destination list
3613+ */
3614+ dest = ip_vs_lookup_dest(svc, daddr, dport);
3615+ if (dest == NULL) {
3616+ IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
3617+ return -ENOENT;
3618+ }
3619+
3620+ write_lock_bh(&__ip_vs_lock);
3621+
3622+ __ip_vs_update_dest(svc, dest, mctl);
3623+
3624+ write_unlock_bh(&__ip_vs_lock);
3625+
3626+ LeaveFunction(2);
3627+ return 0;
3628+}
3629+
3630+
3631+/*
3632+ * Delete a destination from the given service
3633+ */
3634+void __ip_vs_del_dest(struct ip_vs_dest *dest)
3635+{
3636+ dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
3637+
3638+ /*
3639+ * Remove it from the d-linked destination list.
3640+ */
3641+ list_del(&dest->n_list);
3642+
3643+ /*
3644+ * Remove it from the d-linked list with the real services.
3645+ */
3646+ ip_vs_rs_unhash(dest);
3647+
3648+ /*
3649+ * Decrease the refcnt of the dest, and free the dest
3650+ * if nobody refers to it (refcnt=0). Otherwise, throw
3651+ * the destination into the trash.
3652+ */
3653+ if (atomic_dec_and_test(&dest->refcnt)) {
3654+ /* simply decrease svc->refcnt here, let the caller check
3655+ and release the service if nobody refers to it.
3656+ Only user context can release destination and service,
3657+ and only user context can update virtual service at a
3658+ time, so the operation here is OK */
3659+ atomic_dec(&dest->svc->refcnt);
3660+ kfree_s(dest, sizeof(*dest));
3661+ } else {
3662+ IP_VS_DBG(3, "Move dest %u.%u.%u.%u:%d into trash, "
3663+ "refcnt=%d\n",
3664+ NIPQUAD(dest->addr), ntohs(dest->port),
3665+ atomic_read(&dest->refcnt));
3666+ list_add(&dest->n_list, &ip_vs_dest_trash);
3667+ atomic_inc(&dest->refcnt);
3668+ }
3669+}
3670+
3671+int ip_vs_del_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl)
3672+{
3673+ struct ip_vs_dest *dest;
3674+ struct ip_vs_user *mm = &mctl->u.vs_user;
3675+ __u32 daddr = mm->daddr;
3676+ __u16 dport = mm->dport;
3677+
3678+ EnterFunction(2);
3679+
3680+ /*
3681+ * Lookup the destination list
3682+ */
3683+ dest = ip_vs_lookup_dest(svc, daddr, dport);
3684+ if (dest == NULL) {
3685+ IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
3686+ return -ENOENT;
3687+ }
3688+
3689+ write_lock_bh(&__ip_vs_lock);
3690+
3691+ /*
3692+ * Remove dest from the destination list
3693+ */
3694+ __ip_vs_del_dest(dest);
3695+
3696+ /*
3697+ * Called the update_service function of its scheduler
3698+ */
3699+ svc->scheduler->update_service(svc);
3700+
3701+ write_unlock_bh(&__ip_vs_lock);
3702+
3703+ LeaveFunction(2);
3704+
3705+ return 0;
3706+}
3707+
3708+
3709+/*
3710+ * Add a service into the service hash table
3711+ */
3712+int ip_vs_add_service(struct ip_masq_ctl *mctl)
3713+{
3714+ struct ip_vs_user *mm = &mctl->u.vs_user;
3715+ __u16 protocol = mm->protocol;
3716+ __u32 vaddr = mm->vaddr;
3717+ __u16 vport = mm->vport;
3718+ __u32 vfwmark = mm->vfwmark;
3719+
3720+ int ret = 0;
3721+ struct ip_vs_scheduler *sched;
3722+ struct ip_vs_service *svc;
3723+
3724+ EnterFunction(2);
3725+
3726+ /*
3727+ * Lookup the scheduler, by 'mctl->m_tname'
3728+ */
3729+ sched = ip_vs_lookup_scheduler(mctl->m_tname);
3730+ if (sched == NULL) {
3731+ IP_VS_INFO("Scheduler module ip_vs_%s.o not found\n",
3732+ mctl->m_tname);
3733+ return -ENOENT;
3734+ }
3735+
3736+ write_lock_bh(&__ip_vs_lock);
3737+
3738+ /*
3739+ * Check if the service already exists
3740+ */
3741+ if (vfwmark == 0)
3742+ svc = __ip_vs_lookup_service(protocol, vaddr, vport);
3743+ else
3744+ svc = __ip_vs_lookup_svc_fwm(vfwmark);
3745+
3746+ if (svc != NULL) {
3747+ IP_VS_DBG(1, "ip_vs_add_service: service already exists.\n");
3748+ ret = -EEXIST;
3749+ goto out;
3750+ }
3751+
3752+ svc = (struct ip_vs_service*)
3753+ kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
3754+ if (svc == NULL) {
3755+ IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
3756+ ret = -ENOMEM;
3757+ goto out;
3758+ }
3759+ memset(svc, 0, sizeof(struct ip_vs_service));
3760+
3761+ svc->protocol = protocol;
3762+ svc->addr = vaddr;
3763+ svc->port = vport;
3764+ svc->fwmark = vfwmark;
3765+ svc->flags = mm->vs_flags;
3766+ svc->timeout = mm->timeout;
3767+ svc->netmask = mm->netmask;
3768+
3769+ INIT_LIST_HEAD(&svc->destinations);
3770+ atomic_set(&svc->refcnt, 0);
3771+ svc->stats.lock = SPIN_LOCK_UNLOCKED;
3772+
3773+ /*
3774+ * Bind the scheduler
3775+ */
3776+ ip_vs_bind_scheduler(svc, sched);
3777+
3778+ /*
3779+ * Hash the service into the service table
3780+ */
3781+ ip_vs_svc_hash(svc);
3782+
3783+ /*
3784+ * Update the virtual service counters
3785+ */
3786+ if (vport == FTPPORT)
3787+ atomic_inc(&ip_vs_ftpsvc_counter);
3788+ else if (vport == 0)
3789+ atomic_inc(&ip_vs_nullsvc_counter);
3790+
3791+ out:
3792+ write_unlock_bh(&__ip_vs_lock);
3793+ LeaveFunction(2);
3794+ return ret;
3795+}
3796+
3797+
3798+/*
3799+ * Edit a service and bind it with a new scheduler
3800+ */
3801+int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_masq_ctl *mctl)
3802+{
3803+ struct ip_vs_user *mm = &mctl->u.vs_user;
3804+ struct ip_vs_scheduler *sched;
3805+
3806+ EnterFunction(2);
3807+
3808+ /*
3809+ * Lookup the scheduler, by 'mctl->m_tname'
3810+ */
3811+ sched = ip_vs_lookup_scheduler(mctl->m_tname);
3812+ if (sched == NULL) {
3813+ IP_VS_INFO("Scheduler module ip_vs_%s.o not found\n",
3814+ mctl->m_tname);
3815+ return -ENOENT;
3816+ }
3817+
3818+ write_lock_bh(&__ip_vs_lock);
3819+
3820+ /*
3821+ * Set the flags and timeout value
3822+ */
3823+ svc->flags = mm->vs_flags | IP_VS_SVC_F_HASHED;
3824+ svc->timeout = mm->timeout;
3825+ svc->netmask = mm->netmask;
3826+
3827+ /*
3828+ * Unbind the old scheduler
3829+ */
3830+ ip_vs_unbind_scheduler(svc);
3831+
3832+ /*
3833+ * Bind the new scheduler
3834+ */
3835+ ip_vs_bind_scheduler(svc, sched);
3836+
3837+ write_unlock_bh(&__ip_vs_lock);
3838+
3839+ LeaveFunction(2);
3840+ return 0;
3841+}
3842+
3843+
3844+/*
3845+ * Delete a service from the service list
3846+ */
3847+int __ip_vs_del_service(struct ip_vs_service *svc)
3848+{
3849+ struct list_head *l;
3850+ struct ip_vs_dest *dest;
3851+
3852+ /*
3853+ * Unbind scheduler
3854+ */
3855+ ip_vs_unbind_scheduler(svc);
3856+
3857+ /*
3858+ * Unlink the whole destination list
3859+ */
3860+ l = &svc->destinations;
3861+ while (l->next != l) {
3862+ dest = list_entry(l->next, struct ip_vs_dest, n_list);
3863+ __ip_vs_del_dest(dest);
3864+ }
3865+
3866+ /*
3867+ * Unhash it from the service table
3868+ */
3869+ if (ip_vs_svc_unhash(svc)) {
3870+ /*
3871+ * Update the virtual service counters
3872+ */
3873+ if (svc->port == FTPPORT)
3874+ atomic_dec(&ip_vs_ftpsvc_counter);
3875+ else if (svc->port == 0)
3876+ atomic_dec(&ip_vs_nullsvc_counter);
3877+
3878+ /*
3879+ * Free the service if nobody refers to it
3880+ */
3881+ if (atomic_read(&svc->refcnt) == 0) {
3882+ IP_VS_DBG(2, "release svc %s %u.%u.%u.%u:%d\n",
3883+ masq_proto_name(svc->protocol),
3884+ NIPQUAD(svc->addr), ntohs(svc->port));
3885+ kfree_s(svc, sizeof(struct ip_vs_service));
3886+ }
3887+ } else {
3888+ /*
3889+ * Called the update_service function of its scheduler
3890+ */
3891+ svc->scheduler->update_service(svc);
3892+ return -EPERM;
3893+ }
3894+
3895+ return 0;
3896+}
3897+
3898+int ip_vs_del_service(struct ip_vs_service *svc)
3899+{
3900+ EnterFunction(2);
3901+
3902+ if (svc == NULL)
3903+ return -EEXIST;
3904+
3905+ write_lock_bh(&__ip_vs_lock);
3906+
3907+ __ip_vs_del_service(svc);
3908+
3909+ write_unlock_bh(&__ip_vs_lock);
3910+ LeaveFunction(2);
3911+ return 0;
3912+}
3913+
3914+
3915+/*
3916+ * Flush all the virtual services
3917+ */
3918+int ip_vs_flush(void)
3919+{
3920+ int idx;
3921+ struct ip_vs_service *svc;
3922+ struct list_head *l;
3923+
3924+ write_lock_bh(&__ip_vs_lock);
3925+
3926+ /*
3927+ * Flush the service table hashed by <protocol,addr,port>
3928+ */
3929+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3930+ l = &ip_vs_svc_table[idx];
3931+ while (l->next != l) {
3932+ svc = list_entry(l->next,struct ip_vs_service,s_list);
3933+
3934+ if (__ip_vs_del_service(svc))
3935+ goto out;
3936+ }
3937+ }
3938+
3939+ /*
3940+ * Flush the service table hashed by fwmark
3941+ */
3942+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3943+ l = &ip_vs_svc_fwm_table[idx];
3944+ while (l->next != l) {
3945+ svc = list_entry(l->next,struct ip_vs_service,f_list);
3946+
3947+ if (__ip_vs_del_service(svc))
3948+ goto out;
3949+ }
3950+ }
3951+
3952+ out:
3953+ write_unlock_bh(&__ip_vs_lock);
3954+ return 0;
3955+}
3956+
3957+
3958+/*
3959+ * Change the connection counter and the flags if the masq state changes
3960+ * Called by the masq_tcp_state function.
3961+ */
3962+void ip_vs_set_state(struct ip_masq *ms, int new_state)
3963+{
3964+ struct ip_vs_dest *dest = ms->dest;
3965+
3966+ if (dest &&
3967+ (ms->flags & IP_MASQ_F_VS) && (new_state != ms->state)) {
3968+ if (!(ms->flags & IP_MASQ_F_VS_INACTIVE) &&
3969+ (new_state != IP_MASQ_S_ESTABLISHED)) {
3970+ atomic_dec(&dest->activeconns);
3971+ atomic_inc(&dest->inactconns);
3972+ ms->flags |= IP_MASQ_F_VS_INACTIVE;
3973+ } else if ((ms->flags & IP_MASQ_F_VS_INACTIVE) &&
3974+ (new_state == IP_MASQ_S_ESTABLISHED)) {
3975+ atomic_inc(&dest->activeconns);
3976+ atomic_dec(&dest->inactconns);
3977+ ms->flags &= ~IP_MASQ_F_VS_INACTIVE;
3978+ }
3979+
3980+ IP_VS_DBG(8, "Set-state masq fwd:%c s:%s c:%u.%u.%u.%u:%d "
3981+ "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d flg:%X cnt:%d\n",
3982+ ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state),
3983+ NIPQUAD(ms->daddr), ntohs(ms->dport),
3984+ NIPQUAD(ms->maddr), ntohs(ms->mport),
3985+ NIPQUAD(ms->saddr), ntohs(ms->sport),
3986+ ms->flags, atomic_read(&ms->refcnt));
3987+ }
3988+}
3989+
3990+
3991+/*
3992+ * Bind a masq entry with a virtual service destination
3993+ * Called when a new masq entry is created for VS.
3994+ */
3995+void ip_vs_bind_masq(struct ip_masq *ms, struct ip_vs_dest *dest)
3996+{
3997+ ms->flags |= dest->masq_flags;
3998+ ms->dest = dest;
3999+
4000+ /*
4001+ * Increase the refcnt counter of the dest.
4002+ */
4003+ atomic_inc(&dest->refcnt);
4004+
4005+ IP_VS_DBG(9, "Bind-masq fwd:%c s:%s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
4006+ "d:%u.%u.%u.%u:%d flg:%X cnt:%d destcnt:%d\n",
4007+ ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state),
4008+ NIPQUAD(ms->daddr), ntohs(ms->dport),
4009+ NIPQUAD(ms->maddr), ntohs(ms->mport),
4010+ NIPQUAD(ms->saddr), ntohs(ms->sport),
4011+ ms->flags, atomic_read(&ms->refcnt),
4012+ atomic_read(&dest->refcnt));
4013+}
4014+
4015+
4016+/*
4017+ * Unbind a masq entry with its VS destination
4018+ * Called by the masq_expire function.
4019+ */
4020+void ip_vs_unbind_masq(struct ip_masq *ms)
4021+{
4022+ struct ip_vs_dest *dest = ms->dest;
4023+
4024+ IP_VS_DBG(9, "Unbind-masq fwd:%c s:%s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
4025+ "d:%u.%u.%u.%u:%d flg:%X cnt:%d destcnt:%d\n",
4026+ ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state),
4027+ NIPQUAD(ms->daddr),ntohs(ms->dport),
4028+ NIPQUAD(ms->maddr),ntohs(ms->mport),
4029+ NIPQUAD(ms->saddr),ntohs(ms->sport),
4030+ ms->flags, atomic_read(&ms->refcnt),
4031+ atomic_read(&dest->refcnt));
4032+
4033+ if (dest) {
4034+ /*
4035+ * Decrease the inactconns or activeconns counter
4036+ * if it is not a masq template (ms->dport!=0).
4037+ */
4038+ if (ms->dport) {
4039+ if (ms->flags & IP_MASQ_F_VS_INACTIVE) {
4040+ atomic_dec(&dest->inactconns);
4041+ } else {
4042+ atomic_dec(&dest->activeconns);
4043+ }
4044+ }
4045+
4046+ /*
4047+ * Simply decrease the refcnt of the dest, because the
4048+ * dest will be either in service's destination list
4049+ * or in the trash.
4050+ */
4051+ atomic_dec(&dest->refcnt);
4052+ }
4053+}
4054+
4055+
4056+/*
4057+ * Checking if the destination of a masq template is available.
4058+ * If available, return 1, otherwise return 0 and invalidate this
4059+ * masq template.
4060+ */
4061+int ip_vs_check_template(struct ip_masq *mst)
4062+{
4063+ struct ip_vs_dest *dest = mst->dest;
4064+
4065+ /*
4066+ * Checking the dest server status.
4067+ */
4068+ if ((dest == NULL) ||
4069+ !(dest->flags & IP_VS_DEST_F_AVAILABLE)) {
4070+ IP_VS_DBG(9, "check_template: dest not available for prot %s "
4071+ "src %u.%u.%u.%u:%d dest %u.%u.%u.%u:%d -> %X:%X\n",
4072+ masq_proto_name(mst->protocol),
4073+ NIPQUAD(mst->daddr), ntohs(mst->dport),
4074+ NIPQUAD(mst->maddr), ntohs(mst->mport),
4075+ (dest!=NULL)? ntohl(dest->addr):0,
4076+ (dest!=NULL)? ntohs(dest->port):0);
4077+
4078+ /*
4079+ * Invalidate the masq template
4080+ */
4081+ ip_vs_unhash(mst);
4082+ mst->sport = 65535;
4083+ mst->mport = 65535;
4084+ mst->dport = 0;
4085+ ip_vs_hash(mst);
4086+
4087+ /*
4088+ * Simply decrease the refcnt of the template,
4089+ * don't restart its timer.
4090+ */
4091+ atomic_dec(&mst->refcnt);
4092+ return 0;
4093+ }
4094+ return 1;
4095+}
4096+
4097+
4098+/*
4099+ * IPVS persistent scheduling function
4100+ * It creates a masq entry according to its template if exists, or selects
4101+ * a server and creates a masq entry plus a template.
4102+ */
4103+struct ip_masq *
4104+ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
4105+{
4106+ struct ip_masq *ms = NULL;
4107+ struct ip_vs_dest *dest;
4108+ const __u16 *portp;
4109+ struct ip_masq *mst;
4110+ __u16 dport; /* destination port to forward */
4111+ __u32 snet; /* source network of the client, after masking */
4112+
4113+ portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
4114+
4115+ /* Mask saddr with the netmask to adjust template granularity */
4116+ snet = iph->saddr & svc->netmask;
4117+
4118+ IP_VS_DBG(6, "P-schedule: src %u.%u.%u.%u:%d dest %u.%u.%u.%u:%d "
4119+ "snet %u.%u.%u.%u/%u.%u.%u.%u\n",
4120+ NIPQUAD(iph->saddr), ntohs(portp[0]),
4121+ NIPQUAD(iph->daddr), ntohs(portp[1]),
4122+ NIPQUAD(snet), NIPQUAD(svc->netmask));
4123+
4124+ /*
4125+ * As far as we know, FTP is a very complicated network protocol, and
4126+ * it uses control connection and data connections. For active FTP,
4127+ * FTP server initilize data connection to the client, its source port
4128+ * is often 20. For passive FTP, FTP server tells the clients the port
4129+ * that it passively listens to, and the client issues the data
4130+ * connection. In the tunneling or direct routing mode, the load
4131+ * balancer is on the client-to-server half of connection, the port
4132+ * number is unknown to the load balancer. So, a template masq like
4133+ * <daddr, 0, maddr, 0, saddr, 0> is created for persistent FTP
4134+ * service, and a template like <daddr, 0, maddr, mport, saddr, sport>
4135+ * is created for other persistent services.
4136+ */
4137+ if (portp[1] == svc->port) {
4138+ /* Check if a template already exists */
4139+ if (svc->port != FTPPORT)
4140+ mst = ip_vs_in_get(iph->protocol, snet, 0,
4141+ iph->daddr, portp[1]);
4142+ else
4143+ mst = ip_vs_in_get(iph->protocol, snet, 0,
4144+ iph->daddr, 0);
4145+
4146+ if (!mst || !ip_vs_check_template(mst)) {
4147+ /*
4148+ * No template found or the dest of the masq
4149+ * template is not available.
4150+ */
4151+ read_lock(&__ip_vs_lock);
4152+
4153+ dest = svc->scheduler->schedule(svc, iph);
4154+ if (dest == NULL) {
4155+ IP_VS_DBG(1, "P-schedule: no dest found.\n");
4156+ read_unlock(&__ip_vs_lock);
4157+ return NULL;
4158+ }
4159+
4160+ /*
4161+ * Create a template like <protocol,daddr,0,
4162+ * maddr,mport,saddr,sport> for non-ftp service,
4163+ * and <protocol,daddr,0,maddr,0,saddr,0>
4164+ * for ftp service.
4165+ */
4166+ if (svc->port != FTPPORT)
4167+ mst = ip_masq_new_vs(iph->protocol,
4168+ iph->daddr, portp[1],
4169+ dest->addr, dest->port,
4170+ snet, 0,
4171+ 0);
4172+ else
4173+ mst = ip_masq_new_vs(iph->protocol,
4174+ iph->daddr, 0,
4175+ dest->addr, 0,
4176+ snet, 0,
4177+ 0);
4178+ if (mst == NULL) {
4179+ IP_VS_ERR("ip_masq_new_vs template failed\n");
4180+ read_unlock(&__ip_vs_lock);
4181+ return NULL;
4182+ }
4183+
4184+ /*
4185+ * Bind the template with dest and set timeout.
4186+ */
4187+ ip_vs_bind_masq(mst, dest);
4188+ mst->timeout = svc->timeout;
4189+
4190+ read_unlock(&__ip_vs_lock);
4191+ } else {
4192+ /*
4193+ * Template found and its destination is available.
4194+ */
4195+ dest = mst->dest;
4196+
4197+ /*
4198+ * Delete its timer so that it can be put back.
4199+ */
4200+ del_sltimer(&mst->timer);
4201+ }
4202+ dport = dest->port;
4203+ } else {
4204+ /*
4205+ * Note: persistent fwmark-based services and persistent
4206+ * port zero service are handled here.
4207+ * fwmark template: <IPPROTO_IP,daddr,0,fwmark,0,saddr,0>
4208+ * port zero template: <protocol,daddr,0,maddr,0,saddr,0>
4209+ */
4210+ if (svc->fwmark)
4211+ mst = ip_vs_in_get(IPPROTO_IP, snet, 0,
4212+ htonl(svc->fwmark), 0);
4213+ else
4214+ mst = ip_vs_in_get(iph->protocol,
4215+ snet, 0, iph->daddr, 0);
4216+
4217+ if (!mst || !ip_vs_check_template(mst)) {
4218+ /*
4219+ * If it is not persistent port zero, return NULL.
4220+ */
4221+ if (svc->port)
4222+ return NULL;
4223+
4224+ read_lock(&__ip_vs_lock);
4225+
4226+ dest = svc->scheduler->schedule(svc, iph);
4227+ if (dest == NULL) {
4228+ IP_VS_DBG(1, "P-schedule: no dest found.\n");
4229+ read_unlock(&__ip_vs_lock);
4230+ return NULL;
4231+ }
4232+
4233+ /*
4234+ * Create a template according to the service
4235+ */
4236+ if (svc->fwmark)
4237+ mst = ip_masq_new_vs(IPPROTO_IP,
4238+ htonl(svc->fwmark), 0,
4239+ dest->addr, 0,
4240+ snet, 0,
4241+ 0);
4242+ else
4243+ mst = ip_masq_new_vs(iph->protocol,
4244+ iph->daddr, 0,
4245+ dest->addr, 0,
4246+ snet, 0,
4247+ 0);
4248+ if (mst == NULL) {
4249+ IP_VS_ERR("ip_masq_new_vs template failed\n");
4250+ read_unlock(&__ip_vs_lock);
4251+ return NULL;
4252+ }
4253+
4254+ /*
4255+ * Bind the template with dest and set timeout.
4256+ */
4257+ ip_vs_bind_masq(mst, dest);
4258+ mst->timeout = svc->timeout;
4259+ read_unlock(&__ip_vs_lock);
4260+ } else {
4261+ dest = mst->dest;
4262+
4263+ /*
4264+ * Delete its timer so that it can be put back.
4265+ */
4266+ del_sltimer(&mst->timer);
4267+ }
4268+ dport = portp[1];
4269+ }
4270+
4271+ /*
4272+ * Create a new masq according to the template
4273+ */
4274+ ms = ip_masq_new_vs(iph->protocol,
4275+ iph->daddr, portp[1],
4276+ dest->addr, dport,
4277+ iph->saddr, portp[0],
4278+ 0);
4279+ if (ms == NULL) {
4280+ IP_VS_ERR("ip_masq_new_vs failed\n");
4281+ ip_masq_put(mst);
4282+ return NULL;
4283+ }
4284+
4285+ /*
4286+ * Bind the masq entry with the vs dest.
4287+ */
4288+ ip_vs_bind_masq(ms, dest);
4289+
4290+ /*
4291+ * Increase the inactive connection counter
4292+ * because it is in Syn-Received
4293+ * state (inactive) when the masq is created.
4294+ */
4295+ atomic_inc(&dest->inactconns);
4296+
4297+ /*
4298+ * Add its control
4299+ */
4300+ ip_masq_control_add(ms, mst);
4301+
4302+ ip_masq_put(mst);
4303+ return ms;
4304+}
4305+
4306+
4307+/*
4308+ * IPVS main scheduling function
4309+ * It selects a server according to the virtual service, and
4310+ * creates a masq entry.
4311+ */
4312+struct ip_masq *ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph)
4313+{
4314+ struct ip_masq *ms = NULL;
4315+ struct ip_vs_dest *dest;
4316+ const __u16 *portp;
4317+
4318+ /*
4319+ * Persistent service
4320+ */
4321+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
4322+ return ip_vs_sched_persist(svc, iph);
4323+
4324+ /*
4325+ * Non-persistent service
4326+ */
4327+ portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
4328+ if (!svc->fwmark && portp[1] != svc->port) {
4329+ if (!svc->port)
4330+ IP_VS_ERR("Schedule: port zero only supported in persistent services, check your ipvs configuration\n");
4331+ return NULL;
4332+ }
4333+
4334+ read_lock(&__ip_vs_lock);
4335+
4336+ dest = svc->scheduler->schedule(svc, iph);
4337+ if (dest == NULL) {
4338+ IP_VS_DBG(1, "Schedule: no dest found.\n");
4339+ read_unlock(&__ip_vs_lock);
4340+ return NULL;
4341+ }
4342+
4343+ /*
4344+ * Create a masquerading entry.
4345+ */
4346+ ms = ip_masq_new_vs(iph->protocol,
4347+ iph->daddr, portp[1],
4348+ dest->addr, dest->port?dest->port:portp[1],
4349+ iph->saddr, portp[0],
4350+ 0);
4351+ if (ms == NULL) {
4352+ IP_VS_ERR("Schedule: ip_masq_new_vs failed\n");
4353+ read_unlock(&__ip_vs_lock);
4354+ return NULL;
4355+ }
4356+
4357+ /*
4358+ * Bind the masq entry with the vs dest.
4359+ */
4360+ ip_vs_bind_masq(ms, dest);
4361+
4362+ /*
4363+ * Increase the inactive connection counter because it is in
4364+ * Syn-Received state (inactive) when the masq is created.
4365+ */
4366+ atomic_inc(&dest->inactconns);
4367+
4368+ IP_VS_DBG(9, "Schedule masq fwd:%c s:%s c:%u.%u.%u.%u:%d "
4369+ "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d flg:%X cnt:%d\n",
4370+ ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state),
4371+ NIPQUAD(ms->daddr),ntohs(ms->dport),
4372+ NIPQUAD(ms->maddr),ntohs(ms->mport),
4373+ NIPQUAD(ms->saddr),ntohs(ms->sport),
4374+ ms->flags, atomic_read(&ms->refcnt));
4375+
4376+ read_unlock(&__ip_vs_lock);
4377+
4378+ return ms;
4379+}
4380+
4381+
4382+/*
4383+ * Pass or drop the packet.
4384+ * Called by ip_fw_demasquerade, when the virtual service is available but
4385+ * no destination is available for a new connection.
4386+ */
4387+int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb)
4388+{
4389+ struct iphdr *iph = skb->nh.iph;
4390+ __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
4391+
4392+ /*
4393+ * When the virtual ftp service is presented, packets destined
4394+ * for other services on the VIP may get here (except services
4395+ * listed in the ipvs table), pass the packets, because it is
4396+ * not ipvs job to decide to drop the packets.
4397+ */
4398+ if ((svc->port == FTPPORT) && (portp[1] != FTPPORT))
4399+ return 0;
4400+
4401+ /*
4402+ * Notify the client that the destination is unreachable, and
4403+ * release the socket buffer.
4404+ * Since it is in IP layer, the TCP socket is not actually
4405+ * created, the TCP RST packet cannot be sent, instead that
4406+ * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
4407+ */
4408+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
4409+ kfree_skb(skb);
4410+ return -2;
4411+}
4412+
4413+
4414+/*
4415+ * IPVS user control entry
4416+ */
4417+int ip_vs_ctl(int optname, struct ip_masq_ctl *mctl, int optlen)
4418+{
4419+ struct ip_vs_service *svc = NULL;
4420+ struct ip_vs_user *mm = &mctl->u.vs_user;
4421+ __u32 vaddr = mm->vaddr;
4422+ __u16 vport = mm->vport;
4423+ int proto_num = masq_proto_num(mm->protocol);
4424+
4425+ /*
4426+ * Check the size of mctl, no overflow...
4427+ */
4428+ if (optlen != sizeof(*mctl))
4429+ return -EINVAL;
4430+
4431+ /*
4432+ * Flush all the virtual service...
4433+ */
4434+ if (mctl->m_cmd == IP_MASQ_CMD_FLUSH)
4435+ return ip_vs_flush();
4436+
4437+ /*
4438+ * Check for valid protocol: TCP or UDP
4439+ */
4440+ if (mm->vfwmark == 0 && (proto_num < 0 || proto_num > 1)) {
4441+ IP_VS_INFO("vs_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s",
4442+ ntohs(mm->protocol),
4443+ NIPQUAD(vaddr), ntohs(vport), mctl->m_tname);
4444+ return -EFAULT;
4445+ }
4446+
4447+ /*
4448+ * Lookup the exact service by (protocol, vaddr, vport)
4449+ */
4450+ read_lock(&__ip_vs_lock);
4451+
4452+ if (mm->vfwmark == 0)
4453+ svc = __ip_vs_lookup_service(mm->protocol, vaddr, vport);
4454+ else
4455+ svc = __ip_vs_lookup_svc_fwm(mm->vfwmark);
4456+
4457+ read_unlock(&__ip_vs_lock);
4458+
4459+ switch (mctl->m_cmd) {
4460+ case IP_MASQ_CMD_ADD:
4461+ if (svc != NULL)
4462+ return -EEXIST;
4463+
4464+ return ip_vs_add_service(mctl);
4465+
4466+ case IP_MASQ_CMD_SET:
4467+ if (svc == NULL)
4468+ return -ESRCH;
4469+
4470+ return ip_vs_edit_service(svc, mctl);
4471+
4472+ case IP_MASQ_CMD_DEL:
4473+ if (svc == NULL)
4474+ return -ESRCH;
4475+ else
4476+ return ip_vs_del_service(svc);
4477+
4478+ case IP_MASQ_CMD_ADD_DEST:
4479+ if (svc == NULL)
4480+ return -ESRCH;
4481+ else
4482+ return ip_vs_add_dest(svc, mctl);
4483+
4484+ case IP_MASQ_CMD_SET_DEST:
4485+ if (svc == NULL)
4486+ return -ESRCH;
4487+ else
4488+ return ip_vs_edit_dest(svc, mctl);
4489+
4490+ case IP_MASQ_CMD_DEL_DEST:
4491+ if (svc == NULL)
4492+ return -ESRCH;
4493+ else
4494+ return ip_vs_del_dest(svc, mctl);
4495+ }
4496+ return -EINVAL;
4497+}
4498+
4499+
4500+#ifdef CONFIG_SYSCTL
4501+
4502+static int ip_vs_sysctl_defense_mode(ctl_table *ctl, int write,
4503+ struct file * filp,void *buffer, size_t *lenp)
4504+{
4505+ int *valp = ctl->data;
4506+ int val = *valp;
4507+ int ret;
4508+
4509+ ret = proc_dointvec(ctl, write, filp, buffer, lenp);
4510+ if (write && (*valp != val)) {
4511+ if ((*valp < 0) || (*valp > 3)) {
4512+ /* Restore the correct value */
4513+ *valp = val;
4514+ } else {
4515+ update_defense_level();
4516+ }
4517+ }
4518+ return ret;
4519+}
4520+
4521+ctl_table ipv4_vs_table[] = {
4522+#ifdef CONFIG_IP_VS_DEBUG
4523+ {NET_IPV4_VS_DEBUG_LEVEL, "debug_level",
4524+ &sysctl_ip_vs_debug_level, sizeof(int), 0644, NULL,
4525+ &proc_dointvec},
4526+#endif
4527+ {NET_IPV4_VS_AMEMTHRESH, "amemthresh",
4528+ &sysctl_ip_vs_amemthresh, sizeof(int), 0644, NULL,
4529+ &proc_dointvec},
4530+ {NET_IPV4_VS_AMDROPRATE, "am_droprate",
4531+ &sysctl_ip_vs_am_droprate, sizeof(int), 0644, NULL,
4532+ &proc_dointvec},
4533+ {NET_IPV4_VS_DROP_ENTRY, "drop_entry",
4534+ &sysctl_ip_vs_drop_entry, sizeof(int), 0644, NULL,
4535+ &ip_vs_sysctl_defense_mode},
4536+ {NET_IPV4_VS_DROP_PACKET, "drop_packet",
4537+ &sysctl_ip_vs_drop_packet, sizeof(int), 0644, NULL,
4538+ &ip_vs_sysctl_defense_mode},
4539+ {NET_IPV4_VS_SECURE_TCP, "secure_tcp",
4540+ &sysctl_ip_vs_secure_tcp, sizeof(int), 0644, NULL,
4541+ &ip_vs_sysctl_defense_mode},
4542+ {NET_IPV4_VS_TO_ES, "timeout_established",
4543+ &masq_timeout_table_dos.timeout[IP_MASQ_S_ESTABLISHED],
4544+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4545+ {NET_IPV4_VS_TO_SS, "timeout_synsent",
4546+ &masq_timeout_table_dos.timeout[IP_MASQ_S_SYN_SENT],
4547+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4548+ {NET_IPV4_VS_TO_SR, "timeout_synrecv",
4549+ &masq_timeout_table_dos.timeout[IP_MASQ_S_SYN_RECV],
4550+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4551+ {NET_IPV4_VS_TO_FW, "timeout_finwait",
4552+ &masq_timeout_table_dos.timeout[IP_MASQ_S_FIN_WAIT],
4553+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4554+ {NET_IPV4_VS_TO_TW, "timeout_timewait",
4555+ &masq_timeout_table_dos.timeout[IP_MASQ_S_TIME_WAIT],
4556+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4557+ {NET_IPV4_VS_TO_CL, "timeout_close",
4558+ &masq_timeout_table_dos.timeout[IP_MASQ_S_CLOSE],
4559+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4560+ {NET_IPV4_VS_TO_CW, "timeout_closewait",
4561+ &masq_timeout_table_dos.timeout[IP_MASQ_S_CLOSE_WAIT],
4562+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4563+ {NET_IPV4_VS_TO_LA, "timeout_lastack",
4564+ &masq_timeout_table_dos.timeout[IP_MASQ_S_LAST_ACK],
4565+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4566+ {NET_IPV4_VS_TO_LI, "timeout_listen",
4567+ &masq_timeout_table_dos.timeout[IP_MASQ_S_LISTEN],
4568+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4569+ {NET_IPV4_VS_TO_SA, "timeout_synack",
4570+ &masq_timeout_table_dos.timeout[IP_MASQ_S_SYNACK],
4571+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4572+ {NET_IPV4_VS_TO_UDP, "timeout_udp",
4573+ &masq_timeout_table_dos.timeout[IP_MASQ_S_UDP],
4574+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4575+ {NET_IPV4_VS_TO_ICMP, "timeout_icmp",
4576+ &masq_timeout_table_dos.timeout[IP_MASQ_S_ICMP],
4577+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
4578+ {0}
4579+};
4580+#endif
4581+
4582+#ifdef CONFIG_PROC_FS
4583+/*
4584+ * Write the contents of the VS rule table to a PROCfs file.
4585+ */
4586+static int ip_vs_procinfo(char *buf, char **start, off_t offset,
4587+ int length, int *eof, void *data)
4588+{
4589+ int len=0;
4590+ off_t pos=0;
4591+ char temp[64], temp2[32];
4592+ int idx;
4593+ struct ip_vs_service *svc;
4594+ struct ip_vs_dest *dest;
4595+ struct list_head *l, *e, *p, *q;
4596+
4597+ /*
4598+ * Note: since the length of the buffer is usually the multiple
4599+ * of 512, it is good to use fixed record of the divisor of 512,
4600+ * so that records won't be truncated at buffer boundary.
4601+ */
4602+ pos = 192;
4603+ if (pos > offset) {
4604+ sprintf(temp,
4605+ "IP Virtual Server version %d.%d.%d (size=%d)",
4606+ NVERSION(IP_VS_VERSION_CODE), IP_VS_TAB_SIZE);
4607+ len += sprintf(buf+len, "%-63s\n", temp);
4608+ len += sprintf(buf+len, "%-63s\n",
4609+ "Prot LocalAddress:Port Scheduler Flags");
4610+ len += sprintf(buf+len, "%-63s\n",
4611+ " -> RemoteAddress:Port Forward Weight ActiveConn InActConn");
4612+ }
4613+
4614+ read_lock_bh(&__ip_vs_lock);
4615+
4616+ /* print the service table hashed by <protocol,addr,port> */
4617+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4618+ l = &ip_vs_svc_table[idx];
4619+ for (e=l->next; e!=l; e=e->next) {
4620+ svc = list_entry(e, struct ip_vs_service, s_list);
4621+ pos += 64;
4622+ if (pos > offset) {
4623+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
4624+ sprintf(temp2, "persistent %d %08X",
4625+ svc->timeout,
4626+ ntohl(svc->netmask));
4627+ else
4628+ temp2[0] = '\0';
4629+
4630+ sprintf(temp, "%s %08X:%04X %s %s",
4631+ masq_proto_name(svc->protocol),
4632+ ntohl(svc->addr),
4633+ ntohs(svc->port),
4634+ svc->scheduler->name, temp2);
4635+ len += sprintf(buf+len, "%-63s\n", temp);
4636+ if (len >= length)
4637+ goto done;
4638+ }
4639+
4640+ p = &svc->destinations;
4641+ for (q=p->next; q!=p; q=q->next) {
4642+ dest = list_entry(q, struct ip_vs_dest, n_list);
4643+ pos += 64;
4644+ if (pos <= offset)
4645+ continue;
4646+ sprintf(temp,
4647+ " -> %08X:%04X %-7s %-6d %-10d %-10d",
4648+ ntohl(dest->addr),
4649+ ntohs(dest->port),
4650+ ip_vs_fwd_name(dest->masq_flags),
4651+ dest->weight,
4652+ atomic_read(&dest->activeconns),
4653+ atomic_read(&dest->inactconns));
4654+ len += sprintf(buf+len, "%-63s\n", temp);
4655+ if (len >= length)
4656+ goto done;
4657+ }
4658+ }
4659+ }
4660+
4661+ /* print the service table hashed by fwmark */
4662+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4663+ l = &ip_vs_svc_fwm_table[idx];
4664+ for (e=l->next; e!=l; e=e->next) {
4665+ svc = list_entry(e, struct ip_vs_service, f_list);
4666+ pos += 64;
4667+ if (pos > offset) {
4668+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
4669+ sprintf(temp2, "persistent %d %08X",
4670+ svc->timeout,
4671+ ntohl(svc->netmask));
4672+ else
4673+ temp2[0] = '\0';
4674+
4675+ sprintf(temp, "FWM %08X %s %s",
4676+ svc->fwmark,
4677+ svc->scheduler->name, temp2);
4678+ len += sprintf(buf+len, "%-63s\n", temp);
4679+ if (len >= length)
4680+ goto done;
4681+ }
4682+
4683+ p = &svc->destinations;
4684+ for (q=p->next; q!=p; q=q->next) {
4685+ dest = list_entry(q, struct ip_vs_dest, n_list);
4686+ pos += 64;
4687+ if (pos <= offset)
4688+ continue;
4689+ sprintf(temp,
4690+ " -> %08X:%04X %-7s %-6d %-10d %-10d",
4691+ ntohl(dest->addr),
4692+ ntohs(dest->port),
4693+ ip_vs_fwd_name(dest->masq_flags),
4694+ dest->weight,
4695+ atomic_read(&dest->activeconns),
4696+ atomic_read(&dest->inactconns));
4697+ len += sprintf(buf+len, "%-63s\n", temp);
4698+ if (len >= length)
4699+ goto done;
4700+ }
4701+ }
4702+ }
4703+
4704+ done:
4705+ read_unlock_bh(&__ip_vs_lock);
4706+
4707+ *start = buf+len-(pos-offset); /* Start of wanted data */
4708+ len = pos-offset;
4709+ if (len > length)
4710+ len = length;
4711+ if (len < 0)
4712+ len = 0;
4713+ return len;
4714+}
4715+
4716+struct proc_dir_entry ip_vs_proc_entry = {
4717+ 0, /* dynamic inode */
4718+ 2, "vs", /* namelen and name */
4719+ S_IFREG | S_IRUGO, /* mode */
4720+ 1, 0, 0, 0, /* nlinks, owner, group, size */
4721+ &proc_net_inode_operations, /* operations */
4722+ NULL, /* get_info */
4723+ NULL, /* fill_inode */
4724+ NULL, NULL, NULL, /* next, parent, subdir */
4725+ NULL, /* data */
4726+ &ip_vs_procinfo, /* function to generate proc data */
4727+};
4728+
4729+
4730+/*
4731+ * Write the IPVS statistic information to a PROCfs file.
4732+ */
4733+struct ip_vs_stats ip_vs_stats = {SPIN_LOCK_UNLOCKED, 0, 0};
4734+
4735+static int
4736+ip_vs_stats_get_info(char *buf, char **start, off_t offset,
4737+ int length, int *eof, void *data)
4738+{
4739+ int idx;
4740+ int len=0;
4741+ off_t pos=0;
4742+ char temp[128];
4743+ struct ip_vs_service *svc;
4744+ struct ip_vs_dest *dest;
4745+ struct list_head *l, *e, *p, *q;
4746+
4747+ pos += 128;
4748+ if (pos > offset) {
4749+ len += sprintf(buf+len, "%-63s\n",
4750+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
4751+ "TotalConns InPkts OutPkts InBytes OutBytes");
4752+ spin_lock(&ip_vs_stats.lock);
4753+ sprintf(temp, " %8X %8X %8X %8X%08X %8X%08X",
4754+ ip_vs_stats.conns,
4755+ ip_vs_stats.inpkts,
4756+ ip_vs_stats.outpkts,
4757+ (__u32)(ip_vs_stats.inbytes >> 32),
4758+ (__u32)ip_vs_stats.inbytes,
4759+ (__u32)(ip_vs_stats.outbytes >> 32),
4760+ (__u32)ip_vs_stats.outbytes);
4761+ spin_unlock(&ip_vs_stats.lock);
4762+ len += sprintf(buf+len, "%-63s\n", temp);
4763+ }
4764+
4765+ read_lock_bh(&__ip_vs_lock);
4766+
4767+ /* print the service statistics */
4768+ pos += 128;
4769+ if (pos > offset) {
4770+ len += sprintf(buf+len, "%-127s\n",
4771+ "\nVirtual Service\n"
4772+ "Pro VirtService Conns InPkts OutPkts InBytes OutBytes");
4773+ }
4774+
4775+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4776+ l = &ip_vs_svc_table[idx];
4777+ for (e=l->next; e!=l; e=e->next) {
4778+ svc = list_entry(e, struct ip_vs_service, s_list);
4779+ pos += 128;
4780+ if (pos <= offset)
4781+ continue;
4782+ spin_lock(&svc->stats.lock);
4783+ sprintf(temp, "%3s %08X:%04X %8X %8X %8X %8X%08X %8X%08X",
4784+ masq_proto_name(svc->protocol),
4785+ ntohl(svc->addr),
4786+ ntohs(svc->port),
4787+ svc->stats.conns,
4788+ svc->stats.inpkts,
4789+ svc->stats.outpkts,
4790+ (__u32)(svc->stats.inbytes >> 32),
4791+ (__u32)svc->stats.inbytes,
4792+ (__u32)(svc->stats.outbytes >> 32),
4793+ (__u32)svc->stats.outbytes);
4794+ spin_unlock(&svc->stats.lock);
4795+ len += sprintf(buf+len, "%-127s\n", temp);
4796+ if (pos >= offset+length)
4797+ goto done;
4798+ }
4799+ }
4800+
4801+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4802+ l = &ip_vs_svc_fwm_table[idx];
4803+ for (e=l->next; e!=l; e=e->next) {
4804+ svc = list_entry(e, struct ip_vs_service, f_list);
4805+ pos += 128;
4806+ if (pos <= offset)
4807+ continue;
4808+ spin_lock(&svc->stats.lock);
4809+ sprintf(temp, "FWM %08X %8X %8X %8X %8X%08X %8X%08X",
4810+ svc->fwmark,
4811+ svc->stats.conns,
4812+ svc->stats.inpkts,
4813+ svc->stats.outpkts,
4814+ (__u32)(svc->stats.inbytes >> 32),
4815+ (__u32)svc->stats.inbytes,
4816+ (__u32)(svc->stats.outbytes >> 32),
4817+ (__u32)svc->stats.outbytes);
4818+ spin_unlock(&svc->stats.lock);
4819+ len += sprintf(buf+len, "%-127s\n", temp);
4820+ if (pos >= offset+length)
4821+ goto done;
4822+ }
4823+ }
4824+
4825+ /* print the real server statistics */
4826+ pos += 128;
4827+ if (pos > offset) {
4828+ len += sprintf(buf+len, "%-127s\n",
4829+ "\nReal Service\n"
4830+ "Pro VirtService RealService Conns InPkts OutPkts InBytes OutBytes");
4831+ }
4832+
4833+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4834+ l = &ip_vs_svc_table[idx];
4835+ for (e=l->next; e!=l; e=e->next) {
4836+ svc = list_entry(e, struct ip_vs_service, s_list);
4837+ p = &svc->destinations;
4838+ for (q=p->next; q!=p; q=q->next) {
4839+ dest = list_entry(q, struct ip_vs_dest, n_list);
4840+ pos += 128;
4841+ if (pos <= offset)
4842+ continue;
4843+ spin_lock(&dest->stats.lock);
4844+ sprintf(temp,
4845+ "%3s %08X:%04X %08X:%04X %8X %8X %8X %8X%08X %8X%08X",
4846+ masq_proto_name(svc->protocol),
4847+ ntohl(svc->addr),
4848+ ntohs(svc->port),
4849+ ntohl(dest->addr),
4850+ ntohs(dest->port),
4851+ dest->stats.conns,
4852+ dest->stats.inpkts,
4853+ dest->stats.outpkts,
4854+ (__u32)(dest->stats.inbytes >> 32),
4855+ (__u32)dest->stats.inbytes,
4856+ (__u32)(dest->stats.outbytes >> 32),
4857+ (__u32)dest->stats.outbytes);
4858+ spin_unlock(&dest->stats.lock);
4859+ len += sprintf(buf+len, "%-127s\n", temp);
4860+ if (pos >= offset+length)
4861+ goto done;
4862+ }
4863+ }
4864+ }
4865+
4866+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
4867+ l = &ip_vs_svc_fwm_table[idx];
4868+ for (e=l->next; e!=l; e=e->next) {
4869+ svc = list_entry(e, struct ip_vs_service, f_list);
4870+ p = &svc->destinations;
4871+ for (q=p->next; q!=p; q=q->next) {
4872+ dest = list_entry(q,struct ip_vs_dest,n_list);
4873+ pos += 128;
4874+ if (pos <= offset)
4875+ continue;
4876+ spin_lock(&dest->stats.lock);
4877+ sprintf(temp,
4878+ "FWM %08X %08X:%04X %8X %8X %8X %8X%08X %8X%08X",
4879+ svc->fwmark,
4880+ ntohl(dest->addr),
4881+ ntohs(dest->port),
4882+ dest->stats.conns,
4883+ dest->stats.inpkts,
4884+ dest->stats.outpkts,
4885+ (__u32)(dest->stats.inbytes >> 32),
4886+ (__u32)dest->stats.inbytes,
4887+ (__u32)(dest->stats.outbytes >> 32),
4888+ (__u32)dest->stats.outbytes);
4889+ spin_unlock(&dest->stats.lock);
4890+ len += sprintf(buf+len, "%-127s\n", temp);
4891+ if (pos >= offset+length)
4892+ goto done;
4893+ }
4894+ }
4895+ }
4896+ done:
4897+ read_unlock_bh(&__ip_vs_lock);
4898+
4899+ *start = buf+len-(pos-offset); /* Start of wanted data */
4900+ len = pos-offset;
4901+ if (len > length)
4902+ len = length;
4903+ if (len < 0)
4904+ len = 0;
4905+ return len;
4906+}
4907+
4908+struct proc_dir_entry ip_vs_stat_proc_entry = {
4909+ 0, /* dynamic inode */
4910+ 8, "vs_stats", /* namelen and name */
4911+ S_IFREG | S_IRUGO, /* mode */
4912+ 1, 0, 0, 0, /* nlinks, owner, group, size */
4913+ &proc_net_inode_operations, /* operations */
4914+ NULL, /* get_info */
4915+ NULL, /* fill_inode */
4916+ NULL, NULL, NULL, /* next, parent, subdir */
4917+ NULL, /* data */
4918+ &ip_vs_stats_get_info, /* function to generate proc data */
4919+};
4920+
4921+#endif
4922+
4923+
4924+/*
4925+ * This function encapsulates the packet in a new IP header, its destination
4926+ * will be set to the daddr. Most code of this function is from ipip.c.
4927+ * Usage:
4928+ * It is called in the ip_vs_forward() function. The load balancer
4929+ * selects a real server from a cluster based on a scheduling algorithm,
4930+ * encapsulates the packet and forwards it to the selected server. All real
4931+ * servers are configured with "ifconfig tunl0 <Virtual IP Address> up".
4932+ * When the server receives the encapsulated packet, it decapsulates the
4933+ * packet, processes the request and return the reply packets directly to
4934+ * the client without passing the load balancer. This can greatly
4935+ * increase the scalability of virtual server.
4936+ * Returns:
4937+ * if succeeded, return 1; otherwise, return 0.
4938+ */
4939+
4940+int ip_vs_tunnel_xmit(struct sk_buff *skb, __u32 daddr)
4941+{
4942+ struct rtable *rt; /* Route to the other host */
4943+ struct device *tdev; /* Device to other host */
4944+ struct iphdr *old_iph = skb->nh.iph;
4945+ u8 tos = old_iph->tos;
4946+ u16 df = old_iph->frag_off;
4947+ struct iphdr *iph; /* Our new IP header */
4948+ int max_headroom; /* The extra header space needed */
4949+ u32 dst = daddr;
4950+ u32 src = 0;
4951+ int mtu;
4952+
4953+ if (skb->protocol != __constant_htons(ETH_P_IP)) {
4954+ IP_VS_DBG(0, "ip_vs_tunnel_xmit(): protocol error, ETH_P_IP: %d, skb protocol: %d\n",
4955+ __constant_htons(ETH_P_IP),skb->protocol);
4956+ goto tx_error;
4957+ }
4958+
4959+ if (ip_route_output(&rt, dst, src, RT_TOS(tos), 0)) {
4960+ IP_VS_DBG(0, "ip_vs_tunnel_xmit(): route error, dest: "
4961+ "%u.%u.%u.%u\n", NIPQUAD(dst));
4962+ goto tx_error_icmp;
4963+ }
4964+ tdev = rt->u.dst.dev;
4965+
4966+ mtu = rt->u.dst.pmtu - sizeof(struct iphdr);
4967+ if (mtu < 68) {
4968+ ip_rt_put(rt);
4969+ IP_VS_DBG(0, "ip_vs_tunnel_xmit(): mtu less than 68\n");
4970+ goto tx_error;
4971+ }
4972+ if (skb->dst && mtu < skb->dst->pmtu)
4973+ skb->dst->pmtu = mtu;
4974+
4975+ df |= (old_iph->frag_off&__constant_htons(IP_DF));
4976+
4977+ if ((old_iph->frag_off&__constant_htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
4978+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
4979+ ip_rt_put(rt);
4980+ IP_VS_DBG(0, "ip_vs_tunnel_xmit(): frag needed\n");
4981+ goto tx_error;
4982+ }
4983+
4984+ skb->h.raw = skb->nh.raw;
4985+
4986+ /*
4987+ * Okay, now see if we can stuff it in the buffer as-is.
4988+ */
4989+ max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
4990+
4991+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
4992+ struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
4993+ if (!new_skb) {
4994+ ip_rt_put(rt);
4995+ kfree_skb(skb);
4996+ IP_VS_ERR("ip_vs_tunnel_xmit(): no memory for new_skb\n");
4997+ return 0;
4998+ }
4999+ kfree_skb(skb);
5000+ skb = new_skb;
5001+ }
5002+
5003+ skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
5004+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
5005+ dst_release(skb->dst);
5006+ skb->dst = &rt->u.dst;
5007+
5008+ /*
5009+ * Push down and install the IPIP header.
5010+ */
5011+
5012+ iph = skb->nh.iph;
5013+ iph->version = 4;
5014+ iph->ihl = sizeof(struct iphdr)>>2;
5015+ iph->frag_off = df;
5016+ iph->protocol = IPPROTO_IPIP;
5017+ iph->tos = tos;
5018+ iph->daddr = rt->rt_dst;
5019+ iph->saddr = rt->rt_src;
5020+ iph->ttl = old_iph->ttl;
5021+ iph->tot_len = htons(skb->len);
5022+ iph->id = htons(ip_id_count++);
5023+ ip_send_check(iph);
5024+
5025+ IPCB(skb)->flags |= IPSKB_REDIRECTED;
5026+ IPCB(skb)->flags |= IPSKB_MASQUERADED;
5027+
5028+ ip_send(skb);
5029+ return 1;
5030+
5031+ tx_error_icmp:
5032+ dst_link_failure(skb);
5033+ tx_error:
5034+ kfree_skb(skb);
5035+ return 0;
5036+}
5037+
5038+
5039+/*
5040+ * Direct Routing
5041+ */
5042+int ip_vs_dr_xmit(struct sk_buff *skb, __u32 daddr)
5043+{
5044+ struct rtable *rt; /* Route to the other host */
5045+ struct iphdr *iph = skb->nh.iph;
5046+ u8 tos = iph->tos;
5047+ int mtu;
5048+
5049+ if (ip_route_output(&rt, daddr, 0, RT_TOS(tos), 0)) {
5050+ IP_VS_DBG(0, "ip_vs_dr_xmit(): route error, dest: %u.%u.%u.%u\n",
5051+ NIPQUAD(daddr));
5052+ goto tx_error_icmp;
5053+ }
5054+
5055+ /* MTU checking */
5056+ mtu = rt->u.dst.pmtu;
5057+ if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
5058+ icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
5059+ ip_rt_put(rt);
5060+ IP_VS_DBG(0, "ip_vs_dr_xmit(): frag needed\n");
5061+ goto tx_error;
5062+ }
5063+
5064+ dst_release(skb->dst);
5065+ skb->dst = &rt->u.dst;
5066+
5067+ IPCB(skb)->flags |= IPSKB_REDIRECTED;
5068+ IPCB(skb)->flags |= IPSKB_MASQUERADED;
5069+
5070+ ip_send(skb);
5071+ return 1;
5072+
5073+ tx_error_icmp:
5074+ dst_link_failure(skb);
5075+ tx_error:
5076+ kfree_skb(skb);
5077+ return 0;
5078+}
5079+
5080+
5081+/*
5082+ * Initialize IP virtual server
5083+ */
5084+__initfunc(int ip_vs_init(void))
5085+{
5086+ int idx;
5087+
5088+ /*
5089+ * Allocate the ip_vs_table and initialize its list head.
5090+ * Initilize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable,
5091+ * ip_vs_schedulers and ip_vs_dest_trash.
5092+ */
5093+ if (!(ip_vs_table =
5094+ vmalloc(IP_VS_TAB_SIZE*sizeof(struct list_head)))) {
5095+ return -ENOMEM;
5096+ }
5097+ for(idx = 0; idx < IP_VS_TAB_SIZE; idx++) {
5098+ INIT_LIST_HEAD(&ip_vs_table[idx]);
5099+ }
5100+ IP_VS_INFO("Connection hash table configured "
5101+ "(size=%d, memory=%ldKbytes)\n",
5102+ IP_VS_TAB_SIZE,
5103+ (long) (IP_VS_TAB_SIZE*sizeof(struct list_head))/1024);
5104+
5105+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
5106+ INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
5107+ INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
5108+ }
5109+ for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
5110+ INIT_LIST_HEAD(&ip_vs_rtable[idx]);
5111+ }
5112+ INIT_LIST_HEAD(&ip_vs_schedulers);
5113+ INIT_LIST_HEAD(&ip_vs_dest_trash);
5114+
5115+ /*
5116+ * Hook the slow_timer handler in the system timer.
5117+ */
5118+ slow_timer.expires = jiffies+SLTIMER_PERIOD;
5119+ add_timer(&slow_timer);
5120+
5121+#ifdef CONFIG_PROC_FS
5122+ ip_masq_proc_register(&ip_vs_proc_entry);
5123+ ip_masq_proc_register(&ip_vs_stat_proc_entry);
5124+#endif
5125+
5126+#ifdef CONFIG_IP_MASQUERADE_VS_RR
5127+ ip_vs_rr_init();
5128+#endif
5129+#ifdef CONFIG_IP_MASQUERADE_VS_WRR
5130+ ip_vs_wrr_init();
5131+#endif
5132+#ifdef CONFIG_IP_MASQUERADE_VS_LC
5133+ ip_vs_lc_init();
5134+#endif
5135+#ifdef CONFIG_IP_MASQUERADE_VS_WLC
5136+ ip_vs_wlc_init();
5137+#endif
5138+#ifdef CONFIG_IP_MASQUERADE_VS_LBLC
5139+ ip_vs_lblc_init();
5140+#endif
5141+#ifdef CONFIG_IP_MASQUERADE_VS_LBLCR
5142+ ip_vs_lblcr_init();
5143+#endif
5144+ return 0;
5145+}
5146diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lblc.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblc.c
5147--- linux-2.2.19/net/ipv4/ip_vs_lblc.c Thu Jan 1 08:00:00 1970
5148+++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblc.c Fri Feb 2 18:49:08 2001
5149@@ -0,0 +1,645 @@
5150+/*
5151+ * IPVS: Locality-Based Least-Connection scheduling module
5152+ *
5153+ * Version: $Id$
5154+ *
5155+ * Authors: Wensong Zhang <wensong@gnuchina.org>
5156+ *
5157+ * This program is free software; you can redistribute it and/or
5158+ * modify it under the terms of the GNU General Public License
5159+ * as published by the Free Software Foundation; either version
5160+ * 2 of the License, or (at your option) any later version.
5161+ *
5162+ * Changes:
5163+ * Martin Hamilton : fixed the terrible locking bugs
5164+ * *lock(tbl->lock) ==> *lock(&tbl->lock)
5165+ * Wensong Zhang : fixed the uninitilized tbl->lock bug
5166+ * Wensong Zhang : added doing full expiration check to
5167+ * collect stale entries of 24+ hours when
5168+ * no partial expire check in a half hour
5169+ *
5170+ */
5171+
5172+/*
5173+ * The lblc algorithm is as follows (pseudo code):
5174+ *
5175+ * if cachenode[dest_ip] is null then
5176+ * n, cachenode[dest_ip] <- {weighted least-conn node};
5177+ * else
5178+ * n <- cachenode[dest_ip];
5179+ * if (n is dead) OR
5180+ * (n.conns>n.weight AND
5181+ * there is a node m with m.conns<m.weight/2) then
5182+ * n, cachenode[dest_ip] <- {weighted least-conn node};
5183+ *
5184+ * return n;
5185+ *
5186+ * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
5187+ * me to write this module.
5188+ */
5189+
5190+#include <linux/config.h>
5191+#include <linux/module.h>
5192+#ifdef CONFIG_KMOD
5193+#include <linux/kmod.h>
5194+#endif
5195+#include <linux/types.h>
5196+#include <linux/kernel.h>
5197+#include <linux/errno.h>
5198+#include <linux/vmalloc.h>
5199+#include <net/ip_masq.h>
5200+#ifdef CONFIG_IP_MASQUERADE_MOD
5201+#include <net/ip_masq_mod.h>
5202+#endif
5203+#include <linux/sysctl.h>
5204+#include <linux/proc_fs.h>
5205+#include <linux/ip_fw.h>
5206+#include <net/ip_vs.h>
5207+
5208+
5209+/*
5210+ * It is for garbage collection of stale IPVS lblc entries,
5211+ * when the table is full.
5212+ */
5213+#define CHECK_EXPIRE_INTERVAL (60*HZ)
5214+#define ENTRY_TIMEOUT (5*60*HZ)
5215+
5216+/*
5217+ * It is for full expiration check.
5218+ * When there is no partial expiration check (garbage collection)
5219+ * in a half hour, do a full expiration check to collect stale
5220+ * entries that haven't been touched for a day (by default).
5221+ */
5222+#define COUNT_FOR_FULL_EXPIRATION 30
5223+int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
5224+
5225+
5226+/*
5227+ * for IPVS lblc entry hash table
5228+ */
5229+#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
5230+#define CONFIG_IP_VS_LBLC_TAB_BITS 10
5231+#endif
5232+#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
5233+#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
5234+#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
5235+
5236+
5237+/*
5238+ * IPVS lblc entry represents an association between destination
5239+ * IP address and its destination server
5240+ */
5241+struct ip_vs_lblc_entry {
5242+ struct list_head list;
5243+ __u32 addr; /* destination IP address */
5244+ struct ip_vs_dest *dest; /* real server (cache) */
5245+ unsigned long lastuse; /* last used time */
5246+};
5247+
5248+
5249+/*
5250+ * IPVS lblc hash table
5251+ */
5252+struct ip_vs_lblc_table {
5253+ rwlock_t lock; /* lock for this table */
5254+ struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
5255+ atomic_t entries; /* number of entries */
5256+ int max_size; /* maximum size of entries */
5257+ struct timer_list periodic_timer; /* collect stale entries */
5258+ int rover; /* rover for expire check */
5259+ int counter; /* counter for no expire */
5260+};
5261+
5262+
5263+
5264+/*
5265+ * IPVS LBLC sysctl table
5266+ */
5267+struct ip_vs_lblc_sysctl_table {
5268+ struct ctl_table_header *sysctl_header;
5269+ ctl_table vs_vars[2];
5270+ ctl_table vs_dir[2];
5271+ ctl_table ipv4_dir[2];
5272+ ctl_table root_dir[2];
5273+};
5274+
5275+
5276+static struct ip_vs_lblc_sysctl_table lblc_sysctl_table = {
5277+ NULL,
5278+ {{NET_IPV4_VS_LBLC_EXPIRE, "lblc_expiration",
5279+ &sysctl_ip_vs_lblc_expiration,
5280+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
5281+ {0}},
5282+ {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblc_sysctl_table.vs_vars},
5283+ {0}},
5284+ {{NET_IPV4, "ipv4", NULL, 0, 0555, lblc_sysctl_table.vs_dir},
5285+ {0}},
5286+ {{CTL_NET, "net", NULL, 0, 0555, lblc_sysctl_table.ipv4_dir},
5287+ {0}}
5288+};
5289+
5290+
5291+/*
5292+ * new/free a ip_vs_lblc_entry, which is a mapping of a destination
5293+ * IP address to a server.
5294+ */
5295+static inline struct ip_vs_lblc_entry *
5296+ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest)
5297+{
5298+ struct ip_vs_lblc_entry *en;
5299+
5300+ en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
5301+ if (en == NULL) {
5302+ IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
5303+ return NULL;
5304+ }
5305+
5306+ INIT_LIST_HEAD(&en->list);
5307+ en->addr = daddr;
5308+
5309+ atomic_inc(&dest->refcnt);
5310+ en->dest = dest;
5311+
5312+ return en;
5313+}
5314+
5315+
5316+static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
5317+{
5318+ list_del(&en->list);
5319+ atomic_dec(&en->dest->refcnt);
5320+ kfree(en);
5321+}
5322+
5323+
5324+/*
5325+ * Returns hash value for IPVS LBLC entry
5326+ */
5327+static inline unsigned ip_vs_lblc_hashkey(__u32 addr)
5328+{
5329+ return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
5330+}
5331+
5332+
5333+/*
5334+ * Hash an entry in the ip_vs_lblc_table.
5335+ * returns bool success.
5336+ */
5337+static int
5338+ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
5339+{
5340+ unsigned hash;
5341+
5342+ if (!list_empty(&en->list)) {
5343+ IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
5344+ "called from %p\n", __builtin_return_address(0));
5345+ return 0;
5346+ }
5347+
5348+ /*
5349+ * Hash by destination IP address
5350+ */
5351+ hash = ip_vs_lblc_hashkey(en->addr);
5352+
5353+ write_lock(&tbl->lock);
5354+ list_add(&en->list, &tbl->bucket[hash]);
5355+ atomic_inc(&tbl->entries);
5356+ write_unlock(&tbl->lock);
5357+
5358+ return 1;
5359+}
5360+
5361+
5362+#if 0000
5363+/*
5364+ * Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
5365+ * returns bool success.
5366+ */
5367+static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
5368+ struct ip_vs_lblc_entry *en)
5369+{
5370+ if (list_empty(&en->list)) {
5371+ IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
5372+ "called from %p\n", __builtin_return_address(0));
5373+ return 0;
5374+ }
5375+
5376+ /*
5377+ * Remove it from the table
5378+ */
5379+ write_lock(&tbl->lock);
5380+ list_del(&en->list);
5381+ INIT_LIST_HEAD(&en->list);
5382+ write_unlock(&tbl->lock);
5383+
5384+ return 1;
5385+}
5386+#endif
5387+
5388+
5389+/*
5390+ * Get ip_vs_lblc_entry associated with supplied parameters.
5391+ */
5392+static inline struct ip_vs_lblc_entry *
5393+ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr)
5394+{
5395+ unsigned hash;
5396+ struct ip_vs_lblc_entry *en;
5397+ struct list_head *l,*e;
5398+
5399+ hash = ip_vs_lblc_hashkey(addr);
5400+
5401+ read_lock(&tbl->lock);
5402+
5403+ l = &tbl->bucket[hash];
5404+ for (e=l->next; e!=l; e=e->next) {
5405+ en = list_entry(e, struct ip_vs_lblc_entry, list);
5406+ if (en->addr == addr) {
5407+ /* HIT */
5408+ read_unlock(&tbl->lock);
5409+ return en;
5410+ }
5411+ }
5412+
5413+ read_unlock(&tbl->lock);
5414+
5415+ return NULL;
5416+}
5417+
5418+
5419+/*
5420+ * Flush all the entries of the specified table.
5421+ */
5422+static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
5423+{
5424+ int i;
5425+ struct list_head *l;
5426+ struct ip_vs_lblc_entry *en;
5427+
5428+ for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
5429+ write_lock(&tbl->lock);
5430+ for (l=&tbl->bucket[i]; l->next!=l; ) {
5431+ en = list_entry(l->next,
5432+ struct ip_vs_lblc_entry, list);
5433+ ip_vs_lblc_free(en);
5434+ atomic_dec(&tbl->entries);
5435+ }
5436+ write_unlock(&tbl->lock);
5437+ }
5438+}
5439+
5440+
5441+static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
5442+{
5443+ unsigned long now = jiffies;
5444+ int i, j;
5445+ struct list_head *l, *e;
5446+ struct ip_vs_lblc_entry *en;
5447+
5448+ for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
5449+ j = (j + 1) & IP_VS_LBLC_TAB_MASK;
5450+ e = l = &tbl->bucket[j];
5451+ write_lock(&tbl->lock);
5452+ while (e->next != l) {
5453+ en = list_entry(e->next,
5454+ struct ip_vs_lblc_entry, list);
5455+ if ((now - en->lastuse) <
5456+ sysctl_ip_vs_lblc_expiration) {
5457+ e = e->next;
5458+ continue;
5459+ }
5460+ ip_vs_lblc_free(en);
5461+ atomic_dec(&tbl->entries);
5462+ }
5463+ write_unlock(&tbl->lock);
5464+ }
5465+ tbl->rover = j;
5466+}
5467+
5468+
5469+/*
5470+ * Periodical timer handler for IPVS lblc table
5471+ * It is used to collect stale entries when the number of entries
5472+ * exceeds the maximum size of the table.
5473+ *
5474+ * Fixme: we probably need more complicated algorithm to collect
5475+ * entries that have not been used for a long time even
5476+ * if the number of entries doesn't exceed the maximum size
5477+ * of the table.
5478+ * The full expiration check is for this purpose now.
5479+ */
5480+static void ip_vs_lblc_check_expire(unsigned long data)
5481+{
5482+ struct ip_vs_lblc_table *tbl;
5483+ unsigned long now = jiffies;
5484+ int goal;
5485+ int i, j;
5486+ struct list_head *l, *e;
5487+ struct ip_vs_lblc_entry *en;
5488+
5489+ tbl = (struct ip_vs_lblc_table *)data;
5490+
5491+ if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
5492+ /* do full expiration check */
5493+ ip_vs_lblc_full_check(tbl);
5494+ tbl->counter = 1;
5495+ goto out;
5496+ }
5497+
5498+ if (atomic_read(&tbl->entries) < tbl->max_size) {
5499+ tbl->counter++;
5500+ goto out;
5501+ }
5502+
5503+ goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
5504+ if (goal > tbl->max_size/2)
5505+ goal = tbl->max_size/2;
5506+
5507+ for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
5508+ j = (j + 1) & IP_VS_LBLC_TAB_MASK;
5509+ e = l = &tbl->bucket[j];
5510+ write_lock(&tbl->lock);
5511+ while (e->next != l) {
5512+ en = list_entry(e->next,
5513+ struct ip_vs_lblc_entry, list);
5514+ if ((now - en->lastuse) < ENTRY_TIMEOUT) {
5515+ e = e->next;
5516+ continue;
5517+ }
5518+ ip_vs_lblc_free(en);
5519+ atomic_dec(&tbl->entries);
5520+ goal--;
5521+ }
5522+ write_unlock(&tbl->lock);
5523+ if (goal <= 0)
5524+ break;
5525+ }
5526+ tbl->rover = j;
5527+
5528+ out:
5529+ mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
5530+}
5531+
5532+
5533+static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
5534+{
5535+ int i;
5536+ struct ip_vs_lblc_table *tbl;
5537+
5538+ /*
5539+ * Allocate the ip_vs_lblc_table for this service
5540+ */
5541+ tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
5542+ if (tbl == NULL) {
5543+ IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
5544+ return -ENOMEM;
5545+ }
5546+ svc->sched_data = tbl;
5547+ IP_VS_DBG(0, "LBLC hash table (memory=%dbytes) allocated for "
5548+ "current service\n",
5549+ sizeof(struct ip_vs_lblc_table));
5550+
5551+ /*
5552+ * Initialize the hash buckets
5553+ */
5554+ for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
5555+ INIT_LIST_HEAD(&tbl->bucket[i]);
5556+ }
5557+ tbl->lock = RW_LOCK_UNLOCKED;
5558+ tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
5559+ tbl->rover = 0;
5560+ tbl->counter = 1;
5561+
5562+ /*
5563+ * Hook periodic timer for garbage collection
5564+ */
5565+ init_timer(&tbl->periodic_timer);
5566+ tbl->periodic_timer.data = (unsigned long)tbl;
5567+ tbl->periodic_timer.function = ip_vs_lblc_check_expire;
5568+ tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
5569+ add_timer(&tbl->periodic_timer);
5570+
5571+ MOD_INC_USE_COUNT;
5572+ return 0;
5573+}
5574+
5575+
5576+static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
5577+{
5578+ struct ip_vs_lblc_table *tbl = svc->sched_data;
5579+
5580+ /* remove periodic timer */
5581+ del_timer(&tbl->periodic_timer);
5582+
5583+ /* got to clean up table entries here */
5584+ ip_vs_lblc_flush(tbl);
5585+
5586+ /* release the table itself */
5587+ kfree(svc->sched_data);
5588+ IP_VS_DBG(0, "LBLC hash table (memory=%dbytes) released\n",
5589+ sizeof(struct ip_vs_lblc_table));
5590+
5591+ MOD_DEC_USE_COUNT;
5592+ return 0;
5593+}
5594+
5595+
5596+static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
5597+{
5598+ return 0;
5599+}
5600+
5601+
5602+static inline struct ip_vs_dest *
5603+__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
5604+{
5605+ register struct list_head *l, *e;
5606+ struct ip_vs_dest *dest, *least;
5607+ int loh, doh;
5608+
5609+ l = &svc->destinations;
5610+ if (l == l->next)
5611+ return NULL;
5612+
5613+ /*
5614+ * We think the overhead of processing active connections is fifty
5615+ * times than that of inactive conncetions in average. (This fifty
5616+ * times might be not accurate, we will change it later.) We use
5617+ * the following formula to estimate the overhead:
5618+ * dest->activeconns*50 + dest->inactconns
5619+ * and the load:
5620+ * (dest overhead) / dest->weight
5621+ *
5622+ * Remember -- no floats in kernel mode!!!
5623+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
5624+ * h1/w1 > h2/w2
5625+ * if every weight is larger than zero.
5626+ *
5627+ * The server with weight=0 is quiesced and will not receive any
5628+ * new connection.
5629+ */
5630+
5631+ for (e=l->next; e!=l; e=e->next) {
5632+ least = list_entry(e, struct ip_vs_dest, n_list);
5633+ if (least->weight > 0) {
5634+ loh = atomic_read(&least->activeconns) * 50
5635+ + atomic_read(&least->inactconns);
5636+ goto nextstage;
5637+ }
5638+ }
5639+ return NULL;
5640+
5641+ /*
5642+ * Find the destination with the least load.
5643+ */
5644+ nextstage:
5645+ for (e=e->next; e!=l; e=e->next)
5646+ {
5647+ dest = list_entry(e, struct ip_vs_dest, n_list);
5648+ doh = atomic_read(&dest->activeconns) * 50
5649+ + atomic_read(&dest->inactconns);
5650+ if (loh * dest->weight > doh * least->weight)
5651+ {
5652+ least = dest;
5653+ loh = doh;
5654+ }
5655+ }
5656+
5657+ IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
5658+ "activeconns %d refcnt %d weight %d overhead %d\n",
5659+ NIPQUAD(least->addr), ntohs(least->port),
5660+ atomic_read(&least->activeconns),
5661+ atomic_read(&least->refcnt), least->weight, loh);
5662+
5663+ return least;
5664+}
5665+
5666+
5667+/*
5668+ * If this destination server is overloaded and there is a less loaded
5669+ * server, then return true.
5670+ */
5671+static inline int
5672+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
5673+{
5674+ if (atomic_read(&dest->activeconns) > dest->weight) {
5675+ register struct list_head *l, *e;
5676+ struct ip_vs_dest *d;
5677+
5678+ l = &svc->destinations;
5679+ for (e=l->next; e!=l; e=e->next) {
5680+ d = list_entry(e, struct ip_vs_dest, n_list);
5681+ if (atomic_read(&d->activeconns)*2 < d->weight) {
5682+ return 1;
5683+ }
5684+ }
5685+ }
5686+ return 0;
5687+}
5688+
5689+
5690+/*
5691+ * Locality-Based (weighted) Least-Connection scheduling
5692+ */
5693+static struct ip_vs_dest *
5694+ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
5695+{
5696+ struct ip_vs_dest *dest;
5697+ struct ip_vs_lblc_table *tbl;
5698+ struct ip_vs_lblc_entry *en;
5699+
5700+ IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
5701+
5702+ tbl = (struct ip_vs_lblc_table *)svc->sched_data;
5703+ en = ip_vs_lblc_get(tbl, iph->daddr);
5704+ if (en == NULL) {
5705+ dest = __ip_vs_wlc_schedule(svc, iph);
5706+ if (dest == NULL) {
5707+ IP_VS_DBG(1, "no destination available\n");
5708+ return NULL;
5709+ }
5710+ en = ip_vs_lblc_new(iph->daddr, dest);
5711+ if (en == NULL) {
5712+ return NULL;
5713+ }
5714+ ip_vs_lblc_hash(tbl, en);
5715+ } else {
5716+ dest = en->dest;
5717+ if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
5718+ || dest->weight <= 0
5719+ || is_overloaded(dest, svc)) {
5720+ dest = __ip_vs_wlc_schedule(svc, iph);
5721+ if (dest == NULL) {
5722+ IP_VS_DBG(1, "no destination available\n");
5723+ return NULL;
5724+ }
5725+ atomic_dec(&en->dest->refcnt);
5726+ atomic_inc(&dest->refcnt);
5727+ en->dest = dest;
5728+ }
5729+ }
5730+ en->lastuse = jiffies;
5731+
5732+ IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
5733+ "--> server %u.%u.%u.%u:%d\n",
5734+ NIPQUAD(en->addr),
5735+ NIPQUAD(dest->addr),
5736+ ntohs(dest->port));
5737+
5738+ return dest;
5739+}
5740+
5741+
5742+static struct ip_vs_scheduler ip_vs_lblc_scheduler =
5743+{
5744+ {0}, /* n_list */
5745+ "lblc", /* name */
5746+ ATOMIC_INIT(0), /* refcnt */
5747+ ip_vs_lblc_init_svc, /* service initializer */
5748+ ip_vs_lblc_done_svc, /* service done */
5749+ ip_vs_lblc_update_svc, /* service updater */
5750+ ip_vs_lblc_schedule, /* select a server from the destination list */
5751+};
5752+
5753+
5754+__initfunc(int ip_vs_lblc_init(void))
5755+{
5756+ IP_VS_INFO("Initializing LBLC scheduling\n");
5757+ INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
5758+ lblc_sysctl_table.sysctl_header =
5759+ register_sysctl_table(lblc_sysctl_table.root_dir, 0);
5760+ return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
5761+}
5762+
5763+
5764+#ifdef MODULE
5765+EXPORT_NO_SYMBOLS;
5766+
5767+int init_module(void)
5768+{
5769+ INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
5770+
5771+ /* module initialization by 'request_module' */
5772+ if (register_ip_vs_scheduler(&ip_vs_lblc_scheduler) != 0)
5773+ return -EIO;
5774+
5775+ lblc_sysctl_table.sysctl_header =
5776+ register_sysctl_table(lblc_sysctl_table.root_dir, 0);
5777+
5778+ IP_VS_INFO("LBLC scheduling module loaded.\n");
5779+
5780+ return 0;
5781+}
5782+
5783+void cleanup_module(void)
5784+{
5785+ /* module cleanup by 'release_module' */
5786+ if (unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler) != 0) {
5787+ IP_VS_INFO("cannot remove LBLC scheduling module\n");
5788+ } else {
5789+ IP_VS_INFO("LBLC scheduling module unloaded.\n");
5790+ }
5791+ unregister_sysctl_table(lblc_sysctl_table.sysctl_header);
5792+}
5793+
5794+#endif /* MODULE */
5795diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lblcr.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblcr.c
5796--- linux-2.2.19/net/ipv4/ip_vs_lblcr.c Thu Jan 1 08:00:00 1970
5797+++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblcr.c Tue Mar 27 17:37:00 2001
5798@@ -0,0 +1,834 @@
5799+/*
5800+ * IPVS: Locality-Based Least-Connection with Replication scheduler
5801+ *
5802+ * Version: $Id$
5803+ *
5804+ * Authors: Wensong Zhang <wensong@gnuchina.org>
5805+ *
5806+ * This program is free software; you can redistribute it and/or
5807+ * modify it under the terms of the GNU General Public License
5808+ * as published by the Free Software Foundation; either version
5809+ * 2 of the License, or (at your option) any later version.
5810+ *
5811+ * Changes:
5812+ * Julian Anastasov : Added the missing (dest->weight>0)
5813+ * condition in the ip_vs_dest_set_max.
5814+ *
5815+ */
5816+
5817+/*
5818+ * The lblc/r algorithm is as follows (pseudo code):
5819+ *
5820+ * if serverSet[dest_ip] is null then
5821+ * n, serverSet[dest_ip] <- {weighted least-conn node};
5822+ * else
5823+ * n <- {least-conn (alive) node in serverSet[dest_ip]};
5824+ * if (n is null) OR
5825+ * (n.conns>n.weight AND
5826+ * there is a node m with m.conns<m.weight/2) then
5827+ * n <- {weighted least-conn node};
5828+ * add n to serverSet[dest_ip];
5829+ * if |serverSet[dest_ip]| > 1 AND
5830+ * now - serverSet[dest_ip].lastMod > T then
5831+ * m <- {most conn node in serverSet[dest_ip]};
5832+ * remove m from serverSet[dest_ip];
5833+ * if serverSet[dest_ip] changed then
5834+ * serverSet[dest_ip].lastMod <- now;
5835+ *
5836+ * return n;
5837+ *
5838+ */
5839+
5840+#include <linux/config.h>
5841+#include <linux/module.h>
5842+#ifdef CONFIG_KMOD
5843+#include <linux/kmod.h>
5844+#endif
5845+#include <linux/types.h>
5846+#include <linux/kernel.h>
5847+#include <linux/errno.h>
5848+#include <linux/vmalloc.h>
5849+#include <net/ip_masq.h>
5850+#ifdef CONFIG_IP_MASQUERADE_MOD
5851+#include <net/ip_masq_mod.h>
5852+#endif
5853+#include <linux/sysctl.h>
5854+#include <linux/proc_fs.h>
5855+#include <linux/ip_fw.h>
5856+#include <net/ip_vs.h>
5857+
5858+
5859+/*
5860+ * It is for garbage collection of stale IPVS lblcr entries,
5861+ * when the table is full.
5862+ */
5863+#define CHECK_EXPIRE_INTERVAL (60*HZ)
5864+#define ENTRY_TIMEOUT (6*60*HZ)
5865+
5866+/*
5867+ * It is for full expiration check.
5868+ * When there is no partial expiration check (garbage collection)
5869+ * in a half hour, do a full expiration check to collect stale
5870+ * entries that haven't been touched for a day.
5871+ */
5872+#define COUNT_FOR_FULL_EXPIRATION 30
5873+int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
5874+
5875+
5876+/*
5877+ * for IPVS lblcr entry hash table
5878+ */
5879+#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
5880+#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
5881+#endif
5882+#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
5883+#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
5884+#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
5885+
5886+
5887+/*
5888+ * IPVS destination set structure and operations
5889+ */
5890+struct ip_vs_dest_list {
5891+ struct ip_vs_dest_list *next; /* list link */
5892+ struct ip_vs_dest *dest; /* destination server */
5893+};
5894+
5895+struct ip_vs_dest_set {
5896+ atomic_t size; /* set size */
5897+ unsigned long lastmod; /* last modified time */
5898+ struct ip_vs_dest_list *list; /* destination list */
5899+ rwlock_t lock; /* lock for this list */
5900+};
5901+
5902+
5903+static struct ip_vs_dest_list *
5904+ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
5905+{
5906+ struct ip_vs_dest_list *e;
5907+
5908+ for (e=set->list; e!=NULL; e=e->next) {
5909+ if (e->dest == dest)
5910+ /* already existed */
5911+ return NULL;
5912+ }
5913+
5914+ e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
5915+ if (e == NULL) {
5916+ IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
5917+ return NULL;
5918+ }
5919+
5920+ atomic_inc(&dest->refcnt);
5921+ e->dest = dest;
5922+
5923+ /* link it to the list */
5924+ write_lock(&set->lock);
5925+ if (set->list != NULL) {
5926+ e->next = set->list->next;
5927+ set->list = e;
5928+ } else {
5929+ e->next = NULL;
5930+ set->list = e;
5931+ }
5932+ write_unlock(&set->lock);
5933+
5934+ atomic_inc(&set->size);
5935+ set->lastmod = jiffies;
5936+ return e;
5937+}
5938+
5939+static void
5940+ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
5941+{
5942+ struct ip_vs_dest_list *e, **ep;
5943+
5944+ write_lock(&set->lock);
5945+ for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
5946+ if (e->dest == dest) {
5947+ /* HIT */
5948+ *ep = e->next;
5949+ atomic_dec(&set->size);
5950+ set->lastmod = jiffies;
5951+ atomic_dec(&e->dest->refcnt);
5952+ kfree(e);
5953+ break;
5954+ }
5955+ ep = &e->next;
5956+ }
5957+ write_unlock(&set->lock);
5958+}
5959+
5960+static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
5961+{
5962+ struct ip_vs_dest_list *e, **ep;
5963+
5964+ write_lock(&set->lock);
5965+ for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
5966+ *ep = e->next;
5967+ /*
5968+ * We don't kfree dest because it is refered either
5969+ * by its service or by the trash dest list.
5970+ */
5971+ atomic_dec(&e->dest->refcnt);
5972+ kfree(e);
5973+ }
5974+ write_unlock(&set->lock);
5975+}
5976+
5977+/* get weighted least-connection node in the destination set */
5978+static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
5979+{
5980+ register struct ip_vs_dest_list *e;
5981+ struct ip_vs_dest *dest, *least;
5982+ int loh, doh;
5983+
5984+ if (set == NULL)
5985+ return NULL;
5986+
5987+ read_lock(&set->lock);
5988+ /* select the first destination server, whose weight > 0 */
5989+ for (e=set->list; e!=NULL; e=e->next) {
5990+ least = e->dest;
5991+ if ((least->weight > 0)
5992+ && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
5993+ loh = atomic_read(&least->activeconns) * 50
5994+ + atomic_read(&least->inactconns);
5995+ goto nextstage;
5996+ }
5997+ }
5998+ read_unlock(&set->lock);
5999+ return NULL;
6000+
6001+ /* find the destination with the weighted least load */
6002+ nextstage:
6003+ for (e=e->next; e!=NULL; e=e->next) {
6004+ dest = e->dest;
6005+ doh = atomic_read(&dest->activeconns) * 50
6006+ + atomic_read(&dest->inactconns);
6007+ if ((loh*dest->weight > doh*least->weight)
6008+ && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
6009+ least = dest;
6010+ loh = doh;
6011+ }
6012+ }
6013+ read_unlock(&set->lock);
6014+
6015+ IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
6016+ "activeconns %d refcnt %d weight %d overhead %d\n",
6017+ NIPQUAD(least->addr), ntohs(least->port),
6018+ atomic_read(&least->activeconns),
6019+ atomic_read(&least->refcnt), least->weight, loh);
6020+ return least;
6021+}
6022+
6023+
6024+/* get weighted most-connection node in the destination set */
6025+static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
6026+{
6027+ register struct ip_vs_dest_list *e;
6028+ struct ip_vs_dest *dest, *most;
6029+ int moh, doh;
6030+
6031+ if (set == NULL)
6032+ return NULL;
6033+
6034+ read_lock(&set->lock);
6035+ /* select the first destination server, whose weight > 0 */
6036+ for (e=set->list; e!=NULL; e=e->next) {
6037+ most = e->dest;
6038+ if (most->weight > 0) {
6039+ moh = atomic_read(&most->activeconns) * 50
6040+ + atomic_read(&most->inactconns);
6041+ goto nextstage;
6042+ }
6043+ }
6044+ read_unlock(&set->lock);
6045+ return NULL;
6046+
6047+ /* find the destination with the weighted most load */
6048+ nextstage:
6049+ for (e=e->next; e!=NULL; e=e->next) {
6050+ dest = e->dest;
6051+ doh = atomic_read(&dest->activeconns) * 50
6052+ + atomic_read(&dest->inactconns);
6053+ /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
6054+ if (moh*dest->weight < doh*most->weight
6055+ && dest->weight > 0) {
6056+ most = dest;
6057+ moh = doh;
6058+ }
6059+ }
6060+ read_unlock(&set->lock);
6061+
6062+ IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
6063+ "activeconns %d refcnt %d weight %d overhead %d\n",
6064+ NIPQUAD(most->addr), ntohs(most->port),
6065+ atomic_read(&most->activeconns),
6066+ atomic_read(&most->refcnt), most->weight, moh);
6067+ return most;
6068+}
6069+
6070+
6071+/*
6072+ * IPVS lblcr entry represents an association between destination
6073+ * IP address and its destination server set
6074+ */
6075+struct ip_vs_lblcr_entry {
6076+ struct list_head list;
6077+ __u32 addr; /* destination IP address */
6078+ struct ip_vs_dest_set set; /* destination server set */
6079+ unsigned long lastuse; /* last used time */
6080+};
6081+
6082+
6083+/*
6084+ * IPVS lblcr hash table
6085+ */
6086+struct ip_vs_lblcr_table {
6087+ rwlock_t lock; /* lock for this table */
6088+ struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
6089+ atomic_t entries; /* number of entries */
6090+ int max_size; /* maximum size of entries */
6091+ struct timer_list periodic_timer; /* collect stale entries */
6092+ int rover; /* rover for expire check */
6093+ int counter; /* counter for no expire */
6094+};
6095+
6096+
6097+/*
6098+ * IPVS LBLCR sysctl table
6099+ */
6100+struct ip_vs_lblcr_sysctl_table {
6101+ struct ctl_table_header *sysctl_header;
6102+ ctl_table vs_vars[2];
6103+ ctl_table vs_dir[2];
6104+ ctl_table ipv4_dir[2];
6105+ ctl_table root_dir[2];
6106+};
6107+
6108+
6109+static struct ip_vs_lblcr_sysctl_table lblcr_sysctl_table = {
6110+ NULL,
6111+ {{NET_IPV4_VS_LBLCR_EXPIRE, "lblcr_expiration",
6112+ &sysctl_ip_vs_lblcr_expiration,
6113+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
6114+ {0}},
6115+ {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblcr_sysctl_table.vs_vars},
6116+ {0}},
6117+ {{NET_IPV4, "ipv4", NULL, 0, 0555, lblcr_sysctl_table.vs_dir},
6118+ {0}},
6119+ {{CTL_NET, "net", NULL, 0, 0555, lblcr_sysctl_table.ipv4_dir},
6120+ {0}}
6121+};
6122+
6123+
6124+/*
6125+ * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
6126+ * IP address to a server.
6127+ */
6128+static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr)
6129+{
6130+ struct ip_vs_lblcr_entry *en;
6131+
6132+ en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
6133+ if (en == NULL) {
6134+ IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
6135+ return NULL;
6136+ }
6137+
6138+ INIT_LIST_HEAD(&en->list);
6139+ en->addr = daddr;
6140+
6141+ /* initilize its dest set */
6142+ atomic_set(&(en->set.size), 0);
6143+ en->set.list = NULL;
6144+ en->set.lock = RW_LOCK_UNLOCKED;
6145+
6146+ return en;
6147+}
6148+
6149+
6150+static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
6151+{
6152+ list_del(&en->list);
6153+ ip_vs_dest_set_eraseall(&en->set);
6154+ kfree(en);
6155+}
6156+
6157+
6158+/*
6159+ * Returns hash value for IPVS LBLCR entry
6160+ */
6161+static inline unsigned ip_vs_lblcr_hashkey(__u32 addr)
6162+{
6163+ return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
6164+}
6165+
6166+
6167+/*
6168+ * Hash an entry in the ip_vs_lblcr_table.
6169+ * returns bool success.
6170+ */
6171+static int
6172+ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
6173+{
6174+ unsigned hash;
6175+
6176+ if (!list_empty(&en->list)) {
6177+ IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
6178+ "called from %p\n", __builtin_return_address(0));
6179+ return 0;
6180+ }
6181+
6182+ /*
6183+ * Hash by destination IP address
6184+ */
6185+ hash = ip_vs_lblcr_hashkey(en->addr);
6186+
6187+ write_lock(&tbl->lock);
6188+ list_add(&en->list, &tbl->bucket[hash]);
6189+ atomic_inc(&tbl->entries);
6190+ write_unlock(&tbl->lock);
6191+
6192+ return 1;
6193+}
6194+
6195+
6196+#if 0000
6197+/*
6198+ * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
6199+ * returns bool success.
6200+ */
6201+static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
6202+ struct ip_vs_lblcr_entry *en)
6203+{
6204+ if (list_empty(&en->list)) {
6205+ IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
6206+ "called from %p\n", __builtin_return_address(0));
6207+ return 0;
6208+ }
6209+
6210+ /*
6211+ * Remove it from the table
6212+ */
6213+ write_lock(&tbl->lock);
6214+ list_del(&en->list);
6215+ INIT_LIST_HEAD(&en->list);
6216+ write_unlock(&tbl->lock);
6217+
6218+ return 1;
6219+}
6220+#endif
6221+
6222+
6223+/*
6224+ * Get ip_vs_lblcr_entry associated with supplied parameters.
6225+ */
6226+static inline struct ip_vs_lblcr_entry *
6227+ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr)
6228+{
6229+ unsigned hash;
6230+ struct ip_vs_lblcr_entry *en;
6231+ struct list_head *l,*e;
6232+
6233+ hash = ip_vs_lblcr_hashkey(addr);
6234+ l = &tbl->bucket[hash];
6235+
6236+ read_lock(&tbl->lock);
6237+
6238+ for (e=l->next; e!=l; e=e->next) {
6239+ en = list_entry(e, struct ip_vs_lblcr_entry, list);
6240+ if (en->addr == addr) {
6241+ /* HIT */
6242+ read_unlock(&tbl->lock);
6243+ return en;
6244+ }
6245+ }
6246+
6247+ read_unlock(&tbl->lock);
6248+
6249+ return NULL;
6250+}
6251+
6252+
6253+/*
6254+ * Flush all the entries of the specified table.
6255+ */
6256+static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
6257+{
6258+ int i;
6259+ struct list_head *l;
6260+ struct ip_vs_lblcr_entry *en;
6261+
6262+ for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
6263+ write_lock(&tbl->lock);
6264+ for (l=&tbl->bucket[i]; l->next!=l; ) {
6265+ en = list_entry(l->next,
6266+ struct ip_vs_lblcr_entry, list);
6267+ ip_vs_lblcr_free(en);
6268+ atomic_dec(&tbl->entries);
6269+ }
6270+ write_unlock(&tbl->lock);
6271+ }
6272+}
6273+
6274+
6275+static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
6276+{
6277+ unsigned long now = jiffies;
6278+ int i, j;
6279+ struct list_head *l, *e;
6280+ struct ip_vs_lblcr_entry *en;
6281+
6282+ for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
6283+ j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
6284+ e = l = &tbl->bucket[j];
6285+ write_lock(&tbl->lock);
6286+ while (e->next != l) {
6287+ en = list_entry(e->next,
6288+ struct ip_vs_lblcr_entry, list);
6289+ if ((now - en->lastuse) <
6290+ sysctl_ip_vs_lblcr_expiration) {
6291+ e = e->next;
6292+ continue;
6293+ }
6294+ ip_vs_lblcr_free(en);
6295+ atomic_dec(&tbl->entries);
6296+ }
6297+ write_unlock(&tbl->lock);
6298+ }
6299+ tbl->rover = j;
6300+}
6301+
6302+
6303+/*
6304+ * Periodical timer handler for IPVS lblcr table
6305+ * It is used to collect stale entries when the number of entries
6306+ * exceeds the maximum size of the table.
6307+ *
6308+ * Fixme: we probably need more complicated algorithm to collect
6309+ * entries that have not been used for a long time even
6310+ * if the number of entries doesn't exceed the maximum size
6311+ * of the table.
6312+ * The full expiration check is for this purpose now.
6313+ */
6314+static void ip_vs_lblcr_check_expire(unsigned long data)
6315+{
6316+ struct ip_vs_lblcr_table *tbl;
6317+ unsigned long now = jiffies;
6318+ int goal;
6319+ int i, j;
6320+ struct list_head *l, *e;
6321+ struct ip_vs_lblcr_entry *en;
6322+
6323+ tbl = (struct ip_vs_lblcr_table *)data;
6324+
6325+ if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
6326+ /* do full expiration check */
6327+ ip_vs_lblcr_full_check(tbl);
6328+ tbl->counter = 1;
6329+ goto out;
6330+ }
6331+
6332+ if (atomic_read(&tbl->entries) < tbl->max_size) {
6333+ tbl->counter++;
6334+ goto out;
6335+ }
6336+
6337+ goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
6338+ if (goal > tbl->max_size/2)
6339+ goal = tbl->max_size/2;
6340+
6341+ for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
6342+ j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
6343+ e = l = &tbl->bucket[j];
6344+ write_lock(&tbl->lock);
6345+ while (e->next != l) {
6346+ en = list_entry(e->next,
6347+ struct ip_vs_lblcr_entry, list);
6348+ if ((now - en->lastuse) < ENTRY_TIMEOUT) {
6349+ e = e->next;
6350+ continue;
6351+ }
6352+ ip_vs_lblcr_free(en);
6353+ atomic_dec(&tbl->entries);
6354+ goal--;
6355+ }
6356+ write_unlock(&tbl->lock);
6357+ if (goal <= 0)
6358+ break;
6359+ }
6360+ tbl->rover = j;
6361+
6362+ out:
6363+ mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
6364+}
6365+
6366+
6367+static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
6368+{
6369+ int i;
6370+ struct ip_vs_lblcr_table *tbl;
6371+
6372+ /*
6373+ * Allocate the ip_vs_lblcr_table for this service
6374+ */
6375+ tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
6376+ if (tbl == NULL) {
6377+ IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
6378+ return -ENOMEM;
6379+ }
6380+ svc->sched_data = tbl;
6381+ IP_VS_DBG(0, "LBLCR hash table (memory=%dbytes) allocated for "
6382+ "current service\n",
6383+ sizeof(struct ip_vs_lblcr_table));
6384+
6385+ /*
6386+ * Initialize the hash buckets
6387+ */
6388+ for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
6389+ INIT_LIST_HEAD(&tbl->bucket[i]);
6390+ }
6391+ tbl->lock = RW_LOCK_UNLOCKED;
6392+ tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
6393+ tbl->rover = 0;
6394+ tbl->counter = 1;
6395+
6396+ /*
6397+ * Hook periodic timer for garbage collection
6398+ */
6399+ init_timer(&tbl->periodic_timer);
6400+ tbl->periodic_timer.data = (unsigned long)tbl;
6401+ tbl->periodic_timer.function = ip_vs_lblcr_check_expire;
6402+ tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
6403+ add_timer(&tbl->periodic_timer);
6404+
6405+ MOD_INC_USE_COUNT;
6406+ return 0;
6407+}
6408+
6409+
6410+static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
6411+{
6412+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
6413+
6414+ /* remove periodic timer */
6415+ del_timer(&tbl->periodic_timer);
6416+
6417+ /* got to clean up table entries here */
6418+ ip_vs_lblcr_flush(tbl);
6419+
6420+ /* release the table itself */
6421+ kfree(svc->sched_data);
6422+ IP_VS_DBG(0, "LBLCR hash table (memory=%dbytes) released\n",
6423+ sizeof(struct ip_vs_lblcr_table));
6424+
6425+ MOD_DEC_USE_COUNT;
6426+ return 0;
6427+}
6428+
6429+
6430+static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
6431+{
6432+ return 0;
6433+}
6434+
6435+
6436+static inline struct ip_vs_dest *
6437+__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
6438+{
6439+ register struct list_head *l, *e;
6440+ struct ip_vs_dest *dest, *least;
6441+ int loh, doh;
6442+
6443+ l = &svc->destinations;
6444+ if (l == l->next)
6445+ return NULL;
6446+
6447+ /*
6448+ * We think the overhead of processing active connections is fifty
6449+ * times than that of inactive conncetions in average. (This fifty
6450+ * times might be not accurate, we will change it later.) We use
6451+ * the following formula to estimate the overhead:
6452+ * dest->activeconns*50 + dest->inactconns
6453+ * and the load:
6454+ * (dest overhead) / dest->weight
6455+ *
6456+ * Remember -- no floats in kernel mode!!!
6457+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
6458+ * h1/w1 > h2/w2
6459+ * if every weight is larger than zero.
6460+ *
6461+ * The server with weight=0 is quiesced and will not receive any
6462+ * new connection.
6463+ */
6464+
6465+ for (e=l->next; e!=l; e=e->next) {
6466+ least = list_entry(e, struct ip_vs_dest, n_list);
6467+ if (least->weight > 0) {
6468+ loh = atomic_read(&least->activeconns) * 50
6469+ + atomic_read(&least->inactconns);
6470+ goto nextstage;
6471+ }
6472+ }
6473+ return NULL;
6474+
6475+ /*
6476+ * Find the destination with the least load.
6477+ */
6478+ nextstage:
6479+ for (e=e->next; e!=l; e=e->next) {
6480+ dest = list_entry(e, struct ip_vs_dest, n_list);
6481+ doh = atomic_read(&dest->activeconns) * 50
6482+ + atomic_read(&dest->inactconns);
6483+ if (loh*dest->weight > doh*least->weight) {
6484+ least = dest;
6485+ loh = doh;
6486+ }
6487+ }
6488+
6489+ IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
6490+ "activeconns %d refcnt %d weight %d overhead %d\n",
6491+ NIPQUAD(least->addr), ntohs(least->port),
6492+ atomic_read(&least->activeconns),
6493+ atomic_read(&least->refcnt), least->weight, loh);
6494+
6495+ return least;
6496+}
6497+
6498+
6499+/*
6500+ * If this destination server is overloaded and there is a less loaded
6501+ * server, then return true.
6502+ */
6503+static inline int
6504+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
6505+{
6506+ if (atomic_read(&dest->activeconns) > dest->weight) {
6507+ register struct list_head *l, *e;
6508+ struct ip_vs_dest *d;
6509+
6510+ l = &svc->destinations;
6511+ for (e=l->next; e!=l; e=e->next) {
6512+ d = list_entry(e, struct ip_vs_dest, n_list);
6513+ if (atomic_read(&d->activeconns)*2 < d->weight) {
6514+ return 1;
6515+ }
6516+ }
6517+ }
6518+ return 0;
6519+}
6520+
6521+
6522+/*
6523+ * Locality-Based (weighted) Least-Connection scheduling
6524+ */
6525+static struct ip_vs_dest *
6526+ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
6527+{
6528+ struct ip_vs_dest *dest;
6529+ struct ip_vs_lblcr_table *tbl;
6530+ struct ip_vs_lblcr_entry *en;
6531+
6532+ IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
6533+
6534+ tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
6535+ en = ip_vs_lblcr_get(tbl, iph->daddr);
6536+ if (en == NULL) {
6537+ dest = __ip_vs_wlc_schedule(svc, iph);
6538+ if (dest == NULL) {
6539+ IP_VS_DBG(1, "no destination available\n");
6540+ return NULL;
6541+ }
6542+ en = ip_vs_lblcr_new(iph->daddr);
6543+ if (en == NULL) {
6544+ return NULL;
6545+ }
6546+ ip_vs_dest_set_insert(&en->set, dest);
6547+ ip_vs_lblcr_hash(tbl, en);
6548+ } else {
6549+ dest = ip_vs_dest_set_min(&en->set);
6550+ if (!dest || is_overloaded(dest, svc)) {
6551+ dest = __ip_vs_wlc_schedule(svc, iph);
6552+ if (dest == NULL) {
6553+ IP_VS_DBG(1, "no destination available\n");
6554+ return NULL;
6555+ }
6556+ ip_vs_dest_set_insert(&en->set, dest);
6557+ }
6558+ if (atomic_read(&en->set.size) > 1 &&
6559+ jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
6560+ struct ip_vs_dest *m;
6561+ m = ip_vs_dest_set_max(&en->set);
6562+ if (m) ip_vs_dest_set_erase(&en->set, m);
6563+ }
6564+ }
6565+ en->lastuse = jiffies;
6566+
6567+ IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
6568+ "--> server %u.%u.%u.%u:%d\n",
6569+ NIPQUAD(en->addr),
6570+ NIPQUAD(dest->addr),
6571+ ntohs(dest->port));
6572+
6573+ return dest;
6574+}
6575+
6576+
6577+/*
6578+ * IPVS LBLCR Scheduler structure
6579+ */
6580+static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
6581+{
6582+ {0}, /* n_list */
6583+ "lblcr", /* name */
6584+ ATOMIC_INIT(0), /* refcnt */
6585+ ip_vs_lblcr_init_svc, /* service initializer */
6586+ ip_vs_lblcr_done_svc, /* service done */
6587+ ip_vs_lblcr_update_svc, /* service updater */
6588+ ip_vs_lblcr_schedule, /* select a server from the destination list */
6589+};
6590+
6591+
6592+__initfunc(int ip_vs_lblcr_init(void))
6593+{
6594+ IP_VS_INFO("Initializing LBLCR scheduling\n");
6595+ INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
6596+ lblcr_sysctl_table.sysctl_header =
6597+ register_sysctl_table(lblcr_sysctl_table.root_dir, 0);
6598+ return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
6599+}
6600+
6601+
6602+#ifdef MODULE
6603+EXPORT_NO_SYMBOLS;
6604+
6605+int init_module(void)
6606+{
6607+ INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
6608+
6609+ /* module initialization by 'request_module' */
6610+ if (register_ip_vs_scheduler(&ip_vs_lblcr_scheduler) != 0)
6611+ return -EIO;
6612+
6613+ lblcr_sysctl_table.sysctl_header =
6614+ register_sysctl_table(lblcr_sysctl_table.root_dir, 0);
6615+
6616+ IP_VS_INFO("LBLCR scheduling module loaded.\n");
6617+
6618+ return 0;
6619+}
6620+
6621+void cleanup_module(void)
6622+{
6623+ /* module cleanup by 'release_module' */
6624+ if (unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler) != 0) {
6625+ IP_VS_INFO("cannot remove LBLCR scheduling module\n");
6626+ } else {
6627+ IP_VS_INFO("LBLCR scheduling module unloaded.\n");
6628+ }
6629+ unregister_sysctl_table(lblcr_sysctl_table.sysctl_header);
6630+}
6631+
6632+#endif /* MODULE */
6633diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lc.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lc.c
6634--- linux-2.2.19/net/ipv4/ip_vs_lc.c Thu Jan 1 08:00:00 1970
6635+++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lc.c Fri Nov 24 10:02:53 2000
6636@@ -0,0 +1,159 @@
6637+/*
6638+ * IPVS: Least-Connection Scheduling module
6639+ *
6640+ * Version: $Id$
6641+ *
6642+ * Authors: Wensong Zhang <wensong@iinchina.net>
6643+ *
6644+ * This program is free software; you can redistribute it and/or
6645+ * modify it under the terms of the GNU General Public License
6646+ * as published by the Free Software Foundation; either version
6647+ * 2 of the License, or (at your option) any later version.
6648+ *
6649+ * Changes:
6650+ * Wensong Zhang : added the ip_vs_lc_update_svc
6651+ * Wensong Zhang : added any dest with weight=0 is quiesced
6652+ *
6653+ */
6654+
6655+#include <linux/config.h>
6656+#include <linux/module.h>
6657+#ifdef CONFIG_KMOD
6658+#include <linux/kmod.h>
6659+#endif
6660+#include <linux/types.h>
6661+#include <linux/kernel.h>
6662+#include <linux/errno.h>
6663+#include <net/ip_masq.h>
6664+#ifdef CONFIG_IP_MASQUERADE_MOD
6665+#include <net/ip_masq_mod.h>
6666+#endif
6667+#include <linux/ip_fw.h>
6668+#include <net/ip_vs.h>
6669+
6670+
6671+static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
6672+{
6673+ MOD_INC_USE_COUNT;
6674+ return 0;
6675+}
6676+
6677+
6678+static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
6679+{
6680+ MOD_DEC_USE_COUNT;
6681+ return 0;
6682+}
6683+
6684+
6685+static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
6686+{
6687+ return 0;
6688+}
6689+
6690+
6691+/*
6692+ * Least Connection scheduling
6693+ */
6694+static struct ip_vs_dest* ip_vs_lc_schedule(struct ip_vs_service *svc,
6695+ struct iphdr *iph)
6696+{
6697+ struct list_head *l, *e;
6698+ struct ip_vs_dest *dest, *least;
6699+ int lac, dac;
6700+
6701+ IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
6702+
6703+ l = &svc->destinations;
6704+ if (l == l->next)
6705+ return NULL;
6706+
6707+ /*
6708+ * Simply select the server with the least number of
6709+ * (activeconns<<5) + inactconns
6710+ * Except whose weight is equal to zero.
6711+ * If the weight is equal to zero, it means that the server is
6712+ * quiesced, the existing connections to the server still get
6713+ * served, but no new connection is assigned to the server.
6714+ */
6715+
6716+ for (e=l->next; e!=l; e=e->next) {
6717+ least = list_entry (e, struct ip_vs_dest, n_list);
6718+ if (least->weight > 0) {
6719+ lac = (atomic_read(&least->activeconns) << 5)
6720+ + atomic_read(&least->inactconns);
6721+ goto nextstage;
6722+ }
6723+ }
6724+ return NULL;
6725+
6726+ /*
6727+ * Find the destination with the least load.
6728+ */
6729+ nextstage:
6730+ for (e=e->next; e!=l; e=e->next) {
6731+ dest = list_entry(e, struct ip_vs_dest, n_list);
6732+ if (dest->weight == 0)
6733+ continue;
6734+ dac = (atomic_read(&dest->activeconns) << 5)
6735+ + atomic_read(&dest->inactconns);
6736+ if (dac < lac) {
6737+ least = dest;
6738+ lac = dac;
6739+ }
6740+ }
6741+
6742+ IP_VS_DBG(6, "LC: server %d.%d.%d.%d:%d activeconns %d inactconns %d\n",
6743+ NIPQUAD(least->addr), ntohs(least->port),
6744+ atomic_read(&least->activeconns),
6745+ atomic_read(&least->inactconns));
6746+
6747+ return least;
6748+}
6749+
6750+
6751+static struct ip_vs_scheduler ip_vs_lc_scheduler = {
6752+ {0}, /* n_list */
6753+ "lc", /* name */
6754+ ATOMIC_INIT(0), /* refcnt */
6755+ ip_vs_lc_init_svc, /* service initializer */
6756+ ip_vs_lc_done_svc, /* service done */
6757+ ip_vs_lc_update_svc, /* service updater */
6758+ ip_vs_lc_schedule, /* select a server from the destination list */
6759+};
6760+
6761+
6762+__initfunc(int ip_vs_lc_init(void))
6763+{
6764+ IP_VS_INFO("Initializing LC scheduling\n");
6765+ INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
6766+ return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
6767+}
6768+
6769+
6770+#ifdef MODULE
6771+EXPORT_NO_SYMBOLS;
6772+
6773+int init_module(void)
6774+{
6775+ INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
6776+
6777+ /* module initialization by 'request_module' */
6778+ if(register_ip_vs_scheduler(&ip_vs_lc_scheduler) != 0)
6779+ return -EIO;
6780+
6781+ IP_VS_INFO("LC scheduling module loaded.\n");
6782+
6783+ return 0;
6784+}
6785+
6786+void cleanup_module(void)
6787+{
6788+ /* module cleanup by 'release_module' */
6789+ if(unregister_ip_vs_scheduler(&ip_vs_lc_scheduler) != 0)
6790+ IP_VS_INFO("cannot remove LC scheduling module\n");
6791+ else
6792+ IP_VS_INFO("LC scheduling module unloaded.\n");
6793+}
6794+
6795+#endif /* MODULE */
6796diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_rr.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_rr.c
6797--- linux-2.2.19/net/ipv4/ip_vs_rr.c Thu Jan 1 08:00:00 1970
6798+++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_rr.c Fri Nov 24 10:04:12 2000
6799@@ -0,0 +1,145 @@
6800+/*
6801+ * IPVS: Round-Robin Scheduling module
6802+ *
6803+ * Version: $Id$
6804+ *
6805+ * Authors: Wensong Zhang <wensong@iinchina.net>
6806+ * Peter Kese <peter.kese@ijs.si>
6807+ *
6808+ * This program is free software; you can redistribute it and/or
6809+ * modify it under the terms of the GNU General Public License
6810+ * as published by the Free Software Foundation; either version
6811+ * 2 of the License, or (at your option) any later version.
6812+ *
6813+ * Fixes/Changes:
6814+ * Wensong Zhang : changed the ip_vs_rr_schedule to return dest
6815+ * Julian Anastasov : fixed the NULL pointer access bug in debugging
6816+ * Wensong Zhang : changed some comestics things for debugging
6817+ * Wensong Zhang : changed for the d-linked destination list
6818+ * Wensong Zhang : added the ip_vs_rr_update_svc
6819+ * Wensong Zhang : added any dest with weight=0 is quiesced
6820+ *
6821+ */
6822+
6823+#include <linux/config.h>
6824+#include <linux/module.h>
6825+#ifdef CONFIG_KMOD
6826+#include <linux/kmod.h>
6827+#endif
6828+#include <linux/types.h>
6829+#include <linux/kernel.h>
6830+#include <linux/errno.h>
6831+#include <net/ip_masq.h>
6832+#ifdef CONFIG_IP_MASQUERADE_MOD
6833+#include <net/ip_masq_mod.h>
6834+#endif
6835+#include <linux/ip_fw.h>
6836+#include <net/ip_vs.h>
6837+
6838+
6839+static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
6840+{
6841+ svc->sched_data = &svc->destinations;
6842+ MOD_INC_USE_COUNT;
6843+ return 0;
6844+}
6845+
6846+
6847+static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
6848+{
6849+ MOD_DEC_USE_COUNT;
6850+ return 0;
6851+}
6852+
6853+
6854+static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
6855+{
6856+ svc->sched_data = &svc->destinations;
6857+ return 0;
6858+}
6859+
6860+
6861+/*
6862+ * Round-Robin Scheduling
6863+ */
6864+static struct ip_vs_dest* ip_vs_rr_schedule(struct ip_vs_service *svc,
6865+ struct iphdr *iph)
6866+{
6867+ register struct list_head *p, *q;
6868+ struct ip_vs_dest *dest;
6869+
6870+ IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
6871+
6872+ p = (struct list_head *)svc->sched_data;
6873+ p = p->next;
6874+ q = p;
6875+ do {
6876+ if (q == &svc->destinations) {
6877+ q = q->next;
6878+ continue;
6879+ }
6880+ dest = list_entry(q, struct ip_vs_dest, n_list);
6881+ if (dest->weight > 0)
6882+ /* HIT */
6883+ goto out;
6884+ q = q->next;
6885+ } while (q != p);
6886+ return NULL;
6887+
6888+ out:
6889+ svc->sched_data = q;
6890+ IP_VS_DBG(6, "RR: server %d.%d.%d.%d:%d "
6891+ "activeconns %d refcnt %d weight %d\n",
6892+ NIPQUAD(dest->addr), ntohs(dest->port),
6893+ atomic_read(&dest->activeconns),
6894+ atomic_read(&dest->refcnt), dest->weight);
6895+
6896+ return dest;
6897+}
6898+
6899+
6900+static struct ip_vs_scheduler ip_vs_rr_scheduler = {
6901+ {0}, /* n_list */
6902+ "rr", /* name */
6903+ ATOMIC_INIT(0), /* refcnt */
6904+ ip_vs_rr_init_svc, /* service initializer */
6905+ ip_vs_rr_done_svc, /* service done */
6906+ ip_vs_rr_update_svc, /* service updater */
6907+ ip_vs_rr_schedule, /* select a server from the destination list */
6908+};
6909+
6910+
6911+__initfunc(int ip_vs_rr_init(void))
6912+{
6913+ IP_VS_INFO("Initializing RR scheduling\n");
6914+ INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
6915+ return register_ip_vs_scheduler(&ip_vs_rr_scheduler) ;
6916+}
6917+
6918+
6919+#ifdef MODULE
6920+EXPORT_NO_SYMBOLS;
6921+
6922+int init_module(void)
6923+{
6924+ INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
6925+
6926+ /* module initialization by 'request_module' */
6927+ if(register_ip_vs_scheduler(&ip_vs_rr_scheduler) != 0)
6928+ return -EIO;
6929+
6930+ IP_VS_INFO("RR scheduling module loaded.\n");
6931+
6932+ return 0;
6933+}
6934+
6935+void cleanup_module(void)
6936+{
6937+ /* module cleanup by 'release_module' */
6938+ if(unregister_ip_vs_scheduler(&ip_vs_rr_scheduler) != 0)
6939+ IP_VS_INFO("cannot remove RR scheduling module\n");
6940+ else
6941+ IP_VS_INFO("RR scheduling module unloaded.\n");
6942+}
6943+
6944+#endif /* MODULE */
6945diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_wlc.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wlc.c
6946--- linux-2.2.19/net/ipv4/ip_vs_wlc.c Thu Jan 1 08:00:00 1970
6947+++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wlc.c Fri Nov 24 09:59:32 2000
6948@@ -0,0 +1,176 @@
6949+/*
6950+ * IPVS: Weighted Least-Connection Scheduling module
6951+ *
6952+ * Version: $Id$
6953+ *
6954+ * Authors: Wensong Zhang <wensong@iinchina.net>
6955+ * Peter Kese <peter.kese@ijs.si>
6956+ *
6957+ * This program is free software; you can redistribute it and/or
6958+ * modify it under the terms of the GNU General Public License
6959+ * as published by the Free Software Foundation; either version
6960+ * 2 of the License, or (at your option) any later version.
6961+ *
6962+ * Changes:
6963+ * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
6964+ * Wensong Zhang : changed to use the inactconns in scheduling
6965+ * Wensong Zhang : changed some comestics things for debugging
6966+ * Wensong Zhang : changed for the d-linked destination list
6967+ * Wensong Zhang : added the ip_vs_wlc_update_svc
6968+ * Wensong Zhang : added any dest with weight=0 is quiesced
6969+ *
6970+ */
6971+
6972+#include <linux/config.h>
6973+#include <linux/module.h>
6974+#ifdef CONFIG_KMOD
6975+#include <linux/kmod.h>
6976+#endif
6977+#include <linux/types.h>
6978+#include <linux/kernel.h>
6979+#include <linux/errno.h>
6980+#include <net/ip_masq.h>
6981+#ifdef CONFIG_IP_MASQUERADE_MOD
6982+#include <net/ip_masq_mod.h>
6983+#endif
6984+#include <linux/ip_fw.h>
6985+#include <net/ip_vs.h>
6986+
6987+
6988+static int
6989+ip_vs_wlc_init_svc(struct ip_vs_service *svc)
6990+{
6991+ MOD_INC_USE_COUNT;
6992+ return 0;
6993+}
6994+
6995+
6996+static int
6997+ip_vs_wlc_done_svc(struct ip_vs_service *svc)
6998+{
6999+ MOD_DEC_USE_COUNT;
7000+ return 0;
7001+}
7002+
7003+
7004+static int
7005+ip_vs_wlc_update_svc(struct ip_vs_service *svc)
7006+{
7007+ return 0;
7008+}
7009+
7010+
7011+/*
7012+ * Weighted Least Connection scheduling
7013+ */
7014+static struct ip_vs_dest *
7015+ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
7016+{
7017+ register struct list_head *l, *e;
7018+ struct ip_vs_dest *dest, *least;
7019+ int loh, doh;
7020+
7021+ IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
7022+
7023+ l = &svc->destinations;
7024+ if (l == l->next)
7025+ return NULL;
7026+
7027+ /*
7028+ * We think the overhead of processing active connections is fifty
7029+ * times than that of inactive conncetions in average. (This fifty
7030+ * times might be not accurate, we will change it later.) We use
7031+ * the following formula to estimate the overhead:
7032+ * dest->activeconns*50 + dest->inactconns
7033+ * and the load:
7034+ * (dest overhead) / dest->weight
7035+ *
7036+ * Remember -- no floats in kernel mode!!!
7037+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
7038+ * h1/w1 > h2/w2
7039+ * if every weight is larger than zero.
7040+ *
7041+ * The server with weight=0 is quiesced and will not receive any
7042+ * new connection.
7043+ */
7044+
7045+ for (e=l->next; e!=l; e=e->next) {
7046+ least = list_entry(e, struct ip_vs_dest, n_list);
7047+ if (least->weight > 0) {
7048+ loh = atomic_read(&least->activeconns) * 50
7049+ + atomic_read(&least->inactconns);
7050+ goto nextstage;
7051+ }
7052+ }
7053+ return NULL;
7054+
7055+ /*
7056+ * Find the destination with the least load.
7057+ */
7058+ nextstage:
7059+ for (e=e->next; e!=l; e=e->next) {
7060+ dest = list_entry(e, struct ip_vs_dest, n_list);
7061+ doh = atomic_read(&dest->activeconns) * 50
7062+ + atomic_read(&dest->inactconns);
7063+ if (loh * dest->weight > doh * least->weight) {
7064+ least = dest;
7065+ loh = doh;
7066+ }
7067+ }
7068+
7069+ IP_VS_DBG(6, "WLC: server %d.%d.%d.%d:%d "
7070+ "activeconns %d refcnt %d weight %d overhead %d\n",
7071+ NIPQUAD(least->addr), ntohs(least->port),
7072+ atomic_read(&least->activeconns),
7073+ atomic_read(&least->refcnt), least->weight, loh);
7074+
7075+ return least;
7076+}
7077+
7078+
7079+static struct ip_vs_scheduler ip_vs_wlc_scheduler =
7080+{
7081+ {0}, /* n_list */
7082+ "wlc", /* name */
7083+ ATOMIC_INIT (0), /* refcnt */
7084+ ip_vs_wlc_init_svc, /* service initializer */
7085+ ip_vs_wlc_done_svc, /* service done */
7086+ ip_vs_wlc_update_svc, /* service updater */
7087+ ip_vs_wlc_schedule, /* select a server from the destination list */
7088+};
7089+
7090+
7091+__initfunc(int ip_vs_wlc_init (void))
7092+{
7093+ IP_VS_INFO("Initializing WLC scheduling\n");
7094+ INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list);
7095+ return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
7096+}
7097+
7098+
7099+#ifdef MODULE
7100+EXPORT_NO_SYMBOLS;
7101+
7102+int init_module(void)
7103+{
7104+ INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list);
7105+
7106+ /* module initialization by 'request_module' */
7107+ if (register_ip_vs_scheduler(&ip_vs_wlc_scheduler) != 0)
7108+ return -EIO;
7109+
7110+ IP_VS_INFO("WLC scheduling module loaded.\n");
7111+
7112+ return 0;
7113+}
7114+
7115+void cleanup_module(void)
7116+{
7117+ /* module cleanup by 'release_module' */
7118+ if (unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler) != 0)
7119+ IP_VS_INFO("cannot remove WLC scheduling module\n");
7120+ else
7121+ IP_VS_INFO("WLC scheduling module unloaded.\n");
7122+}
7123+
7124+#endif /* MODULE */
7125diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_wrr.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wrr.c
7126--- linux-2.2.19/net/ipv4/ip_vs_wrr.c Thu Jan 1 08:00:00 1970
7127+++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wrr.c Fri Nov 24 09:57:23 2000
7128@@ -0,0 +1,209 @@
7129+/*
7130+ * IPVS: Weighted Round-Robin Scheduling module
7131+ *
7132+ * Version: $Id$
7133+ *
7134+ * Authors: Wensong Zhang <wensong@iinchina.net>
7135+ *
7136+ * This program is free software; you can redistribute it and/or
7137+ * modify it under the terms of the GNU General Public License
7138+ * as published by the Free Software Foundation; either version
7139+ * 2 of the License, or (at your option) any later version.
7140+ *
7141+ * Changes:
7142+ * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
7143+ * Wensong Zhang : changed some comestics things for debugging
7144+ * Wensong Zhang : changed for the d-linked destination list
7145+ * Wensong Zhang : added the ip_vs_wrr_update_svc
7146+ * Julian Anastasov : return -ENOMEM instead of ENOMEM in the
7147+ * ip_vs_wrr_init_svc
7148+ * Julian Anastasov : fixed the bug of returning destination
7149+ * with weight 0 when all weights are zero
7150+ *
7151+ */
7152+
7153+#include <linux/config.h>
7154+#include <linux/module.h>
7155+#ifdef CONFIG_KMOD
7156+#include <linux/kmod.h>
7157+#endif
7158+#include <linux/types.h>
7159+#include <linux/kernel.h>
7160+#include <linux/errno.h>
7161+#include <net/ip_masq.h>
7162+#ifdef CONFIG_IP_MASQUERADE_MOD
7163+#include <net/ip_masq_mod.h>
7164+#endif
7165+#include <linux/ip_fw.h>
7166+#include <net/ip_vs.h>
7167+
7168+/*
7169+ * current destination pointer for weighted round-robin scheduling
7170+ */
7171+struct ip_vs_wrr_mark {
7172+ struct list_head *cl; /* current list head */
7173+ int cw; /* current weight */
7174+};
7175+
7176+
7177+static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
7178+{
7179+ /*
7180+ * Allocate the mark variable for WRR scheduling
7181+ */
7182+ svc->sched_data = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
7183+
7184+ if (svc->sched_data == NULL) {
7185+ IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
7186+ return -ENOMEM;
7187+ }
7188+ memset(svc->sched_data, 0, sizeof(struct ip_vs_wrr_mark));
7189+
7190+ ((struct ip_vs_wrr_mark*)svc->sched_data)->cl = &svc->destinations;
7191+
7192+ MOD_INC_USE_COUNT;
7193+ return 0;
7194+}
7195+
7196+
7197+static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
7198+{
7199+ /*
7200+ * Release the mark variable
7201+ */
7202+ kfree_s(svc->sched_data, sizeof(struct ip_vs_wrr_mark));
7203+
7204+ MOD_DEC_USE_COUNT;
7205+ return 0;
7206+}
7207+
7208+
7209+static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
7210+{
7211+ ((struct ip_vs_wrr_mark*)svc->sched_data)->cl = &svc->destinations;
7212+ return 0;
7213+}
7214+
7215+
7216+/*
7217+ * Get the maximum weight of the service destinations.
7218+ */
7219+int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
7220+{
7221+ register struct list_head *l, *e;
7222+ struct ip_vs_dest *dest;
7223+ int weight = 0;
7224+
7225+ l = &svc->destinations;
7226+ for (e=l->next; e!=l; e=e->next) {
7227+ dest = list_entry(e, struct ip_vs_dest, n_list);
7228+ if (dest->weight > weight)
7229+ weight = dest->weight;
7230+ }
7231+
7232+ return weight;
7233+}
7234+
7235+
7236+/*
7237+ * Weighted Round-Robin Scheduling
7238+ */
7239+static struct ip_vs_dest* ip_vs_wrr_schedule(struct ip_vs_service *svc,
7240+ struct iphdr *iph)
7241+{
7242+ struct ip_vs_dest *dest;
7243+ struct ip_vs_wrr_mark *mark = svc->sched_data;
7244+
7245+ IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
7246+
7247+ /*
7248+ * This loop will always terminate, because 0<mark->cw<max_weight,
7249+ * and at least one server has its weight equal to max_weight.
7250+ */
7251+ while (1) {
7252+ if (mark->cl == &svc->destinations) {
7253+ /* it is at the head of the destination list */
7254+
7255+ if (mark->cl == mark->cl->next)
7256+ /* no dest entry */
7257+ return NULL;
7258+
7259+ mark->cl = svc->destinations.next;
7260+ mark->cw--;
7261+ if (mark->cw <= 0) {
7262+ mark->cw = ip_vs_wrr_max_weight(svc);
7263+ /*
7264+ * Still zero, which means no availabe servers.
7265+ */
7266+ if (mark->cw == 0) {
7267+ mark->cl = &svc->destinations;
7268+ IP_VS_INFO("ip_vs_wrr_schedule(): "
7269+ "no available servers\n");
7270+ return NULL;
7271+ }
7272+ }
7273+ }
7274+ else mark->cl = mark->cl->next;
7275+
7276+ if (mark->cl != &svc->destinations) {
7277+ /* not at the head of the list */
7278+ dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
7279+ if (dest->weight >= mark->cw)
7280+ break;
7281+ }
7282+ }
7283+
7284+ IP_VS_DBG(6, "WRR: server %d.%d.%d.%d:%d "
7285+ "activeconns %d refcnt %d weight %d\n",
7286+ NIPQUAD(dest->addr), ntohs(dest->port),
7287+ atomic_read(&dest->activeconns),
7288+ atomic_read(&dest->refcnt), dest->weight);
7289+
7290+ return dest;
7291+}
7292+
7293+
7294+static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
7295+ {0}, /* n_list */
7296+ "wrr", /* name */
7297+ ATOMIC_INIT(0), /* refcnt */
7298+ ip_vs_wrr_init_svc, /* service initializer */
7299+ ip_vs_wrr_done_svc, /* service done */
7300+ ip_vs_wrr_update_svc, /* service updater */
7301+ ip_vs_wrr_schedule, /* select a server from the destination list */
7302+};
7303+
7304+
7305+__initfunc(int ip_vs_wrr_init(void))
7306+{
7307+ IP_VS_INFO("Initializing WRR scheduling\n");
7308+ INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
7309+ return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
7310+}
7311+
7312+#ifdef MODULE
7313+EXPORT_NO_SYMBOLS;
7314+
7315+int init_module(void)
7316+{
7317+ INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
7318+
7319+ /* module initialization by 'request_module' */
7320+ if(register_ip_vs_scheduler(&ip_vs_wrr_scheduler) != 0)
7321+ return -EIO;
7322+
7323+ IP_VS_INFO("WRR scheduling module loaded.\n");
7324+
7325+ return 0;
7326+}
7327+
7328+void cleanup_module(void)
7329+{
7330+ /* module cleanup by 'release_module' */
7331+ if(unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler) != 0)
7332+ IP_VS_INFO("cannot remove WRR scheduling module\n");
7333+ else
7334+ IP_VS_INFO("WRR scheduling module unloaded.\n");
7335+}
7336+
7337+#endif /* MODULE */
7338diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/sysctl_net_ipv4.c linux-2.2.19-vs-1.0.8/net/ipv4/sysctl_net_ipv4.c
7339--- linux-2.2.19/net/ipv4/sysctl_net_ipv4.c Tue Mar 27 09:33:49 2001
7340+++ linux-2.2.19-vs-1.0.8/net/ipv4/sysctl_net_ipv4.c Tue Mar 27 09:32:21 2001
7341@@ -69,6 +69,9 @@
7342 struct ipv4_config ipv4_config;
7343
7344 extern ctl_table ipv4_route_table[];
7345+#ifdef CONFIG_IP_MASQUERADE_VS
7346+extern ctl_table ipv4_vs_table[];
7347+#endif
7348
7349 #ifdef CONFIG_SYSCTL
7350
7351@@ -198,7 +201,10 @@
7352 {NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships",
7353 &sysctl_igmp_max_memberships, sizeof(int), 0644, NULL, &proc_dointvec},
7354 #endif
7355+#ifdef CONFIG_IP_MASQUERADE_VS
7356+ {NET_IPV4_VS, "vs", NULL, 0, 0555, ipv4_vs_table},
7357+#endif
7358 {0}
7359 };
7360-
7361+
7362 #endif /* CONFIG_SYSCTL */
This page took 1.07596 seconds and 4 git commands to generate.