]>
Commit | Line | Data |
---|---|---|
cb4dd8b9 | 1 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/Documentation/Configure.help linux-2.2.19-vs-1.0.8/Documentation/Configure.help |
2 | --- linux-2.2.19/Documentation/Configure.help Tue Mar 27 09:33:35 2001 | |
3 | +++ linux-2.2.19-vs-1.0.8/Documentation/Configure.help Tue Mar 27 09:32:02 2001 | |
4 | @@ -2807,6 +2807,118 @@ | |
5 | The module will be called ip_masq_markfw.o. If you want to compile | |
6 | it as a module, say M here and read Documentation/modules.txt. | |
7 | ||
8 | +IP: masquerading virtual server support | |
9 | +CONFIG_IP_MASQUERADE_VS | |
10 | + IP Virtual Server support will let you build a virtual server | |
11 | + based on cluster of two or more real servers. This option must | |
12 | + be enabled for at least one of the clustered computers that will | |
13 | + take care of intercepting incomming connections to a single IP | |
14 | + address and scheduling them to real servers. | |
15 | + | |
16 | + Three request dispatching techniques are implemented, they are | |
17 | + virtual server via NAT, virtual server via tunneling and virtual | |
18 | + server via direct routing. The round-robin scheduling, the weighted | |
19 | + round-robin secheduling, the weighted least-connection scheduling, | |
20 | + the locality-based least-connection scheduling, or the | |
21 | + locality-based least-connection with replication scheduling | |
22 | + algorithm can be used to choose which server the connection is | |
23 | + directed to, thus load balancing can be achieved among the servers. | |
24 | + For more information and its administration program, please visit | |
25 | + the following URL: | |
26 | + | |
27 | + http://www.linuxvirtualserver.org/ | |
28 | + If you want this, say Y. | |
29 | + | |
30 | +IP virtual server debugging | |
31 | +CONFIG_IP_VS_DEBUG | |
32 | + Say Y here if you want to get additional messages useful in | |
33 | + debugging the IP virtual server code. You can change the debug | |
34 | + level in /proc/sys/net/ipv4/vs/debug_level | |
35 | + | |
36 | +IP masquerading VS table size (the Nth power of 2) | |
37 | +CONFIG_IP_MASQUERADE_VS_TAB_BITS | |
38 | + Using a big ipvs hash table for virtual server will greatly reduce | |
39 | + conflicts in the ipvs hash table when there are hundreds of thousands | |
40 | + of active connections. | |
41 | + | |
42 | + Note the table size must be power of 2. The table size will be the | |
43 | + value of 2 to the your input number power. For example, the default | |
44 | + number is 12, so the table size is 4096. Don't input the number too | |
45 | + small, otherwise you will lose performance on it. You can adapt the | |
46 | + table size yourself, according to your virtual server application. It | |
47 | + is good to set the table size not far less than the number of | |
48 | + connections per second multiplying average lasting time of connection | |
49 | + in the table. For example, your virtual server gets 200 connections | |
50 | + per second, the connection lasts for 200 seconds in average in the | |
51 | + masquerading table, the table size should be not far less than | |
52 | + 200x200, it is good to set the table size 32768 (2**15). | |
53 | + | |
54 | + Another note that each connection occupies 128 bytes effectively and | |
55 | + each hash entry uses 8 bytes, so you can estimate how much memory is | |
56 | + needed for your box. | |
57 | + | |
58 | +IPVS: round-robin scheduling | |
59 | +CONFIG_IP_MASQUERADE_VS_RR | |
60 | + The robin-robin scheduling algorithm simply directs network | |
61 | + connections to different real servers in a round-robin manner. | |
62 | + If you want to compile it in kernel, say Y. If you want to compile | |
63 | + it as a module, say M here and read Documentation/modules.txt. | |
64 | + | |
65 | +IPVS: weighted round-robin scheduling | |
66 | +CONFIG_IP_MASQUERADE_VS_WRR | |
67 | + The weighted robin-robin scheduling algorithm directs network | |
68 | + connections to different real servers based on server weights | |
69 | + in a round-robin manner. Servers with higher weights receive | |
70 | + new connections first than those with less weights, and servers | |
71 | + with higher weights get more connections than those with less | |
72 | + weights and servers with equal weights get equal connections. | |
73 | + If you want to compile it in kernel, say Y. If you want to compile | |
74 | + it as a module, say M here and read Documentation/modules.txt. | |
75 | + | |
76 | +IPVS: least-connection scheduling | |
77 | +CONFIG_IP_MASQUERADE_VS_LC | |
78 | + The least-connection scheduling algorithm directs network | |
79 | + connections to the server with the least number of active | |
80 | + connections. | |
81 | + If you want to compile it in kernel, say Y. If you want to compile | |
82 | + it as a module, say M here and read Documentation/modules.txt. | |
83 | + | |
84 | +IPVS: weighted least-connection scheduling | |
85 | +CONFIG_IP_MASQUERADE_VS_WLC | |
86 | + The weighted least-connection scheduling algorithm directs network | |
87 | + connections to the server with the least active connections | |
88 | + normalized by the server weight. | |
89 | + If you want to compile it in kernel, say Y. If you want to compile | |
90 | + it as a module, say M here and read Documentation/modules.txt. | |
91 | + | |
92 | +IPVS: locality-based least-connection scheduling | |
93 | +CONFIG_IP_MASQUERADE_VS_LBLC | |
94 | + The locality-based least-connection scheduling algorithm is for | |
95 | + destination IP load balancing. It is usually used in cache cluster. | |
96 | + This algorithm usually directs packet destined for an IP address to | |
97 | + its server if the server is alive and under load. If the server is | |
98 | + overloaded (its active connection numbers is larger than its weight) | |
99 | + and there is a server in its half load, then allocate the weighted | |
100 | + least-connection server to this IP address. | |
101 | + If you want to compile it in kernel, say Y. If you want to compile | |
102 | + it as a module, say M here and read Documentation/modules.txt. | |
103 | + | |
104 | +IPVS: locality-based least-connection with replication scheduling | |
105 | +CONFIG_IP_MASQUERADE_VS_LBLCR | |
106 | + The locality-based least-connection with replication scheduling | |
107 | + algorithm is also for destination IP load balancing. It is | |
108 | + usually used in cache cluster. It differs from the LBLC scheduling | |
109 | + as follows: the load balancer maintains mappings from a target | |
110 | + to a set of server nodes that can serve the target. Requests for | |
111 | + a target are assigned to the least-connection node in the target's | |
112 | + server set. If all the node in the server set are over loaded, | |
113 | + it picks up a least-connection node in the cluster and adds it | |
114 | + in the sever set for the target. If the server set has not been | |
115 | + modified for the specified time, the most loaded node is removed | |
116 | + from the server set, in order to avoid high degree of replication. | |
117 | + If you want to compile it in kernel, say Y. If you want to compile | |
118 | + it as a module, say M here and read Documentation/modules.txt. | |
119 | + | |
120 | IP: aliasing support | |
121 | CONFIG_IP_ALIAS | |
122 | Sometimes it is useful to give several IP addresses to a single | |
123 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/linux/ip_masq.h linux-2.2.19-vs-1.0.8/include/linux/ip_masq.h | |
124 | --- linux-2.2.19/include/linux/ip_masq.h Sat Oct 23 17:02:32 1999 | |
125 | +++ linux-2.2.19-vs-1.0.8/include/linux/ip_masq.h Tue Dec 12 19:17:27 2000 | |
126 | @@ -103,6 +103,27 @@ | |
127 | ||
128 | #define IP_MASQ_MFW_SCHED 0x01 | |
129 | ||
130 | +/* | |
131 | + * Virtual server stuff | |
132 | + */ | |
133 | +struct ip_vs_user { | |
134 | + /* virtual service options */ | |
135 | + u_int16_t protocol; | |
136 | + u_int32_t vaddr; /* virtual address */ | |
137 | + u_int16_t vport; | |
138 | + u_int32_t vfwmark; /* firwall mark of virtual */ | |
139 | + unsigned vs_flags; /* virtual service flags */ | |
140 | + unsigned timeout; /* persistent timeout in ticks */ | |
141 | + u_int32_t netmask; /* persistent netmask */ | |
142 | + | |
143 | + /* destination specific options */ | |
144 | + u_int32_t daddr; /* real destination address */ | |
145 | + u_int16_t dport; | |
146 | + unsigned masq_flags; /* destination flags */ | |
147 | + int weight; /* destination weight */ | |
148 | +}; | |
149 | + | |
150 | + | |
151 | #define IP_FW_MASQCTL_MAX 256 | |
152 | #define IP_MASQ_TNAME_MAX 32 | |
153 | ||
154 | @@ -115,6 +136,7 @@ | |
155 | struct ip_autofw_user autofw_user; | |
156 | struct ip_mfw_user mfw_user; | |
157 | struct ip_masq_user user; | |
158 | + struct ip_vs_user vs_user; | |
159 | unsigned char m_raw[IP_FW_MASQCTL_MAX]; | |
160 | } u; | |
161 | }; | |
162 | @@ -124,7 +146,9 @@ | |
163 | #define IP_MASQ_TARGET_CORE 1 | |
164 | #define IP_MASQ_TARGET_MOD 2 /* masq_mod is selected by "name" */ | |
165 | #define IP_MASQ_TARGET_USER 3 | |
166 | -#define IP_MASQ_TARGET_LAST 4 | |
167 | +#define IP_MASQ_TARGET_VS 4 | |
168 | +#define IP_MASQ_TARGET_LAST 5 | |
169 | + | |
170 | ||
171 | #define IP_MASQ_CMD_NONE 0 /* just peek */ | |
172 | #define IP_MASQ_CMD_INSERT 1 | |
173 | @@ -136,5 +160,9 @@ | |
174 | #define IP_MASQ_CMD_LIST 7 /* actually fake: done via /proc */ | |
175 | #define IP_MASQ_CMD_ENABLE 8 | |
176 | #define IP_MASQ_CMD_DISABLE 9 | |
177 | +#define IP_MASQ_CMD_ADD_DEST 10 /* for adding dest in IPVS */ | |
178 | +#define IP_MASQ_CMD_DEL_DEST 11 /* for deleting dest in IPVS */ | |
179 | +#define IP_MASQ_CMD_SET_DEST 12 /* for setting dest in IPVS */ | |
180 | ||
181 | #endif /* _LINUX_IP_MASQ_H */ | |
182 | + | |
183 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/linux/sysctl.h linux-2.2.19-vs-1.0.8/include/linux/sysctl.h | |
184 | --- linux-2.2.19/include/linux/sysctl.h Tue Mar 27 09:33:48 2001 | |
185 | +++ linux-2.2.19-vs-1.0.8/include/linux/sysctl.h Tue Mar 27 09:32:20 2001 | |
186 | @@ -196,6 +196,7 @@ | |
187 | NET_IPV4_NEIGH=17, | |
188 | NET_IPV4_ROUTE=18, | |
189 | NET_IPV4_FIB_HASH=19, | |
190 | + NET_IPV4_VS=20, | |
191 | ||
192 | NET_IPV4_TCP_TIMESTAMPS=33, | |
193 | NET_IPV4_TCP_WINDOW_SCALING=34, | |
194 | @@ -275,6 +276,32 @@ | |
195 | NET_IPV4_CONF_LOG_MARTIANS=11, | |
196 | NET_IPV4_CONF_HIDDEN=12, | |
197 | NET_IPV4_CONF_ARPFILTER=13 | |
198 | +}; | |
199 | + | |
200 | +/* /proc/sys/net/ipv4/vs */ | |
201 | + | |
202 | +enum | |
203 | +{ | |
204 | + NET_IPV4_VS_AMEMTHRESH=1, | |
205 | + NET_IPV4_VS_AMDROPRATE=2, | |
206 | + NET_IPV4_VS_DROP_ENTRY=3, | |
207 | + NET_IPV4_VS_DROP_PACKET=4, | |
208 | + NET_IPV4_VS_SECURE_TCP=5, | |
209 | + NET_IPV4_VS_TO_ES=6, | |
210 | + NET_IPV4_VS_TO_SS=7, | |
211 | + NET_IPV4_VS_TO_SR=8, | |
212 | + NET_IPV4_VS_TO_FW=9, | |
213 | + NET_IPV4_VS_TO_TW=10, | |
214 | + NET_IPV4_VS_TO_CL=11, | |
215 | + NET_IPV4_VS_TO_CW=12, | |
216 | + NET_IPV4_VS_TO_LA=13, | |
217 | + NET_IPV4_VS_TO_LI=14, | |
218 | + NET_IPV4_VS_TO_SA=15, | |
219 | + NET_IPV4_VS_TO_UDP=16, | |
220 | + NET_IPV4_VS_TO_ICMP=17, | |
221 | + NET_IPV4_VS_DEBUG_LEVEL=18, | |
222 | + NET_IPV4_VS_LBLC_EXPIRE=19, | |
223 | + NET_IPV4_VS_LBLCR_EXPIRE=20, | |
224 | }; | |
225 | ||
226 | /* /proc/sys/net/ipv6 */ | |
227 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip.h linux-2.2.19-vs-1.0.8/include/net/ip.h | |
228 | --- linux-2.2.19/include/net/ip.h Tue Mar 27 09:33:48 2001 | |
229 | +++ linux-2.2.19-vs-1.0.8/include/net/ip.h Tue Mar 27 17:48:23 2001 | |
230 | @@ -47,6 +47,9 @@ | |
231 | #define IPSKB_MASQUERADED 1 | |
232 | #define IPSKB_TRANSLATED 2 | |
233 | #define IPSKB_FORWARDED 4 | |
234 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
235 | +#define IPSKB_REDIRECTED 8 | |
236 | +#endif | |
237 | }; | |
238 | ||
239 | struct ipcm_cookie | |
240 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip_masq.h linux-2.2.19-vs-1.0.8/include/net/ip_masq.h | |
241 | --- linux-2.2.19/include/net/ip_masq.h Tue Mar 27 09:33:48 2001 | |
242 | +++ linux-2.2.19-vs-1.0.8/include/net/ip_masq.h Wed Apr 18 16:17:59 2001 | |
243 | @@ -12,8 +12,15 @@ | |
244 | #include <linux/ip.h> | |
245 | #include <linux/skbuff.h> | |
246 | #include <linux/list.h> | |
247 | +#ifdef CONFIG_SYSCTL | |
248 | +#include <linux/sysctl.h> | |
249 | +#endif | |
250 | #endif /* __KERNEL__ */ | |
251 | ||
252 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
253 | +struct ip_vs_dest; | |
254 | +#endif | |
255 | + | |
256 | /* | |
257 | * This define affects the number of ports that can be handled | |
258 | * by each of the protocol helper modules. | |
259 | @@ -66,10 +73,6 @@ | |
260 | #define IP_MASQ_MOD_CTL 0x00 | |
261 | #define IP_MASQ_USER_CTL 0x01 | |
262 | ||
263 | -#ifdef __KERNEL__ | |
264 | - | |
265 | -#define IP_MASQ_TAB_SIZE 256 | |
266 | - | |
267 | #define IP_MASQ_F_NO_DADDR 0x0001 /* no daddr yet */ | |
268 | #define IP_MASQ_F_NO_DPORT 0x0002 /* no dport set yet */ | |
269 | #define IP_MASQ_F_NO_SADDR 0x0004 /* no sport set yet */ | |
270 | @@ -86,6 +89,22 @@ | |
271 | #define IP_MASQ_F_USER 0x2000 /* from uspace */ | |
272 | #define IP_MASQ_F_SIMPLE_HASH 0x8000 /* prevent s+d and m+d hashing */ | |
273 | ||
274 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
275 | +#define IP_MASQ_F_VS 0x00010000 /* virtual server related */ | |
276 | +#define IP_MASQ_F_VS_NO_OUTPUT 0x00020000 /* output packets avoid masq */ | |
277 | +#define IP_MASQ_F_VS_INACTIVE 0x00040000 /* not established */ | |
278 | +#define IP_MASQ_F_VS_FWD_MASK 0x00700000 /* mask for the fdw method */ | |
279 | +#define IP_MASQ_F_VS_LOCALNODE 0x00100000 /* local node destination */ | |
280 | +#define IP_MASQ_F_VS_TUNNEL 0x00200000 /* packets will be tunneled */ | |
281 | +#define IP_MASQ_F_VS_DROUTE 0x00400000 /* direct routing */ | |
282 | + /* masquerading otherwise */ | |
283 | +#define IP_MASQ_VS_FWD(ms) (ms->flags & IP_MASQ_F_VS_FWD_MASK) | |
284 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
285 | + | |
286 | +#ifdef __KERNEL__ | |
287 | + | |
288 | +#define IP_MASQ_TAB_SIZE 256 | |
289 | + | |
290 | /* | |
291 | * Delta seq. info structure | |
292 | * Each MASQ struct has 2 (output AND input seq. changes). | |
293 | @@ -114,9 +133,13 @@ | |
294 | struct ip_masq *control; /* Master control connection */ | |
295 | atomic_t n_control; /* Number of "controlled" masqs */ | |
296 | unsigned flags; /* status flags */ | |
297 | - unsigned timeout; /* timeout */ | |
298 | + unsigned long timeout; /* timeout */ | |
299 | unsigned state; /* state info */ | |
300 | struct ip_masq_timeout_table *timeout_table; | |
301 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
302 | + struct ip_vs_dest *dest; /* real server */ | |
303 | + atomic_t in_pkts; /* incoming packet counter */ | |
304 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
305 | }; | |
306 | ||
307 | /* | |
308 | @@ -179,7 +202,7 @@ | |
309 | extern struct list_head ip_masq_d_table[IP_MASQ_TAB_SIZE]; | |
310 | extern const char * ip_masq_state_name(int state); | |
311 | extern struct ip_masq_hook *ip_masq_user_hook; | |
312 | -extern u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope); | |
313 | +extern int ip_masq_select_addr(struct sk_buff *skb,__u32 *maddr); | |
314 | /* | |
315 | * | |
316 | * IP_MASQ_APP: IP application masquerading definitions | |
317 | @@ -354,6 +377,10 @@ | |
318 | static const char *strProt[] = {"UDP","TCP","ICMP"}; | |
319 | int msproto = masq_proto_num(proto); | |
320 | ||
321 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
322 | + if (proto == IPPROTO_IP) | |
323 | + return "IP "; | |
324 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
325 | if (msproto<0||msproto>2) { | |
326 | sprintf(buf, "IP_%d", proto); | |
327 | return buf; | |
328 | @@ -372,6 +399,9 @@ | |
329 | IP_MASQ_S_CLOSE_WAIT, | |
330 | IP_MASQ_S_LAST_ACK, | |
331 | IP_MASQ_S_LISTEN, | |
332 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
333 | + IP_MASQ_S_SYNACK, | |
334 | +#endif | |
335 | IP_MASQ_S_UDP, | |
336 | IP_MASQ_S_ICMP, | |
337 | IP_MASQ_S_LAST | |
338 | @@ -395,8 +425,33 @@ | |
339 | ||
340 | if (!mstim) | |
341 | return; | |
342 | + ms->timeout_table = NULL; | |
343 | atomic_dec(&mstim->refcnt); | |
344 | } | |
345 | + | |
346 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
347 | + | |
348 | +extern struct ip_masq_timeout_table masq_timeout_table_dos; | |
349 | +extern void ip_masq_secure_tcp_set(int on); | |
350 | + | |
351 | +/* | |
352 | + * This is a simple mechanism to ignore packets when | |
353 | + * we are loaded. Just set ip_masq_drop_rate to 'n' and | |
354 | + * we start to drop 1/n of the packets | |
355 | + */ | |
356 | + | |
357 | +extern int ip_masq_drop_rate; | |
358 | +extern int ip_masq_drop_counter; | |
359 | + | |
360 | +static __inline__ int ip_masq_todrop(void) | |
361 | +{ | |
362 | + if (!ip_masq_drop_rate) return 0; | |
363 | + if (--ip_masq_drop_counter > 0) return 0; | |
364 | + ip_masq_drop_counter = ip_masq_drop_rate; | |
365 | + return 1; | |
366 | +} | |
367 | + | |
368 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
369 | ||
370 | #endif /* __KERNEL__ */ | |
371 | ||
372 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/include/net/ip_vs.h linux-2.2.19-vs-1.0.8/include/net/ip_vs.h | |
373 | --- linux-2.2.19/include/net/ip_vs.h Thu Jan 1 08:00:00 1970 | |
374 | +++ linux-2.2.19-vs-1.0.8/include/net/ip_vs.h Tue Apr 24 18:07:00 2001 | |
375 | @@ -0,0 +1,392 @@ | |
376 | +/* | |
377 | + * IP virtual server | |
378 | + * data structure and functionality definitions | |
379 | + */ | |
380 | + | |
381 | +#include <linux/config.h> | |
382 | + | |
383 | +#ifndef _IP_VS_H | |
384 | +#define _IP_VS_H | |
385 | + | |
386 | +#define IP_VS_VERSION_CODE 0x010008 | |
387 | +#define NVERSION(version) \ | |
388 | + (version >> 16) & 0xFF, \ | |
389 | + (version >> 8) & 0xFF, \ | |
390 | + version & 0xFF | |
391 | + | |
392 | +/* | |
393 | + * Virtual Service Flags | |
394 | + */ | |
395 | +#define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */ | |
396 | +#define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */ | |
397 | + | |
398 | +/* | |
399 | + * Destination Server Flags | |
400 | + */ | |
401 | +#define IP_VS_DEST_F_AVAILABLE 0x0001 /* Available tag */ | |
402 | + | |
403 | +/* | |
404 | + * The default IP_VS_TEMPLATE_TIMEOUT is a little larger than average | |
405 | + * connection time plus MASQUERADE_EXPIRE_TCP_FIN(2*60*HZ). Because the | |
406 | + * template won't be released until its controlled masq entries are | |
407 | + * expired. | |
408 | + * If IP_VS_TEMPLATE_TIMEOUT is too less, the template will soon expire | |
409 | + * and will be put in expire again and again, which requires additional | |
410 | + * overhead. If it is too large, the same will always visit the same | |
411 | + * server, which will make dynamic load imbalance worse. | |
412 | + */ | |
413 | +#define IP_VS_TEMPLATE_TIMEOUT 6*60*HZ | |
414 | + | |
415 | +#ifdef __KERNEL__ | |
416 | + | |
417 | +extern int ip_vs_forwarding_related_icmp(struct sk_buff *skb); | |
418 | + | |
419 | +#ifdef CONFIG_IP_VS_DEBUG | |
420 | +extern int ip_vs_get_debug_level(void); | |
421 | +#define IP_VS_DBG(level, msg...) \ | |
422 | + do { \ | |
423 | + if (level <= ip_vs_get_debug_level()) \ | |
424 | + printk(KERN_DEBUG "IPVS: " ## msg); \ | |
425 | + } while (0) | |
426 | +#else /* NO DEBUGGING at ALL */ | |
427 | +#define IP_VS_DBG(level, msg...) do {} while (0) | |
428 | +#endif | |
429 | + | |
430 | +#define IP_VS_ERR(msg...) printk(KERN_ERR "IPVS: " ## msg ) | |
431 | +#define IP_VS_INFO(msg...) printk(KERN_INFO "IPVS: " ## msg ) | |
432 | +#define IP_VS_WARNING(msg...) \ | |
433 | + printk(KERN_WARNING "IPVS: " ## msg) | |
434 | + | |
435 | +#ifdef CONFIG_IP_VS_DEBUG | |
436 | +#define EnterFunction(level) \ | |
437 | + do { \ | |
438 | + if (level <= ip_vs_get_debug_level()) \ | |
439 | + printk(KERN_DEBUG "Enter: %s, %s line %i\n", \ | |
440 | + __FUNCTION__, __FILE__, __LINE__); \ | |
441 | + } while (0) | |
442 | +#define LeaveFunction(level) \ | |
443 | + do { \ | |
444 | + if (level <= ip_vs_get_debug_level()) \ | |
445 | + printk(KERN_DEBUG "Leave: %s, %s line %i\n", \ | |
446 | + __FUNCTION__, __FILE__, __LINE__); \ | |
447 | + } while (0) | |
448 | +#else | |
449 | +#define EnterFunction(level) do {} while (0) | |
450 | +#define LeaveFunction(level) do {} while (0) | |
451 | +#endif | |
452 | + | |
453 | + | |
454 | +/* | |
455 | + * IPVS statistics object | |
456 | + */ | |
457 | +struct ip_vs_stats | |
458 | +{ | |
459 | + spinlock_t lock; /* spin lock */ | |
460 | + __u32 conns; /* connections scheduled */ | |
461 | + __u32 inpkts; /* incoming packets */ | |
462 | + __u32 outpkts; /* outgoing packets */ | |
463 | + __u64 inbytes; /* incoming bytes */ | |
464 | + __u64 outbytes; /* outgoing bytes */ | |
465 | +}; | |
466 | + | |
467 | + | |
468 | +/* | |
469 | + * The real server destination forwarding entry | |
470 | + * with ip address, port | |
471 | + */ | |
472 | +struct ip_vs_dest { | |
473 | + struct list_head n_list; /* for the dests in the service */ | |
474 | + struct list_head d_list; /* for table with all the dests */ | |
475 | + | |
476 | + __u32 addr; /* IP address of real server */ | |
477 | + __u16 port; /* port number of the service */ | |
478 | + unsigned flags; /* dest status flags */ | |
479 | + unsigned masq_flags; /* flags to copy to masq */ | |
480 | + atomic_t activeconns; /* active connections */ | |
481 | + atomic_t inactconns; /* inactive connections */ | |
482 | + atomic_t refcnt; /* reference counter */ | |
483 | + int weight; /* server weight */ | |
484 | + struct ip_vs_stats stats; /* statistics */ | |
485 | + | |
486 | + /* for virtual service */ | |
487 | + struct ip_vs_service *svc; /* service that it belongs to */ | |
488 | + __u16 protocol; /* which protocol (TCP/UDP) */ | |
489 | + __u32 vaddr; /* IP address for virtual service */ | |
490 | + __u16 vport; /* port number for the service */ | |
491 | + __u32 vfwmark; /* firewall mark of the service */ | |
492 | +}; | |
493 | + | |
494 | + | |
495 | +/* | |
496 | + * The scheduler object | |
497 | + */ | |
498 | +struct ip_vs_scheduler { | |
499 | + struct list_head n_list; /* d-linked list head */ | |
500 | + char *name; /* scheduler name */ | |
501 | + atomic_t refcnt; /* reference counter */ | |
502 | + | |
503 | + /* scheduler initializing service */ | |
504 | + int (*init_service)(struct ip_vs_service *svc); | |
505 | + /* scheduling service finish */ | |
506 | + int (*done_service)(struct ip_vs_service *svc); | |
507 | + /* scheduler updating service */ | |
508 | + int (*update_service)(struct ip_vs_service *svc); | |
509 | + | |
510 | + /* selecting a server from the given service */ | |
511 | + struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc, | |
512 | + struct iphdr *iph); | |
513 | +}; | |
514 | + | |
515 | + | |
516 | +/* | |
517 | + * The information about the virtual service offered to the net | |
518 | + * and the forwarding entries | |
519 | + */ | |
520 | +struct ip_vs_service { | |
521 | + struct list_head s_list; /* for normal service table */ | |
522 | + struct list_head f_list; /* for fwmark-based service table */ | |
523 | + atomic_t refcnt; /* reference counter */ | |
524 | + | |
525 | + __u16 protocol; /* which protocol (TCP/UDP) */ | |
526 | + __u32 addr; /* IP address for virtual service */ | |
527 | + __u16 port; /* port number for the service */ | |
528 | + __u32 fwmark; /* firewall mark of the service */ | |
529 | + unsigned flags; /* service status flags */ | |
530 | + unsigned timeout; /* persistent timeout in ticks */ | |
531 | + __u32 netmask; /* grouping granularity */ | |
532 | + struct list_head destinations; /* real server d-linked list */ | |
533 | + struct ip_vs_stats stats; /* statistics for the service */ | |
534 | + | |
535 | + /* for scheduling */ | |
536 | + struct ip_vs_scheduler *scheduler; /* bound scheduler object */ | |
537 | + void *sched_data; /* scheduler application data */ | |
538 | +}; | |
539 | + | |
540 | + | |
541 | +/* | |
542 | + * IP Virtual Server masq entry hash table | |
543 | + */ | |
544 | +#define IP_VS_TAB_BITS CONFIG_IP_MASQUERADE_VS_TAB_BITS | |
545 | +#define IP_VS_TAB_SIZE (1 << IP_VS_TAB_BITS) | |
546 | +#define IP_VS_TAB_MASK (IP_VS_TAB_SIZE - 1) | |
547 | +extern struct list_head *ip_vs_table; | |
548 | + | |
549 | +/* | |
550 | + * Hash and unhash functions | |
551 | + */ | |
552 | +extern int ip_vs_hash(struct ip_masq *ms); | |
553 | +extern int ip_vs_unhash(struct ip_masq *ms); | |
554 | + | |
555 | +/* | |
556 | + * Registering/unregistering scheduler functions | |
557 | + */ | |
558 | +extern int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); | |
559 | +extern int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); | |
560 | + | |
561 | +/* | |
562 | + * Lookup functions for the hash table (caller must lock table) | |
563 | + */ | |
564 | +extern struct ip_masq * __ip_vs_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port); | |
565 | +extern struct ip_masq * __ip_vs_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port); | |
566 | + | |
567 | +/* | |
568 | + * Creating a masquerading entry for IPVS | |
569 | + */ | |
570 | +extern struct ip_masq * ip_masq_new_vs(int proto, __u32 maddr, __u16 mport, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned flags); | |
571 | + | |
572 | +/* | |
573 | + * IPVS data and functions | |
574 | + */ | |
575 | +extern rwlock_t __ip_vs_lock; | |
576 | + | |
577 | +extern void ip_vs_set_state(struct ip_masq *ms, int new_state); | |
578 | +extern void ip_vs_bind_masq(struct ip_masq *ms, struct ip_vs_dest *dest); | |
579 | +extern void ip_vs_unbind_masq(struct ip_masq *ms); | |
580 | + | |
581 | +extern int ip_vs_ctl(int optname, struct ip_masq_ctl *mctl, int optlen); | |
582 | +extern struct ip_vs_service * | |
583 | +ip_vs_lookup_service(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport); | |
584 | +extern struct ip_vs_service * ip_vs_lookup_svc_fwm(__u32 fwmark); | |
585 | +extern struct ip_vs_dest * | |
586 | +__ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport); | |
587 | +extern struct ip_vs_dest *ip_vs_lookup_dest(struct ip_vs_service *svc, | |
588 | + __u32 daddr, __u16 dport); | |
589 | +extern struct ip_masq * ip_vs_schedule(struct ip_vs_service *svc, | |
590 | + struct iphdr *iph); | |
591 | +extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb); | |
592 | +extern int ip_vs_tunnel_xmit(struct sk_buff *skb, __u32 daddr); | |
593 | +extern int ip_vs_dr_xmit(struct sk_buff *skb, __u32 daddr); | |
594 | + | |
595 | +/* | |
596 | + * init function | |
597 | + */ | |
598 | +extern int ip_vs_init(void); | |
599 | + | |
600 | +/* | |
601 | + * init function prototypes for scheduling modules | |
602 | + * these function will be called when they are built in kernel | |
603 | + */ | |
604 | +extern int ip_vs_rr_init(void); | |
605 | +extern int ip_vs_wrr_init(void); | |
606 | +extern int ip_vs_lc_init(void); | |
607 | +extern int ip_vs_wlc_init(void); | |
608 | +extern int ip_vs_lblc_init(void); | |
609 | +extern int ip_vs_lblcr_init(void); | |
610 | + | |
611 | + | |
612 | +/* | |
613 | + * Slow timer functions for IPVS | |
614 | + */ | |
615 | +extern void add_sltimer(struct timer_list * timer); | |
616 | +extern int del_sltimer(struct timer_list * timer); | |
617 | +extern void mod_sltimer(struct timer_list *timer, unsigned long expires); | |
618 | + | |
619 | + | |
620 | +/* | |
621 | + * IP Virtual Server statistics | |
622 | + */ | |
623 | +extern struct ip_vs_stats ip_vs_stats; | |
624 | + | |
625 | +extern __inline__ void | |
626 | +ip_vs_in_stats(struct ip_masq *ms, struct sk_buff *skb) | |
627 | +{ | |
628 | + struct ip_vs_dest *dest = ms->dest; | |
629 | + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | |
630 | + spin_lock(&dest->stats.lock); | |
631 | + dest->stats.inpkts++; | |
632 | + dest->stats.inbytes += skb->len; | |
633 | + spin_unlock(&dest->stats.lock); | |
634 | + | |
635 | + spin_lock(&dest->svc->stats.lock); | |
636 | + dest->svc->stats.inpkts++; | |
637 | + dest->svc->stats.inbytes += skb->len; | |
638 | + spin_unlock(&dest->svc->stats.lock); | |
639 | + | |
640 | + spin_lock(&ip_vs_stats.lock); | |
641 | + ip_vs_stats.inpkts++; | |
642 | + ip_vs_stats.inbytes += skb->len; | |
643 | + spin_unlock(&ip_vs_stats.lock); | |
644 | + } | |
645 | +} | |
646 | + | |
647 | + | |
648 | +extern __inline__ void | |
649 | +ip_vs_out_stats(struct ip_masq *ms, struct sk_buff *skb) | |
650 | +{ | |
651 | + struct ip_vs_dest *dest = ms->dest; | |
652 | + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | |
653 | + spin_lock(&dest->stats.lock); | |
654 | + dest->stats.outpkts++; | |
655 | + dest->stats.outbytes += skb->len; | |
656 | + spin_unlock(&dest->stats.lock); | |
657 | + | |
658 | + spin_lock(&dest->svc->stats.lock); | |
659 | + dest->svc->stats.outpkts++; | |
660 | + dest->svc->stats.outbytes += skb->len; | |
661 | + spin_unlock(&dest->svc->stats.lock); | |
662 | + | |
663 | + spin_lock(&ip_vs_stats.lock); | |
664 | + ip_vs_stats.outpkts++; | |
665 | + ip_vs_stats.outbytes += skb->len; | |
666 | + spin_unlock(&ip_vs_stats.lock); | |
667 | + } | |
668 | +} | |
669 | + | |
670 | + | |
671 | +extern __inline__ void | |
672 | +ip_vs_conn_stats(struct ip_masq *ms, struct ip_vs_service *svc) | |
673 | +{ | |
674 | + spin_lock(&ms->dest->stats.lock); | |
675 | + ms->dest->stats.conns++; | |
676 | + spin_unlock(&ms->dest->stats.lock); | |
677 | + | |
678 | + spin_lock(&svc->stats.lock); | |
679 | + svc->stats.conns++; | |
680 | + spin_unlock(&svc->stats.lock); | |
681 | + | |
682 | + spin_lock(&ip_vs_stats.lock); | |
683 | + ip_vs_stats.conns++; | |
684 | + spin_unlock(&ip_vs_stats.lock); | |
685 | +} | |
686 | + | |
687 | + | |
688 | +/* | |
689 | + * ip_vs_fwd_tag returns the forwarding tag of the masq | |
690 | + */ | |
691 | +extern __inline__ char ip_vs_fwd_tag(struct ip_masq *ms) | |
692 | +{ | |
693 | + char fwd = 'M'; | |
694 | + | |
695 | + switch (IP_MASQ_VS_FWD(ms)) { | |
696 | + case IP_MASQ_F_VS_LOCALNODE: fwd = 'L'; break; | |
697 | + case IP_MASQ_F_VS_TUNNEL: fwd = 'T'; break; | |
698 | + case IP_MASQ_F_VS_DROUTE: fwd = 'R'; break; | |
699 | + } | |
700 | + return fwd; | |
701 | +} | |
702 | + | |
703 | + | |
704 | +extern __inline__ char * ip_vs_fwd_name(unsigned masq_flags) | |
705 | +{ | |
706 | + char *fwd; | |
707 | + | |
708 | + switch (masq_flags & IP_MASQ_F_VS_FWD_MASK) { | |
709 | + case IP_MASQ_F_VS_LOCALNODE: | |
710 | + fwd = "Local"; | |
711 | + break; | |
712 | + case IP_MASQ_F_VS_TUNNEL: | |
713 | + fwd = "Tunnel"; | |
714 | + break; | |
715 | + case IP_MASQ_F_VS_DROUTE: | |
716 | + fwd = "Route"; | |
717 | + break; | |
718 | + default: | |
719 | + fwd = "Masq"; | |
720 | + } | |
721 | + return fwd; | |
722 | +} | |
723 | + | |
724 | + | |
725 | +/* | |
726 | + * ip_vs_forward forwards the packet through tunneling, direct | |
727 | + * routing or local node (passing to the upper layer). | |
728 | + * Return values mean: | |
729 | + * 0 skb must be passed to the upper layer | |
730 | + * -1 skb must be released | |
731 | + * -2 skb has been released | |
732 | + */ | |
733 | +extern __inline__ int ip_vs_forward(struct sk_buff *skb, struct ip_masq *ms) | |
734 | +{ | |
735 | + int ret = -1; | |
736 | + | |
737 | + atomic_inc(&ms->in_pkts); | |
738 | + | |
739 | + switch (IP_MASQ_VS_FWD(ms)) { | |
740 | + case IP_MASQ_F_VS_TUNNEL: | |
741 | + if (ip_vs_tunnel_xmit(skb, ms->saddr) == 0) { | |
742 | + IP_VS_DBG(10, "tunneling failed.\n"); | |
743 | + } else { | |
744 | + IP_VS_DBG(10, "tunneling succeeded.\n"); | |
745 | + } | |
746 | + ret = -2; | |
747 | + break; | |
748 | + | |
749 | + case IP_MASQ_F_VS_DROUTE: | |
750 | + if (ip_vs_dr_xmit(skb, ms->saddr) == 0) { | |
751 | + IP_VS_DBG(10, "direct routing failed.\n"); | |
752 | + } else { | |
753 | + IP_VS_DBG(10, "direct routing succeeded.\n"); | |
754 | + } | |
755 | + ret = -2; | |
756 | + break; | |
757 | + | |
758 | + case IP_MASQ_F_VS_LOCALNODE: | |
759 | + ret = 0; | |
760 | + } | |
761 | + | |
762 | + return ret; | |
763 | +} | |
764 | + | |
765 | +#endif /* __KERNEL__ */ | |
766 | + | |
767 | +#endif /* _IP_VS_H */ | |
768 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/Config.in linux-2.2.19-vs-1.0.8/net/ipv4/Config.in | |
769 | --- linux-2.2.19/net/ipv4/Config.in Sat Dec 16 23:10:12 2000 | |
770 | +++ linux-2.2.19-vs-1.0.8/net/ipv4/Config.in Tue Dec 12 18:35:06 2000 | |
771 | @@ -51,6 +51,17 @@ | |
772 | tristate 'IP: ipportfw masq support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_IPPORTFW | |
773 | tristate 'IP: ip fwmark masq-forwarding support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_MFW | |
774 | fi | |
775 | + bool 'IP: masquerading virtual server support (EXPERIMENTAL)' CONFIG_IP_MASQUERADE_VS | |
776 | + if [ "$CONFIG_IP_MASQUERADE_VS" = "y" ]; then | |
777 | + bool ' IP virtual server debugging' CONFIG_IP_VS_DEBUG | |
778 | + int ' IP masquerading VS table size (the Nth power of 2)' CONFIG_IP_MASQUERADE_VS_TAB_BITS 12 | |
779 | + tristate ' IPVS: round-robin scheduling' CONFIG_IP_MASQUERADE_VS_RR | |
780 | + tristate ' IPVS: weighted round-robin scheduling' CONFIG_IP_MASQUERADE_VS_WRR | |
781 | + tristate ' IPVS: least-connection scheduling' CONFIG_IP_MASQUERADE_VS_LC | |
782 | + tristate ' IPVS: weighted least-connection scheduling' CONFIG_IP_MASQUERADE_VS_WLC | |
783 | + tristate ' IPVS: locality-based least-connection scheduling' CONFIG_IP_MASQUERADE_VS_LBLC | |
784 | + tristate ' IPVS: locality-based least-connection with replication scheduling' CONFIG_IP_MASQUERADE_VS_LBLCR | |
785 | + fi | |
786 | fi | |
787 | fi | |
788 | fi | |
789 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/Makefile linux-2.2.19-vs-1.0.8/net/ipv4/Makefile | |
790 | --- linux-2.2.19/net/ipv4/Makefile Tue Jan 5 07:31:34 1999 | |
791 | +++ linux-2.2.19-vs-1.0.8/net/ipv4/Makefile Sat Dec 2 22:32:10 2000 | |
792 | @@ -91,6 +91,58 @@ | |
793 | ||
794 | endif | |
795 | ||
796 | +ifeq ($(CONFIG_IP_MASQUERADE_VS),y) | |
797 | + IPV4X_OBJS += ip_vs.o | |
798 | + | |
799 | + ifeq ($(CONFIG_IP_MASQUERADE_VS_RR),y) | |
800 | + IPV4_OBJS += ip_vs_rr.o | |
801 | + else | |
802 | + ifeq ($(CONFIG_IP_MASQUERADE_VS_RR),m) | |
803 | + M_OBJS += ip_vs_rr.o | |
804 | + endif | |
805 | + endif | |
806 | + | |
807 | + ifeq ($(CONFIG_IP_MASQUERADE_VS_WRR),y) | |
808 | + IPV4_OBJS += ip_vs_wrr.o | |
809 | + else | |
810 | + ifeq ($(CONFIG_IP_MASQUERADE_VS_WRR),m) | |
811 | + M_OBJS += ip_vs_wrr.o | |
812 | + endif | |
813 | + endif | |
814 | + | |
815 | + ifeq ($(CONFIG_IP_MASQUERADE_VS_LC),y) | |
816 | + IPV4_OBJS += ip_vs_lc.o | |
817 | + else | |
818 | + ifeq ($(CONFIG_IP_MASQUERADE_VS_LC),m) | |
819 | + M_OBJS += ip_vs_lc.o | |
820 | + endif | |
821 | + endif | |
822 | + | |
823 | + ifeq ($(CONFIG_IP_MASQUERADE_VS_WLC),y) | |
824 | + IPV4_OBJS += ip_vs_wlc.o | |
825 | + else | |
826 | + ifeq ($(CONFIG_IP_MASQUERADE_VS_WLC),m) | |
827 | + M_OBJS += ip_vs_wlc.o | |
828 | + endif | |
829 | + endif | |
830 | + | |
831 | + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLC),y) | |
832 | + IPV4_OBJS += ip_vs_lblc.o | |
833 | + else | |
834 | + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLC),m) | |
835 | + M_OBJS += ip_vs_lblc.o | |
836 | + endif | |
837 | + endif | |
838 | + | |
839 | + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLCR),y) | |
840 | + IPV4_OBJS += ip_vs_lblcr.o | |
841 | + else | |
842 | + ifeq ($(CONFIG_IP_MASQUERADE_VS_LBLCR),m) | |
843 | + M_OBJS += ip_vs_lblcr.o | |
844 | + endif | |
845 | + endif | |
846 | +endif | |
847 | + | |
848 | M_OBJS += ip_masq_user.o | |
849 | M_OBJS += ip_masq_ftp.o ip_masq_irc.o ip_masq_raudio.o ip_masq_quake.o | |
850 | M_OBJS += ip_masq_vdolive.o ip_masq_cuseeme.o | |
851 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_forward.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_forward.c | |
852 | --- linux-2.2.19/net/ipv4/ip_forward.c Fri Jan 7 09:45:02 2000 | |
853 | +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_forward.c Fri Feb 2 15:38:28 2001 | |
854 | @@ -41,6 +41,9 @@ | |
855 | #include <linux/ip_fw.h> | |
856 | #ifdef CONFIG_IP_MASQUERADE | |
857 | #include <net/ip_masq.h> | |
858 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
859 | +#include <net/ip_vs.h> | |
860 | +#endif | |
861 | #endif | |
862 | #include <net/checksum.h> | |
863 | #include <linux/route.h> | |
864 | @@ -103,6 +106,14 @@ | |
865 | } | |
866 | #endif | |
867 | ||
868 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
869 | + if (iph->protocol == IPPROTO_ICMP && | |
870 | + !(IPCB(skb)->flags&IPSKB_MASQUERADED)) { | |
871 | + /* Related ICMP packet for IPVS ? */ | |
872 | + fw_res = ip_vs_forwarding_related_icmp(skb); | |
873 | + if (fw_res > 0) return ip_local_deliver(skb); | |
874 | + } | |
875 | +#endif | |
876 | ||
877 | #ifdef CONFIG_IP_TRANSPARENT_PROXY | |
878 | if (ip_chksock(skb)) | |
879 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_input.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_input.c | |
880 | --- linux-2.2.19/net/ipv4/ip_input.c Tue Mar 27 09:33:49 2001 | |
881 | +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_input.c Tue Mar 27 09:32:21 2001 | |
882 | @@ -250,6 +250,15 @@ | |
883 | */ | |
884 | { | |
885 | int ret; | |
886 | + | |
887 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
888 | + if((IPCB(skb)->flags&IPSKB_REDIRECTED)) { | |
889 | + printk(KERN_DEBUG "ip_input(): ipvs recursion detected. Check ipvs configuration\n"); | |
890 | + kfree_skb(skb); | |
891 | + return 0; | |
892 | + } | |
893 | +#endif | |
894 | + | |
895 | /* | |
896 | * Some masq modules can re-inject packets if | |
897 | * bad configured. | |
898 | @@ -262,6 +271,12 @@ | |
899 | } | |
900 | ||
901 | ret = ip_fw_demasquerade(&skb); | |
902 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
903 | + if (ret == -2) { | |
904 | + /* skb has already been released */ | |
905 | + return 0; | |
906 | + } | |
907 | +#endif | |
908 | if (ret < 0) { | |
909 | kfree_skb(skb); | |
910 | return 0; | |
911 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_masq.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_masq.c | |
912 | --- linux-2.2.19/net/ipv4/ip_masq.c Tue Mar 27 09:33:49 2001 | |
913 | +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_masq.c Wed Apr 18 19:58:48 2001 | |
914 | @@ -50,7 +50,12 @@ | |
915 | * Kai Bankett : do not toss other IP protos in proto_doff() | |
916 | * Dan Kegel : pointed correct NAT behavior for UDP streams | |
917 | * Julian Anastasov : use daddr and dport as hash keys | |
918 | - * | |
919 | + * Wensong Zhang : Added virtual server support | |
920 | + * Peter Kese : added masq TCP state handling for input-only | |
921 | + * Julian Anastasov : step to mSR after SYN in INPUT_ONLY table | |
922 | + * Julian Anastasov : fixed huge expire bug for IPVS after bad checksum | |
923 | + * Wensong Zhang : added server status checking for IPVS | |
924 | + * | |
925 | */ | |
926 | ||
927 | #include <linux/config.h> | |
928 | @@ -85,6 +90,10 @@ | |
929 | #include <linux/ip_fw.h> | |
930 | #include <linux/ip_masq.h> | |
931 | ||
932 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
933 | +#include <net/ip_vs.h> | |
934 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
935 | + | |
936 | int sysctl_ip_masq_debug = 0; | |
937 | int sysctl_ip_masq_udp_dloose = 0; | |
938 | ||
939 | @@ -98,6 +107,21 @@ | |
940 | ||
941 | struct ip_masq_hook *ip_masq_user_hook = NULL; | |
942 | ||
943 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
944 | +/* | |
945 | + * Use different state/timeout tables | |
946 | + */ | |
947 | +#ifndef IP_MASQ_MANY_STATE_TABLES | |
948 | +#define IP_MASQ_MANY_STATE_TABLES | |
949 | +#endif | |
950 | + | |
951 | +int ip_masq_drop_rate = 0; | |
952 | +int ip_masq_drop_counter = 0; | |
953 | + | |
954 | +#endif | |
955 | + | |
956 | +#ifndef CONFIG_IP_MASQUERADE_VS | |
957 | + | |
958 | /* | |
959 | * Timeout table[state] | |
960 | */ | |
961 | @@ -106,38 +130,104 @@ | |
962 | ATOMIC_INIT(0), /* refcnt */ | |
963 | 0, /* scale */ | |
964 | { | |
965 | - 30*60*HZ, /* IP_MASQ_S_NONE, */ | |
966 | - 15*60*HZ, /* IP_MASQ_S_ESTABLISHED, */ | |
967 | - 2*60*HZ, /* IP_MASQ_S_SYN_SENT, */ | |
968 | - 1*60*HZ, /* IP_MASQ_S_SYN_RECV, */ | |
969 | - 2*60*HZ, /* IP_MASQ_S_FIN_WAIT, */ | |
970 | - 2*60*HZ, /* IP_MASQ_S_TIME_WAIT, */ | |
971 | - 10*HZ, /* IP_MASQ_S_CLOSE, */ | |
972 | - 60*HZ, /* IP_MASQ_S_CLOSE_WAIT, */ | |
973 | - 30*HZ, /* IP_MASQ_S_LAST_ACK, */ | |
974 | - 2*60*HZ, /* IP_MASQ_S_LISTEN, */ | |
975 | - 5*60*HZ, /* IP_MASQ_S_UDP, */ | |
976 | - 1*60*HZ, /* IP_MASQ_S_ICMP, */ | |
977 | - 2*HZ,/* IP_MASQ_S_LAST */ | |
978 | + [IP_MASQ_S_NONE] = 30*60*HZ, | |
979 | + [IP_MASQ_S_ESTABLISHED] = 15*60*HZ, | |
980 | + [IP_MASQ_S_SYN_SENT] = 2*60*HZ, | |
981 | + [IP_MASQ_S_SYN_RECV] = 1*60*HZ, | |
982 | + [IP_MASQ_S_FIN_WAIT] = 2*60*HZ, | |
983 | + [IP_MASQ_S_TIME_WAIT] = 2*60*HZ, | |
984 | + [IP_MASQ_S_CLOSE] = 10*HZ, | |
985 | + [IP_MASQ_S_CLOSE_WAIT] = 60*HZ, | |
986 | + [IP_MASQ_S_LAST_ACK] = 30*HZ, | |
987 | + [IP_MASQ_S_LISTEN] = 2*60*HZ, | |
988 | + [IP_MASQ_S_UDP] = 5*60*HZ, | |
989 | + [IP_MASQ_S_ICMP] = 1*60*HZ, | |
990 | + [IP_MASQ_S_LAST] = 2*HZ, | |
991 | }, /* timeout */ | |
992 | }; | |
993 | ||
994 | +#else /* CONFIG_IP_MASQUERADE_VS */ | |
995 | + | |
996 | +/* | |
997 | + * Timeout table[state] | |
998 | + */ | |
999 | +/* static int masq_timeout_table[IP_MASQ_S_LAST+1] = { */ | |
1000 | +static struct ip_masq_timeout_table masq_timeout_table = { | |
1001 | + ATOMIC_INIT(0), /* refcnt */ | |
1002 | + 0, /* scale */ | |
1003 | + { | |
1004 | + [IP_MASQ_S_NONE] = 30*60*HZ, | |
1005 | + [IP_MASQ_S_ESTABLISHED] = 15*60*HZ, | |
1006 | + [IP_MASQ_S_SYN_SENT] = 2*60*HZ, | |
1007 | + [IP_MASQ_S_SYN_RECV] = 1*60*HZ, | |
1008 | + [IP_MASQ_S_FIN_WAIT] = 2*60*HZ, | |
1009 | + [IP_MASQ_S_TIME_WAIT] = 2*60*HZ, | |
1010 | + [IP_MASQ_S_CLOSE] = 10*HZ, | |
1011 | + [IP_MASQ_S_CLOSE_WAIT] = 60*HZ, | |
1012 | + [IP_MASQ_S_LAST_ACK] = 30*HZ, | |
1013 | + [IP_MASQ_S_LISTEN] = 2*60*HZ, | |
1014 | + [IP_MASQ_S_SYNACK] = 120*HZ, | |
1015 | + [IP_MASQ_S_UDP] = 5*60*HZ, | |
1016 | + [IP_MASQ_S_ICMP] = 1*60*HZ, | |
1017 | + [IP_MASQ_S_LAST] = 2*HZ, | |
1018 | + }, /* timeout */ | |
1019 | +}; | |
1020 | + | |
1021 | + | |
1022 | +struct ip_masq_timeout_table masq_timeout_table_dos = { | |
1023 | + ATOMIC_INIT(0), /* refcnt */ | |
1024 | + 0, /* scale */ | |
1025 | + { | |
1026 | + [IP_MASQ_S_NONE] = 15*60*HZ, | |
1027 | + [IP_MASQ_S_ESTABLISHED] = 8*60*HZ, | |
1028 | + [IP_MASQ_S_SYN_SENT] = 60*HZ, | |
1029 | + [IP_MASQ_S_SYN_RECV] = 10*HZ, | |
1030 | + [IP_MASQ_S_FIN_WAIT] = 60*HZ, | |
1031 | + [IP_MASQ_S_TIME_WAIT] = 60*HZ, | |
1032 | + [IP_MASQ_S_CLOSE] = 10*HZ, | |
1033 | + [IP_MASQ_S_CLOSE_WAIT] = 60*HZ, | |
1034 | + [IP_MASQ_S_LAST_ACK] = 30*HZ, | |
1035 | + [IP_MASQ_S_LISTEN] = 2*60*HZ, | |
1036 | + [IP_MASQ_S_SYNACK] = 100*HZ, | |
1037 | + [IP_MASQ_S_UDP] = 3*60*HZ, | |
1038 | + [IP_MASQ_S_ICMP] = 1*60*HZ, | |
1039 | + [IP_MASQ_S_LAST] = 2*HZ, | |
1040 | + }, /* timeout */ | |
1041 | +}; | |
1042 | + | |
1043 | +/* | |
1044 | + * Timeout table to use for the VS entries | |
1045 | + * If NULL we use the default table (masq_timeout_table). | |
1046 | + * Under flood attack we switch to masq_timeout_table_dos | |
1047 | + */ | |
1048 | + | |
1049 | +struct ip_masq_timeout_table *ip_vs_timeout_table = &masq_timeout_table; | |
1050 | + | |
1051 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1052 | + | |
1053 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1054 | +#define MASQUERADE_EXPIRE_RETRY(ms) (ms->timeout_table? ms->timeout_table->timeout[IP_MASQ_S_TIME_WAIT] : masq_timeout_table.timeout[IP_MASQ_S_TIME_WAIT]) | |
1055 | +#else | |
1056 | #define MASQUERADE_EXPIRE_RETRY masq_timeout_table.timeout[IP_MASQ_S_TIME_WAIT] | |
1057 | +#endif | |
1058 | ||
1059 | static const char * state_name_table[IP_MASQ_S_LAST+1] = { | |
1060 | - "NONE", /* IP_MASQ_S_NONE, */ | |
1061 | - "ESTABLISHED", /* IP_MASQ_S_ESTABLISHED, */ | |
1062 | - "SYN_SENT", /* IP_MASQ_S_SYN_SENT, */ | |
1063 | - "SYN_RECV", /* IP_MASQ_S_SYN_RECV, */ | |
1064 | - "FIN_WAIT", /* IP_MASQ_S_FIN_WAIT, */ | |
1065 | - "TIME_WAIT", /* IP_MASQ_S_TIME_WAIT, */ | |
1066 | - "CLOSE", /* IP_MASQ_S_CLOSE, */ | |
1067 | - "CLOSE_WAIT", /* IP_MASQ_S_CLOSE_WAIT, */ | |
1068 | - "LAST_ACK", /* IP_MASQ_S_LAST_ACK, */ | |
1069 | - "LISTEN", /* IP_MASQ_S_LISTEN, */ | |
1070 | - "UDP", /* IP_MASQ_S_UDP, */ | |
1071 | - "ICMP", /* IP_MASQ_S_ICMP, */ | |
1072 | - "BUG!", /* IP_MASQ_S_LAST */ | |
1073 | + [IP_MASQ_S_NONE] = "NONE", | |
1074 | + [IP_MASQ_S_ESTABLISHED] = "ESTABLISHED", | |
1075 | + [IP_MASQ_S_SYN_SENT] = "SYN_SENT", | |
1076 | + [IP_MASQ_S_SYN_RECV] = "SYN_RECV", | |
1077 | + [IP_MASQ_S_FIN_WAIT] = "FIN_WAIT", | |
1078 | + [IP_MASQ_S_TIME_WAIT] = "TIME_WAIT", | |
1079 | + [IP_MASQ_S_CLOSE] = "CLOSE", | |
1080 | + [IP_MASQ_S_CLOSE_WAIT] = "CLOSE_WAIT", | |
1081 | + [IP_MASQ_S_LAST_ACK] = "LAST_ACK", | |
1082 | + [IP_MASQ_S_LISTEN] = "LISTEN", | |
1083 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1084 | + [IP_MASQ_S_SYNACK] = "SYNACK", | |
1085 | +#endif | |
1086 | + [IP_MASQ_S_UDP] = "UDP", | |
1087 | + [IP_MASQ_S_ICMP] = "ICMP", | |
1088 | + [IP_MASQ_S_LAST] = "BUG!", | |
1089 | }; | |
1090 | ||
1091 | #define mNO IP_MASQ_S_NONE | |
1092 | @@ -150,6 +240,9 @@ | |
1093 | #define mCW IP_MASQ_S_CLOSE_WAIT | |
1094 | #define mLA IP_MASQ_S_LAST_ACK | |
1095 | #define mLI IP_MASQ_S_LISTEN | |
1096 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1097 | +#define mSA IP_MASQ_S_SYNACK | |
1098 | +#endif | |
1099 | ||
1100 | struct masq_tcp_states_t { | |
1101 | int next_state[IP_MASQ_S_LAST]; /* should be _LAST_TCP */ | |
1102 | @@ -159,46 +252,111 @@ | |
1103 | { | |
1104 | if (state >= IP_MASQ_S_LAST) | |
1105 | return "ERR!"; | |
1106 | - return state_name_table[state]; | |
1107 | + return state_name_table[state] ? state_name_table[state] : "?"; | |
1108 | } | |
1109 | ||
1110 | +#ifndef CONFIG_IP_MASQUERADE_VS | |
1111 | + | |
1112 | struct masq_tcp_states_t masq_tcp_states [] = { | |
1113 | /* INPUT */ | |
1114 | /* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */ | |
1115 | /*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR }}, | |
1116 | /*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI }}, | |
1117 | -/*ack*/ {{mCL, mES, mSS, mSR, mFW, mTW, mCL, mCW, mCL, mLI }}, | |
1118 | +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI }}, | |
1119 | /*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI }}, | |
1120 | ||
1121 | /* OUTPUT */ | |
1122 | /* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI */ | |
1123 | -/*syn*/ {{mSS, mES, mSS, mES, mSS, mSS, mSS, mSS, mSS, mLI }}, | |
1124 | +/*syn*/ {{mSS, mES, mSS, mSR, mSS, mSS, mSS, mSS, mSS, mLI }}, | |
1125 | /*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI }}, | |
1126 | -/*ack*/ {{mES, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mES }}, | |
1127 | +/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES }}, | |
1128 | /*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL }}, | |
1129 | }; | |
1130 | ||
1131 | -static __inline__ int masq_tcp_state_idx(struct tcphdr *th, int output) | |
1132 | +#else /* CONFIG_IP_MASQUERADE_VS */ | |
1133 | + | |
1134 | +struct masq_tcp_states_t masq_tcp_states [] = { | |
1135 | +/* INPUT */ | |
1136 | +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */ | |
1137 | +/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSR }}, | |
1138 | +/*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI, mTW }}, | |
1139 | +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }}, | |
1140 | +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mSR }}, | |
1141 | + | |
1142 | +/* OUTPUT */ | |
1143 | +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */ | |
1144 | +/*syn*/ {{mSS, mES, mSS, mSR, mSS, mSS, mSS, mSS, mSS, mLI, mSR }}, | |
1145 | +/*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI, mTW }}, | |
1146 | +/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES, mES }}, | |
1147 | +/*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL, mCL }}, | |
1148 | + | |
1149 | +/* INPUT-ONLY */ | |
1150 | +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */ | |
1151 | +/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSR }}, | |
1152 | +/*fin*/ {{mCL, mFW, mSS, mTW, mFW, mTW, mCL, mCW, mLA, mLI, mTW }}, | |
1153 | +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }}, | |
1154 | +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }}, | |
1155 | +}; | |
1156 | + | |
1157 | +struct masq_tcp_states_t masq_tcp_states_dos [] = { | |
1158 | +/* INPUT */ | |
1159 | +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */ | |
1160 | +/*syn*/ {{mSR, mES, mES, mSR, mSR, mSR, mSR, mSR, mSR, mSR, mSA }}, | |
1161 | +/*fin*/ {{mCL, mCW, mSS, mTW, mTW, mTW, mCL, mCW, mLA, mLI, mSA }}, | |
1162 | +/*ack*/ {{mCL, mES, mSS, mSR, mFW, mTW, mCL, mCW, mCL, mLI, mSA }}, | |
1163 | +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }}, | |
1164 | + | |
1165 | +/* OUTPUT */ | |
1166 | +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */ | |
1167 | +/*syn*/ {{mSS, mES, mSS, mSA, mSS, mSS, mSS, mSS, mSS, mLI, mSA }}, | |
1168 | +/*fin*/ {{mTW, mFW, mSS, mTW, mFW, mTW, mCL, mTW, mLA, mLI, mTW }}, | |
1169 | +/*ack*/ {{mES, mES, mSS, mES, mFW, mTW, mCL, mCW, mLA, mES, mES }}, | |
1170 | +/*rst*/ {{mCL, mCL, mSS, mCL, mCL, mTW, mCL, mCL, mCL, mCL, mCL }}, | |
1171 | + | |
1172 | +/* INPUT-ONLY */ | |
1173 | +/* mNO, mES, mSS, mSR, mFW, mTW, mCL, mCW, mLA, mLI, mSA */ | |
1174 | +/*syn*/ {{mSA, mES, mES, mSR, mSA, mSA, mSA, mSA, mSA, mSA, mSA }}, | |
1175 | +/*fin*/ {{mCL, mFW, mSS, mTW, mFW, mTW, mCL, mCW, mLA, mLI, mTW }}, | |
1176 | +/*ack*/ {{mCL, mES, mSS, mES, mFW, mTW, mCL, mCW, mCL, mLI, mES }}, | |
1177 | +/*rst*/ {{mCL, mCL, mCL, mSR, mCL, mCL, mCL, mCL, mLA, mLI, mCL }}, | |
1178 | +}; | |
1179 | + | |
1180 | +struct masq_tcp_states_t *ip_vs_state_table = masq_tcp_states; | |
1181 | + | |
1182 | +void ip_masq_secure_tcp_set(int on) | |
1183 | +{ | |
1184 | + if (on) { | |
1185 | + ip_vs_state_table = masq_tcp_states_dos; | |
1186 | + ip_vs_timeout_table = &masq_timeout_table_dos; | |
1187 | + } else { | |
1188 | + ip_vs_state_table = masq_tcp_states; | |
1189 | + ip_vs_timeout_table = &masq_timeout_table; | |
1190 | + } | |
1191 | +} | |
1192 | + | |
1193 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1194 | + | |
1195 | +#define MASQ_STATE_INPUT 0 | |
1196 | +#define MASQ_STATE_OUTPUT 4 | |
1197 | +#define MASQ_STATE_INPUT_ONLY 8 | |
1198 | + | |
1199 | +static __inline__ int masq_tcp_state_idx(struct tcphdr *th, int state_off) | |
1200 | { | |
1201 | /* | |
1202 | - * [0-3]: input states, [4-7]: output. | |
1203 | + * [0-3]: input states, [4-7]: output, [8-11] input only states. | |
1204 | */ | |
1205 | - if (output) | |
1206 | - output=4; | |
1207 | - | |
1208 | if (th->rst) | |
1209 | - return output+3; | |
1210 | + return state_off+3; | |
1211 | if (th->syn) | |
1212 | - return output+0; | |
1213 | + return state_off+0; | |
1214 | if (th->fin) | |
1215 | - return output+1; | |
1216 | + return state_off+1; | |
1217 | if (th->ack) | |
1218 | - return output+2; | |
1219 | + return state_off+2; | |
1220 | return -1; | |
1221 | } | |
1222 | ||
1223 | ||
1224 | - | |
1225 | static int masq_set_state_timeout(struct ip_masq *ms, int state) | |
1226 | { | |
1227 | struct ip_masq_timeout_table *mstim = ms->timeout_table; | |
1228 | @@ -221,18 +379,34 @@ | |
1229 | return state; | |
1230 | } | |
1231 | ||
1232 | -static int masq_tcp_state(struct ip_masq *ms, int output, struct tcphdr *th) | |
1233 | +static int masq_tcp_state(struct ip_masq *ms, int state_off, struct tcphdr *th) | |
1234 | { | |
1235 | int state_idx; | |
1236 | int new_state = IP_MASQ_S_CLOSE; | |
1237 | ||
1238 | - if ((state_idx = masq_tcp_state_idx(th, output)) < 0) { | |
1239 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1240 | + /* | |
1241 | + * Update state offset to INPUT_ONLY if necessary | |
1242 | + * or delete NO_OUTPUT flag if output packet detected | |
1243 | + */ | |
1244 | + if (ms->flags & IP_MASQ_F_VS_NO_OUTPUT) { | |
1245 | + if (state_off == MASQ_STATE_OUTPUT) | |
1246 | + ms->flags &= ~IP_MASQ_F_VS_NO_OUTPUT; | |
1247 | + else state_off = MASQ_STATE_INPUT_ONLY; | |
1248 | + } | |
1249 | +#endif | |
1250 | + | |
1251 | + if ((state_idx = masq_tcp_state_idx(th, state_off)) < 0) { | |
1252 | IP_MASQ_DEBUG(1, "masq_state_idx(%d)=%d!!!\n", | |
1253 | - output, state_idx); | |
1254 | + state_off, state_idx); | |
1255 | goto tcp_state_out; | |
1256 | } | |
1257 | ||
1258 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1259 | + new_state = ip_vs_state_table[state_idx].next_state[ms->state]; | |
1260 | +#else | |
1261 | new_state = masq_tcp_states[state_idx].next_state[ms->state]; | |
1262 | +#endif | |
1263 | ||
1264 | tcp_state_out: | |
1265 | if (new_state!=ms->state) | |
1266 | @@ -247,6 +421,15 @@ | |
1267 | ntohl(ms->daddr), ntohs(ms->dport), | |
1268 | ip_masq_state_name(ms->state), | |
1269 | ip_masq_state_name(new_state)); | |
1270 | + | |
1271 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1272 | + /* | |
1273 | + * Increase/Decrease the active connection counter and | |
1274 | + * set ms->flags according to ms->state and new_state. | |
1275 | + */ | |
1276 | + ip_vs_set_state(ms, new_state); | |
1277 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1278 | + | |
1279 | return masq_set_state_timeout(ms, new_state); | |
1280 | } | |
1281 | ||
1282 | @@ -254,7 +437,7 @@ | |
1283 | /* | |
1284 | * Handle state transitions | |
1285 | */ | |
1286 | -static int masq_set_state(struct ip_masq *ms, int output, struct iphdr *iph, void *tp) | |
1287 | +static int masq_set_state(struct ip_masq *ms, int state_off, struct iphdr *iph, void *tp) | |
1288 | { | |
1289 | switch (iph->protocol) { | |
1290 | case IPPROTO_ICMP: | |
1291 | @@ -262,7 +445,7 @@ | |
1292 | case IPPROTO_UDP: | |
1293 | return masq_set_state_timeout(ms, IP_MASQ_S_UDP); | |
1294 | case IPPROTO_TCP: | |
1295 | - return masq_tcp_state(ms, output, tp); | |
1296 | + return masq_tcp_state(ms, state_off, tp); | |
1297 | } | |
1298 | return -1; | |
1299 | } | |
1300 | @@ -361,6 +544,9 @@ | |
1301 | ||
1302 | EXPORT_SYMBOL(ip_masq_get_debug_level); | |
1303 | EXPORT_SYMBOL(ip_masq_new); | |
1304 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1305 | +EXPORT_SYMBOL(ip_masq_new_vs); | |
1306 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1307 | EXPORT_SYMBOL(ip_masq_listen); | |
1308 | EXPORT_SYMBOL(ip_masq_free_ports); | |
1309 | EXPORT_SYMBOL(ip_masq_out_get); | |
1310 | @@ -423,9 +609,17 @@ | |
1311 | { | |
1312 | if (tout) { | |
1313 | ms->timer.expires = jiffies+tout; | |
1314 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1315 | + add_sltimer(&ms->timer); | |
1316 | +#else | |
1317 | add_timer(&ms->timer); | |
1318 | +#endif | |
1319 | } else { | |
1320 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1321 | + del_sltimer(&ms->timer); | |
1322 | +#else | |
1323 | del_timer(&ms->timer); | |
1324 | +#endif | |
1325 | } | |
1326 | } | |
1327 | ||
1328 | @@ -741,6 +935,10 @@ | |
1329 | struct ip_masq *ms; | |
1330 | ||
1331 | read_lock(&__ip_masq_lock); | |
1332 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1333 | + ms = __ip_vs_out_get(protocol, s_addr, s_port, d_addr, d_port); | |
1334 | + if (ms == NULL) | |
1335 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1336 | ms = __ip_masq_out_get(protocol, s_addr, s_port, d_addr, d_port); | |
1337 | read_unlock(&__ip_masq_lock); | |
1338 | ||
1339 | @@ -754,7 +952,11 @@ | |
1340 | struct ip_masq *ms; | |
1341 | ||
1342 | read_lock(&__ip_masq_lock); | |
1343 | - ms = __ip_masq_in_get(protocol, s_addr, s_port, d_addr, d_port); | |
1344 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1345 | + ms = __ip_vs_in_get(protocol, s_addr, s_port, d_addr, d_port); | |
1346 | + if (ms == NULL) | |
1347 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1348 | + ms = __ip_masq_in_get(protocol, s_addr, s_port, d_addr, d_port); | |
1349 | read_unlock(&__ip_masq_lock); | |
1350 | ||
1351 | if (ms) | |
1352 | @@ -791,7 +993,11 @@ | |
1353 | static void masq_expire(unsigned long data) | |
1354 | { | |
1355 | struct ip_masq *ms = (struct ip_masq *)data; | |
1356 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1357 | + ms->timeout = MASQUERADE_EXPIRE_RETRY(ms); | |
1358 | +#else | |
1359 | ms->timeout = MASQUERADE_EXPIRE_RETRY; | |
1360 | +#endif | |
1361 | ||
1362 | /* | |
1363 | * hey, I'm using it | |
1364 | @@ -826,6 +1032,15 @@ | |
1365 | if (ms->control) | |
1366 | ip_masq_control_del(ms); | |
1367 | ||
1368 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1369 | + if (ms->flags & IP_MASQ_F_VS) { | |
1370 | + if (ip_vs_unhash(ms)) { | |
1371 | + ip_vs_unbind_masq(ms); | |
1372 | + ip_masq_unbind_app(ms); | |
1373 | + } | |
1374 | + } | |
1375 | + else | |
1376 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1377 | if (ip_masq_unhash(ms)) { | |
1378 | if (ms->flags&IP_MASQ_F_MPORT) { | |
1379 | atomic_dec(&mport_count); | |
1380 | @@ -839,6 +1054,9 @@ | |
1381 | * refcnt==1 implies I'm the only one referrer | |
1382 | */ | |
1383 | if (atomic_read(&ms->refcnt) == 1) { | |
1384 | +#ifdef IP_MASQ_MANY_STATE_TABLES | |
1385 | + ip_masq_timeout_detach(ms); | |
1386 | +#endif | |
1387 | kfree_s(ms,sizeof(*ms)); | |
1388 | sysctl_ip_always_defrag--; | |
1389 | MOD_DEC_USE_COUNT; | |
1390 | @@ -1077,6 +1295,83 @@ | |
1391 | return NULL; | |
1392 | } | |
1393 | ||
1394 | + | |
1395 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1396 | +/* | |
1397 | + * Create a new masquerade entry for IPVS, all parameters {maddr, | |
1398 | + * mport, saddr, sport, daddr, dport, mflags} are known. No need | |
1399 | + * to allocate a free mport. And, hash it into the ip_vs_table. | |
1400 | + * | |
1401 | + * Be careful, it can be called from u-space | |
1402 | + */ | |
1403 | + | |
1404 | +struct ip_masq * ip_masq_new_vs(int proto, __u32 maddr, __u16 mport, __u32 saddr, __u16 sport, __u32 daddr, __u16 dport, unsigned mflags) | |
1405 | +{ | |
1406 | + struct ip_masq *ms; | |
1407 | + static int n_fails = 0; | |
1408 | + int prio; | |
1409 | + | |
1410 | + prio = (mflags&IP_MASQ_F_USER) ? GFP_KERNEL : GFP_ATOMIC; | |
1411 | + | |
1412 | + ms = (struct ip_masq *) kmalloc(sizeof(struct ip_masq), prio); | |
1413 | + if (ms == NULL) { | |
1414 | + if (++n_fails < 5) | |
1415 | + IP_VS_ERR("ip_masq_new_vs(proto=%s): no memory available.\n", | |
1416 | + masq_proto_name(proto)); | |
1417 | + return NULL; | |
1418 | + } | |
1419 | + MOD_INC_USE_COUNT; | |
1420 | + | |
1421 | +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,2,14) | |
1422 | + sysctl_ip_always_defrag++; | |
1423 | +#endif | |
1424 | + memset(ms, 0, sizeof(*ms)); | |
1425 | + INIT_LIST_HEAD(&ms->s_list); | |
1426 | + INIT_LIST_HEAD(&ms->m_list); | |
1427 | + INIT_LIST_HEAD(&ms->d_list); | |
1428 | + init_timer(&ms->timer); | |
1429 | + ms->timer.data = (unsigned long)ms; | |
1430 | + ms->timer.function = masq_expire; | |
1431 | + ip_masq_timeout_attach(ms,ip_vs_timeout_table); | |
1432 | + ms->protocol = proto; | |
1433 | + ms->saddr = saddr; | |
1434 | + ms->sport = sport; | |
1435 | + ms->daddr = daddr; | |
1436 | + ms->dport = dport; | |
1437 | + ms->maddr = maddr; | |
1438 | + ms->mport = mport; | |
1439 | + ms->flags = mflags; | |
1440 | + ms->app_data = NULL; | |
1441 | + ms->control = NULL; | |
1442 | + | |
1443 | + atomic_set(&ms->n_control,0); | |
1444 | + atomic_set(&ms->refcnt,0); | |
1445 | + atomic_set(&ms->in_pkts,0); | |
1446 | + | |
1447 | + if (mflags & IP_MASQ_F_USER) | |
1448 | + write_lock_bh(&__ip_masq_lock); | |
1449 | + else | |
1450 | + write_lock(&__ip_masq_lock); | |
1451 | + | |
1452 | + /* | |
1453 | + * Hash it in the ip_vs_table | |
1454 | + */ | |
1455 | + ip_vs_hash(ms); | |
1456 | + | |
1457 | + if (mflags & IP_MASQ_F_USER) | |
1458 | + write_unlock_bh(&__ip_masq_lock); | |
1459 | + else | |
1460 | + write_unlock(&__ip_masq_lock); | |
1461 | + | |
1462 | + ip_masq_bind_app(ms); | |
1463 | + n_fails = 0; | |
1464 | + atomic_inc(&ms->refcnt); | |
1465 | + masq_set_state_timeout(ms, IP_MASQ_S_NONE); | |
1466 | + return ms; | |
1467 | +} | |
1468 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1469 | + | |
1470 | + | |
1471 | /* | |
1472 | * Get transport protocol data offset, check against size | |
1473 | * return: | |
1474 | @@ -1153,25 +1448,20 @@ | |
1475 | return -1; | |
1476 | } | |
1477 | ||
1478 | +#ifndef CONFIG_IP_MASQUERADE_VS | |
1479 | /* Lets determine our maddr now, shall we? */ | |
1480 | - if (maddr == 0) { | |
1481 | - struct rtable *rt; | |
1482 | - struct rtable *skb_rt = (struct rtable*)skb->dst; | |
1483 | - struct device *skb_dev = skb_rt->u.dst.dev; | |
1484 | - | |
1485 | - if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos)|RTO_CONN, skb_dev?skb_dev->ifindex:0)) { | |
1486 | - /* Fallback on old method */ | |
1487 | - /* This really shouldn't happen... */ | |
1488 | - maddr = inet_select_addr(skb_dev, skb_rt->rt_gateway, RT_SCOPE_UNIVERSE); | |
1489 | - } else { | |
1490 | - /* Route lookup succeeded */ | |
1491 | - maddr = rt->rt_src; | |
1492 | - ip_rt_put(rt); | |
1493 | - } | |
1494 | + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) { | |
1495 | + return -1; | |
1496 | } | |
1497 | +#endif | |
1498 | ||
1499 | switch (iph->protocol) { | |
1500 | case IPPROTO_ICMP: | |
1501 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1502 | + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) { | |
1503 | + return -1; | |
1504 | + } | |
1505 | +#endif | |
1506 | return(ip_fw_masq_icmp(skb_p, maddr)); | |
1507 | case IPPROTO_UDP: | |
1508 | if (h.uh->check == 0) | |
1509 | @@ -1230,6 +1520,17 @@ | |
1510 | ||
1511 | ms = ip_masq_out_get_iph(iph); | |
1512 | if (ms!=NULL) { | |
1513 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1514 | + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) { | |
1515 | + /* | |
1516 | + * Drop this packet but don't | |
1517 | + * start the timer from the beginning | |
1518 | + */ | |
1519 | + __ip_masq_put(ms); | |
1520 | + add_sltimer(&ms->timer); | |
1521 | + return -1; | |
1522 | + } | |
1523 | +#endif | |
1524 | ||
1525 | /* | |
1526 | * If sysctl !=0 and no pkt has been received yet | |
1527 | @@ -1280,6 +1581,33 @@ | |
1528 | ms->daddr = iph->daddr; | |
1529 | } | |
1530 | } else { | |
1531 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1532 | + struct ip_vs_dest *dest; | |
1533 | + | |
1534 | + /* | |
1535 | + * Check if the packet is from our real service | |
1536 | + */ | |
1537 | + read_lock(&__ip_vs_lock); | |
1538 | + dest = __ip_vs_lookup_real_service(iph->protocol, | |
1539 | + iph->saddr, h.portp[0]); | |
1540 | + read_unlock(&__ip_vs_lock); | |
1541 | + if (dest) { | |
1542 | + /* | |
1543 | + * Notify the real server: there is | |
1544 | + * no existing entry if it is not RST packet | |
1545 | + * or not TCP packet. | |
1546 | + */ | |
1547 | + if (!h.th->rst || iph->protocol != IPPROTO_TCP) | |
1548 | + icmp_send(skb, ICMP_DEST_UNREACH, | |
1549 | + ICMP_PORT_UNREACH, 0); | |
1550 | + return -1; | |
1551 | + } | |
1552 | + | |
1553 | + if (!maddr && (ip_masq_select_addr(skb,&maddr) < 0)) { | |
1554 | + return -1; | |
1555 | + } | |
1556 | +#endif | |
1557 | + | |
1558 | /* | |
1559 | * Nope, not found, create a new entry for it | |
1560 | */ | |
1561 | @@ -1392,11 +1720,17 @@ | |
1562 | IP_MASQ_DEBUG(2, "O-routed from %08X:%04X with masq.addr %08X\n", | |
1563 | ntohl(ms->maddr),ntohs(ms->mport),ntohl(maddr)); | |
1564 | ||
1565 | - masq_set_state(ms, 1, iph, h.portp); | |
1566 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1567 | + /* do the IPVS statistics */ | |
1568 | + if (ms->flags & IP_MASQ_F_VS) | |
1569 | + ip_vs_out_stats(ms, skb); | |
1570 | +#endif | |
1571 | + | |
1572 | + masq_set_state(ms, MASQ_STATE_OUTPUT, iph, h.portp); | |
1573 | ip_masq_put(ms); | |
1574 | ||
1575 | return 0; | |
1576 | - } | |
1577 | +} | |
1578 | ||
1579 | /* | |
1580 | * Restore original addresses and ports in the original IP | |
1581 | @@ -1438,6 +1772,12 @@ | |
1582 | ms = __ip_masq_out_get(iph->protocol, | |
1583 | iph->daddr, portp[1], | |
1584 | iph->saddr, portp[0]); | |
1585 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1586 | + if (ms == NULL) | |
1587 | + ms = __ip_vs_out_get(iph->protocol, | |
1588 | + iph->daddr, portp[1], | |
1589 | + iph->saddr, portp[0]); | |
1590 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1591 | read_unlock(&__ip_masq_lock); | |
1592 | if (ms) { | |
1593 | IP_MASQ_DEBUG(1, "Incoming frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n", | |
1594 | @@ -1459,6 +1799,12 @@ | |
1595 | ms = __ip_masq_in_get(iph->protocol, | |
1596 | iph->daddr, portp[1], | |
1597 | iph->saddr, portp[0]); | |
1598 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1599 | + if (ms == NULL) | |
1600 | + ms = __ip_vs_in_get(iph->protocol, | |
1601 | + iph->daddr, portp[1], | |
1602 | + iph->saddr, portp[0]); | |
1603 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1604 | read_unlock(&__ip_masq_lock); | |
1605 | if (ms) { | |
1606 | IP_MASQ_DEBUG(1, "Outgoing frag_need rewrited from %d.%d.%d.%d to %d.%d.%d.%d\n", | |
1607 | @@ -1469,8 +1815,8 @@ | |
1608 | return 1; | |
1609 | } | |
1610 | return 0; | |
1611 | - | |
1612 | } | |
1613 | + | |
1614 | /* | |
1615 | * Handle ICMP messages in forward direction. | |
1616 | * Find any that might be relevant, check against existing connections, | |
1617 | @@ -1556,7 +1902,7 @@ | |
1618 | ntohs(icmp_id(icmph)), | |
1619 | icmph->type); | |
1620 | ||
1621 | - masq_set_state(ms, 1, iph, icmph); | |
1622 | + masq_set_state(ms, MASQ_STATE_OUTPUT, iph, icmph); | |
1623 | ip_masq_put(ms); | |
1624 | ||
1625 | return 1; | |
1626 | @@ -1684,11 +2030,28 @@ | |
1627 | pptr[1], | |
1628 | ciph->saddr, | |
1629 | pptr[0]); | |
1630 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1631 | + if (ms == NULL) { | |
1632 | + ms = __ip_vs_out_get(ciph->protocol, | |
1633 | + ciph->daddr, pptr[1], | |
1634 | + ciph->saddr, pptr[0]); | |
1635 | + } | |
1636 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1637 | read_unlock(&__ip_masq_lock); | |
1638 | ||
1639 | if (ms == NULL) | |
1640 | return 0; | |
1641 | ||
1642 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1643 | + if (IP_MASQ_VS_FWD(ms) != 0) { | |
1644 | + IP_VS_INFO("shouldn't get here, because tun/dr is on the half connection\n"); | |
1645 | + } | |
1646 | + | |
1647 | + /* do the IPVS statistics */ | |
1648 | + if (ms->flags & IP_MASQ_F_VS) | |
1649 | + ip_vs_out_stats(ms, skb); | |
1650 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1651 | + | |
1652 | /* Now we do real damage to this packet...! */ | |
1653 | /* First change the source IP address, and recalc checksum */ | |
1654 | iph->saddr = ms->maddr; | |
1655 | @@ -1739,6 +2102,87 @@ | |
1656 | return skb; | |
1657 | } | |
1658 | ||
1659 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1660 | + | |
1661 | +/* | |
1662 | + * Check whether this ICMP packet in the FORWARD path is for | |
1663 | + * related IPVS connection and needs to be delivered locally | |
1664 | + */ | |
1665 | + | |
1666 | +int ip_vs_forwarding_related_icmp(struct sk_buff *skb) | |
1667 | +{ | |
1668 | + struct iphdr *iph = skb->nh.iph; | |
1669 | + struct icmphdr *icmph = (struct icmphdr *)((char *)iph + (iph->ihl<<2)); | |
1670 | + unsigned short size = ntohs(iph->tot_len) - (iph->ihl * 4); | |
1671 | + struct iphdr *ciph; /* The ip header contained within the ICMP */ | |
1672 | + __u16 *pptr; /* port numbers from TCP/UDP contained header */ | |
1673 | + struct ip_masq *ms; | |
1674 | + union ip_masq_tphdr h; | |
1675 | + int doff; | |
1676 | + | |
1677 | + /* | |
1678 | + * PACKET_HOST only, see ip_forward | |
1679 | + */ | |
1680 | + | |
1681 | + h.raw = (char*) iph + iph->ihl * 4; | |
1682 | + | |
1683 | + doff = proto_doff(iph->protocol, h.raw, size); | |
1684 | + | |
1685 | + if (doff <= 0) return 0; | |
1686 | + | |
1687 | + IP_VS_DBG(10, "icmp fwd/rev (%d,%d) %u.%u.%u.%u -> %u.%u.%u.%u\n", | |
1688 | + icmph->type, ntohs(icmp_id(icmph)), | |
1689 | + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); | |
1690 | + | |
1691 | + if ((icmph->type != ICMP_DEST_UNREACH) && | |
1692 | + (icmph->type != ICMP_SOURCE_QUENCH) && | |
1693 | + (icmph->type != ICMP_TIME_EXCEEDED)) | |
1694 | + return 0; | |
1695 | + | |
1696 | + /* | |
1697 | + * If we get here we have an ICMP error of one of the above 3 types | |
1698 | + * Now find the contained IP header | |
1699 | + */ | |
1700 | + | |
1701 | + ciph = (struct iphdr *) (icmph + 1); | |
1702 | + size -= sizeof(struct icmphdr); | |
1703 | + if (size < sizeof(struct iphdr)) return 0; | |
1704 | + | |
1705 | + /* We are only interested ICMPs generated from TCP or UDP packets */ | |
1706 | + if (ciph->protocol == IPPROTO_TCP) { | |
1707 | + if (size < sizeof(struct tcphdr)) return 0; | |
1708 | + } | |
1709 | + else | |
1710 | + if (ciph->protocol == IPPROTO_UDP) { | |
1711 | + if (size < sizeof(struct udphdr)) return 0; | |
1712 | + } | |
1713 | + else return 0; | |
1714 | + | |
1715 | + /* We don't ensure for now the checksum is correct */ | |
1716 | + | |
1717 | + /* This is pretty much what __ip_masq_in_get_iph() does, | |
1718 | + except params are wrong way round */ | |
1719 | + pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]); | |
1720 | + | |
1721 | + read_lock(&__ip_masq_lock); | |
1722 | + ms = __ip_vs_in_get(ciph->protocol, | |
1723 | + ciph->daddr, | |
1724 | + pptr[1], | |
1725 | + ciph->saddr, | |
1726 | + pptr[0]); | |
1727 | + read_unlock(&__ip_masq_lock); | |
1728 | + | |
1729 | + if (!ms) return 0; | |
1730 | + IP_VS_DBG(10, "Delivering locally ICMP for %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u to %u.%u.%u.%u\n", | |
1731 | + NIPQUAD(ciph->daddr), ntohs(pptr[1]), | |
1732 | + NIPQUAD(ciph->saddr), ntohs(pptr[0]), | |
1733 | + NIPQUAD(ms->saddr)); | |
1734 | + __ip_masq_put(ms); | |
1735 | + | |
1736 | + return 1; | |
1737 | +} | |
1738 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1739 | + | |
1740 | /* | |
1741 | * Handle ICMP messages in reverse (demasquerade) direction. | |
1742 | * Find any that might be relevant, check against existing connections, | |
1743 | @@ -1812,7 +2256,7 @@ | |
1744 | ntohs(icmp_id(icmph)), | |
1745 | icmph->type); | |
1746 | ||
1747 | - masq_set_state(ms, 0, iph, icmph); | |
1748 | + masq_set_state(ms, MASQ_STATE_INPUT, iph, icmph); | |
1749 | ip_masq_put(ms); | |
1750 | ||
1751 | return 1; | |
1752 | @@ -1914,9 +2358,11 @@ | |
1753 | * *outgoing* so the ports are reversed (and addresses) | |
1754 | */ | |
1755 | pptr = (__u16 *)&(((char *)ciph)[csize]); | |
1756 | +#ifndef CONFIG_IP_MASQUERADE_VS | |
1757 | if (ntohs(pptr[0]) < PORT_MASQ_BEGIN || | |
1758 | ntohs(pptr[0]) > PORT_MASQ_END) | |
1759 | return 0; | |
1760 | +#endif | |
1761 | ||
1762 | /* Ensure the checksum is correct */ | |
1763 | if (ip_compute_csum((unsigned char *) icmph, len)) | |
1764 | @@ -1927,7 +2373,6 @@ | |
1765 | return(-1); | |
1766 | } | |
1767 | ||
1768 | - | |
1769 | IP_MASQ_DEBUG(2, "Handling reverse ICMP for %08X:%04X -> %08X:%04X\n", | |
1770 | ntohl(ciph->saddr), ntohs(pptr[0]), | |
1771 | ntohl(ciph->daddr), ntohs(pptr[1])); | |
1772 | @@ -1935,6 +2380,14 @@ | |
1773 | ||
1774 | /* This is pretty much what __ip_masq_in_get_iph() does, except params are wrong way round */ | |
1775 | read_lock(&__ip_masq_lock); | |
1776 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1777 | + ms = __ip_vs_in_get(ciph->protocol, | |
1778 | + ciph->daddr, | |
1779 | + pptr[1], | |
1780 | + ciph->saddr, | |
1781 | + pptr[0]); | |
1782 | + if (ms == NULL) | |
1783 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1784 | ms = __ip_masq_in_get(ciph->protocol, | |
1785 | ciph->daddr, | |
1786 | pptr[1], | |
1787 | @@ -1945,10 +2398,23 @@ | |
1788 | if (ms == NULL) | |
1789 | return 0; | |
1790 | ||
1791 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1792 | + /* do the IPVS statistics */ | |
1793 | + if (ms->flags & IP_MASQ_F_VS) | |
1794 | + ip_vs_in_stats(ms, skb); | |
1795 | + | |
1796 | + if (IP_MASQ_VS_FWD(ms) != 0) { | |
1797 | + int ret = ip_vs_forward(skb, ms); | |
1798 | + __ip_masq_put(ms); | |
1799 | + return ret; | |
1800 | + } | |
1801 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1802 | + | |
1803 | if ((skb=masq_skb_cow(skb_p, &iph, (unsigned char**)&icmph)) == NULL) { | |
1804 | __ip_masq_put(ms); | |
1805 | return -1; | |
1806 | } | |
1807 | + | |
1808 | ciph = (struct iphdr *) (icmph + 1); | |
1809 | pptr = (__u16 *)&(((char *)ciph)[ciph->ihl*4]); | |
1810 | ||
1811 | @@ -1998,7 +2464,10 @@ | |
1812 | int csum = 0; | |
1813 | int csum_ok = 0; | |
1814 | __u32 maddr; | |
1815 | - | |
1816 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1817 | + struct ip_vs_service *svc = NULL; | |
1818 | +#endif | |
1819 | + | |
1820 | /* | |
1821 | * Big tappo: only PACKET_HOST (nor loopback neither mcasts) | |
1822 | * ... don't know why 1st test DOES NOT include 2nd (?) | |
1823 | @@ -2039,13 +2508,21 @@ | |
1824 | return(ip_fw_demasq_icmp(skb_p)); | |
1825 | case IPPROTO_TCP: | |
1826 | case IPPROTO_UDP: | |
1827 | - /* | |
1828 | + /* | |
1829 | * Make sure packet is in the masq range | |
1830 | * ... or some mod-ule relaxes input range | |
1831 | * ... or there is still some `special' mport opened | |
1832 | */ | |
1833 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1834 | + svc = ip_vs_lookup_service(skb->fwmark, | |
1835 | + iph->protocol, maddr, h.portp[1]); | |
1836 | + if (!svc && | |
1837 | + (ntohs(h.portp[1]) < PORT_MASQ_BEGIN | |
1838 | + || ntohs(h.portp[1]) > PORT_MASQ_END) | |
1839 | +#else | |
1840 | if ((ntohs(h.portp[1]) < PORT_MASQ_BEGIN | |
1841 | || ntohs(h.portp[1]) > PORT_MASQ_END) | |
1842 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1843 | #ifdef CONFIG_IP_MASQUERADE_MOD | |
1844 | && (ip_masq_mod_in_rule(skb, iph) != 1) | |
1845 | #endif | |
1846 | @@ -2100,6 +2577,21 @@ | |
1847 | ||
1848 | ms = ip_masq_in_get_iph(iph); | |
1849 | ||
1850 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1851 | + /* | |
1852 | + * Checking the server status | |
1853 | + */ | |
1854 | + if (ms && ms->dest && !(ms->dest->flags & IP_VS_DEST_F_AVAILABLE)) { | |
1855 | + /* | |
1856 | + * If the dest is not avaiable, don't restart the timer | |
1857 | + * of the packet, but silently drop it. | |
1858 | + */ | |
1859 | + add_sltimer(&ms->timer); | |
1860 | + __ip_masq_put(ms); | |
1861 | + return -1; | |
1862 | + } | |
1863 | +#endif | |
1864 | + | |
1865 | /* | |
1866 | * Give additional modules a chance to create an entry | |
1867 | */ | |
1868 | @@ -2116,6 +2608,27 @@ | |
1869 | ip_masq_mod_in_update(skb, iph, ms); | |
1870 | #endif | |
1871 | ||
1872 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1873 | + if (!ms && | |
1874 | + (h.th->syn || (iph->protocol!=IPPROTO_TCP)) && svc) { | |
1875 | + if (ip_masq_todrop()) { | |
1876 | + /* | |
1877 | + * It seems that we are very loaded. | |
1878 | + * We have to drop this packet :( | |
1879 | + */ | |
1880 | + return -1; | |
1881 | + } | |
1882 | + /* | |
1883 | + * Let the virtual server select a real server | |
1884 | + * for the incomming connection, and create a | |
1885 | + * masquerading entry. | |
1886 | + */ | |
1887 | + ms = ip_vs_schedule(svc, iph); | |
1888 | + if (!ms) | |
1889 | + return ip_vs_leave(svc, skb); | |
1890 | + ip_vs_conn_stats(ms, svc); | |
1891 | + } | |
1892 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1893 | ||
1894 | if (ms != NULL) | |
1895 | { | |
1896 | @@ -2168,13 +2681,43 @@ | |
1897 | ||
1898 | } | |
1899 | } | |
1900 | + | |
1901 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1902 | + /* do the IPVS statistics */ | |
1903 | + if (ms->flags & IP_MASQ_F_VS) | |
1904 | + ip_vs_in_stats(ms, skb); | |
1905 | + | |
1906 | + if (IP_MASQ_VS_FWD(ms) != 0) { | |
1907 | + int ret; | |
1908 | + | |
1909 | + /* | |
1910 | + * Sorry for setting state of masq entry so early | |
1911 | + * no matter whether the packet is forwarded | |
1912 | + * successfully or not, because ip_vs_forward may | |
1913 | + * have already released the skb. Although it | |
1914 | + * brokes the original sematics, it won't lead to | |
1915 | + * serious errors. We look forward to fixing it | |
1916 | + * under the Rusty's netfilter framework both for | |
1917 | + * correctness and modularization. | |
1918 | + */ | |
1919 | + masq_set_state(ms, MASQ_STATE_INPUT, iph, h.portp); | |
1920 | + | |
1921 | + ret = ip_vs_forward(skb, ms); | |
1922 | + ip_masq_put(ms); | |
1923 | + return ret; | |
1924 | + } | |
1925 | + | |
1926 | + IP_VS_DBG(10, "masquerading packet...\n"); | |
1927 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
1928 | + | |
1929 | if ((skb=masq_skb_cow(skb_p, &iph, &h.raw)) == NULL) { | |
1930 | ip_masq_put(ms); | |
1931 | return -1; | |
1932 | } | |
1933 | + | |
1934 | iph->daddr = ms->saddr; | |
1935 | h.portp[1] = ms->sport; | |
1936 | - | |
1937 | + | |
1938 | /* | |
1939 | * Invalidate csum saving if tunnel has masq helper | |
1940 | */ | |
1941 | @@ -2231,15 +2774,28 @@ | |
1942 | h.uh->check = 0xFFFF; | |
1943 | break; | |
1944 | } | |
1945 | - ip_send_check(iph); | |
1946 | + ip_send_check(iph); | |
1947 | ||
1948 | IP_MASQ_DEBUG(2, "I-routed to %08X:%04X\n",ntohl(iph->daddr),ntohs(h.portp[1])); | |
1949 | ||
1950 | - masq_set_state (ms, 0, iph, h.portp); | |
1951 | + masq_set_state(ms, MASQ_STATE_INPUT, iph, h.portp); | |
1952 | ip_masq_put(ms); | |
1953 | ||
1954 | return 1; | |
1955 | } | |
1956 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1957 | + if (svc) { | |
1958 | + /* | |
1959 | + * Drop packet if it belongs to virtual service but no entry | |
1960 | + * is found or created. Furthermore, send DEST_UNREACH icmp | |
1961 | + * packet to clients if it is not RST or it is not TCP. | |
1962 | + */ | |
1963 | + if (!h.th->rst || iph->protocol != IPPROTO_TCP) { | |
1964 | + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); | |
1965 | + } | |
1966 | + return -1; | |
1967 | + } | |
1968 | +#endif | |
1969 | ||
1970 | /* sorry, all this trouble for a no-hit :) */ | |
1971 | return 0; | |
1972 | @@ -2350,7 +2906,6 @@ | |
1973 | len += sprintf(buffer+len, "%-127s\n", temp); | |
1974 | ||
1975 | if(len >= length) { | |
1976 | - | |
1977 | read_unlock_bh(&__ip_masq_lock); | |
1978 | goto done; | |
1979 | } | |
1980 | @@ -2358,9 +2913,52 @@ | |
1981 | read_unlock_bh(&__ip_masq_lock); | |
1982 | ||
1983 | } | |
1984 | -done: | |
1985 | ||
1986 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
1987 | + for(idx = 0; idx < IP_VS_TAB_SIZE; idx++) | |
1988 | + { | |
1989 | + /* | |
1990 | + * Lock is actually only need in next loop | |
1991 | + * we are called from uspace: must stop bh. | |
1992 | + */ | |
1993 | + read_lock_bh(&__ip_masq_lock); | |
1994 | ||
1995 | + l = &ip_vs_table[idx]; | |
1996 | + for (e=l->next; e!=l; e=e->next) { | |
1997 | + ms = list_entry(e, struct ip_masq, m_list); | |
1998 | + pos += 128; | |
1999 | + if (pos <= offset) { | |
2000 | + len = 0; | |
2001 | + continue; | |
2002 | + } | |
2003 | + | |
2004 | + /* | |
2005 | + * We have locked the tables, no need to del/add timers | |
2006 | + * nor cli() 8) | |
2007 | + */ | |
2008 | + | |
2009 | + sprintf(temp,"%s %08X:%04X %08X:%04X %04X %08X %6d %6d %7lu", | |
2010 | + masq_proto_name(ms->protocol), | |
2011 | + ntohl(ms->saddr), ntohs(ms->sport), | |
2012 | + ntohl(ms->daddr), ntohs(ms->dport), | |
2013 | + ntohs(ms->mport), | |
2014 | + ms->out_seq.init_seq, | |
2015 | + ms->out_seq.delta, | |
2016 | + ms->out_seq.previous_delta, | |
2017 | + ms->timer.expires-jiffies); | |
2018 | + len += sprintf(buffer+len, "%-127s\n", temp); | |
2019 | + | |
2020 | + if(len >= length) { | |
2021 | + read_unlock_bh(&__ip_masq_lock); | |
2022 | + goto done; | |
2023 | + } | |
2024 | + } | |
2025 | + read_unlock_bh(&__ip_masq_lock); | |
2026 | + | |
2027 | + } | |
2028 | +#endif /* CONFIG_IP_MASQUERADE_VS */ | |
2029 | + | |
2030 | +done: | |
2031 | begin = len - (pos - offset); | |
2032 | *start = buffer + begin; | |
2033 | len -= begin; | |
2034 | @@ -2386,17 +2984,29 @@ | |
2035 | len, sizeof(struct ip_fw_masq)); | |
2036 | } else { | |
2037 | masq = (struct ip_fw_masq *)m; | |
2038 | - if (masq->tcp_timeout) | |
2039 | + if (masq->tcp_timeout) { | |
2040 | masq_timeout_table.timeout[IP_MASQ_S_ESTABLISHED] | |
2041 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
2042 | + = masq_timeout_table_dos.timeout[IP_MASQ_S_ESTABLISHED] | |
2043 | +#endif | |
2044 | = masq->tcp_timeout; | |
2045 | + } | |
2046 | ||
2047 | - if (masq->tcp_fin_timeout) | |
2048 | + if (masq->tcp_fin_timeout) { | |
2049 | masq_timeout_table.timeout[IP_MASQ_S_FIN_WAIT] | |
2050 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
2051 | + = masq_timeout_table_dos.timeout[IP_MASQ_S_FIN_WAIT] | |
2052 | +#endif | |
2053 | = masq->tcp_fin_timeout; | |
2054 | + } | |
2055 | ||
2056 | - if (masq->udp_timeout) | |
2057 | + if (masq->udp_timeout) { | |
2058 | masq_timeout_table.timeout[IP_MASQ_S_UDP] | |
2059 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
2060 | + = masq_timeout_table_dos.timeout[IP_MASQ_S_UDP] | |
2061 | +#endif | |
2062 | = masq->udp_timeout; | |
2063 | + } | |
2064 | ret = 0; | |
2065 | } | |
2066 | return ret; | |
2067 | @@ -2468,6 +3078,11 @@ | |
2068 | ret = ip_masq_mod_ctl(optname, &masq_ctl, optlen); | |
2069 | break; | |
2070 | #endif | |
2071 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
2072 | + case IP_MASQ_TARGET_VS: | |
2073 | + ret = ip_vs_ctl(optname, &masq_ctl, optlen); | |
2074 | + break; | |
2075 | +#endif | |
2076 | } | |
2077 | ||
2078 | /* | |
2079 | @@ -2529,12 +3144,25 @@ | |
2080 | } | |
2081 | } | |
2082 | #endif /* CONFIG_PROC_FS */ | |
2083 | + | |
2084 | /* | |
2085 | - * Wrapper over inet_select_addr() | |
2086 | + * Determine maddr from skb | |
2087 | */ | |
2088 | -u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope) | |
2089 | +int ip_masq_select_addr(struct sk_buff *skb, __u32 *maddr) | |
2090 | { | |
2091 | - return inet_select_addr(dev, dst, scope); | |
2092 | + struct rtable *rt; | |
2093 | + struct rtable *skb_rt = (struct rtable*)skb->dst; | |
2094 | + struct device *skb_dev = skb_rt->u.dst.dev; | |
2095 | + struct iphdr *iph = skb->nh.iph; | |
2096 | + | |
2097 | + if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos)|RTO_CONN, skb_dev?skb_dev->ifindex:0)) { | |
2098 | + return -1; | |
2099 | + } else { | |
2100 | + /* Route lookup succeeded */ | |
2101 | + *maddr = rt->rt_src; | |
2102 | + ip_rt_put(rt); | |
2103 | + return 0; | |
2104 | + } | |
2105 | } | |
2106 | ||
2107 | /* | |
2108 | @@ -2587,7 +3215,7 @@ | |
2109 | (char *) IPPROTO_ICMP, | |
2110 | ip_masq_user_info | |
2111 | }); | |
2112 | -#endif | |
2113 | +#endif /* CONFIG_PROC_FS */ | |
2114 | #ifdef CONFIG_IP_MASQUERADE_IPAUTOFW | |
2115 | ip_autofw_init(); | |
2116 | #endif | |
2117 | @@ -2596,6 +3224,9 @@ | |
2118 | #endif | |
2119 | #ifdef CONFIG_IP_MASQUERADE_MFW | |
2120 | ip_mfw_init(); | |
2121 | +#endif | |
2122 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
2123 | + ip_vs_init(); | |
2124 | #endif | |
2125 | ip_masq_app_init(); | |
2126 | ||
2127 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs.c | |
2128 | --- linux-2.2.19/net/ipv4/ip_vs.c Thu Jan 1 08:00:00 1970 | |
2129 | +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs.c Mon May 14 22:04:50 2001 | |
2130 | @@ -0,0 +1,3015 @@ | |
2131 | +/* | |
2132 | + * IPVS An implementation of the IP virtual server support for the | |
2133 | + * LINUX operating system. IPVS is now implemented as a part | |
2134 | + * of IP masquerading code. IPVS can be used to build a | |
2135 | + * high-performance and highly available server based on a | |
2136 | + * cluster of servers. | |
2137 | + * | |
2138 | + * Version: $Id$ | |
2139 | + * | |
2140 | + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> | |
2141 | + * Peter Kese <peter.kese@ijs.si> | |
2142 | + * | |
2143 | + * This program is free software; you can redistribute it and/or | |
2144 | + * modify it under the terms of the GNU General Public License | |
2145 | + * as published by the Free Software Foundation; either version | |
2146 | + * 2 of the License, or (at your option) any later version. | |
2147 | + * | |
2148 | + * Changes: | |
2149 | + * Wensong Zhang : fixed the overflow bug in ip_vs_procinfo | |
2150 | + * Wensong Zhang : added editing dest and service functions | |
2151 | + * Wensong Zhang : changed the names of some functions | |
2152 | + * Wensong Zhang : fixed the unlocking bug in ip_vs_del_dest | |
2153 | + * Wensong Zhang : added a separate hash table for IPVS | |
2154 | + * Wensong Zhang : added slow timer for IPVS masq entries | |
2155 | + * Julian Anastasov : fixed the number of active connections | |
2156 | + * Wensong Zhang : added persistent port | |
2157 | + * Wensong Zhang : fixed the incorrect lookup in hash table | |
2158 | + * Wensong Zhang : added server status checking | |
2159 | + * Wensong Zhang : fixed the incorrect slow timer vector layout | |
2160 | + * Wensong Zhang : fixed the sltimer added twice bug of mst | |
2161 | + * Julian Anastasov : fixed the IP_MASQ_F_VS_INACTIVE cleared bug after editing dest | |
2162 | + * Wensong Zhang : added the inactive connection counter | |
2163 | + * Wensong Zhang : changed the body of ip_vs_schedule | |
2164 | + * Julian Anastasov : fixed the unlocking bug in ip_vs_schedule | |
2165 | + * Julian Anastasov : fixed the uncounting bug in creating masqs by template | |
2166 | + * Wensong Zhang : changed some condition orders for a bit performance | |
2167 | + * Julian Anastasov : don't touch counters in ip_vs_unbind_masq for templates | |
2168 | + * Wensong Zhang : added the hash table for virtual services | |
2169 | + * Wensong Zhang : changed destination lists to d-linked lists | |
2170 | + * Wensong Zhang : changed the scheduler list to the d-linked list | |
2171 | + * Wensong Zhang : added new persistent service handling | |
2172 | + * Julian Anastasov : fixed the counting bug in ip_vs_unbind_masq again | |
2173 | + * (don't touch counters for templates) | |
2174 | + * Wensong Zhang : changed some IP_VS_ERR to IP_VS_DBG in the ip_vs_tunnel_xmit | |
2175 | + * Wensong Zhang : added different timeout support for persistent svc | |
2176 | + * Wensong Zhang : fixed the bug that persistent svc cannot be edited | |
2177 | + * Julian Anastasov : removed extra read_unlock in __ip_vs_lookup_service | |
2178 | + * Julian Anastasov : changed not to restart template timers if dest is unavailable | |
2179 | + * Julian Anastasov : added the destination trash | |
2180 | + * Wensong Zhang : added the update_service call in ip_vs_del_dest | |
2181 | + * Wensong Zhang : added the ip_vs_leave function | |
2182 | + * Lars Marowsky-Bree : added persistence granularity support | |
2183 | + * Julian Anastasov : changed some comestics things for debugging | |
2184 | + * Wensong Zhang : use vmalloc to allocate big ipvs hash table | |
2185 | + * Wensong Zhang : changed the tunneling/direct routing methods a little | |
2186 | + * Julian Anastasov : fixed the return bug of ip_vs_leave(-2 instead of -3) | |
2187 | + * Roberto Nibali : fixed the undefined variable bug in the IP_VS_DBG of ip_vs_dr_xmit | |
2188 | + * Julian Anastasov : changed ICMP_PROT_UNREACH to ICMP_PORT_UNREACH in ip_vs_leave | |
2189 | + * Wensong Zhang : added port zero support for persistent services | |
2190 | + * Wensong Zhang : fixed the bug that virtual ftp service blocks other services not listed in ipvs table | |
2191 | + * Wensong Zhang : invalidate a persistent template when its dest is unavailable | |
2192 | + * Julian Anastasov : changed two IP_VS_ERR calls to IP_VS_DBG | |
2193 | + * Wensong Zhang : added random drop of syn entries | |
2194 | + * Wensong Zhang : added random drop of UDP entris | |
2195 | + * Julian Anastasov : added droprate defense against DoS attack | |
2196 | + * Julian Anastasov : added secure_tcp defense against DoS attack | |
2197 | + * Wensong Zhang : revisited dropentry defense against DoS attach | |
2198 | + * Horms : added the fwmark service feature | |
2199 | + * Wensong Zhang : changed to two service hash tables | |
2200 | + * Julian Anastasov : corrected trash_dest lookup for both | |
2201 | + * normal service and fwmark service | |
2202 | + * | |
2203 | + */ | |
2204 | + | |
2205 | +#include <linux/config.h> | |
2206 | +#include <linux/module.h> | |
2207 | +#include <linux/types.h> | |
2208 | +#include <linux/kernel.h> | |
2209 | +#include <linux/errno.h> | |
2210 | +#include <linux/vmalloc.h> | |
2211 | +#include <linux/swap.h> | |
2212 | +#include <net/ip_masq.h> | |
2213 | + | |
2214 | +#include <linux/sysctl.h> | |
2215 | +#include <linux/ip_fw.h> | |
2216 | +#include <linux/ip_masq.h> | |
2217 | +#include <linux/proc_fs.h> | |
2218 | + | |
2219 | +#include <linux/inetdevice.h> | |
2220 | +#include <linux/ip.h> | |
2221 | +#include <net/icmp.h> | |
2222 | +#include <net/ip.h> | |
2223 | +#include <net/route.h> | |
2224 | +#include <net/ip_vs.h> | |
2225 | + | |
2226 | +#ifdef CONFIG_KMOD | |
2227 | +#include <linux/kmod.h> | |
2228 | +#endif | |
2229 | + | |
2230 | +EXPORT_SYMBOL(register_ip_vs_scheduler); | |
2231 | +EXPORT_SYMBOL(unregister_ip_vs_scheduler); | |
2232 | +EXPORT_SYMBOL(ip_vs_bind_masq); | |
2233 | +EXPORT_SYMBOL(ip_vs_unbind_masq); | |
2234 | +EXPORT_SYMBOL(ip_vs_lookup_dest); | |
2235 | +#ifdef CONFIG_IP_VS_DEBUG | |
2236 | +EXPORT_SYMBOL(ip_vs_get_debug_level); | |
2237 | +#endif | |
2238 | + | |
2239 | +int sysctl_ip_vs_drop_entry = 0; | |
2240 | +int sysctl_ip_vs_drop_packet = 0; | |
2241 | +int sysctl_ip_vs_secure_tcp = 0; | |
2242 | +int sysctl_ip_vs_amemthresh = 1024; | |
2243 | +int sysctl_ip_vs_am_droprate = 10; | |
2244 | + | |
2245 | +#ifdef CONFIG_IP_VS_DEBUG | |
2246 | +static int sysctl_ip_vs_debug_level = 0; | |
2247 | + | |
2248 | +int ip_vs_get_debug_level(void) | |
2249 | +{ | |
2250 | + return sysctl_ip_vs_debug_level; | |
2251 | +} | |
2252 | +#endif | |
2253 | + | |
2254 | + | |
2255 | +int ip_vs_dropentry = 0; | |
2256 | + | |
2257 | +static inline void update_defense_level(void) | |
2258 | +{ | |
2259 | + int ip_vs_amem = nr_free_pages+page_cache_size+(buffermem>>PAGE_SHIFT); | |
2260 | + int nomem = (ip_vs_amem < sysctl_ip_vs_amemthresh); | |
2261 | + | |
2262 | + /* drop_entry */ | |
2263 | + switch (sysctl_ip_vs_drop_entry) { | |
2264 | + case 0: | |
2265 | + ip_vs_dropentry = 0; | |
2266 | + break; | |
2267 | + case 1: | |
2268 | + if (nomem) { | |
2269 | + ip_vs_dropentry = 1; | |
2270 | + sysctl_ip_vs_drop_entry = 2; | |
2271 | + } else { | |
2272 | + ip_vs_dropentry = 0; | |
2273 | + } | |
2274 | + break; | |
2275 | + case 2: | |
2276 | + if (nomem) { | |
2277 | + ip_vs_dropentry = 1; | |
2278 | + } else { | |
2279 | + ip_vs_dropentry = 0; | |
2280 | + sysctl_ip_vs_drop_entry = 1; | |
2281 | + }; | |
2282 | + break; | |
2283 | + case 3: | |
2284 | + ip_vs_dropentry = 1; | |
2285 | + break; | |
2286 | + } | |
2287 | + | |
2288 | + /* drop_packet */ | |
2289 | + switch (sysctl_ip_vs_drop_packet) { | |
2290 | + case 0: | |
2291 | + ip_masq_drop_rate = 0; | |
2292 | + break; | |
2293 | + case 1: | |
2294 | + if (nomem) { | |
2295 | + ip_masq_drop_rate = ip_masq_drop_counter | |
2296 | + = sysctl_ip_vs_amemthresh / | |
2297 | + (sysctl_ip_vs_amemthresh-ip_vs_amem); | |
2298 | + sysctl_ip_vs_drop_packet = 2; | |
2299 | + } else { | |
2300 | + ip_masq_drop_rate = 0; | |
2301 | + } | |
2302 | + break; | |
2303 | + case 2: | |
2304 | + if (nomem) { | |
2305 | + ip_masq_drop_rate = ip_masq_drop_counter | |
2306 | + = sysctl_ip_vs_amemthresh / | |
2307 | + (sysctl_ip_vs_amemthresh-ip_vs_amem); | |
2308 | + } else { | |
2309 | + ip_masq_drop_rate = 0; | |
2310 | + sysctl_ip_vs_drop_packet = 1; | |
2311 | + } | |
2312 | + break; | |
2313 | + case 3: | |
2314 | + ip_masq_drop_rate = sysctl_ip_vs_am_droprate; | |
2315 | + break; | |
2316 | + } | |
2317 | + | |
2318 | + /* secure_tcp */ | |
2319 | + switch (sysctl_ip_vs_secure_tcp) { | |
2320 | + case 0: | |
2321 | + ip_masq_secure_tcp_set(0); | |
2322 | + break; | |
2323 | + case 1: | |
2324 | + if (nomem) { | |
2325 | + ip_masq_secure_tcp_set(1); | |
2326 | + sysctl_ip_vs_secure_tcp = 2; | |
2327 | + } else { | |
2328 | + ip_masq_secure_tcp_set(0); | |
2329 | + } | |
2330 | + break; | |
2331 | + case 2: | |
2332 | + if (nomem) { | |
2333 | + ip_masq_secure_tcp_set(1); | |
2334 | + } else { | |
2335 | + ip_masq_secure_tcp_set(0); | |
2336 | + sysctl_ip_vs_secure_tcp = 1; | |
2337 | + } | |
2338 | + break; | |
2339 | + case 3: | |
2340 | + ip_masq_secure_tcp_set(1); | |
2341 | + break; | |
2342 | + } | |
2343 | +} | |
2344 | + | |
2345 | + | |
2346 | +static inline int todrop_entry(struct ip_masq *ms) | |
2347 | +{ | |
2348 | + /* | |
2349 | + * The drop rate array needs tuning for real environments. | |
2350 | + */ | |
2351 | + static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; | |
2352 | + static char todrop_counter[9] = {0}; | |
2353 | + int i; | |
2354 | + | |
2355 | + if (ms->timeout+jiffies-ms->timer.expires < 60*HZ) | |
2356 | + return 0; | |
2357 | + | |
2358 | + i = atomic_read(&ms->in_pkts); | |
2359 | + if (i > 8) return 0; | |
2360 | + | |
2361 | + if (!todrop_rate[i]) return 0; | |
2362 | + if (--todrop_counter[i] > 0) return 0; | |
2363 | + | |
2364 | + todrop_counter[i] = todrop_rate[i]; | |
2365 | + return 1; | |
2366 | +} | |
2367 | + | |
2368 | +static inline void ip_vs_random_dropentry(void) | |
2369 | +{ | |
2370 | + int i; | |
2371 | + struct ip_masq *ms; | |
2372 | + struct list_head *l,*e; | |
2373 | + struct ip_masq *mst; | |
2374 | + void (*fn)(unsigned long); | |
2375 | + | |
2376 | + /* | |
2377 | + * Randomly scan 1/32 of the whole table every second | |
2378 | + */ | |
2379 | + for (i=0; i < (IP_VS_TAB_SIZE>>5); i++) { | |
2380 | + /* | |
2381 | + * Lock is actually needed in this loop. | |
2382 | + */ | |
2383 | + write_lock(&__ip_masq_lock); | |
2384 | + | |
2385 | + l = &ip_vs_table[net_random()&IP_VS_TAB_MASK]; | |
2386 | + for (e=l->next; e!=l; e=e->next) { | |
2387 | + ms = list_entry(e, struct ip_masq, m_list); | |
2388 | + if (ms->dport == 0) | |
2389 | + /* masq template */ | |
2390 | + continue; | |
2391 | + switch(ms->state) { | |
2392 | + case IP_MASQ_S_SYN_RECV: | |
2393 | + case IP_MASQ_S_SYNACK: | |
2394 | + break; | |
2395 | + | |
2396 | + case IP_MASQ_S_ESTABLISHED: | |
2397 | + case IP_MASQ_S_UDP: | |
2398 | + if (todrop_entry(ms)) | |
2399 | + break; | |
2400 | + continue; | |
2401 | + | |
2402 | + default: | |
2403 | + continue; | |
2404 | + } | |
2405 | + | |
2406 | + /* | |
2407 | + * Drop the entry, and drop its mst if not referenced | |
2408 | + */ | |
2409 | + write_unlock(&__ip_masq_lock); | |
2410 | + IP_VS_DBG(4, "Drop masq\n"); | |
2411 | + mst = ms->control; | |
2412 | + fn = (ms->timer).function; | |
2413 | + del_sltimer(&ms->timer); | |
2414 | + fn((unsigned long)ms); | |
2415 | + if (mst && !atomic_read(&mst->n_control)) { | |
2416 | + IP_VS_DBG(4, "Drop masq template\n"); | |
2417 | + del_sltimer(&mst->timer); | |
2418 | + fn((unsigned long)mst); | |
2419 | + } | |
2420 | + write_lock(&__ip_masq_lock); | |
2421 | + } | |
2422 | + write_unlock(&__ip_masq_lock); | |
2423 | + } | |
2424 | +} | |
2425 | + | |
2426 | + | |
2427 | +/* | |
2428 | + * The following block implements slow timers for IPVS, most code is stolen | |
2429 | + * from linux/kernel/sched.c | |
2430 | + * Slow timer is used to avoid the overhead of cascading timers, when lots | |
2431 | + * of masq entries (>50,000) are cluttered in the system. | |
2432 | + */ | |
2433 | +#define SHIFT_BITS 6 | |
2434 | +#define TVN_BITS 8 | |
2435 | +#define TVR_BITS 10 | |
2436 | +#define TVN_SIZE (1 << TVN_BITS) | |
2437 | +#define TVR_SIZE (1 << TVR_BITS) | |
2438 | +#define TVN_MASK (TVN_SIZE - 1) | |
2439 | +#define TVR_MASK (TVR_SIZE - 1) | |
2440 | + | |
2441 | +struct sltimer_vec { | |
2442 | + int index; | |
2443 | + struct timer_list *vec[TVN_SIZE]; | |
2444 | +}; | |
2445 | + | |
2446 | +struct sltimer_vec_root { | |
2447 | + int index; | |
2448 | + struct timer_list *vec[TVR_SIZE]; | |
2449 | +}; | |
2450 | + | |
2451 | +static struct sltimer_vec sltv3 = { 0 }; | |
2452 | +static struct sltimer_vec sltv2 = { 0 }; | |
2453 | +static struct sltimer_vec_root sltv1 = { 0 }; | |
2454 | + | |
2455 | +static struct sltimer_vec * const sltvecs[] = { | |
2456 | + (struct sltimer_vec *)&sltv1, &sltv2, &sltv3 | |
2457 | +}; | |
2458 | + | |
2459 | +#define NOOF_SLTVECS (sizeof(sltvecs) / sizeof(sltvecs[0])) | |
2460 | + | |
2461 | +static unsigned long sltimer_jiffies = 0; | |
2462 | + | |
2463 | +static inline void insert_sltimer(struct timer_list *timer, | |
2464 | + struct timer_list **vec, int idx) | |
2465 | +{ | |
2466 | + if ((timer->next = vec[idx])) | |
2467 | + vec[idx]->prev = timer; | |
2468 | + vec[idx] = timer; | |
2469 | + timer->prev = (struct timer_list *)&vec[idx]; | |
2470 | +} | |
2471 | + | |
2472 | +static inline void internal_add_sltimer(struct timer_list *timer) | |
2473 | +{ | |
2474 | + /* | |
2475 | + * must be cli-ed when calling this | |
2476 | + */ | |
2477 | + unsigned long expires = timer->expires; | |
2478 | + unsigned long idx = (expires - sltimer_jiffies) >> SHIFT_BITS; | |
2479 | + | |
2480 | + if (idx < TVR_SIZE) { | |
2481 | + int i = (expires >> SHIFT_BITS) & TVR_MASK; | |
2482 | + insert_sltimer(timer, sltv1.vec, i); | |
2483 | + } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { | |
2484 | + int i = (expires >> (SHIFT_BITS+TVR_BITS)) & TVN_MASK; | |
2485 | + insert_sltimer(timer, sltv2.vec, i); | |
2486 | + } else if ((signed long) idx < 0) { | |
2487 | + /* | |
2488 | + * can happen if you add a timer with expires == jiffies, | |
2489 | + * or you set a timer to go off in the past | |
2490 | + */ | |
2491 | + insert_sltimer(timer, sltv1.vec, sltv1.index); | |
2492 | + } else if (idx <= 0xffffffffUL) { | |
2493 | + int i = (expires >> (SHIFT_BITS+TVR_BITS+TVN_BITS)) & TVN_MASK; | |
2494 | + insert_sltimer(timer, sltv3.vec, i); | |
2495 | + } else { | |
2496 | + /* Can only get here on architectures with 64-bit jiffies */ | |
2497 | + timer->next = timer->prev = timer; | |
2498 | + } | |
2499 | +} | |
2500 | + | |
2501 | +rwlock_t sltimerlist_lock = RW_LOCK_UNLOCKED; | |
2502 | + | |
2503 | +void add_sltimer(struct timer_list *timer) | |
2504 | +{ | |
2505 | + write_lock(&sltimerlist_lock); | |
2506 | + if (timer->prev) | |
2507 | + goto bug; | |
2508 | + internal_add_sltimer(timer); | |
2509 | +out: | |
2510 | + write_unlock(&sltimerlist_lock); | |
2511 | + return; | |
2512 | + | |
2513 | +bug: | |
2514 | + printk("bug: kernel sltimer added twice at %p.\n", | |
2515 | + __builtin_return_address(0)); | |
2516 | + goto out; | |
2517 | +} | |
2518 | + | |
2519 | +static inline int detach_sltimer(struct timer_list *timer) | |
2520 | +{ | |
2521 | + struct timer_list *prev = timer->prev; | |
2522 | + if (prev) { | |
2523 | + struct timer_list *next = timer->next; | |
2524 | + prev->next = next; | |
2525 | + if (next) | |
2526 | + next->prev = prev; | |
2527 | + return 1; | |
2528 | + } | |
2529 | + return 0; | |
2530 | +} | |
2531 | + | |
2532 | +void mod_sltimer(struct timer_list *timer, unsigned long expires) | |
2533 | +{ | |
2534 | + write_lock(&sltimerlist_lock); | |
2535 | + timer->expires = expires; | |
2536 | + detach_sltimer(timer); | |
2537 | + internal_add_sltimer(timer); | |
2538 | + write_unlock(&sltimerlist_lock); | |
2539 | +} | |
2540 | + | |
2541 | +int del_sltimer(struct timer_list * timer) | |
2542 | +{ | |
2543 | + int ret; | |
2544 | + | |
2545 | + write_lock(&sltimerlist_lock); | |
2546 | + ret = detach_sltimer(timer); | |
2547 | + timer->next = timer->prev = 0; | |
2548 | + write_unlock(&sltimerlist_lock); | |
2549 | + return ret; | |
2550 | +} | |
2551 | + | |
2552 | + | |
2553 | +static inline void cascade_sltimers(struct sltimer_vec *tv) | |
2554 | +{ | |
2555 | + /* | |
2556 | + * cascade all the timers from tv up one level | |
2557 | + */ | |
2558 | + struct timer_list *timer; | |
2559 | + timer = tv->vec[tv->index]; | |
2560 | + /* | |
2561 | + * We are removing _all_ timers from the list, so we don't have to | |
2562 | + * detach them individually, just clear the list afterwards. | |
2563 | + */ | |
2564 | + while (timer) { | |
2565 | + struct timer_list *tmp = timer; | |
2566 | + timer = timer->next; | |
2567 | + internal_add_sltimer(tmp); | |
2568 | + } | |
2569 | + tv->vec[tv->index] = NULL; | |
2570 | + tv->index = (tv->index + 1) & TVN_MASK; | |
2571 | +} | |
2572 | + | |
2573 | +static inline void run_sltimer_list(void) | |
2574 | +{ | |
2575 | + write_lock(&sltimerlist_lock); | |
2576 | + while ((long)(jiffies - sltimer_jiffies) >= 0) { | |
2577 | + struct timer_list *timer; | |
2578 | + if (!sltv1.index) { | |
2579 | + int n = 1; | |
2580 | + do { | |
2581 | + cascade_sltimers(sltvecs[n]); | |
2582 | + } while (sltvecs[n]->index == 1 && ++n < NOOF_SLTVECS); | |
2583 | + } | |
2584 | + while ((timer = sltv1.vec[sltv1.index])) { | |
2585 | + void (*fn)(unsigned long) = timer->function; | |
2586 | + unsigned long data = timer->data; | |
2587 | + detach_sltimer(timer); | |
2588 | + timer->next = timer->prev = NULL; | |
2589 | + write_unlock(&sltimerlist_lock); | |
2590 | + fn(data); | |
2591 | + write_lock(&sltimerlist_lock); | |
2592 | + } | |
2593 | + sltimer_jiffies += 1<<SHIFT_BITS; | |
2594 | + sltv1.index = (sltv1.index + 1) & TVR_MASK; | |
2595 | + } | |
2596 | + write_unlock(&sltimerlist_lock); | |
2597 | +} | |
2598 | + | |
2599 | +static void sltimer_handler(unsigned long data); | |
2600 | + | |
2601 | +struct timer_list slow_timer = { | |
2602 | + NULL, NULL, | |
2603 | + 0, 0, | |
2604 | + sltimer_handler, | |
2605 | +}; | |
2606 | + | |
2607 | +/* | |
2608 | + * Slow timer handler is activated every second | |
2609 | + */ | |
2610 | +#define SLTIMER_PERIOD 1*HZ | |
2611 | + | |
2612 | +void sltimer_handler(unsigned long data) | |
2613 | +{ | |
2614 | + run_sltimer_list(); | |
2615 | + | |
2616 | + update_defense_level(); | |
2617 | + if (ip_vs_dropentry) | |
2618 | + ip_vs_random_dropentry(); | |
2619 | + | |
2620 | + mod_timer(&slow_timer, (jiffies + SLTIMER_PERIOD)); | |
2621 | +} | |
2622 | + | |
2623 | + | |
2624 | +/* | |
2625 | + * The port number of FTP service (in network order). | |
2626 | + */ | |
2627 | +#define FTPPORT __constant_htons(21) | |
2628 | +#define FTPDATA __constant_htons(20) | |
2629 | + | |
2630 | +/* | |
2631 | + * Lock for IPVS | |
2632 | + */ | |
2633 | +rwlock_t __ip_vs_lock = RW_LOCK_UNLOCKED; | |
2634 | + | |
2635 | +/* | |
2636 | + * Hash table: for input and output packets lookups of IPVS | |
2637 | + */ | |
2638 | +#define IP_MASQ_NTABLES 3 | |
2639 | + | |
2640 | +struct list_head *ip_vs_table; | |
2641 | + | |
2642 | +/* | |
2643 | + * Hash table: for virtual service lookups | |
2644 | + */ | |
2645 | +#define IP_VS_SVC_TAB_BITS 8 | |
2646 | +#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) | |
2647 | +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) | |
2648 | + | |
2649 | +/* the service table hashed by <protocol, addr, port> */ | |
2650 | +struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; | |
2651 | +/* the service table hashed by fwmark */ | |
2652 | +struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; | |
2653 | + | |
2654 | +/* | |
2655 | + * Hash table: for real service lookups | |
2656 | + */ | |
2657 | +#define IP_VS_RTAB_BITS 4 | |
2658 | +#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) | |
2659 | +#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) | |
2660 | + | |
2661 | +struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; | |
2662 | + | |
2663 | +/* | |
2664 | + * IPVS scheduler list | |
2665 | + */ | |
2666 | +struct list_head ip_vs_schedulers; | |
2667 | + | |
2668 | +/* | |
2669 | + * Trash for destinations | |
2670 | + */ | |
2671 | +struct list_head ip_vs_dest_trash; | |
2672 | + | |
2673 | +/* | |
2674 | + * FTP & NULL virtual service counters | |
2675 | + */ | |
2676 | +atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); | |
2677 | +atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); | |
2678 | + | |
2679 | +/* | |
2680 | + * Register a scheduler in the scheduler list | |
2681 | + */ | |
2682 | +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) | |
2683 | +{ | |
2684 | + if (!scheduler) { | |
2685 | + IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); | |
2686 | + return -EINVAL; | |
2687 | + } | |
2688 | + | |
2689 | + if (!scheduler->name) { | |
2690 | + IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); | |
2691 | + return -EINVAL; | |
2692 | + } | |
2693 | + | |
2694 | + if (scheduler->n_list.next != &scheduler->n_list) { | |
2695 | + IP_VS_ERR("register_ip_vs_scheduler(): scheduler already linked\n"); | |
2696 | + return -EINVAL; | |
2697 | + } | |
2698 | + | |
2699 | + /* | |
2700 | + * Add it into the d-linked scheduler list | |
2701 | + */ | |
2702 | + list_add(&scheduler->n_list, &ip_vs_schedulers); | |
2703 | + | |
2704 | + return 0; | |
2705 | +} | |
2706 | + | |
2707 | + | |
2708 | +/* | |
2709 | + * Unregister a scheduler in the scheduler list | |
2710 | + */ | |
2711 | +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) | |
2712 | +{ | |
2713 | + if (!scheduler) { | |
2714 | + IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); | |
2715 | + return -EINVAL; | |
2716 | + } | |
2717 | + | |
2718 | + /* | |
2719 | + * Only allow unregistration if it is not referenced | |
2720 | + */ | |
2721 | + if (atomic_read(&scheduler->refcnt)) { | |
2722 | + IP_VS_ERR("unregister_ip_vs_scheduler(): is in use by %d guys. failed\n", | |
2723 | + atomic_read(&scheduler->refcnt)); | |
2724 | + return -EINVAL; | |
2725 | + } | |
2726 | + | |
2727 | + if (scheduler->n_list.next == &scheduler->n_list) { | |
2728 | + IP_VS_ERR("unregister_ip_vs_scheduler(): scheduler is not in the list. failed\n"); | |
2729 | + return -EINVAL; | |
2730 | + } | |
2731 | + | |
2732 | + /* | |
2733 | + * Removed it from the d-linked scheduler list | |
2734 | + */ | |
2735 | + list_del(&scheduler->n_list); | |
2736 | + | |
2737 | + return 0; | |
2738 | +} | |
2739 | + | |
2740 | + | |
2741 | +/* | |
2742 | + * Bind a service with a scheduler | |
2743 | + * Must called with the __ip_vs_lock lock, and return bool. | |
2744 | + */ | |
2745 | +int ip_vs_bind_scheduler(struct ip_vs_service *svc, | |
2746 | + struct ip_vs_scheduler *scheduler) | |
2747 | +{ | |
2748 | + if (svc == NULL) { | |
2749 | + IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); | |
2750 | + return -EINVAL; | |
2751 | + } | |
2752 | + if (scheduler == NULL) { | |
2753 | + IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); | |
2754 | + return -EINVAL; | |
2755 | + } | |
2756 | + | |
2757 | + svc->scheduler = scheduler; | |
2758 | + atomic_inc(&scheduler->refcnt); | |
2759 | + | |
2760 | + if(scheduler->init_service) | |
2761 | + if(scheduler->init_service(svc) != 0) { | |
2762 | + IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); | |
2763 | + return -EINVAL; | |
2764 | + } | |
2765 | + | |
2766 | + return 0; | |
2767 | +} | |
2768 | + | |
2769 | + | |
2770 | +/* | |
2771 | + * Unbind a service with its scheduler | |
2772 | + * Must called with the __ip_vs_lock lock, and return bool. | |
2773 | + */ | |
2774 | +int ip_vs_unbind_scheduler(struct ip_vs_service *svc) | |
2775 | +{ | |
2776 | + struct ip_vs_scheduler *sched; | |
2777 | + | |
2778 | + if (svc == NULL) { | |
2779 | + IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); | |
2780 | + return -EINVAL; | |
2781 | + } | |
2782 | + | |
2783 | + sched = svc->scheduler; | |
2784 | + if (sched == NULL) { | |
2785 | + IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); | |
2786 | + return -EINVAL; | |
2787 | + } | |
2788 | + | |
2789 | + if(sched->done_service) | |
2790 | + if(sched->done_service(svc) != 0) { | |
2791 | + IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); | |
2792 | + return -EINVAL; | |
2793 | + } | |
2794 | + | |
2795 | + atomic_dec(&sched->refcnt); | |
2796 | + svc->scheduler = NULL; | |
2797 | + | |
2798 | + return 0; | |
2799 | +} | |
2800 | + | |
2801 | + | |
2802 | +/* | |
2803 | + * Get scheduler in the scheduler list by name | |
2804 | + */ | |
2805 | +struct ip_vs_scheduler * ip_vs_sched_getbyname(const char *sched_name) | |
2806 | +{ | |
2807 | + struct ip_vs_scheduler *sched; | |
2808 | + struct list_head *l, *e; | |
2809 | + | |
2810 | + IP_VS_DBG(6, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", | |
2811 | + sched_name); | |
2812 | + | |
2813 | + read_lock_bh(&__ip_vs_lock); | |
2814 | + | |
2815 | + l = &ip_vs_schedulers; | |
2816 | + for (e=l->next; e!=l; e=e->next) { | |
2817 | + sched = list_entry(e, struct ip_vs_scheduler, n_list); | |
2818 | + if (strcmp(sched_name, sched->name)==0) { | |
2819 | + /* HIT */ | |
2820 | + read_unlock_bh(&__ip_vs_lock); | |
2821 | + return sched; | |
2822 | + } | |
2823 | + } | |
2824 | + | |
2825 | + read_unlock_bh(&__ip_vs_lock); | |
2826 | + return NULL; | |
2827 | +} | |
2828 | + | |
2829 | + | |
2830 | +/* | |
2831 | + * Lookup scheduler and try to load it if it doesn't exist | |
2832 | + */ | |
2833 | +struct ip_vs_scheduler * ip_vs_lookup_scheduler(const char *sched_name) | |
2834 | +{ | |
2835 | + struct ip_vs_scheduler *sched; | |
2836 | + | |
2837 | + /* | |
2838 | + * Search for the scheduler by sched_name | |
2839 | + */ | |
2840 | + sched = ip_vs_sched_getbyname(sched_name); | |
2841 | + | |
2842 | + /* | |
2843 | + * If scheduler not found, load the module and search again | |
2844 | + */ | |
2845 | + if (sched == NULL) { | |
2846 | + char module_name[IP_MASQ_TNAME_MAX+8]; | |
2847 | + sprintf(module_name,"ip_vs_%s",sched_name); | |
2848 | +#ifdef CONFIG_KMOD | |
2849 | + request_module(module_name); | |
2850 | +#endif /* CONFIG_KMOD */ | |
2851 | + sched = ip_vs_sched_getbyname(sched_name); | |
2852 | + } | |
2853 | + | |
2854 | + return sched; | |
2855 | +} | |
2856 | + | |
2857 | + | |
2858 | +/* | |
2859 | + * Returns hash value for IPVS masq entry | |
2860 | + */ | |
2861 | + | |
2862 | +static __inline__ unsigned | |
2863 | +ip_vs_hash_key(unsigned proto, __u32 addr, __u16 port) | |
2864 | +{ | |
2865 | + unsigned addrh = ntohl(addr); | |
2866 | + | |
2867 | + return (proto^addrh^(addrh>>IP_VS_TAB_BITS)^ntohs(port)) | |
2868 | + & IP_VS_TAB_MASK; | |
2869 | +} | |
2870 | + | |
2871 | + | |
2872 | +/* | |
2873 | + * Hashes ip_masq in ip_vs_table by proto,addr,port. | |
2874 | + * should be called with locked tables. | |
2875 | + * returns bool success. | |
2876 | + */ | |
2877 | +int ip_vs_hash(struct ip_masq *ms) | |
2878 | +{ | |
2879 | + unsigned hash; | |
2880 | + | |
2881 | + if (ms->flags & IP_MASQ_F_HASHED) { | |
2882 | + IP_VS_ERR("ip_vs_hash(): request for already hashed, " | |
2883 | + "called from %p\n", __builtin_return_address(0)); | |
2884 | + return 0; | |
2885 | + } | |
2886 | + | |
2887 | + /* | |
2888 | + * Note: because ip_masq_put sets masq expire only if its | |
2889 | + * refcnt==IP_MASQ_NTABLES, otherwise the masq entry | |
2890 | + * will never expire. | |
2891 | + */ | |
2892 | + atomic_add(IP_MASQ_NTABLES, &ms->refcnt); | |
2893 | + | |
2894 | + /* | |
2895 | + * Hash by proto,d{addr,port}, | |
2896 | + * which are client address and port in IPVS. | |
2897 | + */ | |
2898 | + hash = ip_vs_hash_key(ms->protocol, ms->daddr, ms->dport); | |
2899 | + list_add(&ms->m_list, &ip_vs_table[hash]); | |
2900 | + | |
2901 | + ms->flags |= IP_MASQ_F_HASHED; | |
2902 | + return 1; | |
2903 | +} | |
2904 | + | |
2905 | + | |
2906 | +/* | |
2907 | + * Unhashes ip_masq from ip_vs_table. | |
2908 | + * should be called with locked tables. | |
2909 | + * returns bool success. | |
2910 | + */ | |
2911 | +int ip_vs_unhash(struct ip_masq *ms) | |
2912 | +{ | |
2913 | + if (!(ms->flags & IP_MASQ_F_HASHED)) { | |
2914 | + IP_VS_ERR("ip_vs_unhash(): request for unhash flagged, " | |
2915 | + "called from %p\n", __builtin_return_address(0)); | |
2916 | + return 0; | |
2917 | + } | |
2918 | + | |
2919 | + /* | |
2920 | + * Remove it from the list and decrease its reference counter. | |
2921 | + */ | |
2922 | + list_del(&ms->m_list); | |
2923 | + atomic_sub(IP_MASQ_NTABLES, &ms->refcnt); | |
2924 | + | |
2925 | + ms->flags &= ~IP_MASQ_F_HASHED; | |
2926 | + return 1; | |
2927 | +} | |
2928 | + | |
2929 | + | |
2930 | +/* | |
2931 | + * Gets ip_masq associated with supplied parameters in the ip_vs_table. | |
2932 | + * Called for pkts coming from OUTside-to-INside. | |
2933 | + * s_addr, s_port: pkt source address (foreign host) | |
2934 | + * d_addr, d_port: pkt dest address (load balancer) | |
2935 | + * Caller must lock tables | |
2936 | + */ | |
2937 | +struct ip_masq * __ip_vs_in_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) | |
2938 | +{ | |
2939 | + unsigned hash; | |
2940 | + struct ip_masq *ms; | |
2941 | + struct list_head *l,*e; | |
2942 | + | |
2943 | + hash = ip_vs_hash_key(protocol, s_addr, s_port); | |
2944 | + | |
2945 | + l = &ip_vs_table[hash]; | |
2946 | + for (e=l->next; e!=l; e=e->next) { | |
2947 | + ms = list_entry(e, struct ip_masq, m_list); | |
2948 | + if (s_addr==ms->daddr && s_port==ms->dport && | |
2949 | + d_port==ms->mport && d_addr==ms->maddr && | |
2950 | + protocol==ms->protocol) { | |
2951 | + /* HIT */ | |
2952 | + atomic_inc(&ms->refcnt); | |
2953 | + goto out; | |
2954 | + } | |
2955 | + } | |
2956 | + ms = NULL; | |
2957 | + | |
2958 | + out: | |
2959 | + IP_VS_DBG(7, "look/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", | |
2960 | + masq_proto_name(protocol), | |
2961 | + NIPQUAD(s_addr), ntohs(s_port), | |
2962 | + NIPQUAD(d_addr), ntohs(d_port), | |
2963 | + ms?"hit":"not hit"); | |
2964 | + | |
2965 | + return ms; | |
2966 | +} | |
2967 | + | |
2968 | + | |
2969 | +/* | |
2970 | + * Gets ip_masq associated with supplied parameters in the ip_vs_table. | |
2971 | + * Called for pkts coming from inside-to-OUTside. | |
2972 | + * s_addr, s_port: pkt source address (inside host) | |
2973 | + * d_addr, d_port: pkt dest address (foreign host) | |
2974 | + * Caller must lock tables | |
2975 | + */ | |
2976 | +struct ip_masq * __ip_vs_out_get(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) | |
2977 | +{ | |
2978 | + unsigned hash; | |
2979 | + struct ip_masq *ms; | |
2980 | + struct list_head *l,*e; | |
2981 | + | |
2982 | + /* | |
2983 | + * Check for "full" addressed entries | |
2984 | + */ | |
2985 | + hash = ip_vs_hash_key(protocol, d_addr, d_port); | |
2986 | + | |
2987 | + l = &ip_vs_table[hash]; | |
2988 | + for (e=l->next; e!=l; e=e->next) { | |
2989 | + ms = list_entry(e, struct ip_masq, m_list); | |
2990 | + if (d_addr == ms->daddr && d_port == ms->dport && | |
2991 | + s_port == ms->sport && s_addr == ms->saddr && | |
2992 | + protocol == ms->protocol) { | |
2993 | + /* HIT */ | |
2994 | + atomic_inc(&ms->refcnt); | |
2995 | + goto out; | |
2996 | + } | |
2997 | + } | |
2998 | + ms = NULL; | |
2999 | + | |
3000 | + out: | |
3001 | + IP_VS_DBG(7, "look/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", | |
3002 | + masq_proto_name(protocol), | |
3003 | + NIPQUAD(s_addr), ntohs(s_port), | |
3004 | + NIPQUAD(d_addr), ntohs(d_port), | |
3005 | + ms?"hit":"not hit"); | |
3006 | + | |
3007 | + return ms; | |
3008 | +} | |
3009 | + | |
3010 | + | |
3011 | +/* | |
3012 | + * Called by ip_vs_sched_persist to look for masq template. | |
3013 | + */ | |
3014 | +static __inline__ struct ip_masq *ip_vs_in_get | |
3015 | +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) | |
3016 | +{ | |
3017 | + struct ip_masq *ms; | |
3018 | + | |
3019 | + read_lock(&__ip_masq_lock); | |
3020 | + ms = __ip_vs_in_get(protocol, s_addr, s_port, d_addr, d_port); | |
3021 | + read_unlock(&__ip_masq_lock); | |
3022 | + | |
3023 | + return ms; | |
3024 | +} | |
3025 | + | |
3026 | + | |
3027 | +/* | |
3028 | + * Returns hash value for virtual service | |
3029 | + */ | |
3030 | +static __inline__ unsigned | |
3031 | +ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port) | |
3032 | +{ | |
3033 | + register unsigned porth = ntohs(port); | |
3034 | + | |
3035 | + return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth) | |
3036 | + & IP_VS_SVC_TAB_MASK; | |
3037 | +} | |
3038 | + | |
3039 | +/* | |
3040 | + * Returns hash value of fwmark for virtual service lookup | |
3041 | + */ | |
3042 | +static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) | |
3043 | +{ | |
3044 | + return fwmark & IP_VS_SVC_TAB_MASK; | |
3045 | +} | |
3046 | + | |
3047 | +/* | |
3048 | + * Hashes ip_vs_service in the ip_vs_svc_table by <proto,addr,port> | |
3049 | + * or in the ip_vs_svc_fwm_table by fwmark. | |
3050 | + * Should be called with locked tables. | |
3051 | + * Returns bool success. | |
3052 | + */ | |
3053 | +int ip_vs_svc_hash(struct ip_vs_service *svc) | |
3054 | +{ | |
3055 | + unsigned hash; | |
3056 | + | |
3057 | + if (svc->flags & IP_VS_SVC_F_HASHED) { | |
3058 | + IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " | |
3059 | + "called from %p\n", __builtin_return_address(0)); | |
3060 | + return 0; | |
3061 | + } | |
3062 | + | |
3063 | + if (svc->fwmark == 0) { | |
3064 | + /* | |
3065 | + * Hash by <protocol,addr,port> in ip_vs_svc_table | |
3066 | + */ | |
3067 | + hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port); | |
3068 | + list_add(&svc->s_list, &ip_vs_svc_table[hash]); | |
3069 | + } else { | |
3070 | + /* | |
3071 | + * Hash by fwmark in ip_vs_svc_fwm_table | |
3072 | + */ | |
3073 | + hash = ip_vs_svc_fwm_hashkey(svc->fwmark); | |
3074 | + list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); | |
3075 | + } | |
3076 | + | |
3077 | + svc->flags |= IP_VS_SVC_F_HASHED; | |
3078 | + atomic_inc(&svc->refcnt); | |
3079 | + return 1; | |
3080 | +} | |
3081 | + | |
3082 | + | |
3083 | +/* | |
3084 | + * Unhashes ip_vs_service from ip_vs_svc_table/ip_vs_svc_fwm_table. | |
3085 | + * Should be called with locked tables. | |
3086 | + * Returns bool success. | |
3087 | + */ | |
3088 | +int ip_vs_svc_unhash(struct ip_vs_service *svc) | |
3089 | +{ | |
3090 | + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { | |
3091 | + IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " | |
3092 | + "called from %p\n", __builtin_return_address(0)); | |
3093 | + return 0; | |
3094 | + } | |
3095 | + | |
3096 | + if (svc->fwmark == 0) { | |
3097 | + /* | |
3098 | + * Remove it from the ip_vs_svc_table table. | |
3099 | + */ | |
3100 | + list_del(&svc->s_list); | |
3101 | + } else { | |
3102 | + /* | |
3103 | + * Remove it from the ip_vs_svc_fwm_table table. | |
3104 | + */ | |
3105 | + list_del(&svc->f_list); | |
3106 | + } | |
3107 | + | |
3108 | + svc->flags &= ~IP_VS_SVC_F_HASHED; | |
3109 | + atomic_dec(&svc->refcnt); | |
3110 | + return 1; | |
3111 | +} | |
3112 | + | |
3113 | + | |
3114 | +/* | |
3115 | + * Lookup service by {proto,addr,port} in the service table. | |
3116 | + */ | |
3117 | +static __inline__ struct ip_vs_service * | |
3118 | +__ip_vs_lookup_service(__u16 protocol, __u32 vaddr, __u16 vport) | |
3119 | +{ | |
3120 | + unsigned hash; | |
3121 | + struct ip_vs_service *svc; | |
3122 | + struct list_head *l,*e; | |
3123 | + | |
3124 | + /* | |
3125 | + * Check for "full" addressed entries | |
3126 | + * Note: as long as IP_VS_SVC_TAB_BITS is larger than zero, | |
3127 | + * <TCP,addr,port> and <UDP,addr,port> have different hash | |
3128 | + * keys, there is no need to do protcol checking. | |
3129 | + */ | |
3130 | + hash = ip_vs_svc_hashkey(protocol, vaddr, vport); | |
3131 | + | |
3132 | + l = &ip_vs_svc_table[hash]; | |
3133 | + for (e=l->next; e!=l; e=e->next) { | |
3134 | + svc = list_entry(e, struct ip_vs_service, s_list); | |
3135 | + if ((svc->addr == vaddr) | |
3136 | + && (svc->port == vport)) { | |
3137 | + /* HIT */ | |
3138 | + return svc; | |
3139 | + } | |
3140 | + } | |
3141 | + | |
3142 | + return NULL; | |
3143 | +} | |
3144 | + | |
3145 | + | |
3146 | +/* | |
3147 | + * Lookup service by fwmark in the service table. | |
3148 | + */ | |
3149 | +static __inline__ struct ip_vs_service * __ip_vs_lookup_svc_fwm(__u32 fwmark) | |
3150 | +{ | |
3151 | + unsigned hash; | |
3152 | + struct ip_vs_service *svc; | |
3153 | + struct list_head *l,*e; | |
3154 | + | |
3155 | + /* | |
3156 | + * Check for fwmark-indexed entries | |
3157 | + */ | |
3158 | + hash = ip_vs_svc_fwm_hashkey(fwmark); | |
3159 | + | |
3160 | + l = &ip_vs_svc_fwm_table[hash]; | |
3161 | + for (e=l->next; e!=l; e=e->next) { | |
3162 | + svc = list_entry(e, struct ip_vs_service, f_list); | |
3163 | + if (svc->fwmark == fwmark) { | |
3164 | + /* HIT */ | |
3165 | + return svc; | |
3166 | + } | |
3167 | + } | |
3168 | + | |
3169 | + return NULL; | |
3170 | +} | |
3171 | + | |
3172 | +struct ip_vs_service * | |
3173 | +ip_vs_lookup_service(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport) | |
3174 | +{ | |
3175 | + struct ip_vs_service *svc; | |
3176 | + | |
3177 | + read_lock(&__ip_vs_lock); | |
3178 | + | |
3179 | + if (fwmark) { | |
3180 | + /* | |
3181 | + * Check the table hashed by fwmark first | |
3182 | + */ | |
3183 | + svc = __ip_vs_lookup_svc_fwm(fwmark); | |
3184 | + if (svc) | |
3185 | + goto out; | |
3186 | + } | |
3187 | + | |
3188 | + /* | |
3189 | + * Check the table hashed by <protocol,addr,port> | |
3190 | + * first for "full" addressed entries | |
3191 | + */ | |
3192 | + svc = __ip_vs_lookup_service(protocol, vaddr, vport); | |
3193 | + | |
3194 | + if (svc == NULL | |
3195 | + && protocol == IPPROTO_TCP | |
3196 | + && atomic_read(&ip_vs_ftpsvc_counter) | |
3197 | + && (vport==FTPDATA || ntohs(vport)>=PROT_SOCK)){ | |
3198 | + /* | |
3199 | + * Check if ftp service entry exists, the packet | |
3200 | + * might belong to FTP data connections. | |
3201 | + */ | |
3202 | + svc = __ip_vs_lookup_service(protocol, vaddr, FTPPORT); | |
3203 | + } | |
3204 | + | |
3205 | + if (svc == NULL | |
3206 | + && atomic_read(&ip_vs_nullsvc_counter)) { | |
3207 | + /* | |
3208 | + * Check if the catch-all port (port zero) exists | |
3209 | + */ | |
3210 | + svc = __ip_vs_lookup_service(protocol, vaddr, 0); | |
3211 | + } | |
3212 | + | |
3213 | + out: | |
3214 | + read_unlock(&__ip_vs_lock); | |
3215 | + | |
3216 | + IP_VS_DBG(5, "lookup_service fwm %d %s %u.%u.%u.%u:%d %s\n", | |
3217 | + fwmark, | |
3218 | + masq_proto_name(protocol), | |
3219 | + NIPQUAD(vaddr), ntohs(vport), | |
3220 | + svc?"hit":"not hit"); | |
3221 | + | |
3222 | + return svc; | |
3223 | +} | |
3224 | + | |
3225 | + | |
3226 | +/* | |
3227 | + * Bind a destination with a service | |
3228 | + */ | |
3229 | +static inline void | |
3230 | +__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) | |
3231 | +{ | |
3232 | + atomic_inc(&svc->refcnt); | |
3233 | + dest->svc = svc; | |
3234 | +} | |
3235 | + | |
3236 | +/* | |
3237 | + * Unbind a destination with its service | |
3238 | + */ | |
3239 | +static inline void | |
3240 | +__ip_vs_unbind_svc(struct ip_vs_dest *dest) | |
3241 | +{ | |
3242 | + struct ip_vs_service *svc = dest->svc; | |
3243 | + | |
3244 | + dest->svc = NULL; | |
3245 | + if (atomic_dec_and_test(&svc->refcnt)) { | |
3246 | + IP_VS_DBG(2, "release svc %s %u.%u.%u.%u:%d\n", | |
3247 | + masq_proto_name(svc->protocol), | |
3248 | + NIPQUAD(svc->addr), ntohs(svc->port)); | |
3249 | + kfree_s(svc, sizeof(struct ip_vs_service)); | |
3250 | + } | |
3251 | +} | |
3252 | + | |
3253 | + | |
3254 | +/* | |
3255 | + * Returns hash value for real service | |
3256 | + */ | |
3257 | +static __inline__ unsigned | |
3258 | +ip_vs_rs_hashkey(__u32 addr, __u16 port) | |
3259 | +{ | |
3260 | + register unsigned porth = ntohs(port); | |
3261 | + | |
3262 | + return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) & IP_VS_RTAB_MASK; | |
3263 | +} | |
3264 | + | |
3265 | +/* | |
3266 | + * Hashes ip_vs_dest in ip_vs_rtable by proto,addr,port. | |
3267 | + * should be called with locked tables. | |
3268 | + * returns bool success. | |
3269 | + */ | |
3270 | +int ip_vs_rs_hash(struct ip_vs_dest *dest) | |
3271 | +{ | |
3272 | + unsigned hash; | |
3273 | + | |
3274 | + if (!list_empty(&dest->d_list)) { | |
3275 | + return 0; | |
3276 | + } | |
3277 | + | |
3278 | + /* | |
3279 | + * Hash by proto,addr,port, | |
3280 | + * which are the parameters of the real service. | |
3281 | + */ | |
3282 | + hash = ip_vs_rs_hashkey(dest->addr, dest->port); | |
3283 | + list_add(&dest->d_list, &ip_vs_rtable[hash]); | |
3284 | + | |
3285 | + return 1; | |
3286 | +} | |
3287 | + | |
3288 | +/* | |
3289 | + * UNhashes ip_vs_dest from ip_vs_rtable. | |
3290 | + * should be called with locked tables. | |
3291 | + * returns bool success. | |
3292 | + */ | |
3293 | +int ip_vs_rs_unhash(struct ip_vs_dest *dest) | |
3294 | +{ | |
3295 | + /* | |
3296 | + * Remove it from the ip_vs_rtable table. | |
3297 | + */ | |
3298 | + if (!list_empty(&dest->d_list)) { | |
3299 | + list_del(&dest->d_list); | |
3300 | + INIT_LIST_HEAD(&dest->d_list); | |
3301 | + } | |
3302 | + | |
3303 | + return 1; | |
3304 | +} | |
3305 | + | |
3306 | +/* | |
3307 | + * Lookup real service by {proto,addr,port} in the real service table. | |
3308 | + */ | |
3309 | +struct ip_vs_dest * __ip_vs_lookup_real_service(__u16 protocol, | |
3310 | + __u32 daddr, __u16 dport) | |
3311 | +{ | |
3312 | + unsigned hash; | |
3313 | + struct ip_vs_dest *dest; | |
3314 | + struct list_head *l,*e; | |
3315 | + | |
3316 | + /* | |
3317 | + * Check for "full" addressed entries | |
3318 | + * Return the first found entry | |
3319 | + */ | |
3320 | + hash = ip_vs_rs_hashkey(daddr, dport); | |
3321 | + | |
3322 | + l = &ip_vs_rtable[hash]; | |
3323 | + for (e=l->next; e!=l; e=e->next) { | |
3324 | + dest = list_entry(e, struct ip_vs_dest, d_list); | |
3325 | + if ((dest->addr == daddr) | |
3326 | + && (dest->port == dport) | |
3327 | + && ((dest->protocol == protocol) || dest->vfwmark)) { | |
3328 | + /* HIT */ | |
3329 | + return dest; | |
3330 | + } | |
3331 | + } | |
3332 | + | |
3333 | + return NULL; | |
3334 | +} | |
3335 | + | |
3336 | +/* | |
3337 | + * Lookup destination by {addr,port} in the given service | |
3338 | + */ | |
3339 | +struct ip_vs_dest * ip_vs_lookup_dest(struct ip_vs_service *svc, | |
3340 | + __u32 daddr, __u16 dport) | |
3341 | +{ | |
3342 | + struct ip_vs_dest *dest; | |
3343 | + struct list_head *l, *e; | |
3344 | + | |
3345 | + read_lock_bh(&__ip_vs_lock); | |
3346 | + | |
3347 | + /* | |
3348 | + * Find the destination for the given service | |
3349 | + */ | |
3350 | + l = &svc->destinations; | |
3351 | + for (e=l->next; e!=l; e=e->next) { | |
3352 | + dest = list_entry(e, struct ip_vs_dest, n_list); | |
3353 | + if ((dest->addr == daddr) && (dest->port == dport)) { | |
3354 | + /* HIT */ | |
3355 | + read_unlock_bh(&__ip_vs_lock); | |
3356 | + return dest; | |
3357 | + } | |
3358 | + } | |
3359 | + | |
3360 | + read_unlock_bh(&__ip_vs_lock); | |
3361 | + return NULL; | |
3362 | +} | |
3363 | + | |
3364 | + | |
3365 | +/* | |
3366 | + * Lookup dest by {svc,addr,port} in the destination trash. | |
3367 | + * Called by ip_vs_add_dest with the __ip_vs_lock. | |
3368 | + * The destination trash is used to hold the destinations that are removed | |
3369 | + * from the service table but are still referenced by some masq entries. | |
3370 | + * The reason to add the destination trash is when the dest is temporary | |
3371 | + * down (either by administrator or by monitor program), the dest can be | |
3372 | + * picked back from the trash, the remaining connections to the dest can | |
3373 | + * continue, and the counting information of the dest is also useful for | |
3374 | + * scheduling. | |
3375 | + */ | |
3376 | +struct ip_vs_dest * __ip_vs_get_trash_dest(struct ip_vs_service *svc, | |
3377 | + __u32 daddr, __u16 dport) | |
3378 | +{ | |
3379 | + struct ip_vs_dest *dest; | |
3380 | + struct list_head *l, *e; | |
3381 | + | |
3382 | + /* | |
3383 | + * Find the destination in trash | |
3384 | + */ | |
3385 | + l = &ip_vs_dest_trash; | |
3386 | + for (e=l->next; e!=l; e=e->next) { | |
3387 | + dest = list_entry(e, struct ip_vs_dest, n_list); | |
3388 | + IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%d still in trash, " | |
3389 | + "refcnt=%d\n", | |
3390 | + dest->vfwmark, | |
3391 | + NIPQUAD(dest->addr), ntohs(dest->port), | |
3392 | + atomic_read(&dest->refcnt)); | |
3393 | + if (dest->addr == daddr && | |
3394 | + dest->port == dport && | |
3395 | + dest->vfwmark == svc->fwmark && | |
3396 | + (svc->fwmark || | |
3397 | + (dest->protocol == svc->protocol && | |
3398 | + dest->vaddr == svc->addr && | |
3399 | + dest->vport == svc->port))) { | |
3400 | + /* HIT */ | |
3401 | + return dest; | |
3402 | + } | |
3403 | + | |
3404 | + /* | |
3405 | + * Try to purge the destination from trash if not referenced | |
3406 | + */ | |
3407 | + if (atomic_read(&dest->refcnt) == 1) { | |
3408 | + IP_VS_DBG(3, "Remove destination %u/%u.%u.%u.%u:%d " | |
3409 | + "from trash\n", | |
3410 | + dest->vfwmark, | |
3411 | + NIPQUAD(dest->addr), ntohs(dest->port)); | |
3412 | + e = e->prev; | |
3413 | + list_del(&dest->n_list); | |
3414 | + __ip_vs_unbind_svc(dest); | |
3415 | + kfree_s(dest, sizeof(*dest)); | |
3416 | + } | |
3417 | + } | |
3418 | + return NULL; | |
3419 | +} | |
3420 | + | |
3421 | + | |
3422 | +/* | |
3423 | + * Update a destination in the given service | |
3424 | + */ | |
3425 | +void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, | |
3426 | + struct ip_masq_ctl *mctl) | |
3427 | +{ | |
3428 | + struct ip_vs_user *mm = &mctl->u.vs_user; | |
3429 | + | |
3430 | + /* | |
3431 | + * Set the weight and the flags | |
3432 | + */ | |
3433 | + dest->weight = mm->weight; | |
3434 | + dest->masq_flags = mm->masq_flags; | |
3435 | + | |
3436 | + dest->masq_flags |= IP_MASQ_F_VS; | |
3437 | + dest->masq_flags |= IP_MASQ_F_VS_INACTIVE; | |
3438 | + | |
3439 | + /* | |
3440 | + * Check if local node and update the flags | |
3441 | + */ | |
3442 | + if (inet_addr_type(mm->daddr) == RTN_LOCAL) { | |
3443 | + dest->masq_flags = (dest->masq_flags & ~IP_MASQ_F_VS_FWD_MASK) | |
3444 | + | IP_MASQ_F_VS_LOCALNODE; | |
3445 | + } | |
3446 | + | |
3447 | + /* | |
3448 | + * Set the IP_MASQ_F_VS_NO_OUTPUT flag if not masquerading | |
3449 | + */ | |
3450 | + if ((dest->masq_flags & IP_MASQ_F_VS_FWD_MASK) != 0) { | |
3451 | + dest->masq_flags |= IP_MASQ_F_VS_NO_OUTPUT; | |
3452 | + } else { | |
3453 | + /* | |
3454 | + * Put the real service in ip_vs_rtable if not present. | |
3455 | + * For now only for NAT! | |
3456 | + */ | |
3457 | + ip_vs_rs_hash(dest); | |
3458 | + } | |
3459 | + | |
3460 | + | |
3461 | + /* bind the service */ | |
3462 | + if (!dest->svc) { | |
3463 | + __ip_vs_bind_svc(dest, svc); | |
3464 | + } else { | |
3465 | + if (dest->svc != svc) { | |
3466 | + __ip_vs_unbind_svc(dest); | |
3467 | + __ip_vs_bind_svc(dest, svc); | |
3468 | + } | |
3469 | + } | |
3470 | + | |
3471 | + /* | |
3472 | + * Set the dest status flags | |
3473 | + */ | |
3474 | + dest->flags |= IP_VS_DEST_F_AVAILABLE; | |
3475 | +} | |
3476 | + | |
3477 | + | |
3478 | +/* | |
3479 | + * Create a destination for the given service | |
3480 | + */ | |
3481 | +struct ip_vs_dest *ip_vs_new_dest(struct ip_vs_service *svc, | |
3482 | + struct ip_masq_ctl *mctl) | |
3483 | +{ | |
3484 | + struct ip_vs_dest *dest; | |
3485 | + struct ip_vs_user *mm = &mctl->u.vs_user; | |
3486 | + | |
3487 | + EnterFunction(2); | |
3488 | + | |
3489 | + dest = (struct ip_vs_dest*) kmalloc(sizeof(struct ip_vs_dest), | |
3490 | + GFP_ATOMIC); | |
3491 | + if (dest == NULL) { | |
3492 | + IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); | |
3493 | + return NULL; | |
3494 | + } | |
3495 | + memset(dest, 0, sizeof(struct ip_vs_dest)); | |
3496 | + | |
3497 | + dest->protocol = svc->protocol; | |
3498 | + dest->vaddr = svc->addr; | |
3499 | + dest->vport = svc->port; | |
3500 | + dest->vfwmark = svc->fwmark; | |
3501 | + dest->addr = mm->daddr; | |
3502 | + dest->port = mm->dport; | |
3503 | + | |
3504 | + atomic_set(&dest->activeconns, 0); | |
3505 | + atomic_set(&dest->inactconns, 0); | |
3506 | + atomic_set(&dest->refcnt, 0); | |
3507 | + | |
3508 | + INIT_LIST_HEAD(&dest->d_list); | |
3509 | + dest->stats.lock = SPIN_LOCK_UNLOCKED; | |
3510 | + __ip_vs_update_dest(svc, dest, mctl); | |
3511 | + | |
3512 | + LeaveFunction(2); | |
3513 | + | |
3514 | + return dest; | |
3515 | +} | |
3516 | + | |
3517 | + | |
3518 | +/* | |
3519 | + * Add a destination into an existing service | |
3520 | + */ | |
3521 | +int ip_vs_add_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl) | |
3522 | +{ | |
3523 | + struct ip_vs_dest *dest; | |
3524 | + struct ip_vs_user *mm = &mctl->u.vs_user; | |
3525 | + __u32 daddr = mm->daddr; | |
3526 | + __u16 dport = mm->dport; | |
3527 | + | |
3528 | + EnterFunction(2); | |
3529 | + | |
3530 | + if (mm->weight < 0) { | |
3531 | + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); | |
3532 | + return -ERANGE; | |
3533 | + } | |
3534 | + | |
3535 | + /* | |
3536 | + * Check if the dest already exists in the list | |
3537 | + */ | |
3538 | + dest = ip_vs_lookup_dest(svc, daddr, dport); | |
3539 | + if (dest != NULL) { | |
3540 | + IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); | |
3541 | + return -EEXIST; | |
3542 | + } | |
3543 | + | |
3544 | + write_lock_bh(&__ip_vs_lock); | |
3545 | + | |
3546 | + /* | |
3547 | + * Check if the dest already exists in the trash and | |
3548 | + * is from the same service | |
3549 | + */ | |
3550 | + dest = __ip_vs_get_trash_dest(svc, daddr, dport); | |
3551 | + if (dest != NULL) { | |
3552 | + IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%d from trash, " | |
3553 | + "refcnt=%d, service %u.%u.%u.%u:%d\n", | |
3554 | + NIPQUAD(daddr), ntohs(dport), | |
3555 | + atomic_read(&dest->refcnt), | |
3556 | + NIPQUAD(dest->vaddr), | |
3557 | + ntohs(dest->vport)); | |
3558 | + | |
3559 | + /* | |
3560 | + * Get the destination from the trash | |
3561 | + */ | |
3562 | + list_del(&dest->n_list); | |
3563 | + list_add(&dest->n_list, &svc->destinations); | |
3564 | + | |
3565 | + __ip_vs_update_dest(svc, dest, mctl); | |
3566 | + | |
3567 | + write_unlock_bh(&__ip_vs_lock); | |
3568 | + return 0; | |
3569 | + } | |
3570 | + | |
3571 | + /* | |
3572 | + * Allocate and initialize the dest structure | |
3573 | + */ | |
3574 | + dest = ip_vs_new_dest(svc, mctl); | |
3575 | + if (dest == NULL) { | |
3576 | + write_unlock_bh(&__ip_vs_lock); | |
3577 | + IP_VS_ERR("ip_vs_add_dest(): out of memory\n"); | |
3578 | + return -ENOMEM; | |
3579 | + } | |
3580 | + | |
3581 | + /* | |
3582 | + * Add the dest entry into the list | |
3583 | + */ | |
3584 | + list_add(&dest->n_list, &svc->destinations); | |
3585 | + atomic_inc(&dest->refcnt); | |
3586 | + | |
3587 | + write_unlock_bh(&__ip_vs_lock); | |
3588 | + | |
3589 | + LeaveFunction(2); | |
3590 | + return 0; | |
3591 | +} | |
3592 | + | |
3593 | + | |
3594 | +/* | |
3595 | + * Edit a destination in the given service | |
3596 | + */ | |
3597 | +int ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl) | |
3598 | +{ | |
3599 | + struct ip_vs_dest *dest; | |
3600 | + struct ip_vs_user *mm = &mctl->u.vs_user; | |
3601 | + __u32 daddr = mm->daddr; | |
3602 | + __u16 dport = mm->dport; | |
3603 | + | |
3604 | + EnterFunction(2); | |
3605 | + | |
3606 | + if (mm->weight < 0) { | |
3607 | + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); | |
3608 | + return -ERANGE; | |
3609 | + } | |
3610 | + | |
3611 | + /* | |
3612 | + * Lookup the destination list | |
3613 | + */ | |
3614 | + dest = ip_vs_lookup_dest(svc, daddr, dport); | |
3615 | + if (dest == NULL) { | |
3616 | + IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); | |
3617 | + return -ENOENT; | |
3618 | + } | |
3619 | + | |
3620 | + write_lock_bh(&__ip_vs_lock); | |
3621 | + | |
3622 | + __ip_vs_update_dest(svc, dest, mctl); | |
3623 | + | |
3624 | + write_unlock_bh(&__ip_vs_lock); | |
3625 | + | |
3626 | + LeaveFunction(2); | |
3627 | + return 0; | |
3628 | +} | |
3629 | + | |
3630 | + | |
3631 | +/* | |
3632 | + * Delete a destination from the given service | |
3633 | + */ | |
3634 | +void __ip_vs_del_dest(struct ip_vs_dest *dest) | |
3635 | +{ | |
3636 | + dest->flags &= ~IP_VS_DEST_F_AVAILABLE; | |
3637 | + | |
3638 | + /* | |
3639 | + * Remove it from the d-linked destination list. | |
3640 | + */ | |
3641 | + list_del(&dest->n_list); | |
3642 | + | |
3643 | + /* | |
3644 | + * Remove it from the d-linked list with the real services. | |
3645 | + */ | |
3646 | + ip_vs_rs_unhash(dest); | |
3647 | + | |
3648 | + /* | |
3649 | + * Decrease the refcnt of the dest, and free the dest | |
3650 | + * if nobody refers to it (refcnt=0). Otherwise, throw | |
3651 | + * the destination into the trash. | |
3652 | + */ | |
3653 | + if (atomic_dec_and_test(&dest->refcnt)) { | |
3654 | + /* simply decrease svc->refcnt here, let the caller check | |
3655 | + and release the service if nobody refers to it. | |
3656 | + Only user context can release destination and service, | |
3657 | + and only user context can update virtual service at a | |
3658 | + time, so the operation here is OK */ | |
3659 | + atomic_dec(&dest->svc->refcnt); | |
3660 | + kfree_s(dest, sizeof(*dest)); | |
3661 | + } else { | |
3662 | + IP_VS_DBG(3, "Move dest %u.%u.%u.%u:%d into trash, " | |
3663 | + "refcnt=%d\n", | |
3664 | + NIPQUAD(dest->addr), ntohs(dest->port), | |
3665 | + atomic_read(&dest->refcnt)); | |
3666 | + list_add(&dest->n_list, &ip_vs_dest_trash); | |
3667 | + atomic_inc(&dest->refcnt); | |
3668 | + } | |
3669 | +} | |
3670 | + | |
3671 | +int ip_vs_del_dest(struct ip_vs_service *svc, struct ip_masq_ctl *mctl) | |
3672 | +{ | |
3673 | + struct ip_vs_dest *dest; | |
3674 | + struct ip_vs_user *mm = &mctl->u.vs_user; | |
3675 | + __u32 daddr = mm->daddr; | |
3676 | + __u16 dport = mm->dport; | |
3677 | + | |
3678 | + EnterFunction(2); | |
3679 | + | |
3680 | + /* | |
3681 | + * Lookup the destination list | |
3682 | + */ | |
3683 | + dest = ip_vs_lookup_dest(svc, daddr, dport); | |
3684 | + if (dest == NULL) { | |
3685 | + IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); | |
3686 | + return -ENOENT; | |
3687 | + } | |
3688 | + | |
3689 | + write_lock_bh(&__ip_vs_lock); | |
3690 | + | |
3691 | + /* | |
3692 | + * Remove dest from the destination list | |
3693 | + */ | |
3694 | + __ip_vs_del_dest(dest); | |
3695 | + | |
3696 | + /* | |
3697 | + * Called the update_service function of its scheduler | |
3698 | + */ | |
3699 | + svc->scheduler->update_service(svc); | |
3700 | + | |
3701 | + write_unlock_bh(&__ip_vs_lock); | |
3702 | + | |
3703 | + LeaveFunction(2); | |
3704 | + | |
3705 | + return 0; | |
3706 | +} | |
3707 | + | |
3708 | + | |
3709 | +/* | |
3710 | + * Add a service into the service hash table | |
3711 | + */ | |
3712 | +int ip_vs_add_service(struct ip_masq_ctl *mctl) | |
3713 | +{ | |
3714 | + struct ip_vs_user *mm = &mctl->u.vs_user; | |
3715 | + __u16 protocol = mm->protocol; | |
3716 | + __u32 vaddr = mm->vaddr; | |
3717 | + __u16 vport = mm->vport; | |
3718 | + __u32 vfwmark = mm->vfwmark; | |
3719 | + | |
3720 | + int ret = 0; | |
3721 | + struct ip_vs_scheduler *sched; | |
3722 | + struct ip_vs_service *svc; | |
3723 | + | |
3724 | + EnterFunction(2); | |
3725 | + | |
3726 | + /* | |
3727 | + * Lookup the scheduler, by 'mctl->m_tname' | |
3728 | + */ | |
3729 | + sched = ip_vs_lookup_scheduler(mctl->m_tname); | |
3730 | + if (sched == NULL) { | |
3731 | + IP_VS_INFO("Scheduler module ip_vs_%s.o not found\n", | |
3732 | + mctl->m_tname); | |
3733 | + return -ENOENT; | |
3734 | + } | |
3735 | + | |
3736 | + write_lock_bh(&__ip_vs_lock); | |
3737 | + | |
3738 | + /* | |
3739 | + * Check if the service already exists | |
3740 | + */ | |
3741 | + if (vfwmark == 0) | |
3742 | + svc = __ip_vs_lookup_service(protocol, vaddr, vport); | |
3743 | + else | |
3744 | + svc = __ip_vs_lookup_svc_fwm(vfwmark); | |
3745 | + | |
3746 | + if (svc != NULL) { | |
3747 | + IP_VS_DBG(1, "ip_vs_add_service: service already exists.\n"); | |
3748 | + ret = -EEXIST; | |
3749 | + goto out; | |
3750 | + } | |
3751 | + | |
3752 | + svc = (struct ip_vs_service*) | |
3753 | + kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); | |
3754 | + if (svc == NULL) { | |
3755 | + IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); | |
3756 | + ret = -ENOMEM; | |
3757 | + goto out; | |
3758 | + } | |
3759 | + memset(svc, 0, sizeof(struct ip_vs_service)); | |
3760 | + | |
3761 | + svc->protocol = protocol; | |
3762 | + svc->addr = vaddr; | |
3763 | + svc->port = vport; | |
3764 | + svc->fwmark = vfwmark; | |
3765 | + svc->flags = mm->vs_flags; | |
3766 | + svc->timeout = mm->timeout; | |
3767 | + svc->netmask = mm->netmask; | |
3768 | + | |
3769 | + INIT_LIST_HEAD(&svc->destinations); | |
3770 | + atomic_set(&svc->refcnt, 0); | |
3771 | + svc->stats.lock = SPIN_LOCK_UNLOCKED; | |
3772 | + | |
3773 | + /* | |
3774 | + * Bind the scheduler | |
3775 | + */ | |
3776 | + ip_vs_bind_scheduler(svc, sched); | |
3777 | + | |
3778 | + /* | |
3779 | + * Hash the service into the service table | |
3780 | + */ | |
3781 | + ip_vs_svc_hash(svc); | |
3782 | + | |
3783 | + /* | |
3784 | + * Update the virtual service counters | |
3785 | + */ | |
3786 | + if (vport == FTPPORT) | |
3787 | + atomic_inc(&ip_vs_ftpsvc_counter); | |
3788 | + else if (vport == 0) | |
3789 | + atomic_inc(&ip_vs_nullsvc_counter); | |
3790 | + | |
3791 | + out: | |
3792 | + write_unlock_bh(&__ip_vs_lock); | |
3793 | + LeaveFunction(2); | |
3794 | + return ret; | |
3795 | +} | |
3796 | + | |
3797 | + | |
3798 | +/* | |
3799 | + * Edit a service and bind it with a new scheduler | |
3800 | + */ | |
3801 | +int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_masq_ctl *mctl) | |
3802 | +{ | |
3803 | + struct ip_vs_user *mm = &mctl->u.vs_user; | |
3804 | + struct ip_vs_scheduler *sched; | |
3805 | + | |
3806 | + EnterFunction(2); | |
3807 | + | |
3808 | + /* | |
3809 | + * Lookup the scheduler, by 'mctl->m_tname' | |
3810 | + */ | |
3811 | + sched = ip_vs_lookup_scheduler(mctl->m_tname); | |
3812 | + if (sched == NULL) { | |
3813 | + IP_VS_INFO("Scheduler module ip_vs_%s.o not found\n", | |
3814 | + mctl->m_tname); | |
3815 | + return -ENOENT; | |
3816 | + } | |
3817 | + | |
3818 | + write_lock_bh(&__ip_vs_lock); | |
3819 | + | |
3820 | + /* | |
3821 | + * Set the flags and timeout value | |
3822 | + */ | |
3823 | + svc->flags = mm->vs_flags | IP_VS_SVC_F_HASHED; | |
3824 | + svc->timeout = mm->timeout; | |
3825 | + svc->netmask = mm->netmask; | |
3826 | + | |
3827 | + /* | |
3828 | + * Unbind the old scheduler | |
3829 | + */ | |
3830 | + ip_vs_unbind_scheduler(svc); | |
3831 | + | |
3832 | + /* | |
3833 | + * Bind the new scheduler | |
3834 | + */ | |
3835 | + ip_vs_bind_scheduler(svc, sched); | |
3836 | + | |
3837 | + write_unlock_bh(&__ip_vs_lock); | |
3838 | + | |
3839 | + LeaveFunction(2); | |
3840 | + return 0; | |
3841 | +} | |
3842 | + | |
3843 | + | |
3844 | +/* | |
3845 | + * Delete a service from the service list | |
3846 | + */ | |
3847 | +int __ip_vs_del_service(struct ip_vs_service *svc) | |
3848 | +{ | |
3849 | + struct list_head *l; | |
3850 | + struct ip_vs_dest *dest; | |
3851 | + | |
3852 | + /* | |
3853 | + * Unbind scheduler | |
3854 | + */ | |
3855 | + ip_vs_unbind_scheduler(svc); | |
3856 | + | |
3857 | + /* | |
3858 | + * Unlink the whole destination list | |
3859 | + */ | |
3860 | + l = &svc->destinations; | |
3861 | + while (l->next != l) { | |
3862 | + dest = list_entry(l->next, struct ip_vs_dest, n_list); | |
3863 | + __ip_vs_del_dest(dest); | |
3864 | + } | |
3865 | + | |
3866 | + /* | |
3867 | + * Unhash it from the service table | |
3868 | + */ | |
3869 | + if (ip_vs_svc_unhash(svc)) { | |
3870 | + /* | |
3871 | + * Update the virtual service counters | |
3872 | + */ | |
3873 | + if (svc->port == FTPPORT) | |
3874 | + atomic_dec(&ip_vs_ftpsvc_counter); | |
3875 | + else if (svc->port == 0) | |
3876 | + atomic_dec(&ip_vs_nullsvc_counter); | |
3877 | + | |
3878 | + /* | |
3879 | + * Free the service if nobody refers to it | |
3880 | + */ | |
3881 | + if (atomic_read(&svc->refcnt) == 0) { | |
3882 | + IP_VS_DBG(2, "release svc %s %u.%u.%u.%u:%d\n", | |
3883 | + masq_proto_name(svc->protocol), | |
3884 | + NIPQUAD(svc->addr), ntohs(svc->port)); | |
3885 | + kfree_s(svc, sizeof(struct ip_vs_service)); | |
3886 | + } | |
3887 | + } else { | |
3888 | + /* | |
3889 | + * Called the update_service function of its scheduler | |
3890 | + */ | |
3891 | + svc->scheduler->update_service(svc); | |
3892 | + return -EPERM; | |
3893 | + } | |
3894 | + | |
3895 | + return 0; | |
3896 | +} | |
3897 | + | |
3898 | +int ip_vs_del_service(struct ip_vs_service *svc) | |
3899 | +{ | |
3900 | + EnterFunction(2); | |
3901 | + | |
3902 | + if (svc == NULL) | |
3903 | + return -EEXIST; | |
3904 | + | |
3905 | + write_lock_bh(&__ip_vs_lock); | |
3906 | + | |
3907 | + __ip_vs_del_service(svc); | |
3908 | + | |
3909 | + write_unlock_bh(&__ip_vs_lock); | |
3910 | + LeaveFunction(2); | |
3911 | + return 0; | |
3912 | +} | |
3913 | + | |
3914 | + | |
3915 | +/* | |
3916 | + * Flush all the virtual services | |
3917 | + */ | |
3918 | +int ip_vs_flush(void) | |
3919 | +{ | |
3920 | + int idx; | |
3921 | + struct ip_vs_service *svc; | |
3922 | + struct list_head *l; | |
3923 | + | |
3924 | + write_lock_bh(&__ip_vs_lock); | |
3925 | + | |
3926 | + /* | |
3927 | + * Flush the service table hashed by <protocol,addr,port> | |
3928 | + */ | |
3929 | + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | |
3930 | + l = &ip_vs_svc_table[idx]; | |
3931 | + while (l->next != l) { | |
3932 | + svc = list_entry(l->next,struct ip_vs_service,s_list); | |
3933 | + | |
3934 | + if (__ip_vs_del_service(svc)) | |
3935 | + goto out; | |
3936 | + } | |
3937 | + } | |
3938 | + | |
3939 | + /* | |
3940 | + * Flush the service table hashed by fwmark | |
3941 | + */ | |
3942 | + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | |
3943 | + l = &ip_vs_svc_fwm_table[idx]; | |
3944 | + while (l->next != l) { | |
3945 | + svc = list_entry(l->next,struct ip_vs_service,f_list); | |
3946 | + | |
3947 | + if (__ip_vs_del_service(svc)) | |
3948 | + goto out; | |
3949 | + } | |
3950 | + } | |
3951 | + | |
3952 | + out: | |
3953 | + write_unlock_bh(&__ip_vs_lock); | |
3954 | + return 0; | |
3955 | +} | |
3956 | + | |
3957 | + | |
3958 | +/* | |
3959 | + * Change the connection counter and the flags if the masq state changes | |
3960 | + * Called by the masq_tcp_state function. | |
3961 | + */ | |
3962 | +void ip_vs_set_state(struct ip_masq *ms, int new_state) | |
3963 | +{ | |
3964 | + struct ip_vs_dest *dest = ms->dest; | |
3965 | + | |
3966 | + if (dest && | |
3967 | + (ms->flags & IP_MASQ_F_VS) && (new_state != ms->state)) { | |
3968 | + if (!(ms->flags & IP_MASQ_F_VS_INACTIVE) && | |
3969 | + (new_state != IP_MASQ_S_ESTABLISHED)) { | |
3970 | + atomic_dec(&dest->activeconns); | |
3971 | + atomic_inc(&dest->inactconns); | |
3972 | + ms->flags |= IP_MASQ_F_VS_INACTIVE; | |
3973 | + } else if ((ms->flags & IP_MASQ_F_VS_INACTIVE) && | |
3974 | + (new_state == IP_MASQ_S_ESTABLISHED)) { | |
3975 | + atomic_inc(&dest->activeconns); | |
3976 | + atomic_dec(&dest->inactconns); | |
3977 | + ms->flags &= ~IP_MASQ_F_VS_INACTIVE; | |
3978 | + } | |
3979 | + | |
3980 | + IP_VS_DBG(8, "Set-state masq fwd:%c s:%s c:%u.%u.%u.%u:%d " | |
3981 | + "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d flg:%X cnt:%d\n", | |
3982 | + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state), | |
3983 | + NIPQUAD(ms->daddr), ntohs(ms->dport), | |
3984 | + NIPQUAD(ms->maddr), ntohs(ms->mport), | |
3985 | + NIPQUAD(ms->saddr), ntohs(ms->sport), | |
3986 | + ms->flags, atomic_read(&ms->refcnt)); | |
3987 | + } | |
3988 | +} | |
3989 | + | |
3990 | + | |
3991 | +/* | |
3992 | + * Bind a masq entry with a virtual service destination | |
3993 | + * Called when a new masq entry is created for VS. | |
3994 | + */ | |
3995 | +void ip_vs_bind_masq(struct ip_masq *ms, struct ip_vs_dest *dest) | |
3996 | +{ | |
3997 | + ms->flags |= dest->masq_flags; | |
3998 | + ms->dest = dest; | |
3999 | + | |
4000 | + /* | |
4001 | + * Increase the refcnt counter of the dest. | |
4002 | + */ | |
4003 | + atomic_inc(&dest->refcnt); | |
4004 | + | |
4005 | + IP_VS_DBG(9, "Bind-masq fwd:%c s:%s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " | |
4006 | + "d:%u.%u.%u.%u:%d flg:%X cnt:%d destcnt:%d\n", | |
4007 | + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state), | |
4008 | + NIPQUAD(ms->daddr), ntohs(ms->dport), | |
4009 | + NIPQUAD(ms->maddr), ntohs(ms->mport), | |
4010 | + NIPQUAD(ms->saddr), ntohs(ms->sport), | |
4011 | + ms->flags, atomic_read(&ms->refcnt), | |
4012 | + atomic_read(&dest->refcnt)); | |
4013 | +} | |
4014 | + | |
4015 | + | |
4016 | +/* | |
4017 | + * Unbind a masq entry with its VS destination | |
4018 | + * Called by the masq_expire function. | |
4019 | + */ | |
4020 | +void ip_vs_unbind_masq(struct ip_masq *ms) | |
4021 | +{ | |
4022 | + struct ip_vs_dest *dest = ms->dest; | |
4023 | + | |
4024 | + IP_VS_DBG(9, "Unbind-masq fwd:%c s:%s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " | |
4025 | + "d:%u.%u.%u.%u:%d flg:%X cnt:%d destcnt:%d\n", | |
4026 | + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state), | |
4027 | + NIPQUAD(ms->daddr),ntohs(ms->dport), | |
4028 | + NIPQUAD(ms->maddr),ntohs(ms->mport), | |
4029 | + NIPQUAD(ms->saddr),ntohs(ms->sport), | |
4030 | + ms->flags, atomic_read(&ms->refcnt), | |
4031 | + atomic_read(&dest->refcnt)); | |
4032 | + | |
4033 | + if (dest) { | |
4034 | + /* | |
4035 | + * Decrease the inactconns or activeconns counter | |
4036 | + * if it is not a masq template (ms->dport!=0). | |
4037 | + */ | |
4038 | + if (ms->dport) { | |
4039 | + if (ms->flags & IP_MASQ_F_VS_INACTIVE) { | |
4040 | + atomic_dec(&dest->inactconns); | |
4041 | + } else { | |
4042 | + atomic_dec(&dest->activeconns); | |
4043 | + } | |
4044 | + } | |
4045 | + | |
4046 | + /* | |
4047 | + * Simply decrease the refcnt of the dest, because the | |
4048 | + * dest will be either in service's destination list | |
4049 | + * or in the trash. | |
4050 | + */ | |
4051 | + atomic_dec(&dest->refcnt); | |
4052 | + } | |
4053 | +} | |
4054 | + | |
4055 | + | |
4056 | +/* | |
4057 | + * Checking if the destination of a masq template is available. | |
4058 | + * If available, return 1, otherwise return 0 and invalidate this | |
4059 | + * masq template. | |
4060 | + */ | |
4061 | +int ip_vs_check_template(struct ip_masq *mst) | |
4062 | +{ | |
4063 | + struct ip_vs_dest *dest = mst->dest; | |
4064 | + | |
4065 | + /* | |
4066 | + * Checking the dest server status. | |
4067 | + */ | |
4068 | + if ((dest == NULL) || | |
4069 | + !(dest->flags & IP_VS_DEST_F_AVAILABLE)) { | |
4070 | + IP_VS_DBG(9, "check_template: dest not available for prot %s " | |
4071 | + "src %u.%u.%u.%u:%d dest %u.%u.%u.%u:%d -> %X:%X\n", | |
4072 | + masq_proto_name(mst->protocol), | |
4073 | + NIPQUAD(mst->daddr), ntohs(mst->dport), | |
4074 | + NIPQUAD(mst->maddr), ntohs(mst->mport), | |
4075 | + (dest!=NULL)? ntohl(dest->addr):0, | |
4076 | + (dest!=NULL)? ntohs(dest->port):0); | |
4077 | + | |
4078 | + /* | |
4079 | + * Invalidate the masq template | |
4080 | + */ | |
4081 | + ip_vs_unhash(mst); | |
4082 | + mst->sport = 65535; | |
4083 | + mst->mport = 65535; | |
4084 | + mst->dport = 0; | |
4085 | + ip_vs_hash(mst); | |
4086 | + | |
4087 | + /* | |
4088 | + * Simply decrease the refcnt of the template, | |
4089 | + * don't restart its timer. | |
4090 | + */ | |
4091 | + atomic_dec(&mst->refcnt); | |
4092 | + return 0; | |
4093 | + } | |
4094 | + return 1; | |
4095 | +} | |
4096 | + | |
4097 | + | |
4098 | +/* | |
4099 | + * IPVS persistent scheduling function | |
4100 | + * It creates a masq entry according to its template if exists, or selects | |
4101 | + * a server and creates a masq entry plus a template. | |
4102 | + */ | |
4103 | +struct ip_masq * | |
4104 | +ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph) | |
4105 | +{ | |
4106 | + struct ip_masq *ms = NULL; | |
4107 | + struct ip_vs_dest *dest; | |
4108 | + const __u16 *portp; | |
4109 | + struct ip_masq *mst; | |
4110 | + __u16 dport; /* destination port to forward */ | |
4111 | + __u32 snet; /* source network of the client, after masking */ | |
4112 | + | |
4113 | + portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); | |
4114 | + | |
4115 | + /* Mask saddr with the netmask to adjust template granularity */ | |
4116 | + snet = iph->saddr & svc->netmask; | |
4117 | + | |
4118 | + IP_VS_DBG(6, "P-schedule: src %u.%u.%u.%u:%d dest %u.%u.%u.%u:%d " | |
4119 | + "snet %u.%u.%u.%u/%u.%u.%u.%u\n", | |
4120 | + NIPQUAD(iph->saddr), ntohs(portp[0]), | |
4121 | + NIPQUAD(iph->daddr), ntohs(portp[1]), | |
4122 | + NIPQUAD(snet), NIPQUAD(svc->netmask)); | |
4123 | + | |
4124 | + /* | |
4125 | + * As far as we know, FTP is a very complicated network protocol, and | |
4126 | + * it uses control connection and data connections. For active FTP, | |
4127 | + * FTP server initilize data connection to the client, its source port | |
4128 | + * is often 20. For passive FTP, FTP server tells the clients the port | |
4129 | + * that it passively listens to, and the client issues the data | |
4130 | + * connection. In the tunneling or direct routing mode, the load | |
4131 | + * balancer is on the client-to-server half of connection, the port | |
4132 | + * number is unknown to the load balancer. So, a template masq like | |
4133 | + * <daddr, 0, maddr, 0, saddr, 0> is created for persistent FTP | |
4134 | + * service, and a template like <daddr, 0, maddr, mport, saddr, sport> | |
4135 | + * is created for other persistent services. | |
4136 | + */ | |
4137 | + if (portp[1] == svc->port) { | |
4138 | + /* Check if a template already exists */ | |
4139 | + if (svc->port != FTPPORT) | |
4140 | + mst = ip_vs_in_get(iph->protocol, snet, 0, | |
4141 | + iph->daddr, portp[1]); | |
4142 | + else | |
4143 | + mst = ip_vs_in_get(iph->protocol, snet, 0, | |
4144 | + iph->daddr, 0); | |
4145 | + | |
4146 | + if (!mst || !ip_vs_check_template(mst)) { | |
4147 | + /* | |
4148 | + * No template found or the dest of the masq | |
4149 | + * template is not available. | |
4150 | + */ | |
4151 | + read_lock(&__ip_vs_lock); | |
4152 | + | |
4153 | + dest = svc->scheduler->schedule(svc, iph); | |
4154 | + if (dest == NULL) { | |
4155 | + IP_VS_DBG(1, "P-schedule: no dest found.\n"); | |
4156 | + read_unlock(&__ip_vs_lock); | |
4157 | + return NULL; | |
4158 | + } | |
4159 | + | |
4160 | + /* | |
4161 | + * Create a template like <protocol,daddr,0, | |
4162 | + * maddr,mport,saddr,sport> for non-ftp service, | |
4163 | + * and <protocol,daddr,0,maddr,0,saddr,0> | |
4164 | + * for ftp service. | |
4165 | + */ | |
4166 | + if (svc->port != FTPPORT) | |
4167 | + mst = ip_masq_new_vs(iph->protocol, | |
4168 | + iph->daddr, portp[1], | |
4169 | + dest->addr, dest->port, | |
4170 | + snet, 0, | |
4171 | + 0); | |
4172 | + else | |
4173 | + mst = ip_masq_new_vs(iph->protocol, | |
4174 | + iph->daddr, 0, | |
4175 | + dest->addr, 0, | |
4176 | + snet, 0, | |
4177 | + 0); | |
4178 | + if (mst == NULL) { | |
4179 | + IP_VS_ERR("ip_masq_new_vs template failed\n"); | |
4180 | + read_unlock(&__ip_vs_lock); | |
4181 | + return NULL; | |
4182 | + } | |
4183 | + | |
4184 | + /* | |
4185 | + * Bind the template with dest and set timeout. | |
4186 | + */ | |
4187 | + ip_vs_bind_masq(mst, dest); | |
4188 | + mst->timeout = svc->timeout; | |
4189 | + | |
4190 | + read_unlock(&__ip_vs_lock); | |
4191 | + } else { | |
4192 | + /* | |
4193 | + * Template found and its destination is available. | |
4194 | + */ | |
4195 | + dest = mst->dest; | |
4196 | + | |
4197 | + /* | |
4198 | + * Delete its timer so that it can be put back. | |
4199 | + */ | |
4200 | + del_sltimer(&mst->timer); | |
4201 | + } | |
4202 | + dport = dest->port; | |
4203 | + } else { | |
4204 | + /* | |
4205 | + * Note: persistent fwmark-based services and persistent | |
4206 | + * port zero service are handled here. | |
4207 | + * fwmark template: <IPPROTO_IP,daddr,0,fwmark,0,saddr,0> | |
4208 | + * port zero template: <protocol,daddr,0,maddr,0,saddr,0> | |
4209 | + */ | |
4210 | + if (svc->fwmark) | |
4211 | + mst = ip_vs_in_get(IPPROTO_IP, snet, 0, | |
4212 | + htonl(svc->fwmark), 0); | |
4213 | + else | |
4214 | + mst = ip_vs_in_get(iph->protocol, | |
4215 | + snet, 0, iph->daddr, 0); | |
4216 | + | |
4217 | + if (!mst || !ip_vs_check_template(mst)) { | |
4218 | + /* | |
4219 | + * If it is not persistent port zero, return NULL. | |
4220 | + */ | |
4221 | + if (svc->port) | |
4222 | + return NULL; | |
4223 | + | |
4224 | + read_lock(&__ip_vs_lock); | |
4225 | + | |
4226 | + dest = svc->scheduler->schedule(svc, iph); | |
4227 | + if (dest == NULL) { | |
4228 | + IP_VS_DBG(1, "P-schedule: no dest found.\n"); | |
4229 | + read_unlock(&__ip_vs_lock); | |
4230 | + return NULL; | |
4231 | + } | |
4232 | + | |
4233 | + /* | |
4234 | + * Create a template according to the service | |
4235 | + */ | |
4236 | + if (svc->fwmark) | |
4237 | + mst = ip_masq_new_vs(IPPROTO_IP, | |
4238 | + htonl(svc->fwmark), 0, | |
4239 | + dest->addr, 0, | |
4240 | + snet, 0, | |
4241 | + 0); | |
4242 | + else | |
4243 | + mst = ip_masq_new_vs(iph->protocol, | |
4244 | + iph->daddr, 0, | |
4245 | + dest->addr, 0, | |
4246 | + snet, 0, | |
4247 | + 0); | |
4248 | + if (mst == NULL) { | |
4249 | + IP_VS_ERR("ip_masq_new_vs template failed\n"); | |
4250 | + read_unlock(&__ip_vs_lock); | |
4251 | + return NULL; | |
4252 | + } | |
4253 | + | |
4254 | + /* | |
4255 | + * Bind the template with dest and set timeout. | |
4256 | + */ | |
4257 | + ip_vs_bind_masq(mst, dest); | |
4258 | + mst->timeout = svc->timeout; | |
4259 | + read_unlock(&__ip_vs_lock); | |
4260 | + } else { | |
4261 | + dest = mst->dest; | |
4262 | + | |
4263 | + /* | |
4264 | + * Delete its timer so that it can be put back. | |
4265 | + */ | |
4266 | + del_sltimer(&mst->timer); | |
4267 | + } | |
4268 | + dport = portp[1]; | |
4269 | + } | |
4270 | + | |
4271 | + /* | |
4272 | + * Create a new masq according to the template | |
4273 | + */ | |
4274 | + ms = ip_masq_new_vs(iph->protocol, | |
4275 | + iph->daddr, portp[1], | |
4276 | + dest->addr, dport, | |
4277 | + iph->saddr, portp[0], | |
4278 | + 0); | |
4279 | + if (ms == NULL) { | |
4280 | + IP_VS_ERR("ip_masq_new_vs failed\n"); | |
4281 | + ip_masq_put(mst); | |
4282 | + return NULL; | |
4283 | + } | |
4284 | + | |
4285 | + /* | |
4286 | + * Bind the masq entry with the vs dest. | |
4287 | + */ | |
4288 | + ip_vs_bind_masq(ms, dest); | |
4289 | + | |
4290 | + /* | |
4291 | + * Increase the inactive connection counter | |
4292 | + * because it is in Syn-Received | |
4293 | + * state (inactive) when the masq is created. | |
4294 | + */ | |
4295 | + atomic_inc(&dest->inactconns); | |
4296 | + | |
4297 | + /* | |
4298 | + * Add its control | |
4299 | + */ | |
4300 | + ip_masq_control_add(ms, mst); | |
4301 | + | |
4302 | + ip_masq_put(mst); | |
4303 | + return ms; | |
4304 | +} | |
4305 | + | |
4306 | + | |
4307 | +/* | |
4308 | + * IPVS main scheduling function | |
4309 | + * It selects a server according to the virtual service, and | |
4310 | + * creates a masq entry. | |
4311 | + */ | |
4312 | +struct ip_masq *ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph) | |
4313 | +{ | |
4314 | + struct ip_masq *ms = NULL; | |
4315 | + struct ip_vs_dest *dest; | |
4316 | + const __u16 *portp; | |
4317 | + | |
4318 | + /* | |
4319 | + * Persistent service | |
4320 | + */ | |
4321 | + if (svc->flags & IP_VS_SVC_F_PERSISTENT) | |
4322 | + return ip_vs_sched_persist(svc, iph); | |
4323 | + | |
4324 | + /* | |
4325 | + * Non-persistent service | |
4326 | + */ | |
4327 | + portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); | |
4328 | + if (!svc->fwmark && portp[1] != svc->port) { | |
4329 | + if (!svc->port) | |
4330 | + IP_VS_ERR("Schedule: port zero only supported in persistent services, check your ipvs configuration\n"); | |
4331 | + return NULL; | |
4332 | + } | |
4333 | + | |
4334 | + read_lock(&__ip_vs_lock); | |
4335 | + | |
4336 | + dest = svc->scheduler->schedule(svc, iph); | |
4337 | + if (dest == NULL) { | |
4338 | + IP_VS_DBG(1, "Schedule: no dest found.\n"); | |
4339 | + read_unlock(&__ip_vs_lock); | |
4340 | + return NULL; | |
4341 | + } | |
4342 | + | |
4343 | + /* | |
4344 | + * Create a masquerading entry. | |
4345 | + */ | |
4346 | + ms = ip_masq_new_vs(iph->protocol, | |
4347 | + iph->daddr, portp[1], | |
4348 | + dest->addr, dest->port?dest->port:portp[1], | |
4349 | + iph->saddr, portp[0], | |
4350 | + 0); | |
4351 | + if (ms == NULL) { | |
4352 | + IP_VS_ERR("Schedule: ip_masq_new_vs failed\n"); | |
4353 | + read_unlock(&__ip_vs_lock); | |
4354 | + return NULL; | |
4355 | + } | |
4356 | + | |
4357 | + /* | |
4358 | + * Bind the masq entry with the vs dest. | |
4359 | + */ | |
4360 | + ip_vs_bind_masq(ms, dest); | |
4361 | + | |
4362 | + /* | |
4363 | + * Increase the inactive connection counter because it is in | |
4364 | + * Syn-Received state (inactive) when the masq is created. | |
4365 | + */ | |
4366 | + atomic_inc(&dest->inactconns); | |
4367 | + | |
4368 | + IP_VS_DBG(9, "Schedule masq fwd:%c s:%s c:%u.%u.%u.%u:%d " | |
4369 | + "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d flg:%X cnt:%d\n", | |
4370 | + ip_vs_fwd_tag(ms), ip_masq_state_name(ms->state), | |
4371 | + NIPQUAD(ms->daddr),ntohs(ms->dport), | |
4372 | + NIPQUAD(ms->maddr),ntohs(ms->mport), | |
4373 | + NIPQUAD(ms->saddr),ntohs(ms->sport), | |
4374 | + ms->flags, atomic_read(&ms->refcnt)); | |
4375 | + | |
4376 | + read_unlock(&__ip_vs_lock); | |
4377 | + | |
4378 | + return ms; | |
4379 | +} | |
4380 | + | |
4381 | + | |
4382 | +/* | |
4383 | + * Pass or drop the packet. | |
4384 | + * Called by ip_fw_demasquerade, when the virtual service is available but | |
4385 | + * no destination is available for a new connection. | |
4386 | + */ | |
4387 | +int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb) | |
4388 | +{ | |
4389 | + struct iphdr *iph = skb->nh.iph; | |
4390 | + __u16 *portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); | |
4391 | + | |
4392 | + /* | |
4393 | + * When the virtual ftp service is presented, packets destined | |
4394 | + * for other services on the VIP may get here (except services | |
4395 | + * listed in the ipvs table), pass the packets, because it is | |
4396 | + * not ipvs job to decide to drop the packets. | |
4397 | + */ | |
4398 | + if ((svc->port == FTPPORT) && (portp[1] != FTPPORT)) | |
4399 | + return 0; | |
4400 | + | |
4401 | + /* | |
4402 | + * Notify the client that the destination is unreachable, and | |
4403 | + * release the socket buffer. | |
4404 | + * Since it is in IP layer, the TCP socket is not actually | |
4405 | + * created, the TCP RST packet cannot be sent, instead that | |
4406 | + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ | |
4407 | + */ | |
4408 | + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); | |
4409 | + kfree_skb(skb); | |
4410 | + return -2; | |
4411 | +} | |
4412 | + | |
4413 | + | |
4414 | +/* | |
4415 | + * IPVS user control entry | |
4416 | + */ | |
4417 | +int ip_vs_ctl(int optname, struct ip_masq_ctl *mctl, int optlen) | |
4418 | +{ | |
4419 | + struct ip_vs_service *svc = NULL; | |
4420 | + struct ip_vs_user *mm = &mctl->u.vs_user; | |
4421 | + __u32 vaddr = mm->vaddr; | |
4422 | + __u16 vport = mm->vport; | |
4423 | + int proto_num = masq_proto_num(mm->protocol); | |
4424 | + | |
4425 | + /* | |
4426 | + * Check the size of mctl, no overflow... | |
4427 | + */ | |
4428 | + if (optlen != sizeof(*mctl)) | |
4429 | + return -EINVAL; | |
4430 | + | |
4431 | + /* | |
4432 | + * Flush all the virtual service... | |
4433 | + */ | |
4434 | + if (mctl->m_cmd == IP_MASQ_CMD_FLUSH) | |
4435 | + return ip_vs_flush(); | |
4436 | + | |
4437 | + /* | |
4438 | + * Check for valid protocol: TCP or UDP | |
4439 | + */ | |
4440 | + if (mm->vfwmark == 0 && (proto_num < 0 || proto_num > 1)) { | |
4441 | + IP_VS_INFO("vs_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s", | |
4442 | + ntohs(mm->protocol), | |
4443 | + NIPQUAD(vaddr), ntohs(vport), mctl->m_tname); | |
4444 | + return -EFAULT; | |
4445 | + } | |
4446 | + | |
4447 | + /* | |
4448 | + * Lookup the exact service by (protocol, vaddr, vport) | |
4449 | + */ | |
4450 | + read_lock(&__ip_vs_lock); | |
4451 | + | |
4452 | + if (mm->vfwmark == 0) | |
4453 | + svc = __ip_vs_lookup_service(mm->protocol, vaddr, vport); | |
4454 | + else | |
4455 | + svc = __ip_vs_lookup_svc_fwm(mm->vfwmark); | |
4456 | + | |
4457 | + read_unlock(&__ip_vs_lock); | |
4458 | + | |
4459 | + switch (mctl->m_cmd) { | |
4460 | + case IP_MASQ_CMD_ADD: | |
4461 | + if (svc != NULL) | |
4462 | + return -EEXIST; | |
4463 | + | |
4464 | + return ip_vs_add_service(mctl); | |
4465 | + | |
4466 | + case IP_MASQ_CMD_SET: | |
4467 | + if (svc == NULL) | |
4468 | + return -ESRCH; | |
4469 | + | |
4470 | + return ip_vs_edit_service(svc, mctl); | |
4471 | + | |
4472 | + case IP_MASQ_CMD_DEL: | |
4473 | + if (svc == NULL) | |
4474 | + return -ESRCH; | |
4475 | + else | |
4476 | + return ip_vs_del_service(svc); | |
4477 | + | |
4478 | + case IP_MASQ_CMD_ADD_DEST: | |
4479 | + if (svc == NULL) | |
4480 | + return -ESRCH; | |
4481 | + else | |
4482 | + return ip_vs_add_dest(svc, mctl); | |
4483 | + | |
4484 | + case IP_MASQ_CMD_SET_DEST: | |
4485 | + if (svc == NULL) | |
4486 | + return -ESRCH; | |
4487 | + else | |
4488 | + return ip_vs_edit_dest(svc, mctl); | |
4489 | + | |
4490 | + case IP_MASQ_CMD_DEL_DEST: | |
4491 | + if (svc == NULL) | |
4492 | + return -ESRCH; | |
4493 | + else | |
4494 | + return ip_vs_del_dest(svc, mctl); | |
4495 | + } | |
4496 | + return -EINVAL; | |
4497 | +} | |
4498 | + | |
4499 | + | |
4500 | +#ifdef CONFIG_SYSCTL | |
4501 | + | |
4502 | +static int ip_vs_sysctl_defense_mode(ctl_table *ctl, int write, | |
4503 | + struct file * filp,void *buffer, size_t *lenp) | |
4504 | +{ | |
4505 | + int *valp = ctl->data; | |
4506 | + int val = *valp; | |
4507 | + int ret; | |
4508 | + | |
4509 | + ret = proc_dointvec(ctl, write, filp, buffer, lenp); | |
4510 | + if (write && (*valp != val)) { | |
4511 | + if ((*valp < 0) || (*valp > 3)) { | |
4512 | + /* Restore the correct value */ | |
4513 | + *valp = val; | |
4514 | + } else { | |
4515 | + update_defense_level(); | |
4516 | + } | |
4517 | + } | |
4518 | + return ret; | |
4519 | +} | |
4520 | + | |
4521 | +ctl_table ipv4_vs_table[] = { | |
4522 | +#ifdef CONFIG_IP_VS_DEBUG | |
4523 | + {NET_IPV4_VS_DEBUG_LEVEL, "debug_level", | |
4524 | + &sysctl_ip_vs_debug_level, sizeof(int), 0644, NULL, | |
4525 | + &proc_dointvec}, | |
4526 | +#endif | |
4527 | + {NET_IPV4_VS_AMEMTHRESH, "amemthresh", | |
4528 | + &sysctl_ip_vs_amemthresh, sizeof(int), 0644, NULL, | |
4529 | + &proc_dointvec}, | |
4530 | + {NET_IPV4_VS_AMDROPRATE, "am_droprate", | |
4531 | + &sysctl_ip_vs_am_droprate, sizeof(int), 0644, NULL, | |
4532 | + &proc_dointvec}, | |
4533 | + {NET_IPV4_VS_DROP_ENTRY, "drop_entry", | |
4534 | + &sysctl_ip_vs_drop_entry, sizeof(int), 0644, NULL, | |
4535 | + &ip_vs_sysctl_defense_mode}, | |
4536 | + {NET_IPV4_VS_DROP_PACKET, "drop_packet", | |
4537 | + &sysctl_ip_vs_drop_packet, sizeof(int), 0644, NULL, | |
4538 | + &ip_vs_sysctl_defense_mode}, | |
4539 | + {NET_IPV4_VS_SECURE_TCP, "secure_tcp", | |
4540 | + &sysctl_ip_vs_secure_tcp, sizeof(int), 0644, NULL, | |
4541 | + &ip_vs_sysctl_defense_mode}, | |
4542 | + {NET_IPV4_VS_TO_ES, "timeout_established", | |
4543 | + &masq_timeout_table_dos.timeout[IP_MASQ_S_ESTABLISHED], | |
4544 | + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, | |
4545 | + {NET_IPV4_VS_TO_SS, "timeout_synsent", | |
4546 | + &masq_timeout_table_dos.timeout[IP_MASQ_S_SYN_SENT], | |
4547 | + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, | |
4548 | + {NET_IPV4_VS_TO_SR, "timeout_synrecv", | |
4549 | + &masq_timeout_table_dos.timeout[IP_MASQ_S_SYN_RECV], | |
4550 | + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, | |
4551 | + {NET_IPV4_VS_TO_FW, "timeout_finwait", | |
4552 | + &masq_timeout_table_dos.timeout[IP_MASQ_S_FIN_WAIT], | |
4553 | + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, | |
4554 | + {NET_IPV4_VS_TO_TW, "timeout_timewait", | |
4555 | + &masq_timeout_table_dos.timeout[IP_MASQ_S_TIME_WAIT], | |
4556 | + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, | |
4557 | + {NET_IPV4_VS_TO_CL, "timeout_close", | |
4558 | + &masq_timeout_table_dos.timeout[IP_MASQ_S_CLOSE], | |
4559 | + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, | |
4560 | + {NET_IPV4_VS_TO_CW, "timeout_closewait", | |
4561 | + &masq_timeout_table_dos.timeout[IP_MASQ_S_CLOSE_WAIT], | |
4562 | + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, | |
4563 | + {NET_IPV4_VS_TO_LA, "timeout_lastack", | |
4564 | + &masq_timeout_table_dos.timeout[IP_MASQ_S_LAST_ACK], | |
4565 | + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, | |
4566 | + {NET_IPV4_VS_TO_LI, "timeout_listen", | |
4567 | + &masq_timeout_table_dos.timeout[IP_MASQ_S_LISTEN], | |
4568 | + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, | |
4569 | + {NET_IPV4_VS_TO_SA, "timeout_synack", | |
4570 | + &masq_timeout_table_dos.timeout[IP_MASQ_S_SYNACK], | |
4571 | + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, | |
4572 | + {NET_IPV4_VS_TO_UDP, "timeout_udp", | |
4573 | + &masq_timeout_table_dos.timeout[IP_MASQ_S_UDP], | |
4574 | + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, | |
4575 | + {NET_IPV4_VS_TO_ICMP, "timeout_icmp", | |
4576 | + &masq_timeout_table_dos.timeout[IP_MASQ_S_ICMP], | |
4577 | + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, | |
4578 | + {0} | |
4579 | +}; | |
4580 | +#endif | |
4581 | + | |
4582 | +#ifdef CONFIG_PROC_FS | |
4583 | +/* | |
4584 | + * Write the contents of the VS rule table to a PROCfs file. | |
4585 | + */ | |
4586 | +static int ip_vs_procinfo(char *buf, char **start, off_t offset, | |
4587 | + int length, int *eof, void *data) | |
4588 | +{ | |
4589 | + int len=0; | |
4590 | + off_t pos=0; | |
4591 | + char temp[64], temp2[32]; | |
4592 | + int idx; | |
4593 | + struct ip_vs_service *svc; | |
4594 | + struct ip_vs_dest *dest; | |
4595 | + struct list_head *l, *e, *p, *q; | |
4596 | + | |
4597 | + /* | |
4598 | + * Note: since the length of the buffer is usually the multiple | |
4599 | + * of 512, it is good to use fixed record of the divisor of 512, | |
4600 | + * so that records won't be truncated at buffer boundary. | |
4601 | + */ | |
4602 | + pos = 192; | |
4603 | + if (pos > offset) { | |
4604 | + sprintf(temp, | |
4605 | + "IP Virtual Server version %d.%d.%d (size=%d)", | |
4606 | + NVERSION(IP_VS_VERSION_CODE), IP_VS_TAB_SIZE); | |
4607 | + len += sprintf(buf+len, "%-63s\n", temp); | |
4608 | + len += sprintf(buf+len, "%-63s\n", | |
4609 | + "Prot LocalAddress:Port Scheduler Flags"); | |
4610 | + len += sprintf(buf+len, "%-63s\n", | |
4611 | + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn"); | |
4612 | + } | |
4613 | + | |
4614 | + read_lock_bh(&__ip_vs_lock); | |
4615 | + | |
4616 | + /* print the service table hashed by <protocol,addr,port> */ | |
4617 | + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | |
4618 | + l = &ip_vs_svc_table[idx]; | |
4619 | + for (e=l->next; e!=l; e=e->next) { | |
4620 | + svc = list_entry(e, struct ip_vs_service, s_list); | |
4621 | + pos += 64; | |
4622 | + if (pos > offset) { | |
4623 | + if (svc->flags & IP_VS_SVC_F_PERSISTENT) | |
4624 | + sprintf(temp2, "persistent %d %08X", | |
4625 | + svc->timeout, | |
4626 | + ntohl(svc->netmask)); | |
4627 | + else | |
4628 | + temp2[0] = '\0'; | |
4629 | + | |
4630 | + sprintf(temp, "%s %08X:%04X %s %s", | |
4631 | + masq_proto_name(svc->protocol), | |
4632 | + ntohl(svc->addr), | |
4633 | + ntohs(svc->port), | |
4634 | + svc->scheduler->name, temp2); | |
4635 | + len += sprintf(buf+len, "%-63s\n", temp); | |
4636 | + if (len >= length) | |
4637 | + goto done; | |
4638 | + } | |
4639 | + | |
4640 | + p = &svc->destinations; | |
4641 | + for (q=p->next; q!=p; q=q->next) { | |
4642 | + dest = list_entry(q, struct ip_vs_dest, n_list); | |
4643 | + pos += 64; | |
4644 | + if (pos <= offset) | |
4645 | + continue; | |
4646 | + sprintf(temp, | |
4647 | + " -> %08X:%04X %-7s %-6d %-10d %-10d", | |
4648 | + ntohl(dest->addr), | |
4649 | + ntohs(dest->port), | |
4650 | + ip_vs_fwd_name(dest->masq_flags), | |
4651 | + dest->weight, | |
4652 | + atomic_read(&dest->activeconns), | |
4653 | + atomic_read(&dest->inactconns)); | |
4654 | + len += sprintf(buf+len, "%-63s\n", temp); | |
4655 | + if (len >= length) | |
4656 | + goto done; | |
4657 | + } | |
4658 | + } | |
4659 | + } | |
4660 | + | |
4661 | + /* print the service table hashed by fwmark */ | |
4662 | + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | |
4663 | + l = &ip_vs_svc_fwm_table[idx]; | |
4664 | + for (e=l->next; e!=l; e=e->next) { | |
4665 | + svc = list_entry(e, struct ip_vs_service, f_list); | |
4666 | + pos += 64; | |
4667 | + if (pos > offset) { | |
4668 | + if (svc->flags & IP_VS_SVC_F_PERSISTENT) | |
4669 | + sprintf(temp2, "persistent %d %08X", | |
4670 | + svc->timeout, | |
4671 | + ntohl(svc->netmask)); | |
4672 | + else | |
4673 | + temp2[0] = '\0'; | |
4674 | + | |
4675 | + sprintf(temp, "FWM %08X %s %s", | |
4676 | + svc->fwmark, | |
4677 | + svc->scheduler->name, temp2); | |
4678 | + len += sprintf(buf+len, "%-63s\n", temp); | |
4679 | + if (len >= length) | |
4680 | + goto done; | |
4681 | + } | |
4682 | + | |
4683 | + p = &svc->destinations; | |
4684 | + for (q=p->next; q!=p; q=q->next) { | |
4685 | + dest = list_entry(q, struct ip_vs_dest, n_list); | |
4686 | + pos += 64; | |
4687 | + if (pos <= offset) | |
4688 | + continue; | |
4689 | + sprintf(temp, | |
4690 | + " -> %08X:%04X %-7s %-6d %-10d %-10d", | |
4691 | + ntohl(dest->addr), | |
4692 | + ntohs(dest->port), | |
4693 | + ip_vs_fwd_name(dest->masq_flags), | |
4694 | + dest->weight, | |
4695 | + atomic_read(&dest->activeconns), | |
4696 | + atomic_read(&dest->inactconns)); | |
4697 | + len += sprintf(buf+len, "%-63s\n", temp); | |
4698 | + if (len >= length) | |
4699 | + goto done; | |
4700 | + } | |
4701 | + } | |
4702 | + } | |
4703 | + | |
4704 | + done: | |
4705 | + read_unlock_bh(&__ip_vs_lock); | |
4706 | + | |
4707 | + *start = buf+len-(pos-offset); /* Start of wanted data */ | |
4708 | + len = pos-offset; | |
4709 | + if (len > length) | |
4710 | + len = length; | |
4711 | + if (len < 0) | |
4712 | + len = 0; | |
4713 | + return len; | |
4714 | +} | |
4715 | + | |
4716 | +struct proc_dir_entry ip_vs_proc_entry = { | |
4717 | + 0, /* dynamic inode */ | |
4718 | + 2, "vs", /* namelen and name */ | |
4719 | + S_IFREG | S_IRUGO, /* mode */ | |
4720 | + 1, 0, 0, 0, /* nlinks, owner, group, size */ | |
4721 | + &proc_net_inode_operations, /* operations */ | |
4722 | + NULL, /* get_info */ | |
4723 | + NULL, /* fill_inode */ | |
4724 | + NULL, NULL, NULL, /* next, parent, subdir */ | |
4725 | + NULL, /* data */ | |
4726 | + &ip_vs_procinfo, /* function to generate proc data */ | |
4727 | +}; | |
4728 | + | |
4729 | + | |
4730 | +/* | |
4731 | + * Write the IPVS statistic information to a PROCfs file. | |
4732 | + */ | |
4733 | +struct ip_vs_stats ip_vs_stats = {SPIN_LOCK_UNLOCKED, 0, 0}; | |
4734 | + | |
4735 | +static int | |
4736 | +ip_vs_stats_get_info(char *buf, char **start, off_t offset, | |
4737 | + int length, int *eof, void *data) | |
4738 | +{ | |
4739 | + int idx; | |
4740 | + int len=0; | |
4741 | + off_t pos=0; | |
4742 | + char temp[128]; | |
4743 | + struct ip_vs_service *svc; | |
4744 | + struct ip_vs_dest *dest; | |
4745 | + struct list_head *l, *e, *p, *q; | |
4746 | + | |
4747 | + pos += 128; | |
4748 | + if (pos > offset) { | |
4749 | + len += sprintf(buf+len, "%-63s\n", | |
4750 | +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ | |
4751 | + "TotalConns InPkts OutPkts InBytes OutBytes"); | |
4752 | + spin_lock(&ip_vs_stats.lock); | |
4753 | + sprintf(temp, " %8X %8X %8X %8X%08X %8X%08X", | |
4754 | + ip_vs_stats.conns, | |
4755 | + ip_vs_stats.inpkts, | |
4756 | + ip_vs_stats.outpkts, | |
4757 | + (__u32)(ip_vs_stats.inbytes >> 32), | |
4758 | + (__u32)ip_vs_stats.inbytes, | |
4759 | + (__u32)(ip_vs_stats.outbytes >> 32), | |
4760 | + (__u32)ip_vs_stats.outbytes); | |
4761 | + spin_unlock(&ip_vs_stats.lock); | |
4762 | + len += sprintf(buf+len, "%-63s\n", temp); | |
4763 | + } | |
4764 | + | |
4765 | + read_lock_bh(&__ip_vs_lock); | |
4766 | + | |
4767 | + /* print the service statistics */ | |
4768 | + pos += 128; | |
4769 | + if (pos > offset) { | |
4770 | + len += sprintf(buf+len, "%-127s\n", | |
4771 | + "\nVirtual Service\n" | |
4772 | + "Pro VirtService Conns InPkts OutPkts InBytes OutBytes"); | |
4773 | + } | |
4774 | + | |
4775 | + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | |
4776 | + l = &ip_vs_svc_table[idx]; | |
4777 | + for (e=l->next; e!=l; e=e->next) { | |
4778 | + svc = list_entry(e, struct ip_vs_service, s_list); | |
4779 | + pos += 128; | |
4780 | + if (pos <= offset) | |
4781 | + continue; | |
4782 | + spin_lock(&svc->stats.lock); | |
4783 | + sprintf(temp, "%3s %08X:%04X %8X %8X %8X %8X%08X %8X%08X", | |
4784 | + masq_proto_name(svc->protocol), | |
4785 | + ntohl(svc->addr), | |
4786 | + ntohs(svc->port), | |
4787 | + svc->stats.conns, | |
4788 | + svc->stats.inpkts, | |
4789 | + svc->stats.outpkts, | |
4790 | + (__u32)(svc->stats.inbytes >> 32), | |
4791 | + (__u32)svc->stats.inbytes, | |
4792 | + (__u32)(svc->stats.outbytes >> 32), | |
4793 | + (__u32)svc->stats.outbytes); | |
4794 | + spin_unlock(&svc->stats.lock); | |
4795 | + len += sprintf(buf+len, "%-127s\n", temp); | |
4796 | + if (pos >= offset+length) | |
4797 | + goto done; | |
4798 | + } | |
4799 | + } | |
4800 | + | |
4801 | + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | |
4802 | + l = &ip_vs_svc_fwm_table[idx]; | |
4803 | + for (e=l->next; e!=l; e=e->next) { | |
4804 | + svc = list_entry(e, struct ip_vs_service, f_list); | |
4805 | + pos += 128; | |
4806 | + if (pos <= offset) | |
4807 | + continue; | |
4808 | + spin_lock(&svc->stats.lock); | |
4809 | + sprintf(temp, "FWM %08X %8X %8X %8X %8X%08X %8X%08X", | |
4810 | + svc->fwmark, | |
4811 | + svc->stats.conns, | |
4812 | + svc->stats.inpkts, | |
4813 | + svc->stats.outpkts, | |
4814 | + (__u32)(svc->stats.inbytes >> 32), | |
4815 | + (__u32)svc->stats.inbytes, | |
4816 | + (__u32)(svc->stats.outbytes >> 32), | |
4817 | + (__u32)svc->stats.outbytes); | |
4818 | + spin_unlock(&svc->stats.lock); | |
4819 | + len += sprintf(buf+len, "%-127s\n", temp); | |
4820 | + if (pos >= offset+length) | |
4821 | + goto done; | |
4822 | + } | |
4823 | + } | |
4824 | + | |
4825 | + /* print the real server statistics */ | |
4826 | + pos += 128; | |
4827 | + if (pos > offset) { | |
4828 | + len += sprintf(buf+len, "%-127s\n", | |
4829 | + "\nReal Service\n" | |
4830 | + "Pro VirtService RealService Conns InPkts OutPkts InBytes OutBytes"); | |
4831 | + } | |
4832 | + | |
4833 | + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | |
4834 | + l = &ip_vs_svc_table[idx]; | |
4835 | + for (e=l->next; e!=l; e=e->next) { | |
4836 | + svc = list_entry(e, struct ip_vs_service, s_list); | |
4837 | + p = &svc->destinations; | |
4838 | + for (q=p->next; q!=p; q=q->next) { | |
4839 | + dest = list_entry(q, struct ip_vs_dest, n_list); | |
4840 | + pos += 128; | |
4841 | + if (pos <= offset) | |
4842 | + continue; | |
4843 | + spin_lock(&dest->stats.lock); | |
4844 | + sprintf(temp, | |
4845 | + "%3s %08X:%04X %08X:%04X %8X %8X %8X %8X%08X %8X%08X", | |
4846 | + masq_proto_name(svc->protocol), | |
4847 | + ntohl(svc->addr), | |
4848 | + ntohs(svc->port), | |
4849 | + ntohl(dest->addr), | |
4850 | + ntohs(dest->port), | |
4851 | + dest->stats.conns, | |
4852 | + dest->stats.inpkts, | |
4853 | + dest->stats.outpkts, | |
4854 | + (__u32)(dest->stats.inbytes >> 32), | |
4855 | + (__u32)dest->stats.inbytes, | |
4856 | + (__u32)(dest->stats.outbytes >> 32), | |
4857 | + (__u32)dest->stats.outbytes); | |
4858 | + spin_unlock(&dest->stats.lock); | |
4859 | + len += sprintf(buf+len, "%-127s\n", temp); | |
4860 | + if (pos >= offset+length) | |
4861 | + goto done; | |
4862 | + } | |
4863 | + } | |
4864 | + } | |
4865 | + | |
4866 | + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | |
4867 | + l = &ip_vs_svc_fwm_table[idx]; | |
4868 | + for (e=l->next; e!=l; e=e->next) { | |
4869 | + svc = list_entry(e, struct ip_vs_service, f_list); | |
4870 | + p = &svc->destinations; | |
4871 | + for (q=p->next; q!=p; q=q->next) { | |
4872 | + dest = list_entry(q,struct ip_vs_dest,n_list); | |
4873 | + pos += 128; | |
4874 | + if (pos <= offset) | |
4875 | + continue; | |
4876 | + spin_lock(&dest->stats.lock); | |
4877 | + sprintf(temp, | |
4878 | + "FWM %08X %08X:%04X %8X %8X %8X %8X%08X %8X%08X", | |
4879 | + svc->fwmark, | |
4880 | + ntohl(dest->addr), | |
4881 | + ntohs(dest->port), | |
4882 | + dest->stats.conns, | |
4883 | + dest->stats.inpkts, | |
4884 | + dest->stats.outpkts, | |
4885 | + (__u32)(dest->stats.inbytes >> 32), | |
4886 | + (__u32)dest->stats.inbytes, | |
4887 | + (__u32)(dest->stats.outbytes >> 32), | |
4888 | + (__u32)dest->stats.outbytes); | |
4889 | + spin_unlock(&dest->stats.lock); | |
4890 | + len += sprintf(buf+len, "%-127s\n", temp); | |
4891 | + if (pos >= offset+length) | |
4892 | + goto done; | |
4893 | + } | |
4894 | + } | |
4895 | + } | |
4896 | + done: | |
4897 | + read_unlock_bh(&__ip_vs_lock); | |
4898 | + | |
4899 | + *start = buf+len-(pos-offset); /* Start of wanted data */ | |
4900 | + len = pos-offset; | |
4901 | + if (len > length) | |
4902 | + len = length; | |
4903 | + if (len < 0) | |
4904 | + len = 0; | |
4905 | + return len; | |
4906 | +} | |
4907 | + | |
4908 | +struct proc_dir_entry ip_vs_stat_proc_entry = { | |
4909 | + 0, /* dynamic inode */ | |
4910 | + 8, "vs_stats", /* namelen and name */ | |
4911 | + S_IFREG | S_IRUGO, /* mode */ | |
4912 | + 1, 0, 0, 0, /* nlinks, owner, group, size */ | |
4913 | + &proc_net_inode_operations, /* operations */ | |
4914 | + NULL, /* get_info */ | |
4915 | + NULL, /* fill_inode */ | |
4916 | + NULL, NULL, NULL, /* next, parent, subdir */ | |
4917 | + NULL, /* data */ | |
4918 | + &ip_vs_stats_get_info, /* function to generate proc data */ | |
4919 | +}; | |
4920 | + | |
4921 | +#endif | |
4922 | + | |
4923 | + | |
4924 | +/* | |
4925 | + * This function encapsulates the packet in a new IP header, its destination | |
4926 | + * will be set to the daddr. Most code of this function is from ipip.c. | |
4927 | + * Usage: | |
4928 | + * It is called in the ip_vs_forward() function. The load balancer | |
4929 | + * selects a real server from a cluster based on a scheduling algorithm, | |
4930 | + * encapsulates the packet and forwards it to the selected server. All real | |
4931 | + * servers are configured with "ifconfig tunl0 <Virtual IP Address> up". | |
4932 | + * When the server receives the encapsulated packet, it decapsulates the | |
4933 | + * packet, processes the request and return the reply packets directly to | |
4934 | + * the client without passing the load balancer. This can greatly | |
4935 | + * increase the scalability of virtual server. | |
4936 | + * Returns: | |
4937 | + * if succeeded, return 1; otherwise, return 0. | |
4938 | + */ | |
4939 | + | |
4940 | +int ip_vs_tunnel_xmit(struct sk_buff *skb, __u32 daddr) | |
4941 | +{ | |
4942 | + struct rtable *rt; /* Route to the other host */ | |
4943 | + struct device *tdev; /* Device to other host */ | |
4944 | + struct iphdr *old_iph = skb->nh.iph; | |
4945 | + u8 tos = old_iph->tos; | |
4946 | + u16 df = old_iph->frag_off; | |
4947 | + struct iphdr *iph; /* Our new IP header */ | |
4948 | + int max_headroom; /* The extra header space needed */ | |
4949 | + u32 dst = daddr; | |
4950 | + u32 src = 0; | |
4951 | + int mtu; | |
4952 | + | |
4953 | + if (skb->protocol != __constant_htons(ETH_P_IP)) { | |
4954 | + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): protocol error, ETH_P_IP: %d, skb protocol: %d\n", | |
4955 | + __constant_htons(ETH_P_IP),skb->protocol); | |
4956 | + goto tx_error; | |
4957 | + } | |
4958 | + | |
4959 | + if (ip_route_output(&rt, dst, src, RT_TOS(tos), 0)) { | |
4960 | + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): route error, dest: " | |
4961 | + "%u.%u.%u.%u\n", NIPQUAD(dst)); | |
4962 | + goto tx_error_icmp; | |
4963 | + } | |
4964 | + tdev = rt->u.dst.dev; | |
4965 | + | |
4966 | + mtu = rt->u.dst.pmtu - sizeof(struct iphdr); | |
4967 | + if (mtu < 68) { | |
4968 | + ip_rt_put(rt); | |
4969 | + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): mtu less than 68\n"); | |
4970 | + goto tx_error; | |
4971 | + } | |
4972 | + if (skb->dst && mtu < skb->dst->pmtu) | |
4973 | + skb->dst->pmtu = mtu; | |
4974 | + | |
4975 | + df |= (old_iph->frag_off&__constant_htons(IP_DF)); | |
4976 | + | |
4977 | + if ((old_iph->frag_off&__constant_htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { | |
4978 | + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); | |
4979 | + ip_rt_put(rt); | |
4980 | + IP_VS_DBG(0, "ip_vs_tunnel_xmit(): frag needed\n"); | |
4981 | + goto tx_error; | |
4982 | + } | |
4983 | + | |
4984 | + skb->h.raw = skb->nh.raw; | |
4985 | + | |
4986 | + /* | |
4987 | + * Okay, now see if we can stuff it in the buffer as-is. | |
4988 | + */ | |
4989 | + max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr)); | |
4990 | + | |
4991 | + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { | |
4992 | + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); | |
4993 | + if (!new_skb) { | |
4994 | + ip_rt_put(rt); | |
4995 | + kfree_skb(skb); | |
4996 | + IP_VS_ERR("ip_vs_tunnel_xmit(): no memory for new_skb\n"); | |
4997 | + return 0; | |
4998 | + } | |
4999 | + kfree_skb(skb); | |
5000 | + skb = new_skb; | |
5001 | + } | |
5002 | + | |
5003 | + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); | |
5004 | + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | |
5005 | + dst_release(skb->dst); | |
5006 | + skb->dst = &rt->u.dst; | |
5007 | + | |
5008 | + /* | |
5009 | + * Push down and install the IPIP header. | |
5010 | + */ | |
5011 | + | |
5012 | + iph = skb->nh.iph; | |
5013 | + iph->version = 4; | |
5014 | + iph->ihl = sizeof(struct iphdr)>>2; | |
5015 | + iph->frag_off = df; | |
5016 | + iph->protocol = IPPROTO_IPIP; | |
5017 | + iph->tos = tos; | |
5018 | + iph->daddr = rt->rt_dst; | |
5019 | + iph->saddr = rt->rt_src; | |
5020 | + iph->ttl = old_iph->ttl; | |
5021 | + iph->tot_len = htons(skb->len); | |
5022 | + iph->id = htons(ip_id_count++); | |
5023 | + ip_send_check(iph); | |
5024 | + | |
5025 | + IPCB(skb)->flags |= IPSKB_REDIRECTED; | |
5026 | + IPCB(skb)->flags |= IPSKB_MASQUERADED; | |
5027 | + | |
5028 | + ip_send(skb); | |
5029 | + return 1; | |
5030 | + | |
5031 | + tx_error_icmp: | |
5032 | + dst_link_failure(skb); | |
5033 | + tx_error: | |
5034 | + kfree_skb(skb); | |
5035 | + return 0; | |
5036 | +} | |
5037 | + | |
5038 | + | |
5039 | +/* | |
5040 | + * Direct Routing | |
5041 | + */ | |
5042 | +int ip_vs_dr_xmit(struct sk_buff *skb, __u32 daddr) | |
5043 | +{ | |
5044 | + struct rtable *rt; /* Route to the other host */ | |
5045 | + struct iphdr *iph = skb->nh.iph; | |
5046 | + u8 tos = iph->tos; | |
5047 | + int mtu; | |
5048 | + | |
5049 | + if (ip_route_output(&rt, daddr, 0, RT_TOS(tos), 0)) { | |
5050 | + IP_VS_DBG(0, "ip_vs_dr_xmit(): route error, dest: %u.%u.%u.%u\n", | |
5051 | + NIPQUAD(daddr)); | |
5052 | + goto tx_error_icmp; | |
5053 | + } | |
5054 | + | |
5055 | + /* MTU checking */ | |
5056 | + mtu = rt->u.dst.pmtu; | |
5057 | + if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { | |
5058 | + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); | |
5059 | + ip_rt_put(rt); | |
5060 | + IP_VS_DBG(0, "ip_vs_dr_xmit(): frag needed\n"); | |
5061 | + goto tx_error; | |
5062 | + } | |
5063 | + | |
5064 | + dst_release(skb->dst); | |
5065 | + skb->dst = &rt->u.dst; | |
5066 | + | |
5067 | + IPCB(skb)->flags |= IPSKB_REDIRECTED; | |
5068 | + IPCB(skb)->flags |= IPSKB_MASQUERADED; | |
5069 | + | |
5070 | + ip_send(skb); | |
5071 | + return 1; | |
5072 | + | |
5073 | + tx_error_icmp: | |
5074 | + dst_link_failure(skb); | |
5075 | + tx_error: | |
5076 | + kfree_skb(skb); | |
5077 | + return 0; | |
5078 | +} | |
5079 | + | |
5080 | + | |
5081 | +/* | |
5082 | + * Initialize IP virtual server | |
5083 | + */ | |
5084 | +__initfunc(int ip_vs_init(void)) | |
5085 | +{ | |
5086 | + int idx; | |
5087 | + | |
5088 | + /* | |
5089 | + * Allocate the ip_vs_table and initialize its list head. | |
5090 | + * Initilize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable, | |
5091 | + * ip_vs_schedulers and ip_vs_dest_trash. | |
5092 | + */ | |
5093 | + if (!(ip_vs_table = | |
5094 | + vmalloc(IP_VS_TAB_SIZE*sizeof(struct list_head)))) { | |
5095 | + return -ENOMEM; | |
5096 | + } | |
5097 | + for(idx = 0; idx < IP_VS_TAB_SIZE; idx++) { | |
5098 | + INIT_LIST_HEAD(&ip_vs_table[idx]); | |
5099 | + } | |
5100 | + IP_VS_INFO("Connection hash table configured " | |
5101 | + "(size=%d, memory=%ldKbytes)\n", | |
5102 | + IP_VS_TAB_SIZE, | |
5103 | + (long) (IP_VS_TAB_SIZE*sizeof(struct list_head))/1024); | |
5104 | + | |
5105 | + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { | |
5106 | + INIT_LIST_HEAD(&ip_vs_svc_table[idx]); | |
5107 | + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); | |
5108 | + } | |
5109 | + for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { | |
5110 | + INIT_LIST_HEAD(&ip_vs_rtable[idx]); | |
5111 | + } | |
5112 | + INIT_LIST_HEAD(&ip_vs_schedulers); | |
5113 | + INIT_LIST_HEAD(&ip_vs_dest_trash); | |
5114 | + | |
5115 | + /* | |
5116 | + * Hook the slow_timer handler in the system timer. | |
5117 | + */ | |
5118 | + slow_timer.expires = jiffies+SLTIMER_PERIOD; | |
5119 | + add_timer(&slow_timer); | |
5120 | + | |
5121 | +#ifdef CONFIG_PROC_FS | |
5122 | + ip_masq_proc_register(&ip_vs_proc_entry); | |
5123 | + ip_masq_proc_register(&ip_vs_stat_proc_entry); | |
5124 | +#endif | |
5125 | + | |
5126 | +#ifdef CONFIG_IP_MASQUERADE_VS_RR | |
5127 | + ip_vs_rr_init(); | |
5128 | +#endif | |
5129 | +#ifdef CONFIG_IP_MASQUERADE_VS_WRR | |
5130 | + ip_vs_wrr_init(); | |
5131 | +#endif | |
5132 | +#ifdef CONFIG_IP_MASQUERADE_VS_LC | |
5133 | + ip_vs_lc_init(); | |
5134 | +#endif | |
5135 | +#ifdef CONFIG_IP_MASQUERADE_VS_WLC | |
5136 | + ip_vs_wlc_init(); | |
5137 | +#endif | |
5138 | +#ifdef CONFIG_IP_MASQUERADE_VS_LBLC | |
5139 | + ip_vs_lblc_init(); | |
5140 | +#endif | |
5141 | +#ifdef CONFIG_IP_MASQUERADE_VS_LBLCR | |
5142 | + ip_vs_lblcr_init(); | |
5143 | +#endif | |
5144 | + return 0; | |
5145 | +} | |
5146 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lblc.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblc.c | |
5147 | --- linux-2.2.19/net/ipv4/ip_vs_lblc.c Thu Jan 1 08:00:00 1970 | |
5148 | +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblc.c Fri Feb 2 18:49:08 2001 | |
5149 | @@ -0,0 +1,645 @@ | |
5150 | +/* | |
5151 | + * IPVS: Locality-Based Least-Connection scheduling module | |
5152 | + * | |
5153 | + * Version: $Id$ | |
5154 | + * | |
5155 | + * Authors: Wensong Zhang <wensong@gnuchina.org> | |
5156 | + * | |
5157 | + * This program is free software; you can redistribute it and/or | |
5158 | + * modify it under the terms of the GNU General Public License | |
5159 | + * as published by the Free Software Foundation; either version | |
5160 | + * 2 of the License, or (at your option) any later version. | |
5161 | + * | |
5162 | + * Changes: | |
5163 | + * Martin Hamilton : fixed the terrible locking bugs | |
5164 | + * *lock(tbl->lock) ==> *lock(&tbl->lock) | |
5165 | + * Wensong Zhang : fixed the uninitilized tbl->lock bug | |
5166 | + * Wensong Zhang : added doing full expiration check to | |
5167 | + * collect stale entries of 24+ hours when | |
5168 | + * no partial expire check in a half hour | |
5169 | + * | |
5170 | + */ | |
5171 | + | |
5172 | +/* | |
5173 | + * The lblc algorithm is as follows (pseudo code): | |
5174 | + * | |
5175 | + * if cachenode[dest_ip] is null then | |
5176 | + * n, cachenode[dest_ip] <- {weighted least-conn node}; | |
5177 | + * else | |
5178 | + * n <- cachenode[dest_ip]; | |
5179 | + * if (n is dead) OR | |
5180 | + * (n.conns>n.weight AND | |
5181 | + * there is a node m with m.conns<m.weight/2) then | |
5182 | + * n, cachenode[dest_ip] <- {weighted least-conn node}; | |
5183 | + * | |
5184 | + * return n; | |
5185 | + * | |
5186 | + * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing | |
5187 | + * me to write this module. | |
5188 | + */ | |
5189 | + | |
5190 | +#include <linux/config.h> | |
5191 | +#include <linux/module.h> | |
5192 | +#ifdef CONFIG_KMOD | |
5193 | +#include <linux/kmod.h> | |
5194 | +#endif | |
5195 | +#include <linux/types.h> | |
5196 | +#include <linux/kernel.h> | |
5197 | +#include <linux/errno.h> | |
5198 | +#include <linux/vmalloc.h> | |
5199 | +#include <net/ip_masq.h> | |
5200 | +#ifdef CONFIG_IP_MASQUERADE_MOD | |
5201 | +#include <net/ip_masq_mod.h> | |
5202 | +#endif | |
5203 | +#include <linux/sysctl.h> | |
5204 | +#include <linux/proc_fs.h> | |
5205 | +#include <linux/ip_fw.h> | |
5206 | +#include <net/ip_vs.h> | |
5207 | + | |
5208 | + | |
5209 | +/* | |
5210 | + * It is for garbage collection of stale IPVS lblc entries, | |
5211 | + * when the table is full. | |
5212 | + */ | |
5213 | +#define CHECK_EXPIRE_INTERVAL (60*HZ) | |
5214 | +#define ENTRY_TIMEOUT (5*60*HZ) | |
5215 | + | |
5216 | +/* | |
5217 | + * It is for full expiration check. | |
5218 | + * When there is no partial expiration check (garbage collection) | |
5219 | + * in a half hour, do a full expiration check to collect stale | |
5220 | + * entries that haven't been touched for a day (by default). | |
5221 | + */ | |
5222 | +#define COUNT_FOR_FULL_EXPIRATION 30 | |
5223 | +int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; | |
5224 | + | |
5225 | + | |
5226 | +/* | |
5227 | + * for IPVS lblc entry hash table | |
5228 | + */ | |
5229 | +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS | |
5230 | +#define CONFIG_IP_VS_LBLC_TAB_BITS 10 | |
5231 | +#endif | |
5232 | +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS | |
5233 | +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) | |
5234 | +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) | |
5235 | + | |
5236 | + | |
5237 | +/* | |
5238 | + * IPVS lblc entry represents an association between destination | |
5239 | + * IP address and its destination server | |
5240 | + */ | |
5241 | +struct ip_vs_lblc_entry { | |
5242 | + struct list_head list; | |
5243 | + __u32 addr; /* destination IP address */ | |
5244 | + struct ip_vs_dest *dest; /* real server (cache) */ | |
5245 | + unsigned long lastuse; /* last used time */ | |
5246 | +}; | |
5247 | + | |
5248 | + | |
5249 | +/* | |
5250 | + * IPVS lblc hash table | |
5251 | + */ | |
5252 | +struct ip_vs_lblc_table { | |
5253 | + rwlock_t lock; /* lock for this table */ | |
5254 | + struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ | |
5255 | + atomic_t entries; /* number of entries */ | |
5256 | + int max_size; /* maximum size of entries */ | |
5257 | + struct timer_list periodic_timer; /* collect stale entries */ | |
5258 | + int rover; /* rover for expire check */ | |
5259 | + int counter; /* counter for no expire */ | |
5260 | +}; | |
5261 | + | |
5262 | + | |
5263 | + | |
5264 | +/* | |
5265 | + * IPVS LBLC sysctl table | |
5266 | + */ | |
5267 | +struct ip_vs_lblc_sysctl_table { | |
5268 | + struct ctl_table_header *sysctl_header; | |
5269 | + ctl_table vs_vars[2]; | |
5270 | + ctl_table vs_dir[2]; | |
5271 | + ctl_table ipv4_dir[2]; | |
5272 | + ctl_table root_dir[2]; | |
5273 | +}; | |
5274 | + | |
5275 | + | |
5276 | +static struct ip_vs_lblc_sysctl_table lblc_sysctl_table = { | |
5277 | + NULL, | |
5278 | + {{NET_IPV4_VS_LBLC_EXPIRE, "lblc_expiration", | |
5279 | + &sysctl_ip_vs_lblc_expiration, | |
5280 | + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, | |
5281 | + {0}}, | |
5282 | + {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblc_sysctl_table.vs_vars}, | |
5283 | + {0}}, | |
5284 | + {{NET_IPV4, "ipv4", NULL, 0, 0555, lblc_sysctl_table.vs_dir}, | |
5285 | + {0}}, | |
5286 | + {{CTL_NET, "net", NULL, 0, 0555, lblc_sysctl_table.ipv4_dir}, | |
5287 | + {0}} | |
5288 | +}; | |
5289 | + | |
5290 | + | |
5291 | +/* | |
5292 | + * new/free a ip_vs_lblc_entry, which is a mapping of a destination | |
5293 | + * IP address to a server. | |
5294 | + */ | |
5295 | +static inline struct ip_vs_lblc_entry * | |
5296 | +ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest) | |
5297 | +{ | |
5298 | + struct ip_vs_lblc_entry *en; | |
5299 | + | |
5300 | + en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); | |
5301 | + if (en == NULL) { | |
5302 | + IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); | |
5303 | + return NULL; | |
5304 | + } | |
5305 | + | |
5306 | + INIT_LIST_HEAD(&en->list); | |
5307 | + en->addr = daddr; | |
5308 | + | |
5309 | + atomic_inc(&dest->refcnt); | |
5310 | + en->dest = dest; | |
5311 | + | |
5312 | + return en; | |
5313 | +} | |
5314 | + | |
5315 | + | |
5316 | +static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) | |
5317 | +{ | |
5318 | + list_del(&en->list); | |
5319 | + atomic_dec(&en->dest->refcnt); | |
5320 | + kfree(en); | |
5321 | +} | |
5322 | + | |
5323 | + | |
5324 | +/* | |
5325 | + * Returns hash value for IPVS LBLC entry | |
5326 | + */ | |
5327 | +static inline unsigned ip_vs_lblc_hashkey(__u32 addr) | |
5328 | +{ | |
5329 | + return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; | |
5330 | +} | |
5331 | + | |
5332 | + | |
5333 | +/* | |
5334 | + * Hash an entry in the ip_vs_lblc_table. | |
5335 | + * returns bool success. | |
5336 | + */ | |
5337 | +static int | |
5338 | +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) | |
5339 | +{ | |
5340 | + unsigned hash; | |
5341 | + | |
5342 | + if (!list_empty(&en->list)) { | |
5343 | + IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " | |
5344 | + "called from %p\n", __builtin_return_address(0)); | |
5345 | + return 0; | |
5346 | + } | |
5347 | + | |
5348 | + /* | |
5349 | + * Hash by destination IP address | |
5350 | + */ | |
5351 | + hash = ip_vs_lblc_hashkey(en->addr); | |
5352 | + | |
5353 | + write_lock(&tbl->lock); | |
5354 | + list_add(&en->list, &tbl->bucket[hash]); | |
5355 | + atomic_inc(&tbl->entries); | |
5356 | + write_unlock(&tbl->lock); | |
5357 | + | |
5358 | + return 1; | |
5359 | +} | |
5360 | + | |
5361 | + | |
5362 | +#if 0000 | |
5363 | +/* | |
5364 | + * Unhash ip_vs_lblc_entry from ip_vs_lblc_table. | |
5365 | + * returns bool success. | |
5366 | + */ | |
5367 | +static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl, | |
5368 | + struct ip_vs_lblc_entry *en) | |
5369 | +{ | |
5370 | + if (list_empty(&en->list)) { | |
5371 | + IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, " | |
5372 | + "called from %p\n", __builtin_return_address(0)); | |
5373 | + return 0; | |
5374 | + } | |
5375 | + | |
5376 | + /* | |
5377 | + * Remove it from the table | |
5378 | + */ | |
5379 | + write_lock(&tbl->lock); | |
5380 | + list_del(&en->list); | |
5381 | + INIT_LIST_HEAD(&en->list); | |
5382 | + write_unlock(&tbl->lock); | |
5383 | + | |
5384 | + return 1; | |
5385 | +} | |
5386 | +#endif | |
5387 | + | |
5388 | + | |
5389 | +/* | |
5390 | + * Get ip_vs_lblc_entry associated with supplied parameters. | |
5391 | + */ | |
5392 | +static inline struct ip_vs_lblc_entry * | |
5393 | +ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr) | |
5394 | +{ | |
5395 | + unsigned hash; | |
5396 | + struct ip_vs_lblc_entry *en; | |
5397 | + struct list_head *l,*e; | |
5398 | + | |
5399 | + hash = ip_vs_lblc_hashkey(addr); | |
5400 | + | |
5401 | + read_lock(&tbl->lock); | |
5402 | + | |
5403 | + l = &tbl->bucket[hash]; | |
5404 | + for (e=l->next; e!=l; e=e->next) { | |
5405 | + en = list_entry(e, struct ip_vs_lblc_entry, list); | |
5406 | + if (en->addr == addr) { | |
5407 | + /* HIT */ | |
5408 | + read_unlock(&tbl->lock); | |
5409 | + return en; | |
5410 | + } | |
5411 | + } | |
5412 | + | |
5413 | + read_unlock(&tbl->lock); | |
5414 | + | |
5415 | + return NULL; | |
5416 | +} | |
5417 | + | |
5418 | + | |
5419 | +/* | |
5420 | + * Flush all the entries of the specified table. | |
5421 | + */ | |
5422 | +static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) | |
5423 | +{ | |
5424 | + int i; | |
5425 | + struct list_head *l; | |
5426 | + struct ip_vs_lblc_entry *en; | |
5427 | + | |
5428 | + for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { | |
5429 | + write_lock(&tbl->lock); | |
5430 | + for (l=&tbl->bucket[i]; l->next!=l; ) { | |
5431 | + en = list_entry(l->next, | |
5432 | + struct ip_vs_lblc_entry, list); | |
5433 | + ip_vs_lblc_free(en); | |
5434 | + atomic_dec(&tbl->entries); | |
5435 | + } | |
5436 | + write_unlock(&tbl->lock); | |
5437 | + } | |
5438 | +} | |
5439 | + | |
5440 | + | |
5441 | +static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) | |
5442 | +{ | |
5443 | + unsigned long now = jiffies; | |
5444 | + int i, j; | |
5445 | + struct list_head *l, *e; | |
5446 | + struct ip_vs_lblc_entry *en; | |
5447 | + | |
5448 | + for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { | |
5449 | + j = (j + 1) & IP_VS_LBLC_TAB_MASK; | |
5450 | + e = l = &tbl->bucket[j]; | |
5451 | + write_lock(&tbl->lock); | |
5452 | + while (e->next != l) { | |
5453 | + en = list_entry(e->next, | |
5454 | + struct ip_vs_lblc_entry, list); | |
5455 | + if ((now - en->lastuse) < | |
5456 | + sysctl_ip_vs_lblc_expiration) { | |
5457 | + e = e->next; | |
5458 | + continue; | |
5459 | + } | |
5460 | + ip_vs_lblc_free(en); | |
5461 | + atomic_dec(&tbl->entries); | |
5462 | + } | |
5463 | + write_unlock(&tbl->lock); | |
5464 | + } | |
5465 | + tbl->rover = j; | |
5466 | +} | |
5467 | + | |
5468 | + | |
5469 | +/* | |
5470 | + * Periodical timer handler for IPVS lblc table | |
5471 | + * It is used to collect stale entries when the number of entries | |
5472 | + * exceeds the maximum size of the table. | |
5473 | + * | |
5474 | + * Fixme: we probably need more complicated algorithm to collect | |
5475 | + * entries that have not been used for a long time even | |
5476 | + * if the number of entries doesn't exceed the maximum size | |
5477 | + * of the table. | |
5478 | + * The full expiration check is for this purpose now. | |
5479 | + */ | |
5480 | +static void ip_vs_lblc_check_expire(unsigned long data) | |
5481 | +{ | |
5482 | + struct ip_vs_lblc_table *tbl; | |
5483 | + unsigned long now = jiffies; | |
5484 | + int goal; | |
5485 | + int i, j; | |
5486 | + struct list_head *l, *e; | |
5487 | + struct ip_vs_lblc_entry *en; | |
5488 | + | |
5489 | + tbl = (struct ip_vs_lblc_table *)data; | |
5490 | + | |
5491 | + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { | |
5492 | + /* do full expiration check */ | |
5493 | + ip_vs_lblc_full_check(tbl); | |
5494 | + tbl->counter = 1; | |
5495 | + goto out; | |
5496 | + } | |
5497 | + | |
5498 | + if (atomic_read(&tbl->entries) < tbl->max_size) { | |
5499 | + tbl->counter++; | |
5500 | + goto out; | |
5501 | + } | |
5502 | + | |
5503 | + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; | |
5504 | + if (goal > tbl->max_size/2) | |
5505 | + goal = tbl->max_size/2; | |
5506 | + | |
5507 | + for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { | |
5508 | + j = (j + 1) & IP_VS_LBLC_TAB_MASK; | |
5509 | + e = l = &tbl->bucket[j]; | |
5510 | + write_lock(&tbl->lock); | |
5511 | + while (e->next != l) { | |
5512 | + en = list_entry(e->next, | |
5513 | + struct ip_vs_lblc_entry, list); | |
5514 | + if ((now - en->lastuse) < ENTRY_TIMEOUT) { | |
5515 | + e = e->next; | |
5516 | + continue; | |
5517 | + } | |
5518 | + ip_vs_lblc_free(en); | |
5519 | + atomic_dec(&tbl->entries); | |
5520 | + goal--; | |
5521 | + } | |
5522 | + write_unlock(&tbl->lock); | |
5523 | + if (goal <= 0) | |
5524 | + break; | |
5525 | + } | |
5526 | + tbl->rover = j; | |
5527 | + | |
5528 | + out: | |
5529 | + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); | |
5530 | +} | |
5531 | + | |
5532 | + | |
5533 | +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) | |
5534 | +{ | |
5535 | + int i; | |
5536 | + struct ip_vs_lblc_table *tbl; | |
5537 | + | |
5538 | + /* | |
5539 | + * Allocate the ip_vs_lblc_table for this service | |
5540 | + */ | |
5541 | + tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); | |
5542 | + if (tbl == NULL) { | |
5543 | + IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); | |
5544 | + return -ENOMEM; | |
5545 | + } | |
5546 | + svc->sched_data = tbl; | |
5547 | + IP_VS_DBG(0, "LBLC hash table (memory=%dbytes) allocated for " | |
5548 | + "current service\n", | |
5549 | + sizeof(struct ip_vs_lblc_table)); | |
5550 | + | |
5551 | + /* | |
5552 | + * Initialize the hash buckets | |
5553 | + */ | |
5554 | + for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { | |
5555 | + INIT_LIST_HEAD(&tbl->bucket[i]); | |
5556 | + } | |
5557 | + tbl->lock = RW_LOCK_UNLOCKED; | |
5558 | + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; | |
5559 | + tbl->rover = 0; | |
5560 | + tbl->counter = 1; | |
5561 | + | |
5562 | + /* | |
5563 | + * Hook periodic timer for garbage collection | |
5564 | + */ | |
5565 | + init_timer(&tbl->periodic_timer); | |
5566 | + tbl->periodic_timer.data = (unsigned long)tbl; | |
5567 | + tbl->periodic_timer.function = ip_vs_lblc_check_expire; | |
5568 | + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; | |
5569 | + add_timer(&tbl->periodic_timer); | |
5570 | + | |
5571 | + MOD_INC_USE_COUNT; | |
5572 | + return 0; | |
5573 | +} | |
5574 | + | |
5575 | + | |
5576 | +static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) | |
5577 | +{ | |
5578 | + struct ip_vs_lblc_table *tbl = svc->sched_data; | |
5579 | + | |
5580 | + /* remove periodic timer */ | |
5581 | + del_timer(&tbl->periodic_timer); | |
5582 | + | |
5583 | + /* got to clean up table entries here */ | |
5584 | + ip_vs_lblc_flush(tbl); | |
5585 | + | |
5586 | + /* release the table itself */ | |
5587 | + kfree(svc->sched_data); | |
5588 | + IP_VS_DBG(0, "LBLC hash table (memory=%dbytes) released\n", | |
5589 | + sizeof(struct ip_vs_lblc_table)); | |
5590 | + | |
5591 | + MOD_DEC_USE_COUNT; | |
5592 | + return 0; | |
5593 | +} | |
5594 | + | |
5595 | + | |
5596 | +static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) | |
5597 | +{ | |
5598 | + return 0; | |
5599 | +} | |
5600 | + | |
5601 | + | |
5602 | +static inline struct ip_vs_dest * | |
5603 | +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) | |
5604 | +{ | |
5605 | + register struct list_head *l, *e; | |
5606 | + struct ip_vs_dest *dest, *least; | |
5607 | + int loh, doh; | |
5608 | + | |
5609 | + l = &svc->destinations; | |
5610 | + if (l == l->next) | |
5611 | + return NULL; | |
5612 | + | |
5613 | + /* | |
5614 | + * We think the overhead of processing active connections is fifty | |
5615 | + * times than that of inactive conncetions in average. (This fifty | |
5616 | + * times might be not accurate, we will change it later.) We use | |
5617 | + * the following formula to estimate the overhead: | |
5618 | + * dest->activeconns*50 + dest->inactconns | |
5619 | + * and the load: | |
5620 | + * (dest overhead) / dest->weight | |
5621 | + * | |
5622 | + * Remember -- no floats in kernel mode!!! | |
5623 | + * The comparison of h1*w2 > h2*w1 is equivalent to that of | |
5624 | + * h1/w1 > h2/w2 | |
5625 | + * if every weight is larger than zero. | |
5626 | + * | |
5627 | + * The server with weight=0 is quiesced and will not receive any | |
5628 | + * new connection. | |
5629 | + */ | |
5630 | + | |
5631 | + for (e=l->next; e!=l; e=e->next) { | |
5632 | + least = list_entry(e, struct ip_vs_dest, n_list); | |
5633 | + if (least->weight > 0) { | |
5634 | + loh = atomic_read(&least->activeconns) * 50 | |
5635 | + + atomic_read(&least->inactconns); | |
5636 | + goto nextstage; | |
5637 | + } | |
5638 | + } | |
5639 | + return NULL; | |
5640 | + | |
5641 | + /* | |
5642 | + * Find the destination with the least load. | |
5643 | + */ | |
5644 | + nextstage: | |
5645 | + for (e=e->next; e!=l; e=e->next) | |
5646 | + { | |
5647 | + dest = list_entry(e, struct ip_vs_dest, n_list); | |
5648 | + doh = atomic_read(&dest->activeconns) * 50 | |
5649 | + + atomic_read(&dest->inactconns); | |
5650 | + if (loh * dest->weight > doh * least->weight) | |
5651 | + { | |
5652 | + least = dest; | |
5653 | + loh = doh; | |
5654 | + } | |
5655 | + } | |
5656 | + | |
5657 | + IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " | |
5658 | + "activeconns %d refcnt %d weight %d overhead %d\n", | |
5659 | + NIPQUAD(least->addr), ntohs(least->port), | |
5660 | + atomic_read(&least->activeconns), | |
5661 | + atomic_read(&least->refcnt), least->weight, loh); | |
5662 | + | |
5663 | + return least; | |
5664 | +} | |
5665 | + | |
5666 | + | |
5667 | +/* | |
5668 | + * If this destination server is overloaded and there is a less loaded | |
5669 | + * server, then return true. | |
5670 | + */ | |
5671 | +static inline int | |
5672 | +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) | |
5673 | +{ | |
5674 | + if (atomic_read(&dest->activeconns) > dest->weight) { | |
5675 | + register struct list_head *l, *e; | |
5676 | + struct ip_vs_dest *d; | |
5677 | + | |
5678 | + l = &svc->destinations; | |
5679 | + for (e=l->next; e!=l; e=e->next) { | |
5680 | + d = list_entry(e, struct ip_vs_dest, n_list); | |
5681 | + if (atomic_read(&d->activeconns)*2 < d->weight) { | |
5682 | + return 1; | |
5683 | + } | |
5684 | + } | |
5685 | + } | |
5686 | + return 0; | |
5687 | +} | |
5688 | + | |
5689 | + | |
5690 | +/* | |
5691 | + * Locality-Based (weighted) Least-Connection scheduling | |
5692 | + */ | |
5693 | +static struct ip_vs_dest * | |
5694 | +ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) | |
5695 | +{ | |
5696 | + struct ip_vs_dest *dest; | |
5697 | + struct ip_vs_lblc_table *tbl; | |
5698 | + struct ip_vs_lblc_entry *en; | |
5699 | + | |
5700 | + IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); | |
5701 | + | |
5702 | + tbl = (struct ip_vs_lblc_table *)svc->sched_data; | |
5703 | + en = ip_vs_lblc_get(tbl, iph->daddr); | |
5704 | + if (en == NULL) { | |
5705 | + dest = __ip_vs_wlc_schedule(svc, iph); | |
5706 | + if (dest == NULL) { | |
5707 | + IP_VS_DBG(1, "no destination available\n"); | |
5708 | + return NULL; | |
5709 | + } | |
5710 | + en = ip_vs_lblc_new(iph->daddr, dest); | |
5711 | + if (en == NULL) { | |
5712 | + return NULL; | |
5713 | + } | |
5714 | + ip_vs_lblc_hash(tbl, en); | |
5715 | + } else { | |
5716 | + dest = en->dest; | |
5717 | + if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) | |
5718 | + || dest->weight <= 0 | |
5719 | + || is_overloaded(dest, svc)) { | |
5720 | + dest = __ip_vs_wlc_schedule(svc, iph); | |
5721 | + if (dest == NULL) { | |
5722 | + IP_VS_DBG(1, "no destination available\n"); | |
5723 | + return NULL; | |
5724 | + } | |
5725 | + atomic_dec(&en->dest->refcnt); | |
5726 | + atomic_inc(&dest->refcnt); | |
5727 | + en->dest = dest; | |
5728 | + } | |
5729 | + } | |
5730 | + en->lastuse = jiffies; | |
5731 | + | |
5732 | + IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " | |
5733 | + "--> server %u.%u.%u.%u:%d\n", | |
5734 | + NIPQUAD(en->addr), | |
5735 | + NIPQUAD(dest->addr), | |
5736 | + ntohs(dest->port)); | |
5737 | + | |
5738 | + return dest; | |
5739 | +} | |
5740 | + | |
5741 | + | |
5742 | +static struct ip_vs_scheduler ip_vs_lblc_scheduler = | |
5743 | +{ | |
5744 | + {0}, /* n_list */ | |
5745 | + "lblc", /* name */ | |
5746 | + ATOMIC_INIT(0), /* refcnt */ | |
5747 | + ip_vs_lblc_init_svc, /* service initializer */ | |
5748 | + ip_vs_lblc_done_svc, /* service done */ | |
5749 | + ip_vs_lblc_update_svc, /* service updater */ | |
5750 | + ip_vs_lblc_schedule, /* select a server from the destination list */ | |
5751 | +}; | |
5752 | + | |
5753 | + | |
5754 | +__initfunc(int ip_vs_lblc_init(void)) | |
5755 | +{ | |
5756 | + IP_VS_INFO("Initializing LBLC scheduling\n"); | |
5757 | + INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); | |
5758 | + lblc_sysctl_table.sysctl_header = | |
5759 | + register_sysctl_table(lblc_sysctl_table.root_dir, 0); | |
5760 | + return register_ip_vs_scheduler(&ip_vs_lblc_scheduler); | |
5761 | +} | |
5762 | + | |
5763 | + | |
5764 | +#ifdef MODULE | |
5765 | +EXPORT_NO_SYMBOLS; | |
5766 | + | |
5767 | +int init_module(void) | |
5768 | +{ | |
5769 | + INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); | |
5770 | + | |
5771 | + /* module initialization by 'request_module' */ | |
5772 | + if (register_ip_vs_scheduler(&ip_vs_lblc_scheduler) != 0) | |
5773 | + return -EIO; | |
5774 | + | |
5775 | + lblc_sysctl_table.sysctl_header = | |
5776 | + register_sysctl_table(lblc_sysctl_table.root_dir, 0); | |
5777 | + | |
5778 | + IP_VS_INFO("LBLC scheduling module loaded.\n"); | |
5779 | + | |
5780 | + return 0; | |
5781 | +} | |
5782 | + | |
5783 | +void cleanup_module(void) | |
5784 | +{ | |
5785 | + /* module cleanup by 'release_module' */ | |
5786 | + if (unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler) != 0) { | |
5787 | + IP_VS_INFO("cannot remove LBLC scheduling module\n"); | |
5788 | + } else { | |
5789 | + IP_VS_INFO("LBLC scheduling module unloaded.\n"); | |
5790 | + } | |
5791 | + unregister_sysctl_table(lblc_sysctl_table.sysctl_header); | |
5792 | +} | |
5793 | + | |
5794 | +#endif /* MODULE */ | |
5795 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lblcr.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblcr.c | |
5796 | --- linux-2.2.19/net/ipv4/ip_vs_lblcr.c Thu Jan 1 08:00:00 1970 | |
5797 | +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lblcr.c Tue Mar 27 17:37:00 2001 | |
5798 | @@ -0,0 +1,834 @@ | |
5799 | +/* | |
5800 | + * IPVS: Locality-Based Least-Connection with Replication scheduler | |
5801 | + * | |
5802 | + * Version: $Id$ | |
5803 | + * | |
5804 | + * Authors: Wensong Zhang <wensong@gnuchina.org> | |
5805 | + * | |
5806 | + * This program is free software; you can redistribute it and/or | |
5807 | + * modify it under the terms of the GNU General Public License | |
5808 | + * as published by the Free Software Foundation; either version | |
5809 | + * 2 of the License, or (at your option) any later version. | |
5810 | + * | |
5811 | + * Changes: | |
5812 | + * Julian Anastasov : Added the missing (dest->weight>0) | |
5813 | + * condition in the ip_vs_dest_set_max. | |
5814 | + * | |
5815 | + */ | |
5816 | + | |
5817 | +/* | |
5818 | + * The lblc/r algorithm is as follows (pseudo code): | |
5819 | + * | |
5820 | + * if serverSet[dest_ip] is null then | |
5821 | + * n, serverSet[dest_ip] <- {weighted least-conn node}; | |
5822 | + * else | |
5823 | + * n <- {least-conn (alive) node in serverSet[dest_ip]}; | |
5824 | + * if (n is null) OR | |
5825 | + * (n.conns>n.weight AND | |
5826 | + * there is a node m with m.conns<m.weight/2) then | |
5827 | + * n <- {weighted least-conn node}; | |
5828 | + * add n to serverSet[dest_ip]; | |
5829 | + * if |serverSet[dest_ip]| > 1 AND | |
5830 | + * now - serverSet[dest_ip].lastMod > T then | |
5831 | + * m <- {most conn node in serverSet[dest_ip]}; | |
5832 | + * remove m from serverSet[dest_ip]; | |
5833 | + * if serverSet[dest_ip] changed then | |
5834 | + * serverSet[dest_ip].lastMod <- now; | |
5835 | + * | |
5836 | + * return n; | |
5837 | + * | |
5838 | + */ | |
5839 | + | |
5840 | +#include <linux/config.h> | |
5841 | +#include <linux/module.h> | |
5842 | +#ifdef CONFIG_KMOD | |
5843 | +#include <linux/kmod.h> | |
5844 | +#endif | |
5845 | +#include <linux/types.h> | |
5846 | +#include <linux/kernel.h> | |
5847 | +#include <linux/errno.h> | |
5848 | +#include <linux/vmalloc.h> | |
5849 | +#include <net/ip_masq.h> | |
5850 | +#ifdef CONFIG_IP_MASQUERADE_MOD | |
5851 | +#include <net/ip_masq_mod.h> | |
5852 | +#endif | |
5853 | +#include <linux/sysctl.h> | |
5854 | +#include <linux/proc_fs.h> | |
5855 | +#include <linux/ip_fw.h> | |
5856 | +#include <net/ip_vs.h> | |
5857 | + | |
5858 | + | |
5859 | +/* | |
5860 | + * It is for garbage collection of stale IPVS lblcr entries, | |
5861 | + * when the table is full. | |
5862 | + */ | |
5863 | +#define CHECK_EXPIRE_INTERVAL (60*HZ) | |
5864 | +#define ENTRY_TIMEOUT (6*60*HZ) | |
5865 | + | |
5866 | +/* | |
5867 | + * It is for full expiration check. | |
5868 | + * When there is no partial expiration check (garbage collection) | |
5869 | + * in a half hour, do a full expiration check to collect stale | |
5870 | + * entries that haven't been touched for a day. | |
5871 | + */ | |
5872 | +#define COUNT_FOR_FULL_EXPIRATION 30 | |
5873 | +int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; | |
5874 | + | |
5875 | + | |
5876 | +/* | |
5877 | + * for IPVS lblcr entry hash table | |
5878 | + */ | |
5879 | +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS | |
5880 | +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 | |
5881 | +#endif | |
5882 | +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS | |
5883 | +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) | |
5884 | +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) | |
5885 | + | |
5886 | + | |
5887 | +/* | |
5888 | + * IPVS destination set structure and operations | |
5889 | + */ | |
5890 | +struct ip_vs_dest_list { | |
5891 | + struct ip_vs_dest_list *next; /* list link */ | |
5892 | + struct ip_vs_dest *dest; /* destination server */ | |
5893 | +}; | |
5894 | + | |
5895 | +struct ip_vs_dest_set { | |
5896 | + atomic_t size; /* set size */ | |
5897 | + unsigned long lastmod; /* last modified time */ | |
5898 | + struct ip_vs_dest_list *list; /* destination list */ | |
5899 | + rwlock_t lock; /* lock for this list */ | |
5900 | +}; | |
5901 | + | |
5902 | + | |
5903 | +static struct ip_vs_dest_list * | |
5904 | +ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) | |
5905 | +{ | |
5906 | + struct ip_vs_dest_list *e; | |
5907 | + | |
5908 | + for (e=set->list; e!=NULL; e=e->next) { | |
5909 | + if (e->dest == dest) | |
5910 | + /* already existed */ | |
5911 | + return NULL; | |
5912 | + } | |
5913 | + | |
5914 | + e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC); | |
5915 | + if (e == NULL) { | |
5916 | + IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); | |
5917 | + return NULL; | |
5918 | + } | |
5919 | + | |
5920 | + atomic_inc(&dest->refcnt); | |
5921 | + e->dest = dest; | |
5922 | + | |
5923 | + /* link it to the list */ | |
5924 | + write_lock(&set->lock); | |
5925 | + if (set->list != NULL) { | |
5926 | + e->next = set->list->next; | |
5927 | + set->list = e; | |
5928 | + } else { | |
5929 | + e->next = NULL; | |
5930 | + set->list = e; | |
5931 | + } | |
5932 | + write_unlock(&set->lock); | |
5933 | + | |
5934 | + atomic_inc(&set->size); | |
5935 | + set->lastmod = jiffies; | |
5936 | + return e; | |
5937 | +} | |
5938 | + | |
5939 | +static void | |
5940 | +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) | |
5941 | +{ | |
5942 | + struct ip_vs_dest_list *e, **ep; | |
5943 | + | |
5944 | + write_lock(&set->lock); | |
5945 | + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { | |
5946 | + if (e->dest == dest) { | |
5947 | + /* HIT */ | |
5948 | + *ep = e->next; | |
5949 | + atomic_dec(&set->size); | |
5950 | + set->lastmod = jiffies; | |
5951 | + atomic_dec(&e->dest->refcnt); | |
5952 | + kfree(e); | |
5953 | + break; | |
5954 | + } | |
5955 | + ep = &e->next; | |
5956 | + } | |
5957 | + write_unlock(&set->lock); | |
5958 | +} | |
5959 | + | |
5960 | +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) | |
5961 | +{ | |
5962 | + struct ip_vs_dest_list *e, **ep; | |
5963 | + | |
5964 | + write_lock(&set->lock); | |
5965 | + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { | |
5966 | + *ep = e->next; | |
5967 | + /* | |
5968 | + * We don't kfree dest because it is refered either | |
5969 | + * by its service or by the trash dest list. | |
5970 | + */ | |
5971 | + atomic_dec(&e->dest->refcnt); | |
5972 | + kfree(e); | |
5973 | + } | |
5974 | + write_unlock(&set->lock); | |
5975 | +} | |
5976 | + | |
5977 | +/* get weighted least-connection node in the destination set */ | |
5978 | +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) | |
5979 | +{ | |
5980 | + register struct ip_vs_dest_list *e; | |
5981 | + struct ip_vs_dest *dest, *least; | |
5982 | + int loh, doh; | |
5983 | + | |
5984 | + if (set == NULL) | |
5985 | + return NULL; | |
5986 | + | |
5987 | + read_lock(&set->lock); | |
5988 | + /* select the first destination server, whose weight > 0 */ | |
5989 | + for (e=set->list; e!=NULL; e=e->next) { | |
5990 | + least = e->dest; | |
5991 | + if ((least->weight > 0) | |
5992 | + && (least->flags & IP_VS_DEST_F_AVAILABLE)) { | |
5993 | + loh = atomic_read(&least->activeconns) * 50 | |
5994 | + + atomic_read(&least->inactconns); | |
5995 | + goto nextstage; | |
5996 | + } | |
5997 | + } | |
5998 | + read_unlock(&set->lock); | |
5999 | + return NULL; | |
6000 | + | |
6001 | + /* find the destination with the weighted least load */ | |
6002 | + nextstage: | |
6003 | + for (e=e->next; e!=NULL; e=e->next) { | |
6004 | + dest = e->dest; | |
6005 | + doh = atomic_read(&dest->activeconns) * 50 | |
6006 | + + atomic_read(&dest->inactconns); | |
6007 | + if ((loh*dest->weight > doh*least->weight) | |
6008 | + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { | |
6009 | + least = dest; | |
6010 | + loh = doh; | |
6011 | + } | |
6012 | + } | |
6013 | + read_unlock(&set->lock); | |
6014 | + | |
6015 | + IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " | |
6016 | + "activeconns %d refcnt %d weight %d overhead %d\n", | |
6017 | + NIPQUAD(least->addr), ntohs(least->port), | |
6018 | + atomic_read(&least->activeconns), | |
6019 | + atomic_read(&least->refcnt), least->weight, loh); | |
6020 | + return least; | |
6021 | +} | |
6022 | + | |
6023 | + | |
6024 | +/* get weighted most-connection node in the destination set */ | |
6025 | +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) | |
6026 | +{ | |
6027 | + register struct ip_vs_dest_list *e; | |
6028 | + struct ip_vs_dest *dest, *most; | |
6029 | + int moh, doh; | |
6030 | + | |
6031 | + if (set == NULL) | |
6032 | + return NULL; | |
6033 | + | |
6034 | + read_lock(&set->lock); | |
6035 | + /* select the first destination server, whose weight > 0 */ | |
6036 | + for (e=set->list; e!=NULL; e=e->next) { | |
6037 | + most = e->dest; | |
6038 | + if (most->weight > 0) { | |
6039 | + moh = atomic_read(&most->activeconns) * 50 | |
6040 | + + atomic_read(&most->inactconns); | |
6041 | + goto nextstage; | |
6042 | + } | |
6043 | + } | |
6044 | + read_unlock(&set->lock); | |
6045 | + return NULL; | |
6046 | + | |
6047 | + /* find the destination with the weighted most load */ | |
6048 | + nextstage: | |
6049 | + for (e=e->next; e!=NULL; e=e->next) { | |
6050 | + dest = e->dest; | |
6051 | + doh = atomic_read(&dest->activeconns) * 50 | |
6052 | + + atomic_read(&dest->inactconns); | |
6053 | + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ | |
6054 | + if (moh*dest->weight < doh*most->weight | |
6055 | + && dest->weight > 0) { | |
6056 | + most = dest; | |
6057 | + moh = doh; | |
6058 | + } | |
6059 | + } | |
6060 | + read_unlock(&set->lock); | |
6061 | + | |
6062 | + IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " | |
6063 | + "activeconns %d refcnt %d weight %d overhead %d\n", | |
6064 | + NIPQUAD(most->addr), ntohs(most->port), | |
6065 | + atomic_read(&most->activeconns), | |
6066 | + atomic_read(&most->refcnt), most->weight, moh); | |
6067 | + return most; | |
6068 | +} | |
6069 | + | |
6070 | + | |
6071 | +/* | |
6072 | + * IPVS lblcr entry represents an association between destination | |
6073 | + * IP address and its destination server set | |
6074 | + */ | |
6075 | +struct ip_vs_lblcr_entry { | |
6076 | + struct list_head list; | |
6077 | + __u32 addr; /* destination IP address */ | |
6078 | + struct ip_vs_dest_set set; /* destination server set */ | |
6079 | + unsigned long lastuse; /* last used time */ | |
6080 | +}; | |
6081 | + | |
6082 | + | |
6083 | +/* | |
6084 | + * IPVS lblcr hash table | |
6085 | + */ | |
6086 | +struct ip_vs_lblcr_table { | |
6087 | + rwlock_t lock; /* lock for this table */ | |
6088 | + struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ | |
6089 | + atomic_t entries; /* number of entries */ | |
6090 | + int max_size; /* maximum size of entries */ | |
6091 | + struct timer_list periodic_timer; /* collect stale entries */ | |
6092 | + int rover; /* rover for expire check */ | |
6093 | + int counter; /* counter for no expire */ | |
6094 | +}; | |
6095 | + | |
6096 | + | |
6097 | +/* | |
6098 | + * IPVS LBLCR sysctl table | |
6099 | + */ | |
6100 | +struct ip_vs_lblcr_sysctl_table { | |
6101 | + struct ctl_table_header *sysctl_header; | |
6102 | + ctl_table vs_vars[2]; | |
6103 | + ctl_table vs_dir[2]; | |
6104 | + ctl_table ipv4_dir[2]; | |
6105 | + ctl_table root_dir[2]; | |
6106 | +}; | |
6107 | + | |
6108 | + | |
6109 | +static struct ip_vs_lblcr_sysctl_table lblcr_sysctl_table = { | |
6110 | + NULL, | |
6111 | + {{NET_IPV4_VS_LBLCR_EXPIRE, "lblcr_expiration", | |
6112 | + &sysctl_ip_vs_lblcr_expiration, | |
6113 | + sizeof(int), 0644, NULL, &proc_dointvec_jiffies}, | |
6114 | + {0}}, | |
6115 | + {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblcr_sysctl_table.vs_vars}, | |
6116 | + {0}}, | |
6117 | + {{NET_IPV4, "ipv4", NULL, 0, 0555, lblcr_sysctl_table.vs_dir}, | |
6118 | + {0}}, | |
6119 | + {{CTL_NET, "net", NULL, 0, 0555, lblcr_sysctl_table.ipv4_dir}, | |
6120 | + {0}} | |
6121 | +}; | |
6122 | + | |
6123 | + | |
6124 | +/* | |
6125 | + * new/free a ip_vs_lblcr_entry, which is a mapping of a destination | |
6126 | + * IP address to a server. | |
6127 | + */ | |
6128 | +static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr) | |
6129 | +{ | |
6130 | + struct ip_vs_lblcr_entry *en; | |
6131 | + | |
6132 | + en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC); | |
6133 | + if (en == NULL) { | |
6134 | + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); | |
6135 | + return NULL; | |
6136 | + } | |
6137 | + | |
6138 | + INIT_LIST_HEAD(&en->list); | |
6139 | + en->addr = daddr; | |
6140 | + | |
6141 | + /* initilize its dest set */ | |
6142 | + atomic_set(&(en->set.size), 0); | |
6143 | + en->set.list = NULL; | |
6144 | + en->set.lock = RW_LOCK_UNLOCKED; | |
6145 | + | |
6146 | + return en; | |
6147 | +} | |
6148 | + | |
6149 | + | |
6150 | +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) | |
6151 | +{ | |
6152 | + list_del(&en->list); | |
6153 | + ip_vs_dest_set_eraseall(&en->set); | |
6154 | + kfree(en); | |
6155 | +} | |
6156 | + | |
6157 | + | |
6158 | +/* | |
6159 | + * Returns hash value for IPVS LBLCR entry | |
6160 | + */ | |
6161 | +static inline unsigned ip_vs_lblcr_hashkey(__u32 addr) | |
6162 | +{ | |
6163 | + return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; | |
6164 | +} | |
6165 | + | |
6166 | + | |
6167 | +/* | |
6168 | + * Hash an entry in the ip_vs_lblcr_table. | |
6169 | + * returns bool success. | |
6170 | + */ | |
6171 | +static int | |
6172 | +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) | |
6173 | +{ | |
6174 | + unsigned hash; | |
6175 | + | |
6176 | + if (!list_empty(&en->list)) { | |
6177 | + IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, " | |
6178 | + "called from %p\n", __builtin_return_address(0)); | |
6179 | + return 0; | |
6180 | + } | |
6181 | + | |
6182 | + /* | |
6183 | + * Hash by destination IP address | |
6184 | + */ | |
6185 | + hash = ip_vs_lblcr_hashkey(en->addr); | |
6186 | + | |
6187 | + write_lock(&tbl->lock); | |
6188 | + list_add(&en->list, &tbl->bucket[hash]); | |
6189 | + atomic_inc(&tbl->entries); | |
6190 | + write_unlock(&tbl->lock); | |
6191 | + | |
6192 | + return 1; | |
6193 | +} | |
6194 | + | |
6195 | + | |
6196 | +#if 0000 | |
6197 | +/* | |
6198 | + * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table. | |
6199 | + * returns bool success. | |
6200 | + */ | |
6201 | +static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl, | |
6202 | + struct ip_vs_lblcr_entry *en) | |
6203 | +{ | |
6204 | + if (list_empty(&en->list)) { | |
6205 | + IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, " | |
6206 | + "called from %p\n", __builtin_return_address(0)); | |
6207 | + return 0; | |
6208 | + } | |
6209 | + | |
6210 | + /* | |
6211 | + * Remove it from the table | |
6212 | + */ | |
6213 | + write_lock(&tbl->lock); | |
6214 | + list_del(&en->list); | |
6215 | + INIT_LIST_HEAD(&en->list); | |
6216 | + write_unlock(&tbl->lock); | |
6217 | + | |
6218 | + return 1; | |
6219 | +} | |
6220 | +#endif | |
6221 | + | |
6222 | + | |
6223 | +/* | |
6224 | + * Get ip_vs_lblcr_entry associated with supplied parameters. | |
6225 | + */ | |
6226 | +static inline struct ip_vs_lblcr_entry * | |
6227 | +ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr) | |
6228 | +{ | |
6229 | + unsigned hash; | |
6230 | + struct ip_vs_lblcr_entry *en; | |
6231 | + struct list_head *l,*e; | |
6232 | + | |
6233 | + hash = ip_vs_lblcr_hashkey(addr); | |
6234 | + l = &tbl->bucket[hash]; | |
6235 | + | |
6236 | + read_lock(&tbl->lock); | |
6237 | + | |
6238 | + for (e=l->next; e!=l; e=e->next) { | |
6239 | + en = list_entry(e, struct ip_vs_lblcr_entry, list); | |
6240 | + if (en->addr == addr) { | |
6241 | + /* HIT */ | |
6242 | + read_unlock(&tbl->lock); | |
6243 | + return en; | |
6244 | + } | |
6245 | + } | |
6246 | + | |
6247 | + read_unlock(&tbl->lock); | |
6248 | + | |
6249 | + return NULL; | |
6250 | +} | |
6251 | + | |
6252 | + | |
6253 | +/* | |
6254 | + * Flush all the entries of the specified table. | |
6255 | + */ | |
6256 | +static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) | |
6257 | +{ | |
6258 | + int i; | |
6259 | + struct list_head *l; | |
6260 | + struct ip_vs_lblcr_entry *en; | |
6261 | + | |
6262 | + for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { | |
6263 | + write_lock(&tbl->lock); | |
6264 | + for (l=&tbl->bucket[i]; l->next!=l; ) { | |
6265 | + en = list_entry(l->next, | |
6266 | + struct ip_vs_lblcr_entry, list); | |
6267 | + ip_vs_lblcr_free(en); | |
6268 | + atomic_dec(&tbl->entries); | |
6269 | + } | |
6270 | + write_unlock(&tbl->lock); | |
6271 | + } | |
6272 | +} | |
6273 | + | |
6274 | + | |
6275 | +static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) | |
6276 | +{ | |
6277 | + unsigned long now = jiffies; | |
6278 | + int i, j; | |
6279 | + struct list_head *l, *e; | |
6280 | + struct ip_vs_lblcr_entry *en; | |
6281 | + | |
6282 | + for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { | |
6283 | + j = (j + 1) & IP_VS_LBLCR_TAB_MASK; | |
6284 | + e = l = &tbl->bucket[j]; | |
6285 | + write_lock(&tbl->lock); | |
6286 | + while (e->next != l) { | |
6287 | + en = list_entry(e->next, | |
6288 | + struct ip_vs_lblcr_entry, list); | |
6289 | + if ((now - en->lastuse) < | |
6290 | + sysctl_ip_vs_lblcr_expiration) { | |
6291 | + e = e->next; | |
6292 | + continue; | |
6293 | + } | |
6294 | + ip_vs_lblcr_free(en); | |
6295 | + atomic_dec(&tbl->entries); | |
6296 | + } | |
6297 | + write_unlock(&tbl->lock); | |
6298 | + } | |
6299 | + tbl->rover = j; | |
6300 | +} | |
6301 | + | |
6302 | + | |
6303 | +/* | |
6304 | + * Periodical timer handler for IPVS lblcr table | |
6305 | + * It is used to collect stale entries when the number of entries | |
6306 | + * exceeds the maximum size of the table. | |
6307 | + * | |
6308 | + * Fixme: we probably need more complicated algorithm to collect | |
6309 | + * entries that have not been used for a long time even | |
6310 | + * if the number of entries doesn't exceed the maximum size | |
6311 | + * of the table. | |
6312 | + * The full expiration check is for this purpose now. | |
6313 | + */ | |
6314 | +static void ip_vs_lblcr_check_expire(unsigned long data) | |
6315 | +{ | |
6316 | + struct ip_vs_lblcr_table *tbl; | |
6317 | + unsigned long now = jiffies; | |
6318 | + int goal; | |
6319 | + int i, j; | |
6320 | + struct list_head *l, *e; | |
6321 | + struct ip_vs_lblcr_entry *en; | |
6322 | + | |
6323 | + tbl = (struct ip_vs_lblcr_table *)data; | |
6324 | + | |
6325 | + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { | |
6326 | + /* do full expiration check */ | |
6327 | + ip_vs_lblcr_full_check(tbl); | |
6328 | + tbl->counter = 1; | |
6329 | + goto out; | |
6330 | + } | |
6331 | + | |
6332 | + if (atomic_read(&tbl->entries) < tbl->max_size) { | |
6333 | + tbl->counter++; | |
6334 | + goto out; | |
6335 | + } | |
6336 | + | |
6337 | + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; | |
6338 | + if (goal > tbl->max_size/2) | |
6339 | + goal = tbl->max_size/2; | |
6340 | + | |
6341 | + for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { | |
6342 | + j = (j + 1) & IP_VS_LBLCR_TAB_MASK; | |
6343 | + e = l = &tbl->bucket[j]; | |
6344 | + write_lock(&tbl->lock); | |
6345 | + while (e->next != l) { | |
6346 | + en = list_entry(e->next, | |
6347 | + struct ip_vs_lblcr_entry, list); | |
6348 | + if ((now - en->lastuse) < ENTRY_TIMEOUT) { | |
6349 | + e = e->next; | |
6350 | + continue; | |
6351 | + } | |
6352 | + ip_vs_lblcr_free(en); | |
6353 | + atomic_dec(&tbl->entries); | |
6354 | + goal--; | |
6355 | + } | |
6356 | + write_unlock(&tbl->lock); | |
6357 | + if (goal <= 0) | |
6358 | + break; | |
6359 | + } | |
6360 | + tbl->rover = j; | |
6361 | + | |
6362 | + out: | |
6363 | + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); | |
6364 | +} | |
6365 | + | |
6366 | + | |
6367 | +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) | |
6368 | +{ | |
6369 | + int i; | |
6370 | + struct ip_vs_lblcr_table *tbl; | |
6371 | + | |
6372 | + /* | |
6373 | + * Allocate the ip_vs_lblcr_table for this service | |
6374 | + */ | |
6375 | + tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); | |
6376 | + if (tbl == NULL) { | |
6377 | + IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); | |
6378 | + return -ENOMEM; | |
6379 | + } | |
6380 | + svc->sched_data = tbl; | |
6381 | + IP_VS_DBG(0, "LBLCR hash table (memory=%dbytes) allocated for " | |
6382 | + "current service\n", | |
6383 | + sizeof(struct ip_vs_lblcr_table)); | |
6384 | + | |
6385 | + /* | |
6386 | + * Initialize the hash buckets | |
6387 | + */ | |
6388 | + for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { | |
6389 | + INIT_LIST_HEAD(&tbl->bucket[i]); | |
6390 | + } | |
6391 | + tbl->lock = RW_LOCK_UNLOCKED; | |
6392 | + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; | |
6393 | + tbl->rover = 0; | |
6394 | + tbl->counter = 1; | |
6395 | + | |
6396 | + /* | |
6397 | + * Hook periodic timer for garbage collection | |
6398 | + */ | |
6399 | + init_timer(&tbl->periodic_timer); | |
6400 | + tbl->periodic_timer.data = (unsigned long)tbl; | |
6401 | + tbl->periodic_timer.function = ip_vs_lblcr_check_expire; | |
6402 | + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; | |
6403 | + add_timer(&tbl->periodic_timer); | |
6404 | + | |
6405 | + MOD_INC_USE_COUNT; | |
6406 | + return 0; | |
6407 | +} | |
6408 | + | |
6409 | + | |
6410 | +static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) | |
6411 | +{ | |
6412 | + struct ip_vs_lblcr_table *tbl = svc->sched_data; | |
6413 | + | |
6414 | + /* remove periodic timer */ | |
6415 | + del_timer(&tbl->periodic_timer); | |
6416 | + | |
6417 | + /* got to clean up table entries here */ | |
6418 | + ip_vs_lblcr_flush(tbl); | |
6419 | + | |
6420 | + /* release the table itself */ | |
6421 | + kfree(svc->sched_data); | |
6422 | + IP_VS_DBG(0, "LBLCR hash table (memory=%dbytes) released\n", | |
6423 | + sizeof(struct ip_vs_lblcr_table)); | |
6424 | + | |
6425 | + MOD_DEC_USE_COUNT; | |
6426 | + return 0; | |
6427 | +} | |
6428 | + | |
6429 | + | |
6430 | +static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc) | |
6431 | +{ | |
6432 | + return 0; | |
6433 | +} | |
6434 | + | |
6435 | + | |
6436 | +static inline struct ip_vs_dest * | |
6437 | +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) | |
6438 | +{ | |
6439 | + register struct list_head *l, *e; | |
6440 | + struct ip_vs_dest *dest, *least; | |
6441 | + int loh, doh; | |
6442 | + | |
6443 | + l = &svc->destinations; | |
6444 | + if (l == l->next) | |
6445 | + return NULL; | |
6446 | + | |
6447 | + /* | |
6448 | + * We think the overhead of processing active connections is fifty | |
6449 | + * times than that of inactive conncetions in average. (This fifty | |
6450 | + * times might be not accurate, we will change it later.) We use | |
6451 | + * the following formula to estimate the overhead: | |
6452 | + * dest->activeconns*50 + dest->inactconns | |
6453 | + * and the load: | |
6454 | + * (dest overhead) / dest->weight | |
6455 | + * | |
6456 | + * Remember -- no floats in kernel mode!!! | |
6457 | + * The comparison of h1*w2 > h2*w1 is equivalent to that of | |
6458 | + * h1/w1 > h2/w2 | |
6459 | + * if every weight is larger than zero. | |
6460 | + * | |
6461 | + * The server with weight=0 is quiesced and will not receive any | |
6462 | + * new connection. | |
6463 | + */ | |
6464 | + | |
6465 | + for (e=l->next; e!=l; e=e->next) { | |
6466 | + least = list_entry(e, struct ip_vs_dest, n_list); | |
6467 | + if (least->weight > 0) { | |
6468 | + loh = atomic_read(&least->activeconns) * 50 | |
6469 | + + atomic_read(&least->inactconns); | |
6470 | + goto nextstage; | |
6471 | + } | |
6472 | + } | |
6473 | + return NULL; | |
6474 | + | |
6475 | + /* | |
6476 | + * Find the destination with the least load. | |
6477 | + */ | |
6478 | + nextstage: | |
6479 | + for (e=e->next; e!=l; e=e->next) { | |
6480 | + dest = list_entry(e, struct ip_vs_dest, n_list); | |
6481 | + doh = atomic_read(&dest->activeconns) * 50 | |
6482 | + + atomic_read(&dest->inactconns); | |
6483 | + if (loh*dest->weight > doh*least->weight) { | |
6484 | + least = dest; | |
6485 | + loh = doh; | |
6486 | + } | |
6487 | + } | |
6488 | + | |
6489 | + IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " | |
6490 | + "activeconns %d refcnt %d weight %d overhead %d\n", | |
6491 | + NIPQUAD(least->addr), ntohs(least->port), | |
6492 | + atomic_read(&least->activeconns), | |
6493 | + atomic_read(&least->refcnt), least->weight, loh); | |
6494 | + | |
6495 | + return least; | |
6496 | +} | |
6497 | + | |
6498 | + | |
6499 | +/* | |
6500 | + * If this destination server is overloaded and there is a less loaded | |
6501 | + * server, then return true. | |
6502 | + */ | |
6503 | +static inline int | |
6504 | +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) | |
6505 | +{ | |
6506 | + if (atomic_read(&dest->activeconns) > dest->weight) { | |
6507 | + register struct list_head *l, *e; | |
6508 | + struct ip_vs_dest *d; | |
6509 | + | |
6510 | + l = &svc->destinations; | |
6511 | + for (e=l->next; e!=l; e=e->next) { | |
6512 | + d = list_entry(e, struct ip_vs_dest, n_list); | |
6513 | + if (atomic_read(&d->activeconns)*2 < d->weight) { | |
6514 | + return 1; | |
6515 | + } | |
6516 | + } | |
6517 | + } | |
6518 | + return 0; | |
6519 | +} | |
6520 | + | |
6521 | + | |
6522 | +/* | |
6523 | + * Locality-Based (weighted) Least-Connection scheduling | |
6524 | + */ | |
6525 | +static struct ip_vs_dest * | |
6526 | +ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) | |
6527 | +{ | |
6528 | + struct ip_vs_dest *dest; | |
6529 | + struct ip_vs_lblcr_table *tbl; | |
6530 | + struct ip_vs_lblcr_entry *en; | |
6531 | + | |
6532 | + IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); | |
6533 | + | |
6534 | + tbl = (struct ip_vs_lblcr_table *)svc->sched_data; | |
6535 | + en = ip_vs_lblcr_get(tbl, iph->daddr); | |
6536 | + if (en == NULL) { | |
6537 | + dest = __ip_vs_wlc_schedule(svc, iph); | |
6538 | + if (dest == NULL) { | |
6539 | + IP_VS_DBG(1, "no destination available\n"); | |
6540 | + return NULL; | |
6541 | + } | |
6542 | + en = ip_vs_lblcr_new(iph->daddr); | |
6543 | + if (en == NULL) { | |
6544 | + return NULL; | |
6545 | + } | |
6546 | + ip_vs_dest_set_insert(&en->set, dest); | |
6547 | + ip_vs_lblcr_hash(tbl, en); | |
6548 | + } else { | |
6549 | + dest = ip_vs_dest_set_min(&en->set); | |
6550 | + if (!dest || is_overloaded(dest, svc)) { | |
6551 | + dest = __ip_vs_wlc_schedule(svc, iph); | |
6552 | + if (dest == NULL) { | |
6553 | + IP_VS_DBG(1, "no destination available\n"); | |
6554 | + return NULL; | |
6555 | + } | |
6556 | + ip_vs_dest_set_insert(&en->set, dest); | |
6557 | + } | |
6558 | + if (atomic_read(&en->set.size) > 1 && | |
6559 | + jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) { | |
6560 | + struct ip_vs_dest *m; | |
6561 | + m = ip_vs_dest_set_max(&en->set); | |
6562 | + if (m) ip_vs_dest_set_erase(&en->set, m); | |
6563 | + } | |
6564 | + } | |
6565 | + en->lastuse = jiffies; | |
6566 | + | |
6567 | + IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " | |
6568 | + "--> server %u.%u.%u.%u:%d\n", | |
6569 | + NIPQUAD(en->addr), | |
6570 | + NIPQUAD(dest->addr), | |
6571 | + ntohs(dest->port)); | |
6572 | + | |
6573 | + return dest; | |
6574 | +} | |
6575 | + | |
6576 | + | |
6577 | +/* | |
6578 | + * IPVS LBLCR Scheduler structure | |
6579 | + */ | |
6580 | +static struct ip_vs_scheduler ip_vs_lblcr_scheduler = | |
6581 | +{ | |
6582 | + {0}, /* n_list */ | |
6583 | + "lblcr", /* name */ | |
6584 | + ATOMIC_INIT(0), /* refcnt */ | |
6585 | + ip_vs_lblcr_init_svc, /* service initializer */ | |
6586 | + ip_vs_lblcr_done_svc, /* service done */ | |
6587 | + ip_vs_lblcr_update_svc, /* service updater */ | |
6588 | + ip_vs_lblcr_schedule, /* select a server from the destination list */ | |
6589 | +}; | |
6590 | + | |
6591 | + | |
6592 | +__initfunc(int ip_vs_lblcr_init(void)) | |
6593 | +{ | |
6594 | + IP_VS_INFO("Initializing LBLCR scheduling\n"); | |
6595 | + INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); | |
6596 | + lblcr_sysctl_table.sysctl_header = | |
6597 | + register_sysctl_table(lblcr_sysctl_table.root_dir, 0); | |
6598 | + return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); | |
6599 | +} | |
6600 | + | |
6601 | + | |
6602 | +#ifdef MODULE | |
6603 | +EXPORT_NO_SYMBOLS; | |
6604 | + | |
6605 | +int init_module(void) | |
6606 | +{ | |
6607 | + INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); | |
6608 | + | |
6609 | + /* module initialization by 'request_module' */ | |
6610 | + if (register_ip_vs_scheduler(&ip_vs_lblcr_scheduler) != 0) | |
6611 | + return -EIO; | |
6612 | + | |
6613 | + lblcr_sysctl_table.sysctl_header = | |
6614 | + register_sysctl_table(lblcr_sysctl_table.root_dir, 0); | |
6615 | + | |
6616 | + IP_VS_INFO("LBLCR scheduling module loaded.\n"); | |
6617 | + | |
6618 | + return 0; | |
6619 | +} | |
6620 | + | |
6621 | +void cleanup_module(void) | |
6622 | +{ | |
6623 | + /* module cleanup by 'release_module' */ | |
6624 | + if (unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler) != 0) { | |
6625 | + IP_VS_INFO("cannot remove LBLCR scheduling module\n"); | |
6626 | + } else { | |
6627 | + IP_VS_INFO("LBLCR scheduling module unloaded.\n"); | |
6628 | + } | |
6629 | + unregister_sysctl_table(lblcr_sysctl_table.sysctl_header); | |
6630 | +} | |
6631 | + | |
6632 | +#endif /* MODULE */ | |
6633 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_lc.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lc.c | |
6634 | --- linux-2.2.19/net/ipv4/ip_vs_lc.c Thu Jan 1 08:00:00 1970 | |
6635 | +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_lc.c Fri Nov 24 10:02:53 2000 | |
6636 | @@ -0,0 +1,159 @@ | |
6637 | +/* | |
6638 | + * IPVS: Least-Connection Scheduling module | |
6639 | + * | |
6640 | + * Version: $Id$ | |
6641 | + * | |
6642 | + * Authors: Wensong Zhang <wensong@iinchina.net> | |
6643 | + * | |
6644 | + * This program is free software; you can redistribute it and/or | |
6645 | + * modify it under the terms of the GNU General Public License | |
6646 | + * as published by the Free Software Foundation; either version | |
6647 | + * 2 of the License, or (at your option) any later version. | |
6648 | + * | |
6649 | + * Changes: | |
6650 | + * Wensong Zhang : added the ip_vs_lc_update_svc | |
6651 | + * Wensong Zhang : added any dest with weight=0 is quiesced | |
6652 | + * | |
6653 | + */ | |
6654 | + | |
6655 | +#include <linux/config.h> | |
6656 | +#include <linux/module.h> | |
6657 | +#ifdef CONFIG_KMOD | |
6658 | +#include <linux/kmod.h> | |
6659 | +#endif | |
6660 | +#include <linux/types.h> | |
6661 | +#include <linux/kernel.h> | |
6662 | +#include <linux/errno.h> | |
6663 | +#include <net/ip_masq.h> | |
6664 | +#ifdef CONFIG_IP_MASQUERADE_MOD | |
6665 | +#include <net/ip_masq_mod.h> | |
6666 | +#endif | |
6667 | +#include <linux/ip_fw.h> | |
6668 | +#include <net/ip_vs.h> | |
6669 | + | |
6670 | + | |
6671 | +static int ip_vs_lc_init_svc(struct ip_vs_service *svc) | |
6672 | +{ | |
6673 | + MOD_INC_USE_COUNT; | |
6674 | + return 0; | |
6675 | +} | |
6676 | + | |
6677 | + | |
6678 | +static int ip_vs_lc_done_svc(struct ip_vs_service *svc) | |
6679 | +{ | |
6680 | + MOD_DEC_USE_COUNT; | |
6681 | + return 0; | |
6682 | +} | |
6683 | + | |
6684 | + | |
6685 | +static int ip_vs_lc_update_svc(struct ip_vs_service *svc) | |
6686 | +{ | |
6687 | + return 0; | |
6688 | +} | |
6689 | + | |
6690 | + | |
6691 | +/* | |
6692 | + * Least Connection scheduling | |
6693 | + */ | |
6694 | +static struct ip_vs_dest* ip_vs_lc_schedule(struct ip_vs_service *svc, | |
6695 | + struct iphdr *iph) | |
6696 | +{ | |
6697 | + struct list_head *l, *e; | |
6698 | + struct ip_vs_dest *dest, *least; | |
6699 | + int lac, dac; | |
6700 | + | |
6701 | + IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); | |
6702 | + | |
6703 | + l = &svc->destinations; | |
6704 | + if (l == l->next) | |
6705 | + return NULL; | |
6706 | + | |
6707 | + /* | |
6708 | + * Simply select the server with the least number of | |
6709 | + * (activeconns<<5) + inactconns | |
6710 | + * Except whose weight is equal to zero. | |
6711 | + * If the weight is equal to zero, it means that the server is | |
6712 | + * quiesced, the existing connections to the server still get | |
6713 | + * served, but no new connection is assigned to the server. | |
6714 | + */ | |
6715 | + | |
6716 | + for (e=l->next; e!=l; e=e->next) { | |
6717 | + least = list_entry (e, struct ip_vs_dest, n_list); | |
6718 | + if (least->weight > 0) { | |
6719 | + lac = (atomic_read(&least->activeconns) << 5) | |
6720 | + + atomic_read(&least->inactconns); | |
6721 | + goto nextstage; | |
6722 | + } | |
6723 | + } | |
6724 | + return NULL; | |
6725 | + | |
6726 | + /* | |
6727 | + * Find the destination with the least load. | |
6728 | + */ | |
6729 | + nextstage: | |
6730 | + for (e=e->next; e!=l; e=e->next) { | |
6731 | + dest = list_entry(e, struct ip_vs_dest, n_list); | |
6732 | + if (dest->weight == 0) | |
6733 | + continue; | |
6734 | + dac = (atomic_read(&dest->activeconns) << 5) | |
6735 | + + atomic_read(&dest->inactconns); | |
6736 | + if (dac < lac) { | |
6737 | + least = dest; | |
6738 | + lac = dac; | |
6739 | + } | |
6740 | + } | |
6741 | + | |
6742 | + IP_VS_DBG(6, "LC: server %d.%d.%d.%d:%d activeconns %d inactconns %d\n", | |
6743 | + NIPQUAD(least->addr), ntohs(least->port), | |
6744 | + atomic_read(&least->activeconns), | |
6745 | + atomic_read(&least->inactconns)); | |
6746 | + | |
6747 | + return least; | |
6748 | +} | |
6749 | + | |
6750 | + | |
6751 | +static struct ip_vs_scheduler ip_vs_lc_scheduler = { | |
6752 | + {0}, /* n_list */ | |
6753 | + "lc", /* name */ | |
6754 | + ATOMIC_INIT(0), /* refcnt */ | |
6755 | + ip_vs_lc_init_svc, /* service initializer */ | |
6756 | + ip_vs_lc_done_svc, /* service done */ | |
6757 | + ip_vs_lc_update_svc, /* service updater */ | |
6758 | + ip_vs_lc_schedule, /* select a server from the destination list */ | |
6759 | +}; | |
6760 | + | |
6761 | + | |
6762 | +__initfunc(int ip_vs_lc_init(void)) | |
6763 | +{ | |
6764 | + IP_VS_INFO("Initializing LC scheduling\n"); | |
6765 | + INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list); | |
6766 | + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; | |
6767 | +} | |
6768 | + | |
6769 | + | |
6770 | +#ifdef MODULE | |
6771 | +EXPORT_NO_SYMBOLS; | |
6772 | + | |
6773 | +int init_module(void) | |
6774 | +{ | |
6775 | + INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list); | |
6776 | + | |
6777 | + /* module initialization by 'request_module' */ | |
6778 | + if(register_ip_vs_scheduler(&ip_vs_lc_scheduler) != 0) | |
6779 | + return -EIO; | |
6780 | + | |
6781 | + IP_VS_INFO("LC scheduling module loaded.\n"); | |
6782 | + | |
6783 | + return 0; | |
6784 | +} | |
6785 | + | |
6786 | +void cleanup_module(void) | |
6787 | +{ | |
6788 | + /* module cleanup by 'release_module' */ | |
6789 | + if(unregister_ip_vs_scheduler(&ip_vs_lc_scheduler) != 0) | |
6790 | + IP_VS_INFO("cannot remove LC scheduling module\n"); | |
6791 | + else | |
6792 | + IP_VS_INFO("LC scheduling module unloaded.\n"); | |
6793 | +} | |
6794 | + | |
6795 | +#endif /* MODULE */ | |
6796 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_rr.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_rr.c | |
6797 | --- linux-2.2.19/net/ipv4/ip_vs_rr.c Thu Jan 1 08:00:00 1970 | |
6798 | +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_rr.c Fri Nov 24 10:04:12 2000 | |
6799 | @@ -0,0 +1,145 @@ | |
6800 | +/* | |
6801 | + * IPVS: Round-Robin Scheduling module | |
6802 | + * | |
6803 | + * Version: $Id$ | |
6804 | + * | |
6805 | + * Authors: Wensong Zhang <wensong@iinchina.net> | |
6806 | + * Peter Kese <peter.kese@ijs.si> | |
6807 | + * | |
6808 | + * This program is free software; you can redistribute it and/or | |
6809 | + * modify it under the terms of the GNU General Public License | |
6810 | + * as published by the Free Software Foundation; either version | |
6811 | + * 2 of the License, or (at your option) any later version. | |
6812 | + * | |
6813 | + * Fixes/Changes: | |
6814 | + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest | |
6815 | + * Julian Anastasov : fixed the NULL pointer access bug in debugging | |
6816 | + * Wensong Zhang : changed some comestics things for debugging | |
6817 | + * Wensong Zhang : changed for the d-linked destination list | |
6818 | + * Wensong Zhang : added the ip_vs_rr_update_svc | |
6819 | + * Wensong Zhang : added any dest with weight=0 is quiesced | |
6820 | + * | |
6821 | + */ | |
6822 | + | |
6823 | +#include <linux/config.h> | |
6824 | +#include <linux/module.h> | |
6825 | +#ifdef CONFIG_KMOD | |
6826 | +#include <linux/kmod.h> | |
6827 | +#endif | |
6828 | +#include <linux/types.h> | |
6829 | +#include <linux/kernel.h> | |
6830 | +#include <linux/errno.h> | |
6831 | +#include <net/ip_masq.h> | |
6832 | +#ifdef CONFIG_IP_MASQUERADE_MOD | |
6833 | +#include <net/ip_masq_mod.h> | |
6834 | +#endif | |
6835 | +#include <linux/ip_fw.h> | |
6836 | +#include <net/ip_vs.h> | |
6837 | + | |
6838 | + | |
6839 | +static int ip_vs_rr_init_svc(struct ip_vs_service *svc) | |
6840 | +{ | |
6841 | + svc->sched_data = &svc->destinations; | |
6842 | + MOD_INC_USE_COUNT; | |
6843 | + return 0; | |
6844 | +} | |
6845 | + | |
6846 | + | |
6847 | +static int ip_vs_rr_done_svc(struct ip_vs_service *svc) | |
6848 | +{ | |
6849 | + MOD_DEC_USE_COUNT; | |
6850 | + return 0; | |
6851 | +} | |
6852 | + | |
6853 | + | |
6854 | +static int ip_vs_rr_update_svc(struct ip_vs_service *svc) | |
6855 | +{ | |
6856 | + svc->sched_data = &svc->destinations; | |
6857 | + return 0; | |
6858 | +} | |
6859 | + | |
6860 | + | |
6861 | +/* | |
6862 | + * Round-Robin Scheduling | |
6863 | + */ | |
6864 | +static struct ip_vs_dest* ip_vs_rr_schedule(struct ip_vs_service *svc, | |
6865 | + struct iphdr *iph) | |
6866 | +{ | |
6867 | + register struct list_head *p, *q; | |
6868 | + struct ip_vs_dest *dest; | |
6869 | + | |
6870 | + IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); | |
6871 | + | |
6872 | + p = (struct list_head *)svc->sched_data; | |
6873 | + p = p->next; | |
6874 | + q = p; | |
6875 | + do { | |
6876 | + if (q == &svc->destinations) { | |
6877 | + q = q->next; | |
6878 | + continue; | |
6879 | + } | |
6880 | + dest = list_entry(q, struct ip_vs_dest, n_list); | |
6881 | + if (dest->weight > 0) | |
6882 | + /* HIT */ | |
6883 | + goto out; | |
6884 | + q = q->next; | |
6885 | + } while (q != p); | |
6886 | + return NULL; | |
6887 | + | |
6888 | + out: | |
6889 | + svc->sched_data = q; | |
6890 | + IP_VS_DBG(6, "RR: server %d.%d.%d.%d:%d " | |
6891 | + "activeconns %d refcnt %d weight %d\n", | |
6892 | + NIPQUAD(dest->addr), ntohs(dest->port), | |
6893 | + atomic_read(&dest->activeconns), | |
6894 | + atomic_read(&dest->refcnt), dest->weight); | |
6895 | + | |
6896 | + return dest; | |
6897 | +} | |
6898 | + | |
6899 | + | |
6900 | +static struct ip_vs_scheduler ip_vs_rr_scheduler = { | |
6901 | + {0}, /* n_list */ | |
6902 | + "rr", /* name */ | |
6903 | + ATOMIC_INIT(0), /* refcnt */ | |
6904 | + ip_vs_rr_init_svc, /* service initializer */ | |
6905 | + ip_vs_rr_done_svc, /* service done */ | |
6906 | + ip_vs_rr_update_svc, /* service updater */ | |
6907 | + ip_vs_rr_schedule, /* select a server from the destination list */ | |
6908 | +}; | |
6909 | + | |
6910 | + | |
6911 | +__initfunc(int ip_vs_rr_init(void)) | |
6912 | +{ | |
6913 | + IP_VS_INFO("Initializing RR scheduling\n"); | |
6914 | + INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list); | |
6915 | + return register_ip_vs_scheduler(&ip_vs_rr_scheduler) ; | |
6916 | +} | |
6917 | + | |
6918 | + | |
6919 | +#ifdef MODULE | |
6920 | +EXPORT_NO_SYMBOLS; | |
6921 | + | |
6922 | +int init_module(void) | |
6923 | +{ | |
6924 | + INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list); | |
6925 | + | |
6926 | + /* module initialization by 'request_module' */ | |
6927 | + if(register_ip_vs_scheduler(&ip_vs_rr_scheduler) != 0) | |
6928 | + return -EIO; | |
6929 | + | |
6930 | + IP_VS_INFO("RR scheduling module loaded.\n"); | |
6931 | + | |
6932 | + return 0; | |
6933 | +} | |
6934 | + | |
6935 | +void cleanup_module(void) | |
6936 | +{ | |
6937 | + /* module cleanup by 'release_module' */ | |
6938 | + if(unregister_ip_vs_scheduler(&ip_vs_rr_scheduler) != 0) | |
6939 | + IP_VS_INFO("cannot remove RR scheduling module\n"); | |
6940 | + else | |
6941 | + IP_VS_INFO("RR scheduling module unloaded.\n"); | |
6942 | +} | |
6943 | + | |
6944 | +#endif /* MODULE */ | |
6945 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_wlc.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wlc.c | |
6946 | --- linux-2.2.19/net/ipv4/ip_vs_wlc.c Thu Jan 1 08:00:00 1970 | |
6947 | +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wlc.c Fri Nov 24 09:59:32 2000 | |
6948 | @@ -0,0 +1,176 @@ | |
6949 | +/* | |
6950 | + * IPVS: Weighted Least-Connection Scheduling module | |
6951 | + * | |
6952 | + * Version: $Id$ | |
6953 | + * | |
6954 | + * Authors: Wensong Zhang <wensong@iinchina.net> | |
6955 | + * Peter Kese <peter.kese@ijs.si> | |
6956 | + * | |
6957 | + * This program is free software; you can redistribute it and/or | |
6958 | + * modify it under the terms of the GNU General Public License | |
6959 | + * as published by the Free Software Foundation; either version | |
6960 | + * 2 of the License, or (at your option) any later version. | |
6961 | + * | |
6962 | + * Changes: | |
6963 | + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest | |
6964 | + * Wensong Zhang : changed to use the inactconns in scheduling | |
6965 | + * Wensong Zhang : changed some comestics things for debugging | |
6966 | + * Wensong Zhang : changed for the d-linked destination list | |
6967 | + * Wensong Zhang : added the ip_vs_wlc_update_svc | |
6968 | + * Wensong Zhang : added any dest with weight=0 is quiesced | |
6969 | + * | |
6970 | + */ | |
6971 | + | |
6972 | +#include <linux/config.h> | |
6973 | +#include <linux/module.h> | |
6974 | +#ifdef CONFIG_KMOD | |
6975 | +#include <linux/kmod.h> | |
6976 | +#endif | |
6977 | +#include <linux/types.h> | |
6978 | +#include <linux/kernel.h> | |
6979 | +#include <linux/errno.h> | |
6980 | +#include <net/ip_masq.h> | |
6981 | +#ifdef CONFIG_IP_MASQUERADE_MOD | |
6982 | +#include <net/ip_masq_mod.h> | |
6983 | +#endif | |
6984 | +#include <linux/ip_fw.h> | |
6985 | +#include <net/ip_vs.h> | |
6986 | + | |
6987 | + | |
6988 | +static int | |
6989 | +ip_vs_wlc_init_svc(struct ip_vs_service *svc) | |
6990 | +{ | |
6991 | + MOD_INC_USE_COUNT; | |
6992 | + return 0; | |
6993 | +} | |
6994 | + | |
6995 | + | |
6996 | +static int | |
6997 | +ip_vs_wlc_done_svc(struct ip_vs_service *svc) | |
6998 | +{ | |
6999 | + MOD_DEC_USE_COUNT; | |
7000 | + return 0; | |
7001 | +} | |
7002 | + | |
7003 | + | |
7004 | +static int | |
7005 | +ip_vs_wlc_update_svc(struct ip_vs_service *svc) | |
7006 | +{ | |
7007 | + return 0; | |
7008 | +} | |
7009 | + | |
7010 | + | |
7011 | +/* | |
7012 | + * Weighted Least Connection scheduling | |
7013 | + */ | |
7014 | +static struct ip_vs_dest * | |
7015 | +ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) | |
7016 | +{ | |
7017 | + register struct list_head *l, *e; | |
7018 | + struct ip_vs_dest *dest, *least; | |
7019 | + int loh, doh; | |
7020 | + | |
7021 | + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); | |
7022 | + | |
7023 | + l = &svc->destinations; | |
7024 | + if (l == l->next) | |
7025 | + return NULL; | |
7026 | + | |
7027 | + /* | |
7028 | + * We think the overhead of processing active connections is fifty | |
7029 | + * times than that of inactive conncetions in average. (This fifty | |
7030 | + * times might be not accurate, we will change it later.) We use | |
7031 | + * the following formula to estimate the overhead: | |
7032 | + * dest->activeconns*50 + dest->inactconns | |
7033 | + * and the load: | |
7034 | + * (dest overhead) / dest->weight | |
7035 | + * | |
7036 | + * Remember -- no floats in kernel mode!!! | |
7037 | + * The comparison of h1*w2 > h2*w1 is equivalent to that of | |
7038 | + * h1/w1 > h2/w2 | |
7039 | + * if every weight is larger than zero. | |
7040 | + * | |
7041 | + * The server with weight=0 is quiesced and will not receive any | |
7042 | + * new connection. | |
7043 | + */ | |
7044 | + | |
7045 | + for (e=l->next; e!=l; e=e->next) { | |
7046 | + least = list_entry(e, struct ip_vs_dest, n_list); | |
7047 | + if (least->weight > 0) { | |
7048 | + loh = atomic_read(&least->activeconns) * 50 | |
7049 | + + atomic_read(&least->inactconns); | |
7050 | + goto nextstage; | |
7051 | + } | |
7052 | + } | |
7053 | + return NULL; | |
7054 | + | |
7055 | + /* | |
7056 | + * Find the destination with the least load. | |
7057 | + */ | |
7058 | + nextstage: | |
7059 | + for (e=e->next; e!=l; e=e->next) { | |
7060 | + dest = list_entry(e, struct ip_vs_dest, n_list); | |
7061 | + doh = atomic_read(&dest->activeconns) * 50 | |
7062 | + + atomic_read(&dest->inactconns); | |
7063 | + if (loh * dest->weight > doh * least->weight) { | |
7064 | + least = dest; | |
7065 | + loh = doh; | |
7066 | + } | |
7067 | + } | |
7068 | + | |
7069 | + IP_VS_DBG(6, "WLC: server %d.%d.%d.%d:%d " | |
7070 | + "activeconns %d refcnt %d weight %d overhead %d\n", | |
7071 | + NIPQUAD(least->addr), ntohs(least->port), | |
7072 | + atomic_read(&least->activeconns), | |
7073 | + atomic_read(&least->refcnt), least->weight, loh); | |
7074 | + | |
7075 | + return least; | |
7076 | +} | |
7077 | + | |
7078 | + | |
7079 | +static struct ip_vs_scheduler ip_vs_wlc_scheduler = | |
7080 | +{ | |
7081 | + {0}, /* n_list */ | |
7082 | + "wlc", /* name */ | |
7083 | + ATOMIC_INIT (0), /* refcnt */ | |
7084 | + ip_vs_wlc_init_svc, /* service initializer */ | |
7085 | + ip_vs_wlc_done_svc, /* service done */ | |
7086 | + ip_vs_wlc_update_svc, /* service updater */ | |
7087 | + ip_vs_wlc_schedule, /* select a server from the destination list */ | |
7088 | +}; | |
7089 | + | |
7090 | + | |
7091 | +__initfunc(int ip_vs_wlc_init (void)) | |
7092 | +{ | |
7093 | + IP_VS_INFO("Initializing WLC scheduling\n"); | |
7094 | + INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list); | |
7095 | + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); | |
7096 | +} | |
7097 | + | |
7098 | + | |
7099 | +#ifdef MODULE | |
7100 | +EXPORT_NO_SYMBOLS; | |
7101 | + | |
7102 | +int init_module(void) | |
7103 | +{ | |
7104 | + INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list); | |
7105 | + | |
7106 | + /* module initialization by 'request_module' */ | |
7107 | + if (register_ip_vs_scheduler(&ip_vs_wlc_scheduler) != 0) | |
7108 | + return -EIO; | |
7109 | + | |
7110 | + IP_VS_INFO("WLC scheduling module loaded.\n"); | |
7111 | + | |
7112 | + return 0; | |
7113 | +} | |
7114 | + | |
7115 | +void cleanup_module(void) | |
7116 | +{ | |
7117 | + /* module cleanup by 'release_module' */ | |
7118 | + if (unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler) != 0) | |
7119 | + IP_VS_INFO("cannot remove WLC scheduling module\n"); | |
7120 | + else | |
7121 | + IP_VS_INFO("WLC scheduling module unloaded.\n"); | |
7122 | +} | |
7123 | + | |
7124 | +#endif /* MODULE */ | |
7125 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/ip_vs_wrr.c linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wrr.c | |
7126 | --- linux-2.2.19/net/ipv4/ip_vs_wrr.c Thu Jan 1 08:00:00 1970 | |
7127 | +++ linux-2.2.19-vs-1.0.8/net/ipv4/ip_vs_wrr.c Fri Nov 24 09:57:23 2000 | |
7128 | @@ -0,0 +1,209 @@ | |
7129 | +/* | |
7130 | + * IPVS: Weighted Round-Robin Scheduling module | |
7131 | + * | |
7132 | + * Version: $Id$ | |
7133 | + * | |
7134 | + * Authors: Wensong Zhang <wensong@iinchina.net> | |
7135 | + * | |
7136 | + * This program is free software; you can redistribute it and/or | |
7137 | + * modify it under the terms of the GNU General Public License | |
7138 | + * as published by the Free Software Foundation; either version | |
7139 | + * 2 of the License, or (at your option) any later version. | |
7140 | + * | |
7141 | + * Changes: | |
7142 | + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest | |
7143 | + * Wensong Zhang : changed some comestics things for debugging | |
7144 | + * Wensong Zhang : changed for the d-linked destination list | |
7145 | + * Wensong Zhang : added the ip_vs_wrr_update_svc | |
7146 | + * Julian Anastasov : return -ENOMEM instead of ENOMEM in the | |
7147 | + * ip_vs_wrr_init_svc | |
7148 | + * Julian Anastasov : fixed the bug of returning destination | |
7149 | + * with weight 0 when all weights are zero | |
7150 | + * | |
7151 | + */ | |
7152 | + | |
7153 | +#include <linux/config.h> | |
7154 | +#include <linux/module.h> | |
7155 | +#ifdef CONFIG_KMOD | |
7156 | +#include <linux/kmod.h> | |
7157 | +#endif | |
7158 | +#include <linux/types.h> | |
7159 | +#include <linux/kernel.h> | |
7160 | +#include <linux/errno.h> | |
7161 | +#include <net/ip_masq.h> | |
7162 | +#ifdef CONFIG_IP_MASQUERADE_MOD | |
7163 | +#include <net/ip_masq_mod.h> | |
7164 | +#endif | |
7165 | +#include <linux/ip_fw.h> | |
7166 | +#include <net/ip_vs.h> | |
7167 | + | |
7168 | +/* | |
7169 | + * current destination pointer for weighted round-robin scheduling | |
7170 | + */ | |
7171 | +struct ip_vs_wrr_mark { | |
7172 | + struct list_head *cl; /* current list head */ | |
7173 | + int cw; /* current weight */ | |
7174 | +}; | |
7175 | + | |
7176 | + | |
7177 | +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) | |
7178 | +{ | |
7179 | + /* | |
7180 | + * Allocate the mark variable for WRR scheduling | |
7181 | + */ | |
7182 | + svc->sched_data = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); | |
7183 | + | |
7184 | + if (svc->sched_data == NULL) { | |
7185 | + IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); | |
7186 | + return -ENOMEM; | |
7187 | + } | |
7188 | + memset(svc->sched_data, 0, sizeof(struct ip_vs_wrr_mark)); | |
7189 | + | |
7190 | + ((struct ip_vs_wrr_mark*)svc->sched_data)->cl = &svc->destinations; | |
7191 | + | |
7192 | + MOD_INC_USE_COUNT; | |
7193 | + return 0; | |
7194 | +} | |
7195 | + | |
7196 | + | |
7197 | +static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) | |
7198 | +{ | |
7199 | + /* | |
7200 | + * Release the mark variable | |
7201 | + */ | |
7202 | + kfree_s(svc->sched_data, sizeof(struct ip_vs_wrr_mark)); | |
7203 | + | |
7204 | + MOD_DEC_USE_COUNT; | |
7205 | + return 0; | |
7206 | +} | |
7207 | + | |
7208 | + | |
7209 | +static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) | |
7210 | +{ | |
7211 | + ((struct ip_vs_wrr_mark*)svc->sched_data)->cl = &svc->destinations; | |
7212 | + return 0; | |
7213 | +} | |
7214 | + | |
7215 | + | |
7216 | +/* | |
7217 | + * Get the maximum weight of the service destinations. | |
7218 | + */ | |
7219 | +int ip_vs_wrr_max_weight(struct ip_vs_service *svc) | |
7220 | +{ | |
7221 | + register struct list_head *l, *e; | |
7222 | + struct ip_vs_dest *dest; | |
7223 | + int weight = 0; | |
7224 | + | |
7225 | + l = &svc->destinations; | |
7226 | + for (e=l->next; e!=l; e=e->next) { | |
7227 | + dest = list_entry(e, struct ip_vs_dest, n_list); | |
7228 | + if (dest->weight > weight) | |
7229 | + weight = dest->weight; | |
7230 | + } | |
7231 | + | |
7232 | + return weight; | |
7233 | +} | |
7234 | + | |
7235 | + | |
7236 | +/* | |
7237 | + * Weighted Round-Robin Scheduling | |
7238 | + */ | |
7239 | +static struct ip_vs_dest* ip_vs_wrr_schedule(struct ip_vs_service *svc, | |
7240 | + struct iphdr *iph) | |
7241 | +{ | |
7242 | + struct ip_vs_dest *dest; | |
7243 | + struct ip_vs_wrr_mark *mark = svc->sched_data; | |
7244 | + | |
7245 | + IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); | |
7246 | + | |
7247 | + /* | |
7248 | + * This loop will always terminate, because 0<mark->cw<max_weight, | |
7249 | + * and at least one server has its weight equal to max_weight. | |
7250 | + */ | |
7251 | + while (1) { | |
7252 | + if (mark->cl == &svc->destinations) { | |
7253 | + /* it is at the head of the destination list */ | |
7254 | + | |
7255 | + if (mark->cl == mark->cl->next) | |
7256 | + /* no dest entry */ | |
7257 | + return NULL; | |
7258 | + | |
7259 | + mark->cl = svc->destinations.next; | |
7260 | + mark->cw--; | |
7261 | + if (mark->cw <= 0) { | |
7262 | + mark->cw = ip_vs_wrr_max_weight(svc); | |
7263 | + /* | |
7264 | + * Still zero, which means no availabe servers. | |
7265 | + */ | |
7266 | + if (mark->cw == 0) { | |
7267 | + mark->cl = &svc->destinations; | |
7268 | + IP_VS_INFO("ip_vs_wrr_schedule(): " | |
7269 | + "no available servers\n"); | |
7270 | + return NULL; | |
7271 | + } | |
7272 | + } | |
7273 | + } | |
7274 | + else mark->cl = mark->cl->next; | |
7275 | + | |
7276 | + if (mark->cl != &svc->destinations) { | |
7277 | + /* not at the head of the list */ | |
7278 | + dest = list_entry(mark->cl, struct ip_vs_dest, n_list); | |
7279 | + if (dest->weight >= mark->cw) | |
7280 | + break; | |
7281 | + } | |
7282 | + } | |
7283 | + | |
7284 | + IP_VS_DBG(6, "WRR: server %d.%d.%d.%d:%d " | |
7285 | + "activeconns %d refcnt %d weight %d\n", | |
7286 | + NIPQUAD(dest->addr), ntohs(dest->port), | |
7287 | + atomic_read(&dest->activeconns), | |
7288 | + atomic_read(&dest->refcnt), dest->weight); | |
7289 | + | |
7290 | + return dest; | |
7291 | +} | |
7292 | + | |
7293 | + | |
7294 | +static struct ip_vs_scheduler ip_vs_wrr_scheduler = { | |
7295 | + {0}, /* n_list */ | |
7296 | + "wrr", /* name */ | |
7297 | + ATOMIC_INIT(0), /* refcnt */ | |
7298 | + ip_vs_wrr_init_svc, /* service initializer */ | |
7299 | + ip_vs_wrr_done_svc, /* service done */ | |
7300 | + ip_vs_wrr_update_svc, /* service updater */ | |
7301 | + ip_vs_wrr_schedule, /* select a server from the destination list */ | |
7302 | +}; | |
7303 | + | |
7304 | + | |
7305 | +__initfunc(int ip_vs_wrr_init(void)) | |
7306 | +{ | |
7307 | + IP_VS_INFO("Initializing WRR scheduling\n"); | |
7308 | + INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list); | |
7309 | + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; | |
7310 | +} | |
7311 | + | |
7312 | +#ifdef MODULE | |
7313 | +EXPORT_NO_SYMBOLS; | |
7314 | + | |
7315 | +int init_module(void) | |
7316 | +{ | |
7317 | + INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list); | |
7318 | + | |
7319 | + /* module initialization by 'request_module' */ | |
7320 | + if(register_ip_vs_scheduler(&ip_vs_wrr_scheduler) != 0) | |
7321 | + return -EIO; | |
7322 | + | |
7323 | + IP_VS_INFO("WRR scheduling module loaded.\n"); | |
7324 | + | |
7325 | + return 0; | |
7326 | +} | |
7327 | + | |
7328 | +void cleanup_module(void) | |
7329 | +{ | |
7330 | + /* module cleanup by 'release_module' */ | |
7331 | + if(unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler) != 0) | |
7332 | + IP_VS_INFO("cannot remove WRR scheduling module\n"); | |
7333 | + else | |
7334 | + IP_VS_INFO("WRR scheduling module unloaded.\n"); | |
7335 | +} | |
7336 | + | |
7337 | +#endif /* MODULE */ | |
7338 | diff -urN --exclude-from=/usr/src/exclude linux-2.2.19/net/ipv4/sysctl_net_ipv4.c linux-2.2.19-vs-1.0.8/net/ipv4/sysctl_net_ipv4.c | |
7339 | --- linux-2.2.19/net/ipv4/sysctl_net_ipv4.c Tue Mar 27 09:33:49 2001 | |
7340 | +++ linux-2.2.19-vs-1.0.8/net/ipv4/sysctl_net_ipv4.c Tue Mar 27 09:32:21 2001 | |
7341 | @@ -69,6 +69,9 @@ | |
7342 | struct ipv4_config ipv4_config; | |
7343 | ||
7344 | extern ctl_table ipv4_route_table[]; | |
7345 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
7346 | +extern ctl_table ipv4_vs_table[]; | |
7347 | +#endif | |
7348 | ||
7349 | #ifdef CONFIG_SYSCTL | |
7350 | ||
7351 | @@ -198,7 +201,10 @@ | |
7352 | {NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships", | |
7353 | &sysctl_igmp_max_memberships, sizeof(int), 0644, NULL, &proc_dointvec}, | |
7354 | #endif | |
7355 | +#ifdef CONFIG_IP_MASQUERADE_VS | |
7356 | + {NET_IPV4_VS, "vs", NULL, 0, 0555, ipv4_vs_table}, | |
7357 | +#endif | |
7358 | {0} | |
7359 | }; | |
7360 | - | |
7361 | + | |
7362 | #endif /* CONFIG_SYSCTL */ |