]>
Commit | Line | Data |
---|---|---|
9a91a2bf | 1 | diff -urNp v2.6.22/linux/include/net/ip_vs.h linux/include/net/ip_vs.h |
2 | --- v2.6.22/linux/include/net/ip_vs.h 2007-02-11 01:06:29.000000000 +0200 | |
3 | +++ linux/include/net/ip_vs.h 2007-07-12 12:03:43.000000000 +0300 | |
4 | @@ -9,6 +9,16 @@ | |
5 | #include <asm/types.h> /* For __uXX types */ | |
6 | #include <linux/types.h> /* For __beXX types in userland */ | |
7 | ||
8 | +#ifdef __KERNEL__ | |
9 | +#include <linux/skbuff.h> | |
10 | +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) | |
11 | +#include <net/netfilter/nf_conntrack.h> | |
12 | +#include <net/netfilter/nf_conntrack_core.h> | |
13 | +#include <net/netfilter/nf_conntrack_expect.h> | |
14 | +#include <net/netfilter/nf_conntrack_helper.h> | |
15 | +#endif | |
16 | +#endif | |
17 | + | |
18 | #define IP_VS_VERSION_CODE 0x010201 | |
19 | #define NVERSION(version) \ | |
20 | (version >> 16) & 0xFF, \ | |
21 | @@ -358,6 +368,8 @@ enum { | |
22 | NET_IPV4_VS_SYNC_THRESHOLD=24, | |
23 | NET_IPV4_VS_NAT_ICMP_SEND=25, | |
24 | NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE=26, | |
25 | + NET_IPV4_VS_SNAT_REROUTE=27, | |
26 | + NET_IPV4_VS_CONNTRACK=28, | |
27 | NET_IPV4_VS_LAST | |
28 | }; | |
29 | ||
30 | @@ -715,6 +727,16 @@ extern void ip_vs_init_hash_table(struct | |
31 | */ | |
32 | ||
33 | /* | |
34 | + * Netfilter connection tracking | |
35 | + * (from ip_vs_nfct.c) | |
36 | + */ | |
37 | +extern int ip_vs_nfct_confirm(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int hooknum); | |
38 | +extern void ip_vs_nfct_expect_related(struct sk_buff *skb, | |
39 | + struct ip_vs_conn *cp, | |
40 | + __be16 port, __u16 proto, int from_rs); | |
41 | +extern void ip_vs_nfct_conn_drop(struct ip_vs_conn *cp); | |
42 | + | |
43 | +/* | |
44 | * IPVS connection entry hash table | |
45 | */ | |
46 | #ifndef CONFIG_IP_VS_TAB_BITS | |
47 | @@ -885,8 +907,41 @@ extern int sysctl_ip_vs_expire_nodest_co | |
48 | extern int sysctl_ip_vs_expire_quiescent_template; | |
49 | extern int sysctl_ip_vs_sync_threshold[2]; | |
50 | extern int sysctl_ip_vs_nat_icmp_send; | |
51 | +extern int sysctl_ip_vs_snat_reroute; | |
52 | extern struct ip_vs_stats ip_vs_stats; | |
53 | ||
54 | +#ifdef CONFIG_IP_VS_NFCT | |
55 | + | |
56 | +extern int sysctl_ip_vs_conntrack; | |
57 | + | |
58 | +static inline int ip_vs_use_conntrack(struct sk_buff *skb) | |
59 | +{ | |
60 | + return sysctl_ip_vs_conntrack && skb->nfct; | |
61 | +} | |
62 | + | |
63 | +/* Returns boolean and skb is freed on failure */ | |
64 | +static inline int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int hooknum) | |
65 | +{ | |
66 | + if (!ip_vs_use_conntrack(skb)) | |
67 | + return 1; | |
68 | + return nf_ct_is_confirmed((struct nf_conn *) skb->nfct) || | |
69 | + ip_vs_nfct_confirm(skb, cp, hooknum); | |
70 | +} | |
71 | + | |
72 | +#else | |
73 | + | |
74 | +static inline int ip_vs_use_conntrack(struct sk_buff *skb) | |
75 | +{ | |
76 | + return 0; | |
77 | +} | |
78 | + | |
79 | +static inline int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int hooknum) | |
80 | +{ | |
81 | + return 1; | |
82 | +} | |
83 | + | |
84 | +#endif | |
85 | + | |
86 | extern struct ip_vs_service * | |
87 | ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport); | |
88 | ||
89 | diff -urNp v2.6.22/linux/net/ipv4/ipvs/Kconfig linux/net/ipv4/ipvs/Kconfig | |
90 | --- v2.6.22/linux/net/ipv4/ipvs/Kconfig 2007-07-10 09:18:43.000000000 +0300 | |
91 | +++ linux/net/ipv4/ipvs/Kconfig 2007-07-12 09:48:59.000000000 +0300 | |
92 | @@ -221,4 +221,12 @@ config IP_VS_FTP | |
93 | If you want to compile it in kernel, say Y. To compile it as a | |
94 | module, choose M here. If unsure, say N. | |
95 | ||
96 | +config IP_VS_NFCT | |
97 | + bool "Netfilter connection tracking" | |
98 | + depends on NF_CONNTRACK | |
99 | + ---help--- | |
100 | + The Netfilter connection tracking support allows the IPVS | |
101 | + connection state to be exported to the Netfilter framework | |
102 | + for filtering purposes. | |
103 | + | |
104 | endif # IP_VS | |
105 | diff -urNp v2.6.22/linux/net/ipv4/ipvs/Makefile linux/net/ipv4/ipvs/Makefile | |
106 | --- v2.6.22/linux/net/ipv4/ipvs/Makefile 2005-06-18 08:50:52.000000000 +0300 | |
107 | +++ linux/net/ipv4/ipvs/Makefile 2007-07-12 09:47:58.000000000 +0300 | |
108 | @@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UD | |
109 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o | |
110 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o | |
111 | ||
112 | +ip_vs-extra_objs-y := | |
113 | +ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o | |
114 | + | |
115 | ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ | |
116 | ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ | |
117 | ip_vs_est.o ip_vs_proto.o \ | |
118 | - $(ip_vs_proto-objs-y) | |
119 | + $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y) | |
120 | ||
121 | ||
122 | # IPVS core | |
123 | diff -urNp v2.6.22/linux/net/ipv4/ipvs/ip_vs_conn.c linux/net/ipv4/ipvs/ip_vs_conn.c | |
124 | --- v2.6.22/linux/net/ipv4/ipvs/ip_vs_conn.c 2007-04-28 17:55:11.000000000 +0300 | |
125 | +++ linux/net/ipv4/ipvs/ip_vs_conn.c 2007-07-12 09:47:58.000000000 +0300 | |
126 | @@ -562,6 +562,11 @@ static void ip_vs_conn_expire(unsigned l | |
127 | if (cp->control) | |
128 | ip_vs_control_del(cp); | |
129 | ||
130 | +#ifdef CONFIG_IP_VS_NFCT | |
131 | + if (sysctl_ip_vs_conntrack) | |
132 | + ip_vs_nfct_conn_drop(cp); | |
133 | +#endif | |
134 | + | |
135 | if (unlikely(cp->app != NULL)) | |
136 | ip_vs_unbind_app(cp); | |
137 | ip_vs_unbind_dest(cp); | |
138 | diff -urNp v2.6.22/linux/net/ipv4/ipvs/ip_vs_core.c linux/net/ipv4/ipvs/ip_vs_core.c | |
139 | --- v2.6.22/linux/net/ipv4/ipvs/ip_vs_core.c 2007-07-10 09:18:43.000000000 +0300 | |
140 | +++ linux/net/ipv4/ipvs/ip_vs_core.c 2007-07-12 09:47:58.000000000 +0300 | |
141 | @@ -701,6 +701,8 @@ static int ip_vs_out_icmp(struct sk_buff | |
142 | ||
143 | skb->ipvs_property = 1; | |
144 | verdict = NF_ACCEPT; | |
145 | + if (sysctl_ip_vs_snat_reroute && ip_route_me_harder(pskb, RTN_LOCAL)) | |
146 | + verdict = NF_DROP; | |
147 | ||
148 | out: | |
149 | __ip_vs_conn_put(cp); | |
150 | @@ -805,6 +807,9 @@ ip_vs_out(unsigned int hooknum, struct s | |
151 | if (!ip_vs_make_skb_writable(pskb, ihl)) | |
152 | goto drop; | |
153 | ||
154 | + if (!ip_vs_confirm_conntrack(*pskb, cp, hooknum)) | |
155 | + goto out; | |
156 | + | |
157 | /* mangle the packet */ | |
158 | if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp)) | |
159 | goto drop; | |
160 | @@ -812,13 +817,23 @@ ip_vs_out(unsigned int hooknum, struct s | |
161 | ip_hdr(skb)->saddr = cp->vaddr; | |
162 | ip_send_check(ip_hdr(skb)); | |
163 | ||
164 | + /* | |
165 | + * nf_iterate does not expect change in the skb->dst->dev. | |
166 | + * It looks like it is not fatal to enable this code for hooks | |
167 | + * where our handlers are at the end of the chain list and | |
168 | + * when all next handlers use skb->dst->dev and not outdev. | |
169 | + * It will definitely route properly the inout NAT traffic | |
170 | + * when multiple paths are used. | |
171 | + */ | |
172 | + | |
173 | /* For policy routing, packets originating from this | |
174 | * machine itself may be routed differently to packets | |
175 | * passing through. We want this packet to be routed as | |
176 | * if it came from this machine itself. So re-compute | |
177 | * the routing information. | |
178 | */ | |
179 | - if (ip_route_me_harder(pskb, RTN_LOCAL) != 0) | |
180 | + | |
181 | + if (sysctl_ip_vs_snat_reroute && ip_route_me_harder(pskb, RTN_LOCAL)) | |
182 | goto drop; | |
183 | skb = *pskb; | |
184 | ||
185 | @@ -834,8 +849,11 @@ ip_vs_out(unsigned int hooknum, struct s | |
186 | return NF_ACCEPT; | |
187 | ||
188 | drop: | |
189 | - ip_vs_conn_put(cp); | |
190 | kfree_skb(*pskb); | |
191 | + | |
192 | + out: | |
193 | + ip_vs_conn_put(cp); | |
194 | + LeaveFunction(11); | |
195 | return NF_STOLEN; | |
196 | } | |
197 | ||
198 | diff -urNp v2.6.22/linux/net/ipv4/ipvs/ip_vs_ctl.c linux/net/ipv4/ipvs/ip_vs_ctl.c | |
199 | --- v2.6.22/linux/net/ipv4/ipvs/ip_vs_ctl.c 2007-07-10 09:18:43.000000000 +0300 | |
200 | +++ linux/net/ipv4/ipvs/ip_vs_ctl.c 2007-07-12 09:47:58.000000000 +0300 | |
201 | @@ -81,6 +81,10 @@ int sysctl_ip_vs_expire_nodest_conn = 0; | |
202 | int sysctl_ip_vs_expire_quiescent_template = 0; | |
203 | int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; | |
204 | int sysctl_ip_vs_nat_icmp_send = 0; | |
205 | +int sysctl_ip_vs_snat_reroute = 0; | |
206 | +#ifdef CONFIG_IP_VS_NFCT | |
207 | +int sysctl_ip_vs_conntrack = 0; | |
208 | +#endif | |
209 | ||
210 | ||
211 | #ifdef CONFIG_IP_VS_DEBUG | |
212 | @@ -1424,6 +1428,16 @@ static struct ctl_table vs_vars[] = { | |
213 | .mode = 0644, | |
214 | .proc_handler = &proc_dointvec, | |
215 | }, | |
216 | +#ifdef CONFIG_IP_VS_NFCT | |
217 | + { | |
218 | + .ctl_name = NET_IPV4_VS_CONNTRACK, | |
219 | + .procname = "conntrack", | |
220 | + .data = &sysctl_ip_vs_conntrack, | |
221 | + .maxlen = sizeof(int), | |
222 | + .mode = 0644, | |
223 | + .proc_handler = &proc_dointvec, | |
224 | + }, | |
225 | +#endif | |
226 | { | |
227 | .ctl_name = NET_IPV4_VS_DROP_ENTRY, | |
228 | .procname = "drop_entry", | |
229 | @@ -1448,6 +1462,14 @@ static struct ctl_table vs_vars[] = { | |
230 | .mode = 0644, | |
231 | .proc_handler = &proc_do_defense_mode, | |
232 | }, | |
233 | + { | |
234 | + .ctl_name = NET_IPV4_VS_SNAT_REROUTE, | |
235 | + .procname = "snat_reroute", | |
236 | + .data = &sysctl_ip_vs_snat_reroute, | |
237 | + .maxlen = sizeof(int), | |
238 | + .mode = 0644, | |
239 | + .proc_handler = &proc_dointvec, | |
240 | + }, | |
241 | #if 0 | |
242 | { | |
243 | .ctl_name = NET_IPV4_VS_TO_ES, | |
244 | diff -urNp v2.6.22/linux/net/ipv4/ipvs/ip_vs_ftp.c linux/net/ipv4/ipvs/ip_vs_ftp.c | |
245 | --- v2.6.22/linux/net/ipv4/ipvs/ip_vs_ftp.c 2007-07-10 09:18:43.000000000 +0300 | |
246 | +++ linux/net/ipv4/ipvs/ip_vs_ftp.c 2007-07-12 09:47:58.000000000 +0300 | |
247 | @@ -194,6 +194,11 @@ static int ip_vs_ftp_out(struct ip_vs_ap | |
248 | ip_vs_control_add(n_cp, cp); | |
249 | } | |
250 | ||
251 | +#ifdef CONFIG_IP_VS_NFCT | |
252 | + if ((*pskb)->nfct) | |
253 | + ip_vs_nfct_expect_related(*pskb, n_cp, 0, IPPROTO_TCP, 0); | |
254 | +#endif | |
255 | + | |
256 | /* | |
257 | * Replace the old passive address with the new one | |
258 | */ | |
259 | @@ -326,6 +331,11 @@ static int ip_vs_ftp_in(struct ip_vs_app | |
260 | ip_vs_control_add(n_cp, cp); | |
261 | } | |
262 | ||
263 | +#ifdef CONFIG_IP_VS_NFCT | |
264 | + if ((*pskb)->nfct) | |
265 | + ip_vs_nfct_expect_related(*pskb, n_cp, n_cp->dport, IPPROTO_TCP, 1); | |
266 | +#endif | |
267 | + | |
268 | /* | |
269 | * Move tunnel to listen state | |
270 | */ | |
271 | diff -urNp v2.6.22/linux/net/ipv4/ipvs/ip_vs_nfct.c linux/net/ipv4/ipvs/ip_vs_nfct.c | |
272 | --- v2.6.22/linux/net/ipv4/ipvs/ip_vs_nfct.c 1970-01-01 02:00:00.000000000 +0200 | |
273 | +++ linux/net/ipv4/ipvs/ip_vs_nfct.c 2007-07-12 12:04:31.000000000 +0300 | |
274 | @@ -0,0 +1,389 @@ | |
275 | +/* | |
276 | + * ip_vs_nfct.c: Netfilter connection tracking support for IPVS | |
277 | + * | |
278 | + * Portions Copyright (C) 2001-2002 | |
279 | + * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. | |
280 | + * | |
281 | + * Portions Copyright (C) 2003-2007 | |
282 | + * Julian Anastasov | |
283 | + * | |
284 | + * | |
285 | + * This code is free software; you can redistribute it and/or modify | |
286 | + * it under the terms of the GNU General Public License as published by | |
287 | + * the Free Software Foundation; either version 2 of the License, or | |
288 | + * (at your option) any later version. | |
289 | + * | |
290 | + * This program is distributed in the hope that it will be useful, | |
291 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
292 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
293 | + * GNU General Public License for more details. | |
294 | + * | |
295 | + * You should have received a copy of the GNU General Public License | |
296 | + * along with this program; if not, write to the Free Software | |
297 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
298 | + * | |
299 | + * | |
300 | + * Authors: | |
301 | + * Ben North <ben@redfrontdoor.org> | |
302 | + * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels | |
303 | + * | |
304 | + * | |
305 | + * Current status: | |
306 | + * | |
307 | + * - provide conntrack confirmation for new and related connections, by | |
308 | + * this way we can see their proper conntrack state in all hooks | |
309 | + * - support for all forwarding methods, not only NAT | |
310 | + * - FTP support (NAT), ability to support other NAT apps with expectations | |
311 | + * - to correctly create expectations for related NAT connections the proper | |
312 | + * NF conntrack support must be already installed, eg. ip_vs_ftp requires | |
313 | + * nf_conntrack_ftp for the same ports | |
314 | + * | |
315 | + */ | |
316 | + | |
317 | +#include <linux/module.h> | |
318 | +#include <linux/types.h> | |
319 | +#include <linux/kernel.h> | |
320 | +#include <linux/errno.h> | |
321 | +#include <linux/compiler.h> | |
322 | +#include <linux/vmalloc.h> | |
323 | +#include <linux/skbuff.h> | |
324 | +#include <net/ip.h> | |
325 | +#include <linux/netfilter.h> | |
326 | +#include <linux/netfilter_ipv4.h> | |
327 | +#include <net/ip_vs.h> | |
328 | + | |
329 | + | |
330 | +EXPORT_SYMBOL(ip_vs_nfct_expect_related); | |
331 | + | |
332 | + | |
333 | +#define FMT_TUPLE "%u.%u.%u.%u:%u->%u.%u.%u.%u:%u/%u" | |
334 | +#define ARG_TUPLE(t) NIPQUAD((t)->src.u3.ip), ntohs((t)->src.u.all), \ | |
335 | + NIPQUAD((t)->dst.u3.ip), ntohs((t)->dst.u.all), \ | |
336 | + (t)->dst.protonum | |
337 | + | |
338 | +#define FMT_CONN "%u.%u.%u.%u:%u->%u.%u.%u.%u:%u->%u.%u.%u.%u:%u/%u:%u" | |
339 | +#define ARG_CONN(c) NIPQUAD((c)->caddr), ntohs((c)->cport), \ | |
340 | + NIPQUAD((c)->vaddr), ntohs((c)->vport), \ | |
341 | + NIPQUAD((c)->daddr), ntohs((c)->dport), \ | |
342 | + (c)->protocol, (c)->state | |
343 | + | |
344 | +/* Returns boolean and skb is freed on failure */ | |
345 | +static int __ip_vs_nfct_confirm(struct sk_buff *skb, struct ip_vs_conn *cp, | |
346 | + unsigned int hooknum) | |
347 | +{ | |
348 | + /* | |
349 | + * The assumptions: | |
350 | + * - the nfct is !NULL and is not confirmed | |
351 | + * - we are called before any mangle | |
352 | + */ | |
353 | + | |
354 | + struct iphdr *iph = ip_hdr(skb); | |
355 | + struct nf_conn *ct = (struct nf_conn *) skb->nfct; | |
356 | + struct nf_conntrack_tuple new_reply; | |
357 | + int ret = NF_DROP; | |
358 | + __be16 _ports[2], *pptr; | |
359 | +#ifdef CONFIG_IP_VS_DEBUG | |
360 | + struct nf_conntrack_tuple *orig_tup = | |
361 | + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; | |
362 | + struct nf_conntrack_tuple *orig_rep = | |
363 | + &ct->tuplehash[IP_CT_DIR_REPLY].tuple; | |
364 | +#endif | |
365 | +#ifdef CONFIG_NF_NAT_NEEDED | |
366 | + int initialized = !!(ct->status & IPS_NAT_DONE_MASK); | |
367 | +#else | |
368 | + int initialized = 0; | |
369 | +#endif | |
370 | + | |
371 | + IP_VS_DBG(7, "%s: ct=%p, init=%d, tuples=" FMT_TUPLE ", " FMT_TUPLE | |
372 | + ", cp=" FMT_CONN "\n", | |
373 | + __FUNCTION__, ct, initialized, | |
374 | + ARG_TUPLE(orig_tup), ARG_TUPLE(orig_rep), ARG_CONN(cp)); | |
375 | + | |
376 | +#ifdef CONFIG_NF_NAT_NEEDED | |
377 | + /* | |
378 | + * This is really bad, may be we are trying to alter DNAT conn? | |
379 | + * This is not supported, avoid the confirmation. | |
380 | + */ | |
381 | + if (initialized && ct->status & IPS_NAT_MASK) { | |
382 | +#ifdef CONFIG_IP_VS_DEBUG | |
383 | + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, init=%d\n", | |
384 | + __FUNCTION__, ct, ct->status, initialized); | |
385 | +#endif | |
386 | + return 1; | |
387 | + } | |
388 | +#endif | |
389 | + | |
390 | + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ || NF_IP_FORWARD == hooknum) | |
391 | + goto confirm; | |
392 | + | |
393 | + /* | |
394 | + * Alter reply only for IP_VS_CONN_F_MASQ in outin direction. | |
395 | + * For related connections in inout direction it is done in | |
396 | + * expectfn callback. | |
397 | + */ | |
398 | + | |
399 | + pptr = skb_header_pointer(skb, ip_hdrlen(skb), | |
400 | + sizeof(_ports), _ports); | |
401 | + if (!pptr) | |
402 | + goto out; | |
403 | + | |
404 | + new_reply = (struct nf_conntrack_tuple) { | |
405 | + .dst = { .protonum = iph->protocol, .dir = IP_CT_DIR_REPLY }}; | |
406 | + | |
407 | + new_reply.src.u3.ip = cp->daddr; | |
408 | + new_reply.src.u.tcp.port = cp->dport; | |
409 | + new_reply.src.l3num = PF_INET; | |
410 | + new_reply.dst.u3.ip = iph->saddr; | |
411 | + new_reply.dst.u.tcp.port = pptr[0]; | |
412 | + | |
413 | + nf_conntrack_alter_reply(ct, &new_reply); | |
414 | + | |
415 | + IP_VS_DBG(7, "%s: ct=%p, init=%d, orig=" FMT_TUPLE | |
416 | + ", new_reply=" FMT_TUPLE " => alter_reply\n", | |
417 | + __FUNCTION__, ct, initialized, | |
418 | + ARG_TUPLE(orig_tup), ARG_TUPLE(&new_reply)); | |
419 | + | |
420 | + /* | |
421 | + * No need to rehash NAT info because we don't change source | |
422 | + * address in original direction | |
423 | + */ | |
424 | + | |
425 | +confirm: | |
426 | + | |
427 | + ret = __nf_conntrack_confirm(&skb); | |
428 | + | |
429 | + if (ret != NF_STOLEN) { | |
430 | + IP_VS_DBG(7, "%s: ct=%p, init=%d, orig=" FMT_TUPLE " => confirm ret=%d\n", | |
431 | + __FUNCTION__, ct, initialized, ARG_TUPLE(orig_tup), ret); | |
432 | + } | |
433 | + | |
434 | + if (ret != NF_ACCEPT) | |
435 | + goto out; | |
436 | + return 1; | |
437 | + | |
438 | +out: | |
439 | + if (ret != NF_STOLEN) | |
440 | + kfree_skb(skb); | |
441 | + return 0; | |
442 | +} | |
443 | + | |
444 | +/* | |
445 | + * Confirm (and optionally alter) the conntrack entry if needed | |
446 | + * because the IPVS packets do not reach ipv4_confirm. | |
447 | + */ | |
448 | +int ip_vs_nfct_confirm(struct sk_buff *skb, struct ip_vs_conn *cp, | |
449 | + unsigned int hooknum) | |
450 | +{ | |
451 | + struct iphdr *iph = ip_hdr(skb); | |
452 | + struct nf_conn *ct = (struct nf_conn *) skb->nfct; | |
453 | + | |
454 | + /* By the time we're sending the packet out the other | |
455 | + * side, there should be a confirmed Netfilter CT entry | |
456 | + * for this connection. This may not be the case, | |
457 | + * however, if it's a brand new connection, or if the NF | |
458 | + * entry has timed out before ours has. Either way, if | |
459 | + * the NF CT entry is unconfirmed, confirm it, and deal | |
460 | + * with reply tuple mangling at the same time. | |
461 | + */ | |
462 | + | |
463 | + /* We only deal with TCP or UDP packets */ | |
464 | + if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_UDP) | |
465 | + return 1; | |
466 | + | |
467 | + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { | |
468 | + /* | |
469 | + * Do not be surprised if non-NAT conntracks stay in SYN_SENT | |
470 | + * state, may be the replies from the real server go | |
471 | + * directly to client. In any case, keep them in REPLIED | |
472 | + * state (ESTABLISHED). | |
473 | + */ | |
474 | + if (iph->protocol != IPPROTO_TCP || | |
475 | + IP_VS_TCP_S_ESTABLISHED == cp->state) { | |
476 | + set_bit(IPS_SEEN_REPLY_BIT, &ct->status); | |
477 | + } | |
478 | + } | |
479 | + | |
480 | + /* | |
481 | + * We assume the reused connections do not change their rip:rport | |
482 | + * and we do not need to alter their conntrack reply | |
483 | + */ | |
484 | + return __ip_vs_nfct_confirm(skb, cp, hooknum); | |
485 | +} | |
486 | + | |
487 | +/* | |
488 | + * We are called from init_conntrack() as expectfn handler | |
489 | + */ | |
490 | + | |
491 | +static void ip_vs_nfct_expect_callback(struct nf_conn *ct, | |
492 | + struct nf_conntrack_expect *exp) | |
493 | +{ | |
494 | + struct nf_conntrack_tuple *orig, new_reply; | |
495 | + struct ip_vs_conn *cp; | |
496 | + | |
497 | + if (exp->tuple.src.l3num != PF_INET) | |
498 | + return; | |
499 | + | |
500 | + /* | |
501 | + * - We assume that no NF locks are held before this callback | |
502 | + * - ip_vs_conn_out_get and ip_vs_conn_in_get should match their | |
503 | + * expectations even if they use wildcard values, now we provide | |
504 | + * the actual values from the newly created original conntrack direction | |
505 | + * - the conntrack is confirmed when packet reaches IPVS hooks | |
506 | + */ | |
507 | + | |
508 | + /* RS->CLIENT */ | |
509 | + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; | |
510 | + cp = ip_vs_conn_out_get(orig->dst.protonum, | |
511 | + orig->src.u3.ip, orig->src.u.tcp.port, | |
512 | + orig->dst.u3.ip, orig->dst.u.tcp.port); | |
513 | + if (cp) { | |
514 | + /* Change reply CLIENT->RS to CLIENT->VS */ | |
515 | + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; | |
516 | + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " FMT_TUPLE | |
517 | + ", found inout cp=" FMT_CONN "\n", | |
518 | + __FUNCTION__, ct, ct->status, | |
519 | + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), | |
520 | + ARG_CONN(cp)); | |
521 | + new_reply.dst.u3.ip = cp->vaddr; | |
522 | + new_reply.dst.u.tcp.port = cp->vport; | |
523 | + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE | |
524 | + ", inout cp=" FMT_CONN "\n", | |
525 | + __FUNCTION__, ct, | |
526 | + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), | |
527 | + ARG_CONN(cp)); | |
528 | + goto alter; | |
529 | + } | |
530 | + | |
531 | + /* CLIENT->VS */ | |
532 | + cp = ip_vs_conn_in_get(orig->dst.protonum, | |
533 | + orig->src.u3.ip, orig->src.u.tcp.port, | |
534 | + orig->dst.u3.ip, orig->dst.u.tcp.port); | |
535 | + if (cp) { | |
536 | + /* Change reply VS->CLIENT to RS->CLIENT */ | |
537 | + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; | |
538 | + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " FMT_TUPLE | |
539 | + ", found outin cp=" FMT_CONN "\n", | |
540 | + __FUNCTION__, ct, ct->status, | |
541 | + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), | |
542 | + ARG_CONN(cp)); | |
543 | + new_reply.src.u3.ip = cp->daddr; | |
544 | + new_reply.src.u.tcp.port = cp->dport; | |
545 | + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE | |
546 | + ", outin cp=" FMT_CONN "\n", | |
547 | + __FUNCTION__, ct, | |
548 | + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), | |
549 | + ARG_CONN(cp)); | |
550 | + goto alter; | |
551 | + } | |
552 | + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE " - unknown expect\n", | |
553 | + __FUNCTION__, ct, ct->status, ARG_TUPLE(orig)); | |
554 | + return; | |
555 | + | |
556 | +alter: | |
557 | + | |
558 | + /* Never alter conntrack for non-NAT conns */ | |
559 | + if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) | |
560 | + nf_conntrack_alter_reply(ct, &new_reply); | |
561 | + ip_vs_conn_put(cp); | |
562 | + return; | |
563 | +} | |
564 | + | |
565 | +/* | |
566 | + * Create NF conntrack expectation with wildcard (optional) source port. | |
567 | + * Then the default callback function will alter the reply and will confirm | |
568 | + * the conntrack entry when the first packet comes. | |
569 | + */ | |
570 | +void ip_vs_nfct_expect_related(struct sk_buff *skb, struct ip_vs_conn *cp, | |
571 | + __be16 port, __u16 proto, int from_rs) | |
572 | +{ | |
573 | + struct nf_conn *ct = (struct nf_conn *) skb->nfct; | |
574 | + struct nf_conntrack_expect *e; | |
575 | + | |
576 | + if (!sysctl_ip_vs_conntrack) | |
577 | + return; | |
578 | + | |
579 | + if (!ct) { | |
580 | + IP_VS_DBG(7, "%s: ct=%p for cp=" FMT_CONN "\n", | |
581 | + __FUNCTION__, ct, ARG_CONN(cp)); | |
582 | + return; | |
583 | + } | |
584 | + | |
585 | + if (!(e = nf_conntrack_expect_alloc(ct))) | |
586 | + return; | |
587 | + | |
588 | + e->expectfn = ip_vs_nfct_expect_callback; | |
589 | + e->helper = NULL; | |
590 | + e->flags = 0; | |
591 | + memset(&e->tuple, 0, sizeof(e->tuple)); | |
592 | + e->tuple.src.u.tcp.port = port; | |
593 | + e->tuple.src.l3num = PF_INET; | |
594 | + e->tuple.dst.protonum = proto; | |
595 | + memset(&e->mask, 0, sizeof(e->mask)); | |
596 | + e->mask.src.u3.ip = 0xffffffff; | |
597 | + e->mask.src.u.all = port? 0xffff : 0; | |
598 | + e->mask.src.l3num = 0xffff; | |
599 | + e->mask.dst.u3.ip = 0xffffffff; | |
600 | + e->mask.dst.u.all = 0xffff; | |
601 | + e->mask.dst.protonum = 0xff; | |
602 | + | |
603 | + if (from_rs) { | |
604 | + e->tuple.src.u3.ip = cp->daddr; | |
605 | + e->tuple.dst.u3.ip = cp->caddr; | |
606 | + e->tuple.dst.u.tcp.port = cp->cport; | |
607 | + } else { | |
608 | + e->tuple.src.u3.ip = cp->caddr; | |
609 | + e->tuple.dst.u3.ip = cp->vaddr; | |
610 | + e->tuple.dst.u.tcp.port = cp->vport; | |
611 | + } | |
612 | + | |
613 | + IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", | |
614 | + __FUNCTION__, ct, ARG_TUPLE(&e->tuple)); | |
615 | + nf_conntrack_expect_related(e); | |
616 | + nf_conntrack_expect_put(e); | |
617 | +} | |
618 | + | |
619 | +/* | |
620 | + * Our connection was terminated, try to drop the conntrack immediately | |
621 | + */ | |
622 | +void ip_vs_nfct_conn_drop(struct ip_vs_conn *cp) | |
623 | +{ | |
624 | + struct nf_conntrack_tuple_hash *h; | |
625 | + struct nf_conn *ct; | |
626 | + struct nf_conntrack_tuple tuple; | |
627 | + | |
628 | + if (!cp->cport) | |
629 | + return; | |
630 | + | |
631 | + tuple = (struct nf_conntrack_tuple) { | |
632 | + .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } }; | |
633 | + tuple.src.u3.ip = cp->caddr; | |
634 | + tuple.src.u.all = cp->cport; | |
635 | + tuple.src.l3num = PF_INET; | |
636 | + tuple.dst.u3.ip = cp->vaddr; | |
637 | + tuple.dst.u.all = cp->vport; | |
638 | + | |
639 | + IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE | |
640 | + " for conn " FMT_CONN "\n", | |
641 | + __FUNCTION__, ARG_TUPLE(&tuple), ARG_CONN(cp)); | |
642 | + | |
643 | + h = nf_conntrack_find_get(&tuple, NULL); | |
644 | + if (h) { | |
645 | + ct = nf_ct_tuplehash_to_ctrack(h); | |
646 | + if (del_timer(&ct->timeout)) { | |
647 | + IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple=" | |
648 | + FMT_TUPLE "\n", | |
649 | + __FUNCTION__, ct, ARG_TUPLE(&tuple)); | |
650 | + if (ct->timeout.function) | |
651 | + ct->timeout.function(ct->timeout.data); | |
652 | + } else { | |
653 | + IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" | |
654 | + FMT_TUPLE "\n", | |
655 | + __FUNCTION__, ct, ARG_TUPLE(&tuple)); | |
656 | + } | |
657 | + nf_ct_put(ct); | |
658 | + } else { | |
659 | + IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", | |
660 | + __FUNCTION__, ARG_TUPLE(&tuple)); | |
661 | + } | |
662 | +} | |
663 | + | |
664 | diff -urNp v2.6.22/linux/net/ipv4/ipvs/ip_vs_xmit.c linux/net/ipv4/ipvs/ip_vs_xmit.c | |
665 | --- v2.6.22/linux/net/ipv4/ipvs/ip_vs_xmit.c 2007-07-10 09:18:43.000000000 +0300 | |
666 | +++ linux/net/ipv4/ipvs/ip_vs_xmit.c 2007-07-12 09:54:45.000000000 +0300 | |
667 | @@ -199,6 +199,9 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s | |
668 | dst_release(skb->dst); | |
669 | skb->dst = &rt->u.dst; | |
670 | ||
671 | + if (!ip_vs_confirm_conntrack(skb, cp, NF_IP_LOCAL_IN)) | |
672 | + goto tx_error_out; | |
673 | + | |
674 | /* Another hack: avoid icmp_send in ip_fragment */ | |
675 | skb->local_df = 1; | |
676 | ||
677 | @@ -211,6 +214,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s | |
678 | dst_link_failure(skb); | |
679 | tx_error: | |
680 | kfree_skb(skb); | |
681 | + tx_error_out: | |
682 | LeaveFunction(10); | |
683 | return NF_STOLEN; | |
684 | } | |
685 | @@ -263,6 +267,9 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru | |
686 | dst_release(skb->dst); | |
687 | skb->dst = &rt->u.dst; | |
688 | ||
689 | + if (!ip_vs_confirm_conntrack(skb, cp, NF_IP_LOCAL_IN)) | |
690 | + goto tx_error_out; | |
691 | + | |
692 | /* mangle the packet */ | |
693 | if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp)) | |
694 | goto tx_error; | |
695 | @@ -286,8 +293,9 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru | |
696 | tx_error_icmp: | |
697 | dst_link_failure(skb); | |
698 | tx_error: | |
699 | - LeaveFunction(10); | |
700 | kfree_skb(skb); | |
701 | + tx_error_out: | |
702 | + LeaveFunction(10); | |
703 | return NF_STOLEN; | |
704 | tx_error_put: | |
705 | ip_rt_put(rt); | |
706 | @@ -386,14 +394,17 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s | |
707 | /* fix old IP header checksum */ | |
708 | ip_send_check(old_iph); | |
709 | ||
710 | - skb_push(skb, sizeof(struct iphdr)); | |
711 | - skb_reset_network_header(skb); | |
712 | - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | |
713 | - | |
714 | /* drop old route */ | |
715 | dst_release(skb->dst); | |
716 | skb->dst = &rt->u.dst; | |
717 | ||
718 | + if (!ip_vs_confirm_conntrack(skb, cp, NF_IP_LOCAL_IN)) | |
719 | + goto tx_error_out; | |
720 | + | |
721 | + skb_push(skb, sizeof(struct iphdr)); | |
722 | + skb_reset_network_header(skb); | |
723 | + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | |
724 | + | |
725 | /* | |
726 | * Push down and install the IPIP header. | |
727 | */ | |
728 | @@ -423,6 +434,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s | |
729 | dst_link_failure(skb); | |
730 | tx_error: | |
731 | kfree_skb(skb); | |
732 | + tx_error_out: | |
733 | LeaveFunction(10); | |
734 | return NF_STOLEN; | |
735 | } | |
736 | @@ -468,6 +480,9 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc | |
737 | dst_release(skb->dst); | |
738 | skb->dst = &rt->u.dst; | |
739 | ||
740 | + if (!ip_vs_confirm_conntrack(skb, cp, NF_IP_LOCAL_IN)) | |
741 | + goto tx_error_out; | |
742 | + | |
743 | /* Another hack: avoid icmp_send in ip_fragment */ | |
744 | skb->local_df = 1; | |
745 | ||
746 | @@ -480,6 +495,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc | |
747 | dst_link_failure(skb); | |
748 | tx_error: | |
749 | kfree_skb(skb); | |
750 | + tx_error_out: | |
751 | LeaveFunction(10); | |
752 | return NF_STOLEN; | |
753 | } | |
754 | @@ -539,6 +555,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, str | |
755 | dst_release(skb->dst); | |
756 | skb->dst = &rt->u.dst; | |
757 | ||
758 | + /* TODO: properly alter reply for NFCT */ | |
759 | + | |
760 | ip_vs_nat_icmp(skb, pp, cp, 0); | |
761 | ||
762 | /* Another hack: avoid icmp_send in ip_fragment */ |