]>
Commit | Line | Data |
---|---|---|
6d4e1af8 | 1 | diff -urNp v2.6.25/linux/include/net/ip_vs.h linux/include/net/ip_vs.h |
2 | --- v2.6.25/linux/include/net/ip_vs.h 2008-04-17 09:58:08.000000000 +0300 | |
3 | +++ linux/include/net/ip_vs.h 2008-04-19 19:59:24.000000000 +0300 | |
ec625505 | 4 | @@ -22,6 +22,16 @@ |
6d4e1af8 | 5 | |
ec625505 | 6 | #include <net/checksum.h> |
db744e5b | 7 | |
8 | +#ifdef __KERNEL__ | |
9 | +#include <linux/skbuff.h> | |
10 | +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) | |
11 | +#include <net/netfilter/nf_conntrack.h> | |
12 | +#include <net/netfilter/nf_conntrack_core.h> | |
13 | +#include <net/netfilter/nf_conntrack_expect.h> | |
14 | +#include <net/netfilter/nf_conntrack_helper.h> | |
15 | +#endif | |
16 | +#endif | |
17 | + | |
ec625505 AM |
18 | #ifdef CONFIG_IP_VS_DEBUG |
19 | #include <linux/net.h> | |
20 | ||
6d4e1af8 | 21 | @@ -686,6 +696,16 @@ extern void ip_vs_init_hash_table(struct |
db744e5b | 22 | */ |
23 | ||
24 | /* | |
25 | + * Netfilter connection tracking | |
26 | + * (from ip_vs_nfct.c) | |
27 | + */ | |
28 | +extern int ip_vs_nfct_confirm(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int hooknum); | |
29 | +extern void ip_vs_nfct_expect_related(struct sk_buff *skb, | |
30 | + struct ip_vs_conn *cp, | |
31 | + __be16 port, __u16 proto, int from_rs); | |
32 | +extern void ip_vs_nfct_conn_drop(struct ip_vs_conn *cp); | |
33 | + | |
34 | +/* | |
35 | * IPVS connection entry hash table | |
36 | */ | |
37 | #ifndef CONFIG_IP_VS_TAB_BITS | |
ec625505 | 38 | @@ -653,9 +663,42 @@ |
db744e5b | 39 | extern int sysctl_ip_vs_expire_quiescent_template; |
40 | extern int sysctl_ip_vs_sync_threshold[2]; | |
41 | extern int sysctl_ip_vs_nat_icmp_send; | |
42 | +extern int sysctl_ip_vs_snat_reroute; | |
43 | extern struct ip_vs_stats ip_vs_stats; | |
ec625505 | 44 | extern const struct ctl_path net_vs_ctl_path[]; |
db744e5b | 45 | |
46 | +#ifdef CONFIG_IP_VS_NFCT | |
47 | + | |
48 | +extern int sysctl_ip_vs_conntrack; | |
49 | + | |
50 | +static inline int ip_vs_use_conntrack(struct sk_buff *skb) | |
51 | +{ | |
ec625505 | 52 | + return sysctl_ip_vs_conntrack && skb->nfct; |
db744e5b | 53 | +} |
54 | + | |
55 | +/* Returns boolean and skb is freed on failure */ | |
56 | +static inline int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int hooknum) | |
57 | +{ | |
ec625505 AM |
58 | + if (!ip_vs_use_conntrack(skb)) |
59 | + return 1; | |
60 | + return nf_ct_is_confirmed((struct nf_conn *) skb->nfct) || | |
61 | + ip_vs_nfct_confirm(skb, cp, hooknum); | |
db744e5b | 62 | +} |
63 | + | |
64 | +#else | |
65 | + | |
66 | +static inline int ip_vs_use_conntrack(struct sk_buff *skb) | |
67 | +{ | |
ec625505 | 68 | + return 0; |
db744e5b | 69 | +} |
70 | + | |
71 | +static inline int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int hooknum) | |
72 | +{ | |
ec625505 | 73 | + return 1; |
db744e5b | 74 | +} |
75 | + | |
76 | +#endif | |
77 | + | |
78 | extern struct ip_vs_service * | |
79 | ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport); | |
80 | ||
6d4e1af8 | 81 | diff -urNp v2.6.25/linux/net/ipv4/ipvs/Kconfig linux/net/ipv4/ipvs/Kconfig |
82 | --- v2.6.25/linux/net/ipv4/ipvs/Kconfig 2007-07-10 09:18:43.000000000 +0300 | |
83 | +++ linux/net/ipv4/ipvs/Kconfig 2008-04-19 19:55:40.000000000 +0300 | |
db744e5b | 84 | @@ -221,4 +221,12 @@ config IP_VS_FTP |
85 | If you want to compile it in kernel, say Y. To compile it as a | |
86 | module, choose M here. If unsure, say N. | |
87 | ||
88 | +config IP_VS_NFCT | |
89 | + bool "Netfilter connection tracking" | |
90 | + depends on NF_CONNTRACK | |
91 | + ---help--- | |
92 | + The Netfilter connection tracking support allows the IPVS | |
93 | + connection state to be exported to the Netfilter framework | |
94 | + for filtering purposes. | |
95 | + | |
96 | endif # IP_VS | |
6d4e1af8 | 97 | diff -urNp v2.6.25/linux/net/ipv4/ipvs/Makefile linux/net/ipv4/ipvs/Makefile |
98 | --- v2.6.25/linux/net/ipv4/ipvs/Makefile 2005-06-18 08:50:52.000000000 +0300 | |
99 | +++ linux/net/ipv4/ipvs/Makefile 2008-04-19 19:55:40.000000000 +0300 | |
db744e5b | 100 | @@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UD |
101 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o | |
102 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o | |
103 | ||
104 | +ip_vs-extra_objs-y := | |
105 | +ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o | |
106 | + | |
107 | ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ | |
108 | ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ | |
109 | ip_vs_est.o ip_vs_proto.o \ | |
110 | - $(ip_vs_proto-objs-y) | |
111 | + $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y) | |
112 | ||
113 | ||
114 | # IPVS core | |
6d4e1af8 | 115 | diff -urNp v2.6.25/linux/net/ipv4/ipvs/ip_vs_conn.c linux/net/ipv4/ipvs/ip_vs_conn.c |
116 | --- v2.6.25/linux/net/ipv4/ipvs/ip_vs_conn.c 2008-04-17 09:58:09.000000000 +0300 | |
117 | +++ linux/net/ipv4/ipvs/ip_vs_conn.c 2008-04-19 19:55:40.000000000 +0300 | |
118 | @@ -593,6 +593,11 @@ static void ip_vs_conn_expire(unsigned l | |
db744e5b | 119 | if (cp->control) |
120 | ip_vs_control_del(cp); | |
121 | ||
122 | +#ifdef CONFIG_IP_VS_NFCT | |
123 | + if (sysctl_ip_vs_conntrack) | |
124 | + ip_vs_nfct_conn_drop(cp); | |
125 | +#endif | |
126 | + | |
127 | if (unlikely(cp->app != NULL)) | |
128 | ip_vs_unbind_app(cp); | |
129 | ip_vs_unbind_dest(cp); | |
6d4e1af8 | 130 | diff -urNp v2.6.25/linux/net/ipv4/ipvs/ip_vs_core.c linux/net/ipv4/ipvs/ip_vs_core.c |
131 | --- v2.6.25/linux/net/ipv4/ipvs/ip_vs_core.c 2008-04-17 09:58:09.000000000 +0300 | |
132 | +++ linux/net/ipv4/ipvs/ip_vs_core.c 2008-04-19 19:55:40.000000000 +0300 | |
db744e5b | 133 | @@ -661,6 +661,8 @@ static int ip_vs_out_icmp(struct sk_buff |
134 | ||
135 | skb->ipvs_property = 1; | |
136 | verdict = NF_ACCEPT; | |
137 | + if (sysctl_ip_vs_snat_reroute && ip_route_me_harder(skb, RTN_LOCAL)) | |
138 | + verdict = NF_DROP; | |
139 | ||
140 | out: | |
141 | __ip_vs_conn_put(cp); | |
142 | @@ -761,19 +763,31 @@ ip_vs_out(unsigned int hooknum, struct s | |
143 | if (!skb_make_writable(skb, ihl)) | |
144 | goto drop; | |
145 | ||
146 | + if (!ip_vs_confirm_conntrack(skb, cp, hooknum)) | |
147 | + goto out; | |
148 | + | |
149 | /* mangle the packet */ | |
150 | if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) | |
151 | goto drop; | |
152 | ip_hdr(skb)->saddr = cp->vaddr; | |
153 | ip_send_check(ip_hdr(skb)); | |
154 | ||
155 | + /* | |
156 | + * nf_iterate does not expect change in the skb->dst->dev. | |
157 | + * It looks like it is not fatal to enable this code for hooks | |
158 | + * where our handlers are at the end of the chain list and | |
159 | + * when all next handlers use skb->dst->dev and not outdev. | |
160 | + * It will definitely route properly the inout NAT traffic | |
161 | + * when multiple paths are used. | |
162 | + */ | |
163 | + | |
164 | /* For policy routing, packets originating from this | |
165 | * machine itself may be routed differently to packets | |
166 | * passing through. We want this packet to be routed as | |
167 | * if it came from this machine itself. So re-compute | |
168 | * the routing information. | |
169 | */ | |
170 | - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) | |
171 | + if (sysctl_ip_vs_snat_reroute && ip_route_me_harder(skb, RTN_LOCAL) != 0) | |
172 | goto drop; | |
173 | ||
174 | IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); | |
175 | @@ -788,8 +802,11 @@ ip_vs_out(unsigned int hooknum, struct s | |
176 | return NF_ACCEPT; | |
177 | ||
178 | drop: | |
179 | - ip_vs_conn_put(cp); | |
180 | kfree_skb(skb); | |
181 | + | |
182 | + out: | |
183 | + ip_vs_conn_put(cp); | |
184 | + LeaveFunction(11); | |
185 | return NF_STOLEN; | |
186 | } | |
187 | ||
6d4e1af8 | 188 | diff -urNp v2.6.25/linux/net/ipv4/ipvs/ip_vs_ctl.c linux/net/ipv4/ipvs/ip_vs_ctl.c |
189 | --- v2.6.25/linux/net/ipv4/ipvs/ip_vs_ctl.c 2008-04-17 09:58:09.000000000 +0300 | |
190 | +++ linux/net/ipv4/ipvs/ip_vs_ctl.c 2008-04-19 19:55:40.000000000 +0300 | |
db744e5b | 191 | @@ -81,6 +81,10 @@ int sysctl_ip_vs_expire_nodest_conn = 0; |
192 | int sysctl_ip_vs_expire_quiescent_template = 0; | |
193 | int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; | |
194 | int sysctl_ip_vs_nat_icmp_send = 0; | |
195 | +int sysctl_ip_vs_snat_reroute = 0; | |
196 | +#ifdef CONFIG_IP_VS_NFCT | |
197 | +int sysctl_ip_vs_conntrack = 0; | |
198 | +#endif | |
199 | ||
200 | ||
201 | #ifdef CONFIG_IP_VS_DEBUG | |
202 | @@ -1446,6 +1450,15 @@ static struct ctl_table vs_vars[] = { | |
203 | .mode = 0644, | |
204 | .proc_handler = &proc_dointvec, | |
205 | }, | |
206 | +#ifdef CONFIG_IP_VS_NFCT | |
207 | + { | |
208 | + .procname = "conntrack", | |
209 | + .data = &sysctl_ip_vs_conntrack, | |
210 | + .maxlen = sizeof(int), | |
211 | + .mode = 0644, | |
212 | + .proc_handler = &proc_dointvec, | |
213 | + }, | |
214 | +#endif | |
215 | { | |
216 | .procname = "drop_entry", | |
217 | .data = &sysctl_ip_vs_drop_entry, | |
218 | @@ -1467,6 +1480,13 @@ static struct ctl_table vs_vars[] = { | |
219 | .mode = 0644, | |
220 | .proc_handler = &proc_do_defense_mode, | |
221 | }, | |
222 | + { | |
223 | + .procname = "snat_reroute", | |
224 | + .data = &sysctl_ip_vs_snat_reroute, | |
225 | + .maxlen = sizeof(int), | |
226 | + .mode = 0644, | |
227 | + .proc_handler = &proc_dointvec, | |
228 | + }, | |
229 | #if 0 | |
230 | { | |
231 | .procname = "timeout_established", | |
6d4e1af8 | 232 | diff -urNp v2.6.25/linux/net/ipv4/ipvs/ip_vs_ftp.c linux/net/ipv4/ipvs/ip_vs_ftp.c |
233 | --- v2.6.25/linux/net/ipv4/ipvs/ip_vs_ftp.c 2008-01-25 10:45:06.000000000 +0200 | |
234 | +++ linux/net/ipv4/ipvs/ip_vs_ftp.c 2008-04-19 19:55:40.000000000 +0300 | |
db744e5b | 235 | @@ -195,6 +195,11 @@ static int ip_vs_ftp_out(struct ip_vs_ap |
236 | ip_vs_control_add(n_cp, cp); | |
237 | } | |
238 | ||
239 | +#ifdef CONFIG_IP_VS_NFCT | |
240 | + if (skb->nfct) | |
241 | + ip_vs_nfct_expect_related(skb, n_cp, 0, IPPROTO_TCP, 0); | |
242 | +#endif | |
243 | + | |
244 | /* | |
245 | * Replace the old passive address with the new one | |
246 | */ | |
247 | @@ -327,6 +332,11 @@ static int ip_vs_ftp_in(struct ip_vs_app | |
248 | ip_vs_control_add(n_cp, cp); | |
249 | } | |
250 | ||
251 | +#ifdef CONFIG_IP_VS_NFCT | |
252 | + if (skb->nfct) | |
253 | + ip_vs_nfct_expect_related(skb, n_cp, n_cp->dport, IPPROTO_TCP, 1); | |
254 | +#endif | |
255 | + | |
256 | /* | |
257 | * Move tunnel to listen state | |
258 | */ | |
6d4e1af8 | 259 | diff -urNp v2.6.25/linux/net/ipv4/ipvs/ip_vs_nfct.c linux/net/ipv4/ipvs/ip_vs_nfct.c |
260 | --- v2.6.25/linux/net/ipv4/ipvs/ip_vs_nfct.c 1970-01-01 02:00:00.000000000 +0200 | |
261 | +++ linux/net/ipv4/ipvs/ip_vs_nfct.c 2008-04-19 20:06:46.000000000 +0300 | |
db744e5b | 262 | @@ -0,0 +1,385 @@ |
263 | +/* | |
264 | + * ip_vs_nfct.c: Netfilter connection tracking support for IPVS | |
265 | + * | |
266 | + * Portions Copyright (C) 2001-2002 | |
267 | + * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. | |
268 | + * | |
269 | + * Portions Copyright (C) 2003-2008 | |
270 | + * Julian Anastasov | |
271 | + * | |
272 | + * | |
273 | + * This code is free software; you can redistribute it and/or modify | |
274 | + * it under the terms of the GNU General Public License as published by | |
275 | + * the Free Software Foundation; either version 2 of the License, or | |
276 | + * (at your option) any later version. | |
277 | + * | |
278 | + * This program is distributed in the hope that it will be useful, | |
279 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
280 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
281 | + * GNU General Public License for more details. | |
282 | + * | |
283 | + * You should have received a copy of the GNU General Public License | |
284 | + * along with this program; if not, write to the Free Software | |
285 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
286 | + * | |
287 | + * | |
288 | + * Authors: | |
289 | + * Ben North <ben@redfrontdoor.org> | |
290 | + * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels | |
291 | + * | |
292 | + * | |
293 | + * Current status: | |
294 | + * | |
295 | + * - provide conntrack confirmation for new and related connections, by | |
296 | + * this way we can see their proper conntrack state in all hooks | |
297 | + * - support for all forwarding methods, not only NAT | |
298 | + * - FTP support (NAT), ability to support other NAT apps with expectations | |
299 | + * - to correctly create expectations for related NAT connections the proper | |
300 | + * NF conntrack support must be already installed, eg. ip_vs_ftp requires | |
301 | + * nf_conntrack_ftp for the same ports | |
302 | + * | |
303 | + */ | |
304 | + | |
305 | +#include <linux/module.h> | |
306 | +#include <linux/types.h> | |
307 | +#include <linux/kernel.h> | |
308 | +#include <linux/errno.h> | |
309 | +#include <linux/compiler.h> | |
310 | +#include <linux/vmalloc.h> | |
311 | +#include <linux/skbuff.h> | |
312 | +#include <net/ip.h> | |
313 | +#include <linux/netfilter.h> | |
314 | +#include <linux/netfilter_ipv4.h> | |
315 | +#include <net/ip_vs.h> | |
316 | + | |
317 | + | |
318 | +EXPORT_SYMBOL(ip_vs_nfct_expect_related); | |
319 | + | |
320 | + | |
321 | +#define FMT_TUPLE "%u.%u.%u.%u:%u->%u.%u.%u.%u:%u/%u" | |
322 | +#define ARG_TUPLE(t) NIPQUAD((t)->src.u3.ip), ntohs((t)->src.u.all), \ | |
323 | + NIPQUAD((t)->dst.u3.ip), ntohs((t)->dst.u.all), \ | |
324 | + (t)->dst.protonum | |
325 | + | |
326 | +#define FMT_CONN "%u.%u.%u.%u:%u->%u.%u.%u.%u:%u->%u.%u.%u.%u:%u/%u:%u" | |
327 | +#define ARG_CONN(c) NIPQUAD((c)->caddr), ntohs((c)->cport), \ | |
328 | + NIPQUAD((c)->vaddr), ntohs((c)->vport), \ | |
329 | + NIPQUAD((c)->daddr), ntohs((c)->dport), \ | |
330 | + (c)->protocol, (c)->state | |
331 | + | |
332 | +/* Returns boolean and skb is freed on failure */ | |
333 | +static int __ip_vs_nfct_confirm(struct sk_buff *skb, struct ip_vs_conn *cp, | |
334 | + unsigned int hooknum) | |
335 | +{ | |
336 | + /* | |
337 | + * The assumptions: | |
338 | + * - the nfct is !NULL and is not confirmed | |
339 | + * - we are called before any mangle | |
340 | + */ | |
341 | + | |
342 | + struct iphdr *iph = ip_hdr(skb); | |
343 | + struct nf_conn *ct = (struct nf_conn *) skb->nfct; | |
344 | + struct nf_conntrack_tuple new_reply; | |
345 | + int ret = NF_DROP; | |
346 | + __be16 _ports[2], *pptr; | |
347 | +#ifdef CONFIG_IP_VS_DEBUG | |
348 | + struct nf_conntrack_tuple *orig_tup = | |
349 | + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; | |
350 | + struct nf_conntrack_tuple *orig_rep = | |
351 | + &ct->tuplehash[IP_CT_DIR_REPLY].tuple; | |
352 | +#endif | |
353 | +#ifdef CONFIG_NF_NAT_NEEDED | |
354 | + int initialized = !!(ct->status & IPS_NAT_DONE_MASK); | |
355 | +#else | |
356 | + int initialized = 0; | |
357 | +#endif | |
358 | + | |
359 | + IP_VS_DBG(7, "%s: ct=%p, init=%d, tuples=" FMT_TUPLE ", " FMT_TUPLE | |
360 | + ", cp=" FMT_CONN "\n", | |
361 | + __FUNCTION__, ct, initialized, | |
362 | + ARG_TUPLE(orig_tup), ARG_TUPLE(orig_rep), ARG_CONN(cp)); | |
363 | + | |
364 | +#ifdef CONFIG_NF_NAT_NEEDED | |
365 | + /* | |
366 | + * This is really bad, may be we are trying to alter DNAT conn? | |
367 | + * This is not supported, avoid the confirmation. | |
368 | + */ | |
369 | + if (initialized && ct->status & IPS_NAT_MASK) { | |
370 | +#ifdef CONFIG_IP_VS_DEBUG | |
371 | + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, init=%d\n", | |
372 | + __FUNCTION__, ct, ct->status, initialized); | |
373 | +#endif | |
374 | + return 1; | |
375 | + } | |
376 | +#endif | |
377 | + | |
6d4e1af8 | 378 | + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ || NF_INET_FORWARD == hooknum) |
db744e5b | 379 | + goto confirm; |
380 | + | |
381 | + /* | |
382 | + * Alter reply only for IP_VS_CONN_F_MASQ in outin direction. | |
383 | + * For related connections in inout direction it is done in | |
384 | + * expectfn callback. | |
385 | + */ | |
386 | + | |
387 | + pptr = skb_header_pointer(skb, ip_hdrlen(skb), | |
388 | + sizeof(_ports), _ports); | |
389 | + if (!pptr) | |
390 | + goto out; | |
391 | + | |
392 | + new_reply = (struct nf_conntrack_tuple) { | |
393 | + .dst = { .protonum = iph->protocol, .dir = IP_CT_DIR_REPLY }}; | |
394 | + | |
395 | + new_reply.src.u3.ip = cp->daddr; | |
396 | + new_reply.src.u.tcp.port = cp->dport; | |
397 | + new_reply.src.l3num = PF_INET; | |
398 | + new_reply.dst.u3.ip = iph->saddr; | |
399 | + new_reply.dst.u.tcp.port = pptr[0]; | |
400 | + | |
401 | + nf_conntrack_alter_reply(ct, &new_reply); | |
402 | + | |
403 | + IP_VS_DBG(7, "%s: ct=%p, init=%d, orig=" FMT_TUPLE | |
404 | + ", new_reply=" FMT_TUPLE " => alter_reply\n", | |
405 | + __FUNCTION__, ct, initialized, | |
406 | + ARG_TUPLE(orig_tup), ARG_TUPLE(&new_reply)); | |
407 | + | |
408 | + /* | |
409 | + * No need to rehash NAT info because we don't change source | |
410 | + * address in original direction | |
411 | + */ | |
412 | + | |
413 | +confirm: | |
414 | + | |
415 | + ret = __nf_conntrack_confirm(skb); | |
416 | + | |
417 | + if (ret != NF_STOLEN) { | |
418 | + IP_VS_DBG(7, "%s: ct=%p, init=%d, orig=" FMT_TUPLE " => confirm ret=%d\n", | |
419 | + __FUNCTION__, ct, initialized, ARG_TUPLE(orig_tup), ret); | |
420 | + } | |
421 | + | |
422 | + if (ret != NF_ACCEPT) | |
423 | + goto out; | |
424 | + return 1; | |
425 | + | |
426 | +out: | |
427 | + if (ret != NF_STOLEN) | |
428 | + kfree_skb(skb); | |
429 | + return 0; | |
430 | +} | |
431 | + | |
432 | +/* | |
433 | + * Confirm (and optionally alter) the conntrack entry if needed | |
434 | + * because the IPVS packets do not reach ipv4_confirm. | |
435 | + */ | |
436 | +int ip_vs_nfct_confirm(struct sk_buff *skb, struct ip_vs_conn *cp, | |
437 | + unsigned int hooknum) | |
438 | +{ | |
439 | + struct iphdr *iph = ip_hdr(skb); | |
440 | + struct nf_conn *ct = (struct nf_conn *) skb->nfct; | |
441 | + | |
442 | + /* By the time we're sending the packet out the other | |
443 | + * side, there should be a confirmed Netfilter CT entry | |
444 | + * for this connection. This may not be the case, | |
445 | + * however, if it's a brand new connection, or if the NF | |
446 | + * entry has timed out before ours has. Either way, if | |
447 | + * the NF CT entry is unconfirmed, confirm it, and deal | |
448 | + * with reply tuple mangling at the same time. | |
449 | + */ | |
450 | + | |
451 | + /* We only deal with TCP or UDP packets */ | |
452 | + if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_UDP) | |
453 | + return 1; | |
454 | + | |
455 | + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { | |
456 | + /* | |
457 | + * Do not be surprised if non-NAT conntracks stay in SYN_SENT | |
458 | + * state, may be the replies from the real server go | |
459 | + * directly to client. In any case, keep them in REPLIED | |
460 | + * state (ESTABLISHED). | |
461 | + */ | |
462 | + if (iph->protocol != IPPROTO_TCP || | |
463 | + IP_VS_TCP_S_ESTABLISHED == cp->state) { | |
464 | + set_bit(IPS_SEEN_REPLY_BIT, &ct->status); | |
465 | + } | |
466 | + } | |
467 | + | |
468 | + /* | |
469 | + * We assume the reused connections do not change their rip:rport | |
470 | + * and we do not need to alter their conntrack reply | |
471 | + */ | |
472 | + return __ip_vs_nfct_confirm(skb, cp, hooknum); | |
473 | +} | |
474 | + | |
475 | +/* | |
476 | + * We are called from init_conntrack() as expectfn handler | |
477 | + */ | |
478 | + | |
479 | +static void ip_vs_nfct_expect_callback(struct nf_conn *ct, | |
480 | + struct nf_conntrack_expect *exp) | |
481 | +{ | |
482 | + struct nf_conntrack_tuple *orig, new_reply; | |
483 | + struct ip_vs_conn *cp; | |
484 | + | |
485 | + if (exp->tuple.src.l3num != PF_INET) | |
486 | + return; | |
487 | + | |
488 | + /* | |
489 | + * - We assume that no NF locks are held before this callback | |
490 | + * - ip_vs_conn_out_get and ip_vs_conn_in_get should match their | |
491 | + * expectations even if they use wildcard values, now we provide | |
492 | + * the actual values from the newly created original conntrack direction | |
493 | + * - the conntrack is confirmed when packet reaches IPVS hooks | |
494 | + */ | |
495 | + | |
496 | + /* RS->CLIENT */ | |
497 | + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; | |
498 | + cp = ip_vs_conn_out_get(orig->dst.protonum, | |
499 | + orig->src.u3.ip, orig->src.u.tcp.port, | |
500 | + orig->dst.u3.ip, orig->dst.u.tcp.port); | |
501 | + if (cp) { | |
502 | + /* Change reply CLIENT->RS to CLIENT->VS */ | |
503 | + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; | |
504 | + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " FMT_TUPLE | |
505 | + ", found inout cp=" FMT_CONN "\n", | |
506 | + __FUNCTION__, ct, ct->status, | |
507 | + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), | |
508 | + ARG_CONN(cp)); | |
509 | + new_reply.dst.u3.ip = cp->vaddr; | |
510 | + new_reply.dst.u.tcp.port = cp->vport; | |
511 | + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE | |
512 | + ", inout cp=" FMT_CONN "\n", | |
513 | + __FUNCTION__, ct, | |
514 | + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), | |
515 | + ARG_CONN(cp)); | |
516 | + goto alter; | |
517 | + } | |
518 | + | |
519 | + /* CLIENT->VS */ | |
520 | + cp = ip_vs_conn_in_get(orig->dst.protonum, | |
521 | + orig->src.u3.ip, orig->src.u.tcp.port, | |
522 | + orig->dst.u3.ip, orig->dst.u.tcp.port); | |
523 | + if (cp) { | |
524 | + /* Change reply VS->CLIENT to RS->CLIENT */ | |
525 | + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; | |
526 | + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " FMT_TUPLE | |
527 | + ", found outin cp=" FMT_CONN "\n", | |
528 | + __FUNCTION__, ct, ct->status, | |
529 | + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), | |
530 | + ARG_CONN(cp)); | |
531 | + new_reply.src.u3.ip = cp->daddr; | |
532 | + new_reply.src.u.tcp.port = cp->dport; | |
533 | + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE | |
534 | + ", outin cp=" FMT_CONN "\n", | |
535 | + __FUNCTION__, ct, | |
536 | + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), | |
537 | + ARG_CONN(cp)); | |
538 | + goto alter; | |
539 | + } | |
540 | + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE " - unknown expect\n", | |
541 | + __FUNCTION__, ct, ct->status, ARG_TUPLE(orig)); | |
542 | + return; | |
543 | + | |
544 | +alter: | |
545 | + | |
546 | + /* Never alter conntrack for non-NAT conns */ | |
547 | + if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) | |
548 | + nf_conntrack_alter_reply(ct, &new_reply); | |
549 | + ip_vs_conn_put(cp); | |
550 | + return; | |
551 | +} | |
552 | + | |
553 | +/* | |
554 | + * Create NF conntrack expectation with wildcard (optional) source port. | |
555 | + * Then the default callback function will alter the reply and will confirm | |
556 | + * the conntrack entry when the first packet comes. | |
557 | + */ | |
558 | +void ip_vs_nfct_expect_related(struct sk_buff *skb, struct ip_vs_conn *cp, | |
559 | + __be16 port, __u16 proto, int from_rs) | |
560 | +{ | |
561 | + struct nf_conn *ct = (struct nf_conn *) skb->nfct; | |
562 | + struct nf_conntrack_expect *e; | |
563 | + | |
564 | + if (!sysctl_ip_vs_conntrack) | |
565 | + return; | |
566 | + | |
567 | + if (!ct) { | |
568 | + IP_VS_DBG(7, "%s: ct=%p for cp=" FMT_CONN "\n", | |
569 | + __FUNCTION__, ct, ARG_CONN(cp)); | |
570 | + return; | |
571 | + } | |
572 | + | |
573 | + if (!(e = nf_ct_expect_alloc(ct))) | |
574 | + return; | |
575 | + | |
576 | + e->expectfn = ip_vs_nfct_expect_callback; | |
577 | + e->helper = NULL; | |
578 | + e->flags = 0; | |
579 | + memset(&e->tuple, 0, sizeof(e->tuple)); | |
580 | + e->tuple.src.u.tcp.port = port; | |
581 | + e->tuple.src.l3num = PF_INET; | |
582 | + e->tuple.dst.protonum = proto; | |
583 | + memset(&e->mask, 0, sizeof(e->mask)); | |
584 | + e->mask.src.u3.ip = 0xffffffff; | |
585 | + e->mask.src.u.all = port? 0xffff : 0; | |
586 | + | |
587 | + if (from_rs) { | |
588 | + e->tuple.src.u3.ip = cp->daddr; | |
589 | + e->tuple.dst.u3.ip = cp->caddr; | |
590 | + e->tuple.dst.u.tcp.port = cp->cport; | |
591 | + } else { | |
592 | + e->tuple.src.u3.ip = cp->caddr; | |
593 | + e->tuple.dst.u3.ip = cp->vaddr; | |
594 | + e->tuple.dst.u.tcp.port = cp->vport; | |
595 | + } | |
596 | + | |
597 | + IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", | |
598 | + __FUNCTION__, ct, ARG_TUPLE(&e->tuple)); | |
599 | + nf_ct_expect_related(e); | |
600 | + nf_ct_expect_put(e); | |
601 | +} | |
602 | + | |
603 | +/* | |
604 | + * Our connection was terminated, try to drop the conntrack immediately | |
605 | + */ | |
606 | +void ip_vs_nfct_conn_drop(struct ip_vs_conn *cp) | |
607 | +{ | |
608 | + struct nf_conntrack_tuple_hash *h; | |
609 | + struct nf_conn *ct; | |
610 | + struct nf_conntrack_tuple tuple; | |
611 | + | |
612 | + if (!cp->cport) | |
613 | + return; | |
614 | + | |
615 | + tuple = (struct nf_conntrack_tuple) { | |
616 | + .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } }; | |
617 | + tuple.src.u3.ip = cp->caddr; | |
618 | + tuple.src.u.all = cp->cport; | |
619 | + tuple.src.l3num = PF_INET; | |
620 | + tuple.dst.u3.ip = cp->vaddr; | |
621 | + tuple.dst.u.all = cp->vport; | |
622 | + | |
623 | + IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE | |
624 | + " for conn " FMT_CONN "\n", | |
625 | + __FUNCTION__, ARG_TUPLE(&tuple), ARG_CONN(cp)); | |
626 | + | |
627 | + h = nf_conntrack_find_get(&tuple); | |
628 | + if (h) { | |
629 | + ct = nf_ct_tuplehash_to_ctrack(h); | |
630 | + if (del_timer(&ct->timeout)) { | |
631 | + IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple=" | |
632 | + FMT_TUPLE "\n", | |
633 | + __FUNCTION__, ct, ARG_TUPLE(&tuple)); | |
634 | + if (ct->timeout.function) | |
635 | + ct->timeout.function(ct->timeout.data); | |
636 | + } else { | |
637 | + IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" | |
638 | + FMT_TUPLE "\n", | |
639 | + __FUNCTION__, ct, ARG_TUPLE(&tuple)); | |
640 | + } | |
641 | + nf_ct_put(ct); | |
642 | + } else { | |
643 | + IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", | |
644 | + __FUNCTION__, ARG_TUPLE(&tuple)); | |
645 | + } | |
646 | +} | |
647 | + | |
6d4e1af8 | 648 | diff -urNp v2.6.25/linux/net/ipv4/ipvs/ip_vs_xmit.c linux/net/ipv4/ipvs/ip_vs_xmit.c |
649 | --- v2.6.25/linux/net/ipv4/ipvs/ip_vs_xmit.c 2008-04-17 09:58:09.000000000 +0300 | |
650 | +++ linux/net/ipv4/ipvs/ip_vs_xmit.c 2008-04-19 20:04:42.000000000 +0300 | |
db744e5b | 651 | @@ -141,7 +141,6 @@ int |
652 | ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, | |
653 | struct ip_vs_protocol *pp) | |
654 | { | |
655 | - /* we do not touch skb and do not need pskb ptr */ | |
656 | return NF_ACCEPT; | |
657 | } | |
658 | ||
659 | @@ -199,6 +198,9 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s | |
660 | dst_release(skb->dst); | |
661 | skb->dst = &rt->u.dst; | |
662 | ||
6d4e1af8 | 663 | + if (!ip_vs_confirm_conntrack(skb, cp, NF_INET_LOCAL_IN)) |
db744e5b | 664 | + goto tx_error_out; |
665 | + | |
666 | /* Another hack: avoid icmp_send in ip_fragment */ | |
667 | skb->local_df = 1; | |
668 | ||
669 | @@ -211,6 +213,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s | |
670 | dst_link_failure(skb); | |
671 | tx_error: | |
672 | kfree_skb(skb); | |
673 | + tx_error_out: | |
674 | LeaveFunction(10); | |
675 | return NF_STOLEN; | |
676 | } | |
677 | @@ -263,6 +266,9 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru | |
678 | dst_release(skb->dst); | |
679 | skb->dst = &rt->u.dst; | |
680 | ||
6d4e1af8 | 681 | + if (!ip_vs_confirm_conntrack(skb, cp, NF_INET_LOCAL_IN)) |
db744e5b | 682 | + goto tx_error_out; |
683 | + | |
684 | /* mangle the packet */ | |
685 | if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) | |
686 | goto tx_error; | |
687 | @@ -286,8 +292,9 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru | |
688 | tx_error_icmp: | |
689 | dst_link_failure(skb); | |
690 | tx_error: | |
691 | - LeaveFunction(10); | |
692 | kfree_skb(skb); | |
693 | + tx_error_out: | |
694 | + LeaveFunction(10); | |
695 | return NF_STOLEN; | |
696 | tx_error_put: | |
697 | ip_rt_put(rt); | |
698 | @@ -386,14 +393,17 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s | |
699 | /* fix old IP header checksum */ | |
700 | ip_send_check(old_iph); | |
701 | ||
702 | - skb_push(skb, sizeof(struct iphdr)); | |
703 | - skb_reset_network_header(skb); | |
704 | - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | |
705 | - | |
706 | /* drop old route */ | |
707 | dst_release(skb->dst); | |
708 | skb->dst = &rt->u.dst; | |
709 | ||
6d4e1af8 | 710 | + if (!ip_vs_confirm_conntrack(skb, cp, NF_INET_LOCAL_IN)) |
db744e5b | 711 | + goto tx_error_out; |
712 | + | |
713 | + skb_push(skb, sizeof(struct iphdr)); | |
714 | + skb_reset_network_header(skb); | |
715 | + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | |
716 | + | |
717 | /* | |
718 | * Push down and install the IPIP header. | |
719 | */ | |
6d4e1af8 | 720 | @@ -421,6 +431,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s |
db744e5b | 721 | dst_link_failure(skb); |
722 | tx_error: | |
723 | kfree_skb(skb); | |
724 | + tx_error_out: | |
725 | LeaveFunction(10); | |
726 | return NF_STOLEN; | |
727 | } | |
6d4e1af8 | 728 | @@ -466,6 +477,9 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc |
db744e5b | 729 | dst_release(skb->dst); |
730 | skb->dst = &rt->u.dst; | |
731 | ||
6d4e1af8 | 732 | + if (!ip_vs_confirm_conntrack(skb, cp, NF_INET_LOCAL_IN)) |
db744e5b | 733 | + goto tx_error_out; |
734 | + | |
735 | /* Another hack: avoid icmp_send in ip_fragment */ | |
736 | skb->local_df = 1; | |
737 | ||
6d4e1af8 | 738 | @@ -478,6 +492,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc |
db744e5b | 739 | dst_link_failure(skb); |
740 | tx_error: | |
741 | kfree_skb(skb); | |
742 | + tx_error_out: | |
743 | LeaveFunction(10); | |
744 | return NF_STOLEN; | |
745 | } | |
6d4e1af8 | 746 | @@ -537,6 +552,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, str |
db744e5b | 747 | dst_release(skb->dst); |
748 | skb->dst = &rt->u.dst; | |
749 | ||
750 | + /* TODO: properly alter reply for NFCT */ | |
751 | + | |
752 | ip_vs_nat_icmp(skb, pp, cp, 0); | |
753 | ||
754 | /* Another hack: avoid icmp_send in ip_fragment */ |