]>
Commit | Line | Data |
---|---|---|
8e6b03ae | 1 | diff -urNp v2.6.27/linux/include/net/ip_vs.h linux/include/net/ip_vs.h |
2 | --- v2.6.27/linux/include/net/ip_vs.h 2008-10-11 12:46:15.000000000 +0300 | |
3 | +++ linux/include/net/ip_vs.h 2008-10-11 14:24:47.000000000 +0300 | |
4 | @@ -21,6 +21,13 @@ | |
5 | #include <linux/timer.h> | |
6d4e1af8 | 6 | |
ec625505 | 7 | #include <net/checksum.h> |
db744e5b | 8 | +#include <linux/skbuff.h> |
9 | +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) | |
10 | +#include <net/netfilter/nf_conntrack.h> | |
11 | +#include <net/netfilter/nf_conntrack_core.h> | |
12 | +#include <net/netfilter/nf_conntrack_expect.h> | |
13 | +#include <net/netfilter/nf_conntrack_helper.h> | |
14 | +#endif | |
8e6b03ae | 15 | |
ec625505 AM |
16 | #ifdef CONFIG_IP_VS_DEBUG |
17 | #include <linux/net.h> | |
8e6b03ae | 18 | @@ -474,6 +481,16 @@ extern void ip_vs_init_hash_table(struct |
db744e5b | 19 | */ |
20 | ||
21 | /* | |
22 | + * Netfilter connection tracking | |
23 | + * (from ip_vs_nfct.c) | |
24 | + */ | |
25 | +extern int ip_vs_nfct_confirm(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int hooknum); | |
26 | +extern void ip_vs_nfct_expect_related(struct sk_buff *skb, | |
27 | + struct ip_vs_conn *cp, | |
28 | + __be16 port, __u16 proto, int from_rs); | |
29 | +extern void ip_vs_nfct_conn_drop(struct ip_vs_conn *cp); | |
30 | + | |
31 | +/* | |
32 | * IPVS connection entry hash table | |
33 | */ | |
34 | #ifndef CONFIG_IP_VS_TAB_BITS | |
8e6b03ae | 35 | @@ -643,9 +660,42 @@ extern int sysctl_ip_vs_expire_nodest_co |
db744e5b | 36 | extern int sysctl_ip_vs_expire_quiescent_template; |
37 | extern int sysctl_ip_vs_sync_threshold[2]; | |
38 | extern int sysctl_ip_vs_nat_icmp_send; | |
39 | +extern int sysctl_ip_vs_snat_reroute; | |
40 | extern struct ip_vs_stats ip_vs_stats; | |
ec625505 | 41 | extern const struct ctl_path net_vs_ctl_path[]; |
db744e5b | 42 | |
43 | +#ifdef CONFIG_IP_VS_NFCT | |
44 | + | |
45 | +extern int sysctl_ip_vs_conntrack; | |
46 | + | |
47 | +static inline int ip_vs_use_conntrack(struct sk_buff *skb) | |
48 | +{ | |
8e6b03ae | 49 | + return sysctl_ip_vs_conntrack && skb->nfct; |
db744e5b | 50 | +} |
51 | + | |
52 | +/* Returns boolean and skb is freed on failure */ | |
53 | +static inline int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int hooknum) | |
54 | +{ | |
8e6b03ae | 55 | + if (!ip_vs_use_conntrack(skb)) |
56 | + return 1; | |
57 | + return nf_ct_is_confirmed((struct nf_conn *) skb->nfct) || | |
58 | + ip_vs_nfct_confirm(skb, cp, hooknum); | |
db744e5b | 59 | +} |
60 | + | |
61 | +#else | |
62 | + | |
63 | +static inline int ip_vs_use_conntrack(struct sk_buff *skb) | |
64 | +{ | |
8e6b03ae | 65 | + return 0; |
db744e5b | 66 | +} |
67 | + | |
68 | +static inline int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, unsigned int hooknum) | |
69 | +{ | |
8e6b03ae | 70 | + return 1; |
db744e5b | 71 | +} |
72 | + | |
73 | +#endif | |
74 | + | |
75 | extern struct ip_vs_service * | |
76 | ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport); | |
77 | ||
8e6b03ae | 78 | diff -urNp v2.6.27/linux/net/ipv4/ipvs/Kconfig linux/net/ipv4/ipvs/Kconfig |
79 | --- v2.6.27/linux/net/ipv4/ipvs/Kconfig 2007-07-10 09:18:43.000000000 +0300 | |
80 | +++ linux/net/ipv4/ipvs/Kconfig 2008-10-11 14:19:27.000000000 +0300 | |
db744e5b | 81 | @@ -221,4 +221,12 @@ config IP_VS_FTP |
82 | If you want to compile it in kernel, say Y. To compile it as a | |
83 | module, choose M here. If unsure, say N. | |
84 | ||
85 | +config IP_VS_NFCT | |
86 | + bool "Netfilter connection tracking" | |
87 | + depends on NF_CONNTRACK | |
88 | + ---help--- | |
89 | + The Netfilter connection tracking support allows the IPVS | |
90 | + connection state to be exported to the Netfilter framework | |
91 | + for filtering purposes. | |
92 | + | |
93 | endif # IP_VS | |
8e6b03ae | 94 | diff -urNp v2.6.27/linux/net/ipv4/ipvs/Makefile linux/net/ipv4/ipvs/Makefile |
95 | --- v2.6.27/linux/net/ipv4/ipvs/Makefile 2005-06-18 08:50:52.000000000 +0300 | |
96 | +++ linux/net/ipv4/ipvs/Makefile 2008-10-11 14:19:27.000000000 +0300 | |
db744e5b | 97 | @@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UD |
98 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o | |
99 | ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o | |
100 | ||
101 | +ip_vs-extra_objs-y := | |
102 | +ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o | |
103 | + | |
104 | ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ | |
105 | ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ | |
106 | ip_vs_est.o ip_vs_proto.o \ | |
107 | - $(ip_vs_proto-objs-y) | |
108 | + $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y) | |
109 | ||
110 | ||
111 | # IPVS core | |
8e6b03ae | 112 | diff -urNp v2.6.27/linux/net/ipv4/ipvs/ip_vs_conn.c linux/net/ipv4/ipvs/ip_vs_conn.c |
113 | --- v2.6.27/linux/net/ipv4/ipvs/ip_vs_conn.c 2008-10-11 12:46:16.000000000 +0300 | |
114 | +++ linux/net/ipv4/ipvs/ip_vs_conn.c 2008-10-11 14:19:27.000000000 +0300 | |
115 | @@ -591,6 +591,11 @@ static void ip_vs_conn_expire(unsigned l | |
db744e5b | 116 | if (cp->control) |
117 | ip_vs_control_del(cp); | |
118 | ||
119 | +#ifdef CONFIG_IP_VS_NFCT | |
120 | + if (sysctl_ip_vs_conntrack) | |
121 | + ip_vs_nfct_conn_drop(cp); | |
122 | +#endif | |
123 | + | |
124 | if (unlikely(cp->app != NULL)) | |
125 | ip_vs_unbind_app(cp); | |
126 | ip_vs_unbind_dest(cp); | |
8e6b03ae | 127 | diff -urNp v2.6.27/linux/net/ipv4/ipvs/ip_vs_core.c linux/net/ipv4/ipvs/ip_vs_core.c |
128 | --- v2.6.27/linux/net/ipv4/ipvs/ip_vs_core.c 2008-10-11 12:46:16.000000000 +0300 | |
129 | +++ linux/net/ipv4/ipvs/ip_vs_core.c 2008-10-11 14:19:27.000000000 +0300 | |
130 | @@ -659,6 +659,8 @@ static int ip_vs_out_icmp(struct sk_buff | |
db744e5b | 131 | |
132 | skb->ipvs_property = 1; | |
133 | verdict = NF_ACCEPT; | |
134 | + if (sysctl_ip_vs_snat_reroute && ip_route_me_harder(skb, RTN_LOCAL)) | |
135 | + verdict = NF_DROP; | |
136 | ||
137 | out: | |
138 | __ip_vs_conn_put(cp); | |
8e6b03ae | 139 | @@ -759,19 +761,31 @@ ip_vs_out(unsigned int hooknum, struct s |
db744e5b | 140 | if (!skb_make_writable(skb, ihl)) |
141 | goto drop; | |
142 | ||
143 | + if (!ip_vs_confirm_conntrack(skb, cp, hooknum)) | |
144 | + goto out; | |
145 | + | |
146 | /* mangle the packet */ | |
147 | if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) | |
148 | goto drop; | |
149 | ip_hdr(skb)->saddr = cp->vaddr; | |
150 | ip_send_check(ip_hdr(skb)); | |
151 | ||
152 | + /* | |
153 | + * nf_iterate does not expect change in the skb->dst->dev. | |
154 | + * It looks like it is not fatal to enable this code for hooks | |
155 | + * where our handlers are at the end of the chain list and | |
156 | + * when all next handlers use skb->dst->dev and not outdev. | |
157 | + * It will definitely route properly the inout NAT traffic | |
158 | + * when multiple paths are used. | |
159 | + */ | |
160 | + | |
161 | /* For policy routing, packets originating from this | |
162 | * machine itself may be routed differently to packets | |
163 | * passing through. We want this packet to be routed as | |
164 | * if it came from this machine itself. So re-compute | |
165 | * the routing information. | |
166 | */ | |
167 | - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) | |
168 | + if (sysctl_ip_vs_snat_reroute && ip_route_me_harder(skb, RTN_LOCAL) != 0) | |
169 | goto drop; | |
170 | ||
171 | IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); | |
8e6b03ae | 172 | @@ -786,8 +800,11 @@ ip_vs_out(unsigned int hooknum, struct s |
db744e5b | 173 | return NF_ACCEPT; |
174 | ||
175 | drop: | |
176 | - ip_vs_conn_put(cp); | |
177 | kfree_skb(skb); | |
178 | + | |
179 | + out: | |
180 | + ip_vs_conn_put(cp); | |
181 | + LeaveFunction(11); | |
182 | return NF_STOLEN; | |
183 | } | |
184 | ||
8e6b03ae | 185 | diff -urNp v2.6.27/linux/net/ipv4/ipvs/ip_vs_ctl.c linux/net/ipv4/ipvs/ip_vs_ctl.c |
186 | --- v2.6.27/linux/net/ipv4/ipvs/ip_vs_ctl.c 2008-10-11 12:46:16.000000000 +0300 | |
187 | +++ linux/net/ipv4/ipvs/ip_vs_ctl.c 2008-10-11 14:19:27.000000000 +0300 | |
188 | @@ -79,6 +79,10 @@ int sysctl_ip_vs_expire_nodest_conn = 0; | |
db744e5b | 189 | int sysctl_ip_vs_expire_quiescent_template = 0; |
190 | int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; | |
191 | int sysctl_ip_vs_nat_icmp_send = 0; | |
192 | +int sysctl_ip_vs_snat_reroute = 0; | |
193 | +#ifdef CONFIG_IP_VS_NFCT | |
194 | +int sysctl_ip_vs_conntrack = 0; | |
195 | +#endif | |
196 | ||
197 | ||
198 | #ifdef CONFIG_IP_VS_DEBUG | |
8e6b03ae | 199 | @@ -1457,6 +1461,15 @@ static struct ctl_table vs_vars[] = { |
db744e5b | 200 | .mode = 0644, |
201 | .proc_handler = &proc_dointvec, | |
202 | }, | |
203 | +#ifdef CONFIG_IP_VS_NFCT | |
204 | + { | |
205 | + .procname = "conntrack", | |
206 | + .data = &sysctl_ip_vs_conntrack, | |
207 | + .maxlen = sizeof(int), | |
208 | + .mode = 0644, | |
209 | + .proc_handler = &proc_dointvec, | |
210 | + }, | |
211 | +#endif | |
212 | { | |
213 | .procname = "drop_entry", | |
214 | .data = &sysctl_ip_vs_drop_entry, | |
8e6b03ae | 215 | @@ -1478,6 +1491,13 @@ static struct ctl_table vs_vars[] = { |
db744e5b | 216 | .mode = 0644, |
217 | .proc_handler = &proc_do_defense_mode, | |
218 | }, | |
219 | + { | |
220 | + .procname = "snat_reroute", | |
221 | + .data = &sysctl_ip_vs_snat_reroute, | |
222 | + .maxlen = sizeof(int), | |
223 | + .mode = 0644, | |
224 | + .proc_handler = &proc_dointvec, | |
225 | + }, | |
226 | #if 0 | |
227 | { | |
228 | .procname = "timeout_established", | |
8e6b03ae | 229 | diff -urNp v2.6.27/linux/net/ipv4/ipvs/ip_vs_ftp.c linux/net/ipv4/ipvs/ip_vs_ftp.c |
230 | --- v2.6.27/linux/net/ipv4/ipvs/ip_vs_ftp.c 2008-10-11 12:46:16.000000000 +0300 | |
231 | +++ linux/net/ipv4/ipvs/ip_vs_ftp.c 2008-10-11 14:19:27.000000000 +0300 | |
232 | @@ -193,6 +193,11 @@ static int ip_vs_ftp_out(struct ip_vs_ap | |
db744e5b | 233 | ip_vs_control_add(n_cp, cp); |
234 | } | |
235 | ||
236 | +#ifdef CONFIG_IP_VS_NFCT | |
237 | + if (skb->nfct) | |
238 | + ip_vs_nfct_expect_related(skb, n_cp, 0, IPPROTO_TCP, 0); | |
239 | +#endif | |
240 | + | |
241 | /* | |
242 | * Replace the old passive address with the new one | |
243 | */ | |
8e6b03ae | 244 | @@ -325,6 +330,11 @@ static int ip_vs_ftp_in(struct ip_vs_app |
db744e5b | 245 | ip_vs_control_add(n_cp, cp); |
246 | } | |
247 | ||
248 | +#ifdef CONFIG_IP_VS_NFCT | |
249 | + if (skb->nfct) | |
250 | + ip_vs_nfct_expect_related(skb, n_cp, n_cp->dport, IPPROTO_TCP, 1); | |
251 | +#endif | |
252 | + | |
253 | /* | |
254 | * Move tunnel to listen state | |
255 | */ | |
8e6b03ae | 256 | diff -urNp v2.6.27/linux/net/ipv4/ipvs/ip_vs_nfct.c linux/net/ipv4/ipvs/ip_vs_nfct.c |
257 | --- v2.6.27/linux/net/ipv4/ipvs/ip_vs_nfct.c 1970-01-01 02:00:00.000000000 +0200 | |
258 | +++ linux/net/ipv4/ipvs/ip_vs_nfct.c 2008-10-11 14:19:27.000000000 +0300 | |
259 | @@ -0,0 +1,386 @@ | |
db744e5b | 260 | +/* |
261 | + * ip_vs_nfct.c: Netfilter connection tracking support for IPVS | |
262 | + * | |
263 | + * Portions Copyright (C) 2001-2002 | |
264 | + * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. | |
265 | + * | |
266 | + * Portions Copyright (C) 2003-2008 | |
267 | + * Julian Anastasov | |
268 | + * | |
269 | + * | |
270 | + * This code is free software; you can redistribute it and/or modify | |
271 | + * it under the terms of the GNU General Public License as published by | |
272 | + * the Free Software Foundation; either version 2 of the License, or | |
273 | + * (at your option) any later version. | |
274 | + * | |
275 | + * This program is distributed in the hope that it will be useful, | |
276 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
277 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
278 | + * GNU General Public License for more details. | |
279 | + * | |
280 | + * You should have received a copy of the GNU General Public License | |
281 | + * along with this program; if not, write to the Free Software | |
282 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
283 | + * | |
284 | + * | |
285 | + * Authors: | |
286 | + * Ben North <ben@redfrontdoor.org> | |
287 | + * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels | |
288 | + * | |
289 | + * | |
290 | + * Current status: | |
291 | + * | |
292 | + * - provide conntrack confirmation for new and related connections, by | |
293 | + * this way we can see their proper conntrack state in all hooks | |
294 | + * - support for all forwarding methods, not only NAT | |
295 | + * - FTP support (NAT), ability to support other NAT apps with expectations | |
296 | + * - to correctly create expectations for related NAT connections the proper | |
297 | + * NF conntrack support must be already installed, eg. ip_vs_ftp requires | |
298 | + * nf_conntrack_ftp for the same ports | |
299 | + * | |
300 | + */ | |
301 | + | |
302 | +#include <linux/module.h> | |
303 | +#include <linux/types.h> | |
304 | +#include <linux/kernel.h> | |
305 | +#include <linux/errno.h> | |
306 | +#include <linux/compiler.h> | |
307 | +#include <linux/vmalloc.h> | |
308 | +#include <linux/skbuff.h> | |
309 | +#include <net/ip.h> | |
310 | +#include <linux/netfilter.h> | |
311 | +#include <linux/netfilter_ipv4.h> | |
312 | +#include <net/ip_vs.h> | |
313 | + | |
314 | + | |
315 | +EXPORT_SYMBOL(ip_vs_nfct_expect_related); | |
316 | + | |
317 | + | |
318 | +#define FMT_TUPLE "%u.%u.%u.%u:%u->%u.%u.%u.%u:%u/%u" | |
319 | +#define ARG_TUPLE(t) NIPQUAD((t)->src.u3.ip), ntohs((t)->src.u.all), \ | |
320 | + NIPQUAD((t)->dst.u3.ip), ntohs((t)->dst.u.all), \ | |
321 | + (t)->dst.protonum | |
322 | + | |
323 | +#define FMT_CONN "%u.%u.%u.%u:%u->%u.%u.%u.%u:%u->%u.%u.%u.%u:%u/%u:%u" | |
324 | +#define ARG_CONN(c) NIPQUAD((c)->caddr), ntohs((c)->cport), \ | |
325 | + NIPQUAD((c)->vaddr), ntohs((c)->vport), \ | |
326 | + NIPQUAD((c)->daddr), ntohs((c)->dport), \ | |
327 | + (c)->protocol, (c)->state | |
328 | + | |
329 | +/* Returns boolean and skb is freed on failure */ | |
330 | +static int __ip_vs_nfct_confirm(struct sk_buff *skb, struct ip_vs_conn *cp, | |
331 | + unsigned int hooknum) | |
332 | +{ | |
333 | + /* | |
334 | + * The assumptions: | |
335 | + * - the nfct is !NULL and is not confirmed | |
336 | + * - we are called before any mangle | |
337 | + */ | |
338 | + | |
339 | + struct iphdr *iph = ip_hdr(skb); | |
340 | + struct nf_conn *ct = (struct nf_conn *) skb->nfct; | |
341 | + struct nf_conntrack_tuple new_reply; | |
342 | + int ret = NF_DROP; | |
343 | + __be16 _ports[2], *pptr; | |
344 | +#ifdef CONFIG_IP_VS_DEBUG | |
345 | + struct nf_conntrack_tuple *orig_tup = | |
346 | + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; | |
347 | + struct nf_conntrack_tuple *orig_rep = | |
348 | + &ct->tuplehash[IP_CT_DIR_REPLY].tuple; | |
349 | +#endif | |
350 | +#ifdef CONFIG_NF_NAT_NEEDED | |
351 | + int initialized = !!(ct->status & IPS_NAT_DONE_MASK); | |
352 | +#else | |
353 | + int initialized = 0; | |
354 | +#endif | |
355 | + | |
356 | + IP_VS_DBG(7, "%s: ct=%p, init=%d, tuples=" FMT_TUPLE ", " FMT_TUPLE | |
357 | + ", cp=" FMT_CONN "\n", | |
358 | + __FUNCTION__, ct, initialized, | |
359 | + ARG_TUPLE(orig_tup), ARG_TUPLE(orig_rep), ARG_CONN(cp)); | |
360 | + | |
361 | +#ifdef CONFIG_NF_NAT_NEEDED | |
362 | + /* | |
363 | + * This is really bad, may be we are trying to alter DNAT conn? | |
364 | + * This is not supported, avoid the confirmation. | |
365 | + */ | |
366 | + if (initialized && ct->status & IPS_NAT_MASK) { | |
367 | +#ifdef CONFIG_IP_VS_DEBUG | |
368 | + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, init=%d\n", | |
369 | + __FUNCTION__, ct, ct->status, initialized); | |
370 | +#endif | |
371 | + return 1; | |
372 | + } | |
373 | +#endif | |
374 | + | |
6d4e1af8 | 375 | + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ || NF_INET_FORWARD == hooknum) |
db744e5b | 376 | + goto confirm; |
377 | + | |
378 | + /* | |
379 | + * Alter reply only for IP_VS_CONN_F_MASQ in outin direction. | |
380 | + * For related connections in inout direction it is done in | |
381 | + * expectfn callback. | |
382 | + */ | |
383 | + | |
384 | + pptr = skb_header_pointer(skb, ip_hdrlen(skb), | |
385 | + sizeof(_ports), _ports); | |
386 | + if (!pptr) | |
387 | + goto out; | |
388 | + | |
389 | + new_reply = (struct nf_conntrack_tuple) { | |
390 | + .dst = { .protonum = iph->protocol, .dir = IP_CT_DIR_REPLY }}; | |
391 | + | |
392 | + new_reply.src.u3.ip = cp->daddr; | |
393 | + new_reply.src.u.tcp.port = cp->dport; | |
394 | + new_reply.src.l3num = PF_INET; | |
395 | + new_reply.dst.u3.ip = iph->saddr; | |
396 | + new_reply.dst.u.tcp.port = pptr[0]; | |
397 | + | |
398 | + nf_conntrack_alter_reply(ct, &new_reply); | |
399 | + | |
400 | + IP_VS_DBG(7, "%s: ct=%p, init=%d, orig=" FMT_TUPLE | |
401 | + ", new_reply=" FMT_TUPLE " => alter_reply\n", | |
402 | + __FUNCTION__, ct, initialized, | |
403 | + ARG_TUPLE(orig_tup), ARG_TUPLE(&new_reply)); | |
404 | + | |
405 | + /* | |
406 | + * No need to rehash NAT info because we don't change source | |
407 | + * address in original direction | |
408 | + */ | |
409 | + | |
410 | +confirm: | |
411 | + | |
412 | + ret = __nf_conntrack_confirm(skb); | |
413 | + | |
414 | + if (ret != NF_STOLEN) { | |
415 | + IP_VS_DBG(7, "%s: ct=%p, init=%d, orig=" FMT_TUPLE " => confirm ret=%d\n", | |
416 | + __FUNCTION__, ct, initialized, ARG_TUPLE(orig_tup), ret); | |
417 | + } | |
418 | + | |
419 | + if (ret != NF_ACCEPT) | |
420 | + goto out; | |
421 | + return 1; | |
422 | + | |
423 | +out: | |
424 | + if (ret != NF_STOLEN) | |
425 | + kfree_skb(skb); | |
426 | + return 0; | |
427 | +} | |
428 | + | |
429 | +/* | |
430 | + * Confirm (and optionally alter) the conntrack entry if needed | |
431 | + * because the IPVS packets do not reach ipv4_confirm. | |
432 | + */ | |
433 | +int ip_vs_nfct_confirm(struct sk_buff *skb, struct ip_vs_conn *cp, | |
434 | + unsigned int hooknum) | |
435 | +{ | |
436 | + struct iphdr *iph = ip_hdr(skb); | |
437 | + struct nf_conn *ct = (struct nf_conn *) skb->nfct; | |
438 | + | |
439 | + /* By the time we're sending the packet out the other | |
440 | + * side, there should be a confirmed Netfilter CT entry | |
441 | + * for this connection. This may not be the case, | |
442 | + * however, if it's a brand new connection, or if the NF | |
443 | + * entry has timed out before ours has. Either way, if | |
444 | + * the NF CT entry is unconfirmed, confirm it, and deal | |
445 | + * with reply tuple mangling at the same time. | |
446 | + */ | |
447 | + | |
448 | + /* We only deal with TCP or UDP packets */ | |
449 | + if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_UDP) | |
450 | + return 1; | |
451 | + | |
452 | + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { | |
453 | + /* | |
454 | + * Do not be surprised if non-NAT conntracks stay in SYN_SENT | |
455 | + * state, may be the replies from the real server go | |
456 | + * directly to client. In any case, keep them in REPLIED | |
457 | + * state (ESTABLISHED). | |
458 | + */ | |
459 | + if (iph->protocol != IPPROTO_TCP || | |
460 | + IP_VS_TCP_S_ESTABLISHED == cp->state) { | |
461 | + set_bit(IPS_SEEN_REPLY_BIT, &ct->status); | |
462 | + } | |
463 | + } | |
464 | + | |
465 | + /* | |
466 | + * We assume the reused connections do not change their rip:rport | |
467 | + * and we do not need to alter their conntrack reply | |
468 | + */ | |
469 | + return __ip_vs_nfct_confirm(skb, cp, hooknum); | |
470 | +} | |
471 | + | |
472 | +/* | |
473 | + * We are called from init_conntrack() as expectfn handler | |
474 | + */ | |
475 | + | |
476 | +static void ip_vs_nfct_expect_callback(struct nf_conn *ct, | |
477 | + struct nf_conntrack_expect *exp) | |
478 | +{ | |
479 | + struct nf_conntrack_tuple *orig, new_reply; | |
480 | + struct ip_vs_conn *cp; | |
481 | + | |
482 | + if (exp->tuple.src.l3num != PF_INET) | |
483 | + return; | |
484 | + | |
485 | + /* | |
486 | + * - We assume that no NF locks are held before this callback | |
487 | + * - ip_vs_conn_out_get and ip_vs_conn_in_get should match their | |
488 | + * expectations even if they use wildcard values, now we provide | |
489 | + * the actual values from the newly created original conntrack direction | |
490 | + * - the conntrack is confirmed when packet reaches IPVS hooks | |
491 | + */ | |
492 | + | |
493 | + /* RS->CLIENT */ | |
494 | + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; | |
495 | + cp = ip_vs_conn_out_get(orig->dst.protonum, | |
496 | + orig->src.u3.ip, orig->src.u.tcp.port, | |
497 | + orig->dst.u3.ip, orig->dst.u.tcp.port); | |
498 | + if (cp) { | |
499 | + /* Change reply CLIENT->RS to CLIENT->VS */ | |
500 | + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; | |
501 | + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " FMT_TUPLE | |
502 | + ", found inout cp=" FMT_CONN "\n", | |
503 | + __FUNCTION__, ct, ct->status, | |
504 | + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), | |
505 | + ARG_CONN(cp)); | |
506 | + new_reply.dst.u3.ip = cp->vaddr; | |
507 | + new_reply.dst.u.tcp.port = cp->vport; | |
508 | + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE | |
509 | + ", inout cp=" FMT_CONN "\n", | |
510 | + __FUNCTION__, ct, | |
511 | + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), | |
512 | + ARG_CONN(cp)); | |
513 | + goto alter; | |
514 | + } | |
515 | + | |
516 | + /* CLIENT->VS */ | |
517 | + cp = ip_vs_conn_in_get(orig->dst.protonum, | |
518 | + orig->src.u3.ip, orig->src.u.tcp.port, | |
519 | + orig->dst.u3.ip, orig->dst.u.tcp.port); | |
520 | + if (cp) { | |
521 | + /* Change reply VS->CLIENT to RS->CLIENT */ | |
522 | + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; | |
523 | + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " FMT_TUPLE | |
524 | + ", found outin cp=" FMT_CONN "\n", | |
525 | + __FUNCTION__, ct, ct->status, | |
526 | + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), | |
527 | + ARG_CONN(cp)); | |
528 | + new_reply.src.u3.ip = cp->daddr; | |
529 | + new_reply.src.u.tcp.port = cp->dport; | |
530 | + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE | |
531 | + ", outin cp=" FMT_CONN "\n", | |
532 | + __FUNCTION__, ct, | |
533 | + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), | |
534 | + ARG_CONN(cp)); | |
535 | + goto alter; | |
536 | + } | |
537 | + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE " - unknown expect\n", | |
538 | + __FUNCTION__, ct, ct->status, ARG_TUPLE(orig)); | |
539 | + return; | |
540 | + | |
541 | +alter: | |
542 | + | |
543 | + /* Never alter conntrack for non-NAT conns */ | |
544 | + if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) | |
545 | + nf_conntrack_alter_reply(ct, &new_reply); | |
546 | + ip_vs_conn_put(cp); | |
547 | + return; | |
548 | +} | |
549 | + | |
550 | +/* | |
551 | + * Create NF conntrack expectation with wildcard (optional) source port. | |
552 | + * Then the default callback function will alter the reply and will confirm | |
553 | + * the conntrack entry when the first packet comes. | |
554 | + */ | |
555 | +void ip_vs_nfct_expect_related(struct sk_buff *skb, struct ip_vs_conn *cp, | |
556 | + __be16 port, __u16 proto, int from_rs) | |
557 | +{ | |
558 | + struct nf_conn *ct = (struct nf_conn *) skb->nfct; | |
559 | + struct nf_conntrack_expect *e; | |
560 | + | |
561 | + if (!sysctl_ip_vs_conntrack) | |
562 | + return; | |
563 | + | |
564 | + if (!ct) { | |
565 | + IP_VS_DBG(7, "%s: ct=%p for cp=" FMT_CONN "\n", | |
566 | + __FUNCTION__, ct, ARG_CONN(cp)); | |
567 | + return; | |
568 | + } | |
569 | + | |
570 | + if (!(e = nf_ct_expect_alloc(ct))) | |
571 | + return; | |
572 | + | |
573 | + e->expectfn = ip_vs_nfct_expect_callback; | |
574 | + e->helper = NULL; | |
575 | + e->flags = 0; | |
8e6b03ae | 576 | + e->class = NF_CT_EXPECT_CLASS_DEFAULT; |
db744e5b | 577 | + memset(&e->tuple, 0, sizeof(e->tuple)); |
578 | + e->tuple.src.u.tcp.port = port; | |
579 | + e->tuple.src.l3num = PF_INET; | |
580 | + e->tuple.dst.protonum = proto; | |
581 | + memset(&e->mask, 0, sizeof(e->mask)); | |
582 | + e->mask.src.u3.ip = 0xffffffff; | |
583 | + e->mask.src.u.all = port? 0xffff : 0; | |
584 | + | |
585 | + if (from_rs) { | |
586 | + e->tuple.src.u3.ip = cp->daddr; | |
587 | + e->tuple.dst.u3.ip = cp->caddr; | |
588 | + e->tuple.dst.u.tcp.port = cp->cport; | |
589 | + } else { | |
590 | + e->tuple.src.u3.ip = cp->caddr; | |
591 | + e->tuple.dst.u3.ip = cp->vaddr; | |
592 | + e->tuple.dst.u.tcp.port = cp->vport; | |
593 | + } | |
594 | + | |
595 | + IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", | |
596 | + __FUNCTION__, ct, ARG_TUPLE(&e->tuple)); | |
597 | + nf_ct_expect_related(e); | |
598 | + nf_ct_expect_put(e); | |
599 | +} | |
600 | + | |
601 | +/* | |
602 | + * Our connection was terminated, try to drop the conntrack immediately | |
603 | + */ | |
604 | +void ip_vs_nfct_conn_drop(struct ip_vs_conn *cp) | |
605 | +{ | |
606 | + struct nf_conntrack_tuple_hash *h; | |
607 | + struct nf_conn *ct; | |
608 | + struct nf_conntrack_tuple tuple; | |
609 | + | |
610 | + if (!cp->cport) | |
611 | + return; | |
612 | + | |
613 | + tuple = (struct nf_conntrack_tuple) { | |
614 | + .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } }; | |
615 | + tuple.src.u3.ip = cp->caddr; | |
616 | + tuple.src.u.all = cp->cport; | |
617 | + tuple.src.l3num = PF_INET; | |
618 | + tuple.dst.u3.ip = cp->vaddr; | |
619 | + tuple.dst.u.all = cp->vport; | |
620 | + | |
621 | + IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE | |
622 | + " for conn " FMT_CONN "\n", | |
623 | + __FUNCTION__, ARG_TUPLE(&tuple), ARG_CONN(cp)); | |
624 | + | |
625 | + h = nf_conntrack_find_get(&tuple); | |
626 | + if (h) { | |
627 | + ct = nf_ct_tuplehash_to_ctrack(h); | |
628 | + if (del_timer(&ct->timeout)) { | |
629 | + IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple=" | |
630 | + FMT_TUPLE "\n", | |
631 | + __FUNCTION__, ct, ARG_TUPLE(&tuple)); | |
632 | + if (ct->timeout.function) | |
633 | + ct->timeout.function(ct->timeout.data); | |
634 | + } else { | |
635 | + IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" | |
636 | + FMT_TUPLE "\n", | |
637 | + __FUNCTION__, ct, ARG_TUPLE(&tuple)); | |
638 | + } | |
639 | + nf_ct_put(ct); | |
640 | + } else { | |
641 | + IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", | |
642 | + __FUNCTION__, ARG_TUPLE(&tuple)); | |
643 | + } | |
644 | +} | |
645 | + | |
8e6b03ae | 646 | diff -urNp v2.6.27/linux/net/ipv4/ipvs/ip_vs_xmit.c linux/net/ipv4/ipvs/ip_vs_xmit.c |
647 | --- v2.6.27/linux/net/ipv4/ipvs/ip_vs_xmit.c 2008-10-11 12:46:16.000000000 +0300 | |
648 | +++ linux/net/ipv4/ipvs/ip_vs_xmit.c 2008-10-11 14:19:27.000000000 +0300 | |
649 | @@ -139,7 +139,6 @@ int | |
db744e5b | 650 | ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, |
651 | struct ip_vs_protocol *pp) | |
652 | { | |
653 | - /* we do not touch skb and do not need pskb ptr */ | |
654 | return NF_ACCEPT; | |
655 | } | |
656 | ||
8e6b03ae | 657 | @@ -197,6 +196,9 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s |
db744e5b | 658 | dst_release(skb->dst); |
659 | skb->dst = &rt->u.dst; | |
660 | ||
6d4e1af8 | 661 | + if (!ip_vs_confirm_conntrack(skb, cp, NF_INET_LOCAL_IN)) |
db744e5b | 662 | + goto tx_error_out; |
663 | + | |
664 | /* Another hack: avoid icmp_send in ip_fragment */ | |
665 | skb->local_df = 1; | |
666 | ||
8e6b03ae | 667 | @@ -209,6 +211,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s |
db744e5b | 668 | dst_link_failure(skb); |
669 | tx_error: | |
670 | kfree_skb(skb); | |
671 | + tx_error_out: | |
672 | LeaveFunction(10); | |
673 | return NF_STOLEN; | |
674 | } | |
8e6b03ae | 675 | @@ -261,6 +264,9 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru |
db744e5b | 676 | dst_release(skb->dst); |
677 | skb->dst = &rt->u.dst; | |
678 | ||
6d4e1af8 | 679 | + if (!ip_vs_confirm_conntrack(skb, cp, NF_INET_LOCAL_IN)) |
db744e5b | 680 | + goto tx_error_out; |
681 | + | |
682 | /* mangle the packet */ | |
683 | if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) | |
684 | goto tx_error; | |
8e6b03ae | 685 | @@ -284,8 +290,9 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru |
db744e5b | 686 | tx_error_icmp: |
687 | dst_link_failure(skb); | |
688 | tx_error: | |
689 | - LeaveFunction(10); | |
690 | kfree_skb(skb); | |
691 | + tx_error_out: | |
692 | + LeaveFunction(10); | |
693 | return NF_STOLEN; | |
694 | tx_error_put: | |
695 | ip_rt_put(rt); | |
8e6b03ae | 696 | @@ -384,14 +391,17 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s |
db744e5b | 697 | /* fix old IP header checksum */ |
698 | ip_send_check(old_iph); | |
699 | ||
700 | - skb_push(skb, sizeof(struct iphdr)); | |
701 | - skb_reset_network_header(skb); | |
702 | - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | |
703 | - | |
704 | /* drop old route */ | |
705 | dst_release(skb->dst); | |
706 | skb->dst = &rt->u.dst; | |
707 | ||
6d4e1af8 | 708 | + if (!ip_vs_confirm_conntrack(skb, cp, NF_INET_LOCAL_IN)) |
db744e5b | 709 | + goto tx_error_out; |
710 | + | |
711 | + skb_push(skb, sizeof(struct iphdr)); | |
712 | + skb_reset_network_header(skb); | |
713 | + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); | |
714 | + | |
715 | /* | |
716 | * Push down and install the IPIP header. | |
717 | */ | |
8e6b03ae | 718 | @@ -419,6 +429,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s |
db744e5b | 719 | dst_link_failure(skb); |
720 | tx_error: | |
721 | kfree_skb(skb); | |
722 | + tx_error_out: | |
723 | LeaveFunction(10); | |
724 | return NF_STOLEN; | |
725 | } | |
8e6b03ae | 726 | @@ -464,6 +475,9 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc |
db744e5b | 727 | dst_release(skb->dst); |
728 | skb->dst = &rt->u.dst; | |
729 | ||
6d4e1af8 | 730 | + if (!ip_vs_confirm_conntrack(skb, cp, NF_INET_LOCAL_IN)) |
db744e5b | 731 | + goto tx_error_out; |
732 | + | |
733 | /* Another hack: avoid icmp_send in ip_fragment */ | |
734 | skb->local_df = 1; | |
735 | ||
8e6b03ae | 736 | @@ -476,6 +490,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc |
db744e5b | 737 | dst_link_failure(skb); |
738 | tx_error: | |
739 | kfree_skb(skb); | |
740 | + tx_error_out: | |
741 | LeaveFunction(10); | |
742 | return NF_STOLEN; | |
743 | } | |
8e6b03ae | 744 | @@ -535,6 +550,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, str |
db744e5b | 745 | dst_release(skb->dst); |
746 | skb->dst = &rt->u.dst; | |
747 | ||
748 | + /* TODO: properly alter reply for NFCT */ | |
749 | + | |
750 | ip_vs_nat_icmp(skb, pp, cp, 0); | |
751 | ||
752 | /* Another hack: avoid icmp_send in ip_fragment */ |