]> git.pld-linux.org Git - packages/kernel.git/blame - 2.6.0-t11-pptp-conntrack-nat.patch
- obsolete
[packages/kernel.git] / 2.6.0-t11-pptp-conntrack-nat.patch
CommitLineData
f7a90c55 1diff -Nur linux-2.6.0-test11.org/include/linux/netfilter_ipv4/ip_conntrack_tuple.h linux-2.6.0-test11/include/linux/netfilter_ipv4/ip_conntrack_tuple.h
2--- linux-2.6.0-test11.org/include/linux/netfilter_ipv4/ip_conntrack_tuple.h 2003-11-26 21:44:58.000000000 +0100
3+++ linux-2.6.0-test11/include/linux/netfilter_ipv4/ip_conntrack_tuple.h 2003-12-17 14:02:02.000000000 +0100
4@@ -14,7 +14,7 @@
5 union ip_conntrack_manip_proto
6 {
7 /* Add other protocols here. */
8- u_int16_t all;
9+ u_int32_t all;
10
11 struct {
12 u_int16_t port;
13@@ -25,6 +25,9 @@
14 struct {
15 u_int16_t id;
16 } icmp;
17+ struct {
18+ u_int32_t key;
19+ } gre;
20 };
21
22 /* The manipulable part of the tuple. */
23@@ -44,7 +47,7 @@
24 u_int32_t ip;
25 union {
26 /* Add other protocols here. */
27- u_int16_t all;
28+ u_int64_t all;
29
30 struct {
31 u_int16_t port;
32@@ -55,6 +58,11 @@
33 struct {
34 u_int8_t type, code;
35 } icmp;
36+ struct {
37+ u_int16_t protocol;
38+ u_int8_t version;
39+ u_int32_t key;
40+ } gre;
41 } u;
42
43 /* The protocol. */
44@@ -80,10 +88,16 @@
45 #ifdef __KERNEL__
46
47 #define DUMP_TUPLE(tp) \
48-DEBUGP("tuple %p: %u %u.%u.%u.%u:%hu -> %u.%u.%u.%u:%hu\n", \
49+DEBUGP("tuple %p: %u %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n", \
50 (tp), (tp)->dst.protonum, \
51- NIPQUAD((tp)->src.ip), ntohs((tp)->src.u.all), \
52- NIPQUAD((tp)->dst.ip), ntohs((tp)->dst.u.all))
53+ NIPQUAD((tp)->src.ip), ntohl((tp)->src.u.all), \
54+ NIPQUAD((tp)->dst.ip), ntohl((tp)->dst.u.all))
55+
56+#define DUMP_TUPLE_RAW(x) \
57+ DEBUGP("tuple %p: %u %u.%u.%u.%u:0x%08x -> %u.%u.%u.%u:0x%08x\n",\
58+ (x), (x)->dst.protonum, \
59+ NIPQUAD((x)->src.ip), ntohl((x)->src.u.all), \
60+ NIPQUAD((x)->dst.ip), ntohl((x)->dst.u.all))
61
62 #define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL)
63
64diff -Nur linux-2.6.0-test11.org/include/linux/netfilter_ipv4/ip_conntrack_tuple.h.orig linux-2.6.0-test11/include/linux/netfilter_ipv4/ip_conntrack_tuple.h.orig
65--- linux-2.6.0-test11.org/include/linux/netfilter_ipv4/ip_conntrack_tuple.h.orig 1970-01-01 01:00:00.000000000 +0100
66+++ linux-2.6.0-test11/include/linux/netfilter_ipv4/ip_conntrack_tuple.h.orig 2003-11-26 21:44:58.000000000 +0100
67@@ -0,0 +1,139 @@
68+#ifndef _IP_CONNTRACK_TUPLE_H
69+#define _IP_CONNTRACK_TUPLE_H
70+
71+/* A `tuple' is a structure containing the information to uniquely
72+ identify a connection. ie. if two packets have the same tuple, they
73+ are in the same connection; if not, they are not.
74+
75+ We divide the structure along "manipulatable" and
76+ "non-manipulatable" lines, for the benefit of the NAT code.
77+*/
78+
79+/* The protocol-specific manipulable parts of the tuple: always in
80+ network order! */
81+union ip_conntrack_manip_proto
82+{
83+ /* Add other protocols here. */
84+ u_int16_t all;
85+
86+ struct {
87+ u_int16_t port;
88+ } tcp;
89+ struct {
90+ u_int16_t port;
91+ } udp;
92+ struct {
93+ u_int16_t id;
94+ } icmp;
95+};
96+
97+/* The manipulable part of the tuple. */
98+struct ip_conntrack_manip
99+{
100+ u_int32_t ip;
101+ union ip_conntrack_manip_proto u;
102+};
103+
104+/* This contains the information to distinguish a connection. */
105+struct ip_conntrack_tuple
106+{
107+ struct ip_conntrack_manip src;
108+
109+ /* These are the parts of the tuple which are fixed. */
110+ struct {
111+ u_int32_t ip;
112+ union {
113+ /* Add other protocols here. */
114+ u_int16_t all;
115+
116+ struct {
117+ u_int16_t port;
118+ } tcp;
119+ struct {
120+ u_int16_t port;
121+ } udp;
122+ struct {
123+ u_int8_t type, code;
124+ } icmp;
125+ } u;
126+
127+ /* The protocol. */
128+ u_int16_t protonum;
129+ } dst;
130+};
131+
132+/* This is optimized opposed to a memset of the whole structure. Everything we
133+ * really care about is the source/destination unions */
134+#define IP_CT_TUPLE_U_BLANK(tuple) \
135+ do { \
136+ (tuple)->src.u.all = 0; \
137+ (tuple)->dst.u.all = 0; \
138+ } while (0)
139+
140+enum ip_conntrack_dir
141+{
142+ IP_CT_DIR_ORIGINAL,
143+ IP_CT_DIR_REPLY,
144+ IP_CT_DIR_MAX
145+};
146+
147+#ifdef __KERNEL__
148+
149+#define DUMP_TUPLE(tp) \
150+DEBUGP("tuple %p: %u %u.%u.%u.%u:%hu -> %u.%u.%u.%u:%hu\n", \
151+ (tp), (tp)->dst.protonum, \
152+ NIPQUAD((tp)->src.ip), ntohs((tp)->src.u.all), \
153+ NIPQUAD((tp)->dst.ip), ntohs((tp)->dst.u.all))
154+
155+#define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL)
156+
157+/* If we're the first tuple, it's the original dir. */
158+#define DIRECTION(h) ((enum ip_conntrack_dir)(&(h)->ctrack->tuplehash[1] == (h)))
159+
160+/* Connections have two entries in the hash table: one for each way */
161+struct ip_conntrack_tuple_hash
162+{
163+ struct list_head list;
164+
165+ struct ip_conntrack_tuple tuple;
166+
167+ /* this == &ctrack->tuplehash[DIRECTION(this)]. */
168+ struct ip_conntrack *ctrack;
169+};
170+
171+#endif /* __KERNEL__ */
172+
173+static inline int ip_ct_tuple_src_equal(const struct ip_conntrack_tuple *t1,
174+ const struct ip_conntrack_tuple *t2)
175+{
176+ return t1->src.ip == t2->src.ip
177+ && t1->src.u.all == t2->src.u.all;
178+}
179+
180+static inline int ip_ct_tuple_dst_equal(const struct ip_conntrack_tuple *t1,
181+ const struct ip_conntrack_tuple *t2)
182+{
183+ return t1->dst.ip == t2->dst.ip
184+ && t1->dst.u.all == t2->dst.u.all
185+ && t1->dst.protonum == t2->dst.protonum;
186+}
187+
188+static inline int ip_ct_tuple_equal(const struct ip_conntrack_tuple *t1,
189+ const struct ip_conntrack_tuple *t2)
190+{
191+ return ip_ct_tuple_src_equal(t1, t2) && ip_ct_tuple_dst_equal(t1, t2);
192+}
193+
194+static inline int ip_ct_tuple_mask_cmp(const struct ip_conntrack_tuple *t,
195+ const struct ip_conntrack_tuple *tuple,
196+ const struct ip_conntrack_tuple *mask)
197+{
198+ return !(((t->src.ip ^ tuple->src.ip) & mask->src.ip)
199+ || ((t->dst.ip ^ tuple->dst.ip) & mask->dst.ip)
200+ || ((t->src.u.all ^ tuple->src.u.all) & mask->src.u.all)
201+ || ((t->dst.u.all ^ tuple->dst.u.all) & mask->dst.u.all)
202+ || ((t->dst.protonum ^ tuple->dst.protonum)
203+ & mask->dst.protonum));
204+}
205+
206+#endif /* _IP_CONNTRACK_TUPLE_H */
207diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/Makefile linux-2.6.0-test11/net/ipv4/netfilter/Makefile
208--- linux-2.6.0-test11.org/net/ipv4/netfilter/Makefile 2003-11-26 21:43:25.000000000 +0100
209+++ linux-2.6.0-test11/net/ipv4/netfilter/Makefile 2003-12-17 14:02:02.000000000 +0100
210@@ -19,6 +19,15 @@
211 # connection tracking
212 obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
213
214+# connection tracking protocol helpers
215+obj-$(CONFIG_IP_NF_CT_PROTO_GRE) += ip_conntrack_proto_gre.o
216+ifdef CONFIG_IP_NF_CT_PROTO_GRE
217+ export-objs += ip_conntrack_proto_gre.o
218+endif
219+
220+# NAT protocol helpers
221+obj-$(CONFIG_IP_NF_NAT_PROTO_GRE) += ip_nat_proto_gre.o
222+
223 # connection tracking helpers
224 obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
225 obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
226diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/Makefile.orig linux-2.6.0-test11/net/ipv4/netfilter/Makefile.orig
227--- linux-2.6.0-test11.org/net/ipv4/netfilter/Makefile.orig 1970-01-01 01:00:00.000000000 +0100
228+++ linux-2.6.0-test11/net/ipv4/netfilter/Makefile.orig 2003-11-26 21:43:25.000000000 +0100
229@@ -0,0 +1,96 @@
230+#
231+# Makefile for the netfilter modules on top of IPv4.
232+#
233+
234+# objects for the conntrack and NAT core (used by standalone and backw. compat)
235+ip_nf_conntrack-objs := ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o
236+ip_nf_nat-objs := ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
237+
238+# objects for the standalone - connection tracking / NAT
239+ip_conntrack-objs := ip_conntrack_standalone.o $(ip_nf_conntrack-objs)
240+iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o $(ip_nf_nat-objs)
241+
242+# objects for backwards compatibility mode
243+ip_nf_compat-objs := ip_fw_compat.o ip_fw_compat_redir.o ip_fw_compat_masq.o $(ip_nf_conntrack-objs) $(ip_nf_nat-objs)
244+
245+ipfwadm-objs := $(ip_nf_compat-objs) ipfwadm_core.o
246+ipchains-objs := $(ip_nf_compat-objs) ipchains_core.o
247+
248+# connection tracking
249+obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
250+
251+# connection tracking helpers
252+obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
253+obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
254+obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
255+obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
256+
257+# NAT helpers
258+obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
259+obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
260+obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
261+obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
262+
263+# generic IP tables
264+obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
265+
266+# the three instances of ip_tables
267+obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
268+obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
269+obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
270+
271+# matches
272+obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
273+obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
274+obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
275+obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
276+obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
277+
278+obj-$(CONFIG_IP_NF_MATCH_PKTTYPE) += ipt_pkttype.o
279+obj-$(CONFIG_IP_NF_MATCH_MULTIPORT) += ipt_multiport.o
280+obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o
281+obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o
282+
283+obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o
284+
285+obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
286+obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o
287+obj-$(CONFIG_IP_NF_MATCH_AH_ESP) += ipt_ah.o ipt_esp.o
288+
289+obj-$(CONFIG_IP_NF_MATCH_LENGTH) += ipt_length.o
290+
291+obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
292+obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
293+obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
294+obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
295+
296+obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
297+
298+# targets
299+obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
300+obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o
301+obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
302+obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o
303+obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o
304+obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
305+obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
306+obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
307+obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o
308+obj-$(CONFIG_IP_NF_TARGET_CLASSIFY) += ipt_CLASSIFY.o
309+obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o
310+obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
311+obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
312+obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
313+
314+# generic ARP tables
315+obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
316+obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
317+
318+# just filtering instance of ARP tables for now
319+obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
320+
321+# backwards compatibility
322+obj-$(CONFIG_IP_NF_COMPAT_IPCHAINS) += ipchains.o
323+obj-$(CONFIG_IP_NF_COMPAT_IPFWADM) += ipfwadm.o
324+
325+obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
326diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c
327--- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c 2003-11-26 21:42:40.000000000 +0100
328+++ linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c 2003-12-17 14:02:02.000000000 +0100
329@@ -150,6 +150,8 @@
330 inverse->dst.ip = orig->src.ip;
331 inverse->dst.protonum = orig->dst.protonum;
332
333+ inverse->src.u.all = inverse->dst.u.all = 0;
334+
335 return protocol->invert_tuple(inverse, orig);
336 }
337
338@@ -925,8 +927,8 @@
339 * so there is no need to use the tuple lock too */
340
341 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
342- DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
343- DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
344+ DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
345+ DEBUGP("mask: "); DUMP_TUPLE_RAW(&expect->mask);
346
347 old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
348 struct ip_conntrack_expect *, &expect->tuple,
349@@ -1051,15 +1053,14 @@
350
351 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
352 WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
353-
354 DEBUGP("change_expect:\n");
355- DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
356- DEBUGP("exp mask: "); DUMP_TUPLE(&expect->mask);
357- DEBUGP("newtuple: "); DUMP_TUPLE(newtuple);
358+ DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
359+ DEBUGP("exp mask: "); DUMP_TUPLE_RAW(&expect->mask);
360+ DEBUGP("newtuple: "); DUMP_TUPLE_RAW(newtuple);
361 if (expect->ct_tuple.dst.protonum == 0) {
362 /* Never seen before */
363 DEBUGP("change expect: never seen before\n");
364- if (!ip_ct_tuple_equal(&expect->tuple, newtuple)
365+ if (!ip_ct_tuple_mask_cmp(&expect->tuple, newtuple, &expect->mask)
366 && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
367 struct ip_conntrack_expect *, newtuple, &expect->mask)) {
368 /* Force NAT to find an unused tuple */
369diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c.orig linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c.orig
370--- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c.orig 1970-01-01 01:00:00.000000000 +0100
371+++ linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c.orig 2003-11-26 21:42:40.000000000 +0100
372@@ -0,0 +1,1430 @@
373+/* Connection state tracking for netfilter. This is separated from,
374+ but required by, the NAT layer; it can also be used by an iptables
375+ extension. */
376+
377+/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
378+ * Public Licence.
379+ *
380+ * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
381+ * - new API and handling of conntrack/nat helpers
382+ * - now capable of multiple expectations for one master
383+ * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
384+ * - add usage/reference counts to ip_conntrack_expect
385+ * - export ip_conntrack[_expect]_{find_get,put} functions
386+ * */
387+
388+#include <linux/config.h>
389+#include <linux/types.h>
390+#include <linux/icmp.h>
391+#include <linux/ip.h>
392+#include <linux/netfilter.h>
393+#include <linux/netfilter_ipv4.h>
394+#include <linux/module.h>
395+#include <linux/skbuff.h>
396+#include <linux/proc_fs.h>
397+#include <linux/vmalloc.h>
398+#include <net/checksum.h>
399+#include <linux/stddef.h>
400+#include <linux/sysctl.h>
401+#include <linux/slab.h>
402+#include <linux/random.h>
403+#include <linux/jhash.h>
404+/* For ERR_PTR(). Yeah, I know... --RR */
405+#include <linux/fs.h>
406+
407+/* This rwlock protects the main hash table, protocol/helper/expected
408+ registrations, conntrack timers*/
409+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
410+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
411+
412+#include <linux/netfilter_ipv4/ip_conntrack.h>
413+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
414+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
415+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
416+#include <linux/netfilter_ipv4/listhelp.h>
417+
418+#define IP_CONNTRACK_VERSION "2.1"
419+
420+#if 0
421+#define DEBUGP printk
422+#else
423+#define DEBUGP(format, args...)
424+#endif
425+
426+DECLARE_RWLOCK(ip_conntrack_lock);
427+DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
428+
429+void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
430+LIST_HEAD(ip_conntrack_expect_list);
431+LIST_HEAD(protocol_list);
432+static LIST_HEAD(helpers);
433+unsigned int ip_conntrack_htable_size = 0;
434+int ip_conntrack_max;
435+static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
436+struct list_head *ip_conntrack_hash;
437+static kmem_cache_t *ip_conntrack_cachep;
438+
439+extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
440+
441+static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
442+ u_int8_t protocol)
443+{
444+ return protocol == curr->proto;
445+}
446+
447+struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
448+{
449+ struct ip_conntrack_protocol *p;
450+
451+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
452+ p = LIST_FIND(&protocol_list, proto_cmpfn,
453+ struct ip_conntrack_protocol *, protocol);
454+ if (!p)
455+ p = &ip_conntrack_generic_protocol;
456+
457+ return p;
458+}
459+
460+struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
461+{
462+ struct ip_conntrack_protocol *p;
463+
464+ READ_LOCK(&ip_conntrack_lock);
465+ p = __ip_ct_find_proto(protocol);
466+ READ_UNLOCK(&ip_conntrack_lock);
467+ return p;
468+}
469+
470+inline void
471+ip_conntrack_put(struct ip_conntrack *ct)
472+{
473+ IP_NF_ASSERT(ct);
474+ IP_NF_ASSERT(ct->infos[0].master);
475+ /* nf_conntrack_put wants to go via an info struct, so feed it
476+ one at random. */
477+ nf_conntrack_put(&ct->infos[0]);
478+}
479+
480+static int ip_conntrack_hash_rnd_initted;
481+static unsigned int ip_conntrack_hash_rnd;
482+
483+static u_int32_t
484+hash_conntrack(const struct ip_conntrack_tuple *tuple)
485+{
486+#if 0
487+ dump_tuple(tuple);
488+#endif
489+ return (jhash_3words(tuple->src.ip,
490+ (tuple->dst.ip ^ tuple->dst.protonum),
491+ (tuple->src.u.all | (tuple->dst.u.all << 16)),
492+ ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
493+}
494+
495+int
496+get_tuple(const struct iphdr *iph,
497+ const struct sk_buff *skb,
498+ unsigned int dataoff,
499+ struct ip_conntrack_tuple *tuple,
500+ const struct ip_conntrack_protocol *protocol)
501+{
502+ /* Never happen */
503+ if (iph->frag_off & htons(IP_OFFSET)) {
504+ printk("ip_conntrack_core: Frag of proto %u.\n",
505+ iph->protocol);
506+ return 0;
507+ }
508+
509+ tuple->src.ip = iph->saddr;
510+ tuple->dst.ip = iph->daddr;
511+ tuple->dst.protonum = iph->protocol;
512+
513+ return protocol->pkt_to_tuple(skb, dataoff, tuple);
514+}
515+
516+static int
517+invert_tuple(struct ip_conntrack_tuple *inverse,
518+ const struct ip_conntrack_tuple *orig,
519+ const struct ip_conntrack_protocol *protocol)
520+{
521+ inverse->src.ip = orig->dst.ip;
522+ inverse->dst.ip = orig->src.ip;
523+ inverse->dst.protonum = orig->dst.protonum;
524+
525+ return protocol->invert_tuple(inverse, orig);
526+}
527+
528+
529+/* ip_conntrack_expect helper functions */
530+
531+/* Compare tuple parts depending on mask. */
532+static inline int expect_cmp(const struct ip_conntrack_expect *i,
533+ const struct ip_conntrack_tuple *tuple)
534+{
535+ MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
536+ return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
537+}
538+
539+static void
540+destroy_expect(struct ip_conntrack_expect *exp)
541+{
542+ DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
543+ IP_NF_ASSERT(atomic_read(&exp->use));
544+ IP_NF_ASSERT(!timer_pending(&exp->timeout));
545+
546+ kfree(exp);
547+}
548+
549+
550+inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
551+{
552+ IP_NF_ASSERT(exp);
553+
554+ if (atomic_dec_and_test(&exp->use)) {
555+ /* usage count dropped to zero */
556+ destroy_expect(exp);
557+ }
558+}
559+
560+static inline struct ip_conntrack_expect *
561+__ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
562+{
563+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
564+ MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
565+ return LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
566+ struct ip_conntrack_expect *, tuple);
567+}
568+
569+/* Find a expectation corresponding to a tuple. */
570+struct ip_conntrack_expect *
571+ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
572+{
573+ struct ip_conntrack_expect *exp;
574+
575+ READ_LOCK(&ip_conntrack_lock);
576+ READ_LOCK(&ip_conntrack_expect_tuple_lock);
577+ exp = __ip_ct_expect_find(tuple);
578+ if (exp)
579+ atomic_inc(&exp->use);
580+ READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
581+ READ_UNLOCK(&ip_conntrack_lock);
582+
583+ return exp;
584+}
585+
586+/* remove one specific expectation from all lists and drop refcount,
587+ * does _NOT_ delete the timer. */
588+static void __unexpect_related(struct ip_conntrack_expect *expect)
589+{
590+ DEBUGP("unexpect_related(%p)\n", expect);
591+ MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
592+
593+ /* we're not allowed to unexpect a confirmed expectation! */
594+ IP_NF_ASSERT(!expect->sibling);
595+
596+ /* delete from global and local lists */
597+ list_del(&expect->list);
598+ list_del(&expect->expected_list);
599+
600+ /* decrement expect-count of master conntrack */
601+ if (expect->expectant)
602+ expect->expectant->expecting--;
603+
604+ ip_conntrack_expect_put(expect);
605+}
606+
607+/* remove one specific expecatation from all lists, drop refcount
608+ * and expire timer.
609+ * This function can _NOT_ be called for confirmed expects! */
610+static void unexpect_related(struct ip_conntrack_expect *expect)
611+{
612+ IP_NF_ASSERT(expect->expectant);
613+ IP_NF_ASSERT(expect->expectant->helper);
614+ /* if we are supposed to have a timer, but we can't delete
615+ * it: race condition. __unexpect_related will
616+ * be calledd by timeout function */
617+ if (expect->expectant->helper->timeout
618+ && !del_timer(&expect->timeout))
619+ return;
620+
621+ __unexpect_related(expect);
622+}
623+
624+/* delete all unconfirmed expectations for this conntrack */
625+static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
626+{
627+ struct list_head *exp_entry, *next;
628+ struct ip_conntrack_expect *exp;
629+
630+ DEBUGP("remove_expectations(%p)\n", ct);
631+
632+ list_for_each_safe(exp_entry, next, &ct->sibling_list) {
633+ exp = list_entry(exp_entry, struct ip_conntrack_expect,
634+ expected_list);
635+
636+ /* we skip established expectations, as we want to delete
637+ * the un-established ones only */
638+ if (exp->sibling) {
639+ DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
640+ if (drop_refcount) {
641+ /* Indicate that this expectations parent is dead */
642+ ip_conntrack_put(exp->expectant);
643+ exp->expectant = NULL;
644+ }
645+ continue;
646+ }
647+
648+ IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
649+ IP_NF_ASSERT(exp->expectant == ct);
650+
651+ /* delete expectation from global and private lists */
652+ unexpect_related(exp);
653+ }
654+}
655+
656+static void
657+clean_from_lists(struct ip_conntrack *ct)
658+{
659+ unsigned int ho, hr;
660+
661+ DEBUGP("clean_from_lists(%p)\n", ct);
662+ MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
663+
664+ ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
665+ hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
666+ LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
667+ LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
668+
669+ /* Destroy all un-established, pending expectations */
670+ remove_expectations(ct, 1);
671+}
672+
673+static void
674+destroy_conntrack(struct nf_conntrack *nfct)
675+{
676+ struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
677+ struct ip_conntrack_protocol *proto;
678+
679+ DEBUGP("destroy_conntrack(%p)\n", ct);
680+ IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
681+ IP_NF_ASSERT(!timer_pending(&ct->timeout));
682+
683+ /* To make sure we don't get any weird locking issues here:
684+ * destroy_conntrack() MUST NOT be called with a write lock
685+ * to ip_conntrack_lock!!! -HW */
686+ proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
687+ if (proto && proto->destroy)
688+ proto->destroy(ct);
689+
690+ if (ip_conntrack_destroyed)
691+ ip_conntrack_destroyed(ct);
692+
693+ WRITE_LOCK(&ip_conntrack_lock);
694+ /* Delete us from our own list to prevent corruption later */
695+ list_del(&ct->sibling_list);
696+
697+ /* Delete our master expectation */
698+ if (ct->master) {
699+ if (ct->master->expectant) {
700+ /* can't call __unexpect_related here,
701+ * since it would screw up expect_list */
702+ list_del(&ct->master->expected_list);
703+ master = ct->master->expectant;
704+ }
705+ kfree(ct->master);
706+ }
707+ WRITE_UNLOCK(&ip_conntrack_lock);
708+
709+ if (master)
710+ ip_conntrack_put(master);
711+
712+ DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
713+ kmem_cache_free(ip_conntrack_cachep, ct);
714+ atomic_dec(&ip_conntrack_count);
715+}
716+
717+static void death_by_timeout(unsigned long ul_conntrack)
718+{
719+ struct ip_conntrack *ct = (void *)ul_conntrack;
720+
721+ WRITE_LOCK(&ip_conntrack_lock);
722+ clean_from_lists(ct);
723+ WRITE_UNLOCK(&ip_conntrack_lock);
724+ ip_conntrack_put(ct);
725+}
726+
727+static inline int
728+conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
729+ const struct ip_conntrack_tuple *tuple,
730+ const struct ip_conntrack *ignored_conntrack)
731+{
732+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
733+ return i->ctrack != ignored_conntrack
734+ && ip_ct_tuple_equal(tuple, &i->tuple);
735+}
736+
737+static struct ip_conntrack_tuple_hash *
738+__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
739+ const struct ip_conntrack *ignored_conntrack)
740+{
741+ struct ip_conntrack_tuple_hash *h;
742+ unsigned int hash = hash_conntrack(tuple);
743+
744+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
745+ h = LIST_FIND(&ip_conntrack_hash[hash],
746+ conntrack_tuple_cmp,
747+ struct ip_conntrack_tuple_hash *,
748+ tuple, ignored_conntrack);
749+ return h;
750+}
751+
752+/* Find a connection corresponding to a tuple. */
753+struct ip_conntrack_tuple_hash *
754+ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
755+ const struct ip_conntrack *ignored_conntrack)
756+{
757+ struct ip_conntrack_tuple_hash *h;
758+
759+ READ_LOCK(&ip_conntrack_lock);
760+ h = __ip_conntrack_find(tuple, ignored_conntrack);
761+ if (h)
762+ atomic_inc(&h->ctrack->ct_general.use);
763+ READ_UNLOCK(&ip_conntrack_lock);
764+
765+ return h;
766+}
767+
768+static inline struct ip_conntrack *
769+__ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
770+{
771+ struct ip_conntrack *ct
772+ = (struct ip_conntrack *)nfct->master;
773+
774+ /* ctinfo is the index of the nfct inside the conntrack */
775+ *ctinfo = nfct - ct->infos;
776+ IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
777+ return ct;
778+}
779+
780+/* Return conntrack and conntrack_info given skb->nfct->master */
781+struct ip_conntrack *
782+ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
783+{
784+ if (skb->nfct)
785+ return __ip_conntrack_get(skb->nfct, ctinfo);
786+ return NULL;
787+}
788+
789+/* Confirm a connection given skb->nfct; places it in hash table */
790+int
791+__ip_conntrack_confirm(struct nf_ct_info *nfct)
792+{
793+ unsigned int hash, repl_hash;
794+ struct ip_conntrack *ct;
795+ enum ip_conntrack_info ctinfo;
796+
797+ ct = __ip_conntrack_get(nfct, &ctinfo);
798+
799+ /* ipt_REJECT uses ip_conntrack_attach to attach related
800+ ICMP/TCP RST packets in other direction. Actual packet
801+ which created connection will be IP_CT_NEW or for an
802+ expected connection, IP_CT_RELATED. */
803+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
804+ return NF_ACCEPT;
805+
806+ hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
807+ repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
808+
809+ /* We're not in hash table, and we refuse to set up related
810+ connections for unconfirmed conns. But packet copies and
811+ REJECT will give spurious warnings here. */
812+ /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
813+
814+ /* No external references means noone else could have
815+ confirmed us. */
816+ IP_NF_ASSERT(!is_confirmed(ct));
817+ DEBUGP("Confirming conntrack %p\n", ct);
818+
819+ WRITE_LOCK(&ip_conntrack_lock);
820+ /* See if there's one in the list already, including reverse:
821+ NAT could have grabbed it without realizing, since we're
822+ not in the hash. If there is, we lost race. */
823+ if (!LIST_FIND(&ip_conntrack_hash[hash],
824+ conntrack_tuple_cmp,
825+ struct ip_conntrack_tuple_hash *,
826+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
827+ && !LIST_FIND(&ip_conntrack_hash[repl_hash],
828+ conntrack_tuple_cmp,
829+ struct ip_conntrack_tuple_hash *,
830+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
831+ list_prepend(&ip_conntrack_hash[hash],
832+ &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
833+ list_prepend(&ip_conntrack_hash[repl_hash],
834+ &ct->tuplehash[IP_CT_DIR_REPLY]);
835+ /* Timer relative to confirmation time, not original
836+ setting time, otherwise we'd get timer wrap in
837+ weird delay cases. */
838+ ct->timeout.expires += jiffies;
839+ add_timer(&ct->timeout);
840+ atomic_inc(&ct->ct_general.use);
841+ set_bit(IPS_CONFIRMED_BIT, &ct->status);
842+ WRITE_UNLOCK(&ip_conntrack_lock);
843+ return NF_ACCEPT;
844+ }
845+
846+ WRITE_UNLOCK(&ip_conntrack_lock);
847+ return NF_DROP;
848+}
849+
850+/* Returns true if a connection correspondings to the tuple (required
851+ for NAT). */
852+int
853+ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
854+ const struct ip_conntrack *ignored_conntrack)
855+{
856+ struct ip_conntrack_tuple_hash *h;
857+
858+ READ_LOCK(&ip_conntrack_lock);
859+ h = __ip_conntrack_find(tuple, ignored_conntrack);
860+ READ_UNLOCK(&ip_conntrack_lock);
861+
862+ return h != NULL;
863+}
864+
865+/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
866+struct ip_conntrack *
867+icmp_error_track(struct sk_buff *skb,
868+ enum ip_conntrack_info *ctinfo,
869+ unsigned int hooknum)
870+{
871+ struct ip_conntrack_tuple innertuple, origtuple;
872+ struct {
873+ struct icmphdr icmp;
874+ struct iphdr ip;
875+ } inside;
876+ struct ip_conntrack_protocol *innerproto;
877+ struct ip_conntrack_tuple_hash *h;
878+ int dataoff;
879+
880+ IP_NF_ASSERT(skb->nfct == NULL);
881+
882+ /* Not enough header? */
883+ if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0)
884+ return NULL;
885+
886+ if (inside.icmp.type != ICMP_DEST_UNREACH
887+ && inside.icmp.type != ICMP_SOURCE_QUENCH
888+ && inside.icmp.type != ICMP_TIME_EXCEEDED
889+ && inside.icmp.type != ICMP_PARAMETERPROB
890+ && inside.icmp.type != ICMP_REDIRECT)
891+ return NULL;
892+
893+ /* Ignore ICMP's containing fragments (shouldn't happen) */
894+ if (inside.ip.frag_off & htons(IP_OFFSET)) {
895+ DEBUGP("icmp_error_track: fragment of proto %u\n",
896+ inside.ip.protocol);
897+ return NULL;
898+ }
899+
900+ innerproto = ip_ct_find_proto(inside.ip.protocol);
901+ dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4;
902+ /* Are they talking about one of our connections? */
903+ if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) {
904+ DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol);
905+ return NULL;
906+ }
907+
908+ /* Ordinarily, we'd expect the inverted tupleproto, but it's
909+ been preserved inside the ICMP. */
910+ if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
911+ DEBUGP("icmp_error_track: Can't invert tuple\n");
912+ return NULL;
913+ }
914+
915+ *ctinfo = IP_CT_RELATED;
916+
917+ h = ip_conntrack_find_get(&innertuple, NULL);
918+ if (!h) {
919+ /* Locally generated ICMPs will match inverted if they
920+ haven't been SNAT'ed yet */
921+ /* FIXME: NAT code has to handle half-done double NAT --RR */
922+ if (hooknum == NF_IP_LOCAL_OUT)
923+ h = ip_conntrack_find_get(&origtuple, NULL);
924+
925+ if (!h) {
926+ DEBUGP("icmp_error_track: no match\n");
927+ return NULL;
928+ }
929+ /* Reverse direction from that found */
930+ if (DIRECTION(h) != IP_CT_DIR_REPLY)
931+ *ctinfo += IP_CT_IS_REPLY;
932+ } else {
933+ if (DIRECTION(h) == IP_CT_DIR_REPLY)
934+ *ctinfo += IP_CT_IS_REPLY;
935+ }
936+
937+ /* Update skb to refer to this connection */
938+ skb->nfct = &h->ctrack->infos[*ctinfo];
939+ return h->ctrack;
940+}
941+
942+/* There's a small race here where we may free a just-assured
943+ connection. Too bad: we're in trouble anyway. */
944+static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
945+{
946+ return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
947+}
948+
949+static int early_drop(struct list_head *chain)
950+{
951+ /* Traverse backwards: gives us oldest, which is roughly LRU */
952+ struct ip_conntrack_tuple_hash *h;
953+ int dropped = 0;
954+
955+ READ_LOCK(&ip_conntrack_lock);
956+ h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
957+ if (h)
958+ atomic_inc(&h->ctrack->ct_general.use);
959+ READ_UNLOCK(&ip_conntrack_lock);
960+
961+ if (!h)
962+ return dropped;
963+
964+ if (del_timer(&h->ctrack->timeout)) {
965+ death_by_timeout((unsigned long)h->ctrack);
966+ dropped = 1;
967+ }
968+ ip_conntrack_put(h->ctrack);
969+ return dropped;
970+}
971+
972+static inline int helper_cmp(const struct ip_conntrack_helper *i,
973+ const struct ip_conntrack_tuple *rtuple)
974+{
975+ return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
976+}
977+
978+struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
979+{
980+ return LIST_FIND(&helpers, helper_cmp,
981+ struct ip_conntrack_helper *,
982+ tuple);
983+}
984+
985+/* Allocate a new conntrack: we return -ENOMEM if classification
986+ failed due to stress. Otherwise it really is unclassifiable. */
987+static struct ip_conntrack_tuple_hash *
988+init_conntrack(const struct ip_conntrack_tuple *tuple,
989+ struct ip_conntrack_protocol *protocol,
990+ struct sk_buff *skb)
991+{
992+ struct ip_conntrack *conntrack;
993+ struct ip_conntrack_tuple repl_tuple;
994+ size_t hash;
995+ struct ip_conntrack_expect *expected;
996+ int i;
997+ static unsigned int drop_next;
998+
999+ if (!ip_conntrack_hash_rnd_initted) {
1000+ get_random_bytes(&ip_conntrack_hash_rnd, 4);
1001+ ip_conntrack_hash_rnd_initted = 1;
1002+ }
1003+
1004+ hash = hash_conntrack(tuple);
1005+
1006+ if (ip_conntrack_max &&
1007+ atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
1008+ /* Try dropping from random chain, or else from the
1009+ chain about to put into (in case they're trying to
1010+ bomb one hash chain). */
1011+ unsigned int next = (drop_next++)%ip_conntrack_htable_size;
1012+
1013+ if (!early_drop(&ip_conntrack_hash[next])
1014+ && !early_drop(&ip_conntrack_hash[hash])) {
1015+ if (net_ratelimit())
1016+ printk(KERN_WARNING
1017+ "ip_conntrack: table full, dropping"
1018+ " packet.\n");
1019+ return ERR_PTR(-ENOMEM);
1020+ }
1021+ }
1022+
1023+ if (!invert_tuple(&repl_tuple, tuple, protocol)) {
1024+ DEBUGP("Can't invert tuple.\n");
1025+ return NULL;
1026+ }
1027+
1028+ conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
1029+ if (!conntrack) {
1030+ DEBUGP("Can't allocate conntrack.\n");
1031+ return ERR_PTR(-ENOMEM);
1032+ }
1033+
1034+ memset(conntrack, 0, sizeof(*conntrack));
1035+ atomic_set(&conntrack->ct_general.use, 1);
1036+ conntrack->ct_general.destroy = destroy_conntrack;
1037+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
1038+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
1039+ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
1040+ conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
1041+ for (i=0; i < IP_CT_NUMBER; i++)
1042+ conntrack->infos[i].master = &conntrack->ct_general;
1043+
1044+ if (!protocol->new(conntrack, skb)) {
1045+ kmem_cache_free(ip_conntrack_cachep, conntrack);
1046+ return NULL;
1047+ }
1048+ /* Don't set timer yet: wait for confirmation */
1049+ init_timer(&conntrack->timeout);
1050+ conntrack->timeout.data = (unsigned long)conntrack;
1051+ conntrack->timeout.function = death_by_timeout;
1052+
1053+ INIT_LIST_HEAD(&conntrack->sibling_list);
1054+
1055+ WRITE_LOCK(&ip_conntrack_lock);
1056+ /* Need finding and deleting of expected ONLY if we win race */
1057+ READ_LOCK(&ip_conntrack_expect_tuple_lock);
1058+ expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
1059+ struct ip_conntrack_expect *, tuple);
1060+ READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
1061+
1062+ /* If master is not in hash table yet (ie. packet hasn't left
1063+ this machine yet), how can other end know about expected?
1064+ Hence these are not the droids you are looking for (if
1065+ master ct never got confirmed, we'd hold a reference to it
1066+ and weird things would happen to future packets). */
1067+ if (expected && !is_confirmed(expected->expectant))
1068+ expected = NULL;
1069+
1070+ /* Look up the conntrack helper for master connections only */
1071+ if (!expected)
1072+ conntrack->helper = ip_ct_find_helper(&repl_tuple);
1073+
1074+ /* If the expectation is dying, then this is a loser. */
1075+ if (expected
1076+ && expected->expectant->helper->timeout
1077+ && ! del_timer(&expected->timeout))
1078+ expected = NULL;
1079+
1080+ if (expected) {
1081+ DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
1082+ conntrack, expected);
1083+ /* Welcome, Mr. Bond. We've been expecting you... */
1084+ IP_NF_ASSERT(master_ct(conntrack));
1085+ __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
1086+ conntrack->master = expected;
1087+ expected->sibling = conntrack;
1088+ LIST_DELETE(&ip_conntrack_expect_list, expected);
1089+ expected->expectant->expecting--;
1090+ nf_conntrack_get(&master_ct(conntrack)->infos[0]);
1091+ }
1092+ atomic_inc(&ip_conntrack_count);
1093+ WRITE_UNLOCK(&ip_conntrack_lock);
1094+
1095+ if (expected && expected->expectfn)
1096+ expected->expectfn(conntrack);
1097+ return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
1098+}
1099+
1100+/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1101+static inline struct ip_conntrack *
1102+resolve_normal_ct(struct sk_buff *skb,
1103+ struct ip_conntrack_protocol *proto,
1104+ int *set_reply,
1105+ unsigned int hooknum,
1106+ enum ip_conntrack_info *ctinfo)
1107+{
1108+ struct ip_conntrack_tuple tuple;
1109+ struct ip_conntrack_tuple_hash *h;
1110+
1111+ IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
1112+
1113+ if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto))
1114+ return NULL;
1115+
1116+ /* look for tuple match */
1117+ h = ip_conntrack_find_get(&tuple, NULL);
1118+ if (!h) {
1119+ h = init_conntrack(&tuple, proto, skb);
1120+ if (!h)
1121+ return NULL;
1122+ if (IS_ERR(h))
1123+ return (void *)h;
1124+ }
1125+
1126+ /* It exists; we have (non-exclusive) reference. */
1127+ if (DIRECTION(h) == IP_CT_DIR_REPLY) {
1128+ *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1129+ /* Please set reply bit if this packet OK */
1130+ *set_reply = 1;
1131+ } else {
1132+ /* Once we've had two way comms, always ESTABLISHED. */
1133+ if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
1134+ DEBUGP("ip_conntrack_in: normal packet for %p\n",
1135+ h->ctrack);
1136+ *ctinfo = IP_CT_ESTABLISHED;
1137+ } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
1138+ DEBUGP("ip_conntrack_in: related packet for %p\n",
1139+ h->ctrack);
1140+ *ctinfo = IP_CT_RELATED;
1141+ } else {
1142+ DEBUGP("ip_conntrack_in: new packet for %p\n",
1143+ h->ctrack);
1144+ *ctinfo = IP_CT_NEW;
1145+ }
1146+ *set_reply = 0;
1147+ }
1148+ skb->nfct = &h->ctrack->infos[*ctinfo];
1149+ return h->ctrack;
1150+}
1151+
1152+/* Netfilter hook itself. */
1153+unsigned int ip_conntrack_in(unsigned int hooknum,
1154+ struct sk_buff **pskb,
1155+ const struct net_device *in,
1156+ const struct net_device *out,
1157+ int (*okfn)(struct sk_buff *))
1158+{
1159+ struct ip_conntrack *ct;
1160+ enum ip_conntrack_info ctinfo;
1161+ struct ip_conntrack_protocol *proto;
1162+ int set_reply;
1163+ int ret;
1164+
1165+ /* FIXME: Do this right please. --RR */
1166+ (*pskb)->nfcache |= NFC_UNKNOWN;
1167+
1168+/* Doesn't cover locally-generated broadcast, so not worth it. */
1169+#if 0
1170+ /* Ignore broadcast: no `connection'. */
1171+ if ((*pskb)->pkt_type == PACKET_BROADCAST) {
1172+ printk("Broadcast packet!\n");
1173+ return NF_ACCEPT;
1174+ } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
1175+ == htonl(0x000000FF)) {
1176+ printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
1177+ NIPQUAD((*pskb)->nh.iph->saddr),
1178+ NIPQUAD((*pskb)->nh.iph->daddr),
1179+ (*pskb)->sk, (*pskb)->pkt_type);
1180+ }
1181+#endif
1182+
1183+ /* Previously seen (loopback)? Ignore. Do this before
1184+ fragment check. */
1185+ if ((*pskb)->nfct)
1186+ return NF_ACCEPT;
1187+
1188+ /* Gather fragments. */
1189+ if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
1190+ *pskb = ip_ct_gather_frags(*pskb);
1191+ if (!*pskb)
1192+ return NF_STOLEN;
1193+ }
1194+
1195+ proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
1196+
1197+ /* It may be an icmp error... */
1198+ if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
1199+ && icmp_error_track(*pskb, &ctinfo, hooknum))
1200+ return NF_ACCEPT;
1201+
1202+ if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
1203+ /* Not valid part of a connection */
1204+ return NF_ACCEPT;
1205+
1206+ if (IS_ERR(ct))
1207+ /* Too stressed to deal. */
1208+ return NF_DROP;
1209+
1210+ IP_NF_ASSERT((*pskb)->nfct);
1211+
1212+ ret = proto->packet(ct, *pskb, ctinfo);
1213+ if (ret == -1) {
1214+ /* Invalid */
1215+ nf_conntrack_put((*pskb)->nfct);
1216+ (*pskb)->nfct = NULL;
1217+ return NF_ACCEPT;
1218+ }
1219+
1220+ if (ret != NF_DROP && ct->helper) {
1221+ ret = ct->helper->help(*pskb, ct, ctinfo);
1222+ if (ret == -1) {
1223+ /* Invalid */
1224+ nf_conntrack_put((*pskb)->nfct);
1225+ (*pskb)->nfct = NULL;
1226+ return NF_ACCEPT;
1227+ }
1228+ }
1229+ if (set_reply)
1230+ set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
1231+
1232+ return ret;
1233+}
1234+
1235+int invert_tuplepr(struct ip_conntrack_tuple *inverse,
1236+ const struct ip_conntrack_tuple *orig)
1237+{
1238+ return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
1239+}
1240+
1241+static inline int resent_expect(const struct ip_conntrack_expect *i,
1242+ const struct ip_conntrack_tuple *tuple,
1243+ const struct ip_conntrack_tuple *mask)
1244+{
1245+ DEBUGP("resent_expect\n");
1246+ DEBUGP(" tuple: "); DUMP_TUPLE(&i->tuple);
1247+ DEBUGP("ct_tuple: "); DUMP_TUPLE(&i->ct_tuple);
1248+ DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
1249+ return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
1250+ || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
1251+ && ip_ct_tuple_equal(&i->mask, mask));
1252+}
1253+
1254+/* Would two expected things clash? */
1255+static inline int expect_clash(const struct ip_conntrack_expect *i,
1256+ const struct ip_conntrack_tuple *tuple,
1257+ const struct ip_conntrack_tuple *mask)
1258+{
1259+ /* Part covered by intersection of masks must be unequal,
1260+ otherwise they clash */
1261+ struct ip_conntrack_tuple intersect_mask
1262+ = { { i->mask.src.ip & mask->src.ip,
1263+ { i->mask.src.u.all & mask->src.u.all } },
1264+ { i->mask.dst.ip & mask->dst.ip,
1265+ { i->mask.dst.u.all & mask->dst.u.all },
1266+ i->mask.dst.protonum & mask->dst.protonum } };
1267+
1268+ return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
1269+}
1270+
1271+inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
1272+{
1273+ WRITE_LOCK(&ip_conntrack_lock);
1274+ unexpect_related(expect);
1275+ WRITE_UNLOCK(&ip_conntrack_lock);
1276+}
1277+
1278+static void expectation_timed_out(unsigned long ul_expect)
1279+{
1280+ struct ip_conntrack_expect *expect = (void *) ul_expect;
1281+
1282+ DEBUGP("expectation %p timed out\n", expect);
1283+ WRITE_LOCK(&ip_conntrack_lock);
1284+ __unexpect_related(expect);
1285+ WRITE_UNLOCK(&ip_conntrack_lock);
1286+}
1287+
1288+/* Add a related connection. */
1289+int ip_conntrack_expect_related(struct ip_conntrack *related_to,
1290+ struct ip_conntrack_expect *expect)
1291+{
1292+ struct ip_conntrack_expect *old, *new;
1293+ int ret = 0;
1294+
1295+ WRITE_LOCK(&ip_conntrack_lock);
1296+ /* Because of the write lock, no reader can walk the lists,
1297+ * so there is no need to use the tuple lock too */
1298+
1299+ DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1300+ DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1301+ DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1302+
1303+ old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
1304+ struct ip_conntrack_expect *, &expect->tuple,
1305+ &expect->mask);
1306+ if (old) {
1307+ /* Helper private data may contain offsets but no pointers
1308+ pointing into the payload - otherwise we should have to copy
1309+ the data filled out by the helper over the old one */
1310+ DEBUGP("expect_related: resent packet\n");
1311+ if (related_to->helper->timeout) {
1312+ if (!del_timer(&old->timeout)) {
1313+ /* expectation is dying. Fall through */
1314+ old = NULL;
1315+ } else {
1316+ old->timeout.expires = jiffies +
1317+ related_to->helper->timeout * HZ;
1318+ add_timer(&old->timeout);
1319+ }
1320+ }
1321+
1322+ if (old) {
1323+ WRITE_UNLOCK(&ip_conntrack_lock);
1324+ return -EEXIST;
1325+ }
1326+ } else if (related_to->helper->max_expected &&
1327+ related_to->expecting >= related_to->helper->max_expected) {
1328+ struct list_head *cur_item;
1329+ /* old == NULL */
1330+ if (!(related_to->helper->flags &
1331+ IP_CT_HELPER_F_REUSE_EXPECT)) {
1332+ WRITE_UNLOCK(&ip_conntrack_lock);
1333+ if (net_ratelimit())
1334+ printk(KERN_WARNING
1335+ "ip_conntrack: max number of expected "
1336+ "connections %i of %s reached for "
1337+ "%u.%u.%u.%u->%u.%u.%u.%u\n",
1338+ related_to->helper->max_expected,
1339+ related_to->helper->name,
1340+ NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1341+ NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1342+ return -EPERM;
1343+ }
1344+ DEBUGP("ip_conntrack: max number of expected "
1345+ "connections %i of %s reached for "
1346+ "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
1347+ related_to->helper->max_expected,
1348+ related_to->helper->name,
1349+ NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1350+ NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1351+
1352+ /* choose the the oldest expectation to evict */
1353+ list_for_each(cur_item, &related_to->sibling_list) {
1354+ struct ip_conntrack_expect *cur;
1355+
1356+ cur = list_entry(cur_item,
1357+ struct ip_conntrack_expect,
1358+ expected_list);
1359+ if (cur->sibling == NULL) {
1360+ old = cur;
1361+ break;
1362+ }
1363+ }
1364+
1365+ /* (!old) cannot happen, since related_to->expecting is the
1366+ * number of unconfirmed expects */
1367+ IP_NF_ASSERT(old);
1368+
1369+ /* newnat14 does not reuse the real allocated memory
1370+ * structures but rather unexpects the old and
1371+ * allocates a new. unexpect_related will decrement
1372+ * related_to->expecting.
1373+ */
1374+ unexpect_related(old);
1375+ ret = -EPERM;
1376+ } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1377+ struct ip_conntrack_expect *, &expect->tuple,
1378+ &expect->mask)) {
1379+ WRITE_UNLOCK(&ip_conntrack_lock);
1380+ DEBUGP("expect_related: busy!\n");
1381+ return -EBUSY;
1382+ }
1383+
1384+ new = (struct ip_conntrack_expect *)
1385+ kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
1386+ if (!new) {
1387+ WRITE_UNLOCK(&ip_conntrack_lock);
1388+ DEBUGP("expect_relaed: OOM allocating expect\n");
1389+ return -ENOMEM;
1390+ }
1391+
1392+ DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
1393+ memcpy(new, expect, sizeof(*expect));
1394+ new->expectant = related_to;
1395+ new->sibling = NULL;
1396+ atomic_set(&new->use, 1);
1397+
1398+ /* add to expected list for this connection */
1399+ list_add(&new->expected_list, &related_to->sibling_list);
1400+ /* add to global list of expectations */
1401+ list_prepend(&ip_conntrack_expect_list, &new->list);
1402+ /* add and start timer if required */
1403+ if (related_to->helper->timeout) {
1404+ init_timer(&new->timeout);
1405+ new->timeout.data = (unsigned long)new;
1406+ new->timeout.function = expectation_timed_out;
1407+ new->timeout.expires = jiffies +
1408+ related_to->helper->timeout * HZ;
1409+ add_timer(&new->timeout);
1410+ }
1411+ related_to->expecting++;
1412+
1413+ WRITE_UNLOCK(&ip_conntrack_lock);
1414+
1415+ return ret;
1416+}
1417+
1418+/* Change tuple in an existing expectation */
1419+int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
1420+ struct ip_conntrack_tuple *newtuple)
1421+{
1422+ int ret;
1423+
1424+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
1425+ WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
1426+
1427+ DEBUGP("change_expect:\n");
1428+ DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
1429+ DEBUGP("exp mask: "); DUMP_TUPLE(&expect->mask);
1430+ DEBUGP("newtuple: "); DUMP_TUPLE(newtuple);
1431+ if (expect->ct_tuple.dst.protonum == 0) {
1432+ /* Never seen before */
1433+ DEBUGP("change expect: never seen before\n");
1434+ if (!ip_ct_tuple_equal(&expect->tuple, newtuple)
1435+ && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1436+ struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1437+ /* Force NAT to find an unused tuple */
1438+ ret = -1;
1439+ } else {
1440+ memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1441+ memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1442+ ret = 0;
1443+ }
1444+ } else {
1445+ /* Resent packet */
1446+ DEBUGP("change expect: resent packet\n");
1447+ if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1448+ ret = 0;
1449+ } else {
1450+ /* Force NAT to choose again the same port */
1451+ ret = -1;
1452+ }
1453+ }
1454+ WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1455+
1456+ return ret;
1457+}
1458+
1459+/* Alter reply tuple (maybe alter helper). If it's already taken,
1460+ return 0 and don't do alteration. */
1461+int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1462+ const struct ip_conntrack_tuple *newreply)
1463+{
1464+ WRITE_LOCK(&ip_conntrack_lock);
1465+ if (__ip_conntrack_find(newreply, conntrack)) {
1466+ WRITE_UNLOCK(&ip_conntrack_lock);
1467+ return 0;
1468+ }
1469+ /* Should be unconfirmed, so not in hash table yet */
1470+ IP_NF_ASSERT(!is_confirmed(conntrack));
1471+
1472+ DEBUGP("Altering reply tuple of %p to ", conntrack);
1473+ DUMP_TUPLE(newreply);
1474+
1475+ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1476+ if (!conntrack->master)
1477+ conntrack->helper = LIST_FIND(&helpers, helper_cmp,
1478+ struct ip_conntrack_helper *,
1479+ newreply);
1480+ WRITE_UNLOCK(&ip_conntrack_lock);
1481+
1482+ return 1;
1483+}
1484+
1485+int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1486+{
1487+ WRITE_LOCK(&ip_conntrack_lock);
1488+ list_prepend(&helpers, me);
1489+ WRITE_UNLOCK(&ip_conntrack_lock);
1490+
1491+ return 0;
1492+}
1493+
1494+static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1495+ const struct ip_conntrack_helper *me)
1496+{
1497+ if (i->ctrack->helper == me) {
1498+ /* Get rid of any expected. */
1499+ remove_expectations(i->ctrack, 0);
1500+ /* And *then* set helper to NULL */
1501+ i->ctrack->helper = NULL;
1502+ }
1503+ return 0;
1504+}
1505+
1506+void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1507+{
1508+ unsigned int i;
1509+
1510+ /* Need write lock here, to delete helper. */
1511+ WRITE_LOCK(&ip_conntrack_lock);
1512+ LIST_DELETE(&helpers, me);
1513+
1514+ /* Get rid of expecteds, set helpers to NULL. */
1515+ for (i = 0; i < ip_conntrack_htable_size; i++)
1516+ LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1517+ struct ip_conntrack_tuple_hash *, me);
1518+ WRITE_UNLOCK(&ip_conntrack_lock);
1519+
1520+ /* Someone could be still looking at the helper in a bh. */
1521+ synchronize_net();
1522+}
1523+
1524+/* Refresh conntrack for this many jiffies. */
1525+void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
1526+{
1527+ IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1528+
1529+ WRITE_LOCK(&ip_conntrack_lock);
1530+ /* If not in hash table, timer will not be active yet */
1531+ if (!is_confirmed(ct))
1532+ ct->timeout.expires = extra_jiffies;
1533+ else {
1534+ /* Need del_timer for race avoidance (may already be dying). */
1535+ if (del_timer(&ct->timeout)) {
1536+ ct->timeout.expires = jiffies + extra_jiffies;
1537+ add_timer(&ct->timeout);
1538+ }
1539+ }
1540+ WRITE_UNLOCK(&ip_conntrack_lock);
1541+}
1542+
1543+/* Returns new sk_buff, or NULL */
1544+struct sk_buff *
1545+ip_ct_gather_frags(struct sk_buff *skb)
1546+{
1547+ struct sock *sk = skb->sk;
1548+#ifdef CONFIG_NETFILTER_DEBUG
1549+ unsigned int olddebug = skb->nf_debug;
1550+#endif
1551+ if (sk) {
1552+ sock_hold(sk);
1553+ skb_orphan(skb);
1554+ }
1555+
1556+ local_bh_disable();
1557+ skb = ip_defrag(skb);
1558+ local_bh_enable();
1559+
1560+ if (!skb) {
1561+ if (sk)
1562+ sock_put(sk);
1563+ return skb;
1564+ }
1565+
1566+ if (sk) {
1567+ skb_set_owner_w(skb, sk);
1568+ sock_put(sk);
1569+ }
1570+
1571+ ip_send_check(skb->nh.iph);
1572+ skb->nfcache |= NFC_ALTERED;
1573+#ifdef CONFIG_NETFILTER_DEBUG
1574+ /* Packet path as if nothing had happened. */
1575+ skb->nf_debug = olddebug;
1576+#endif
1577+ return skb;
1578+}
1579+
1580+/* Used by ipt_REJECT. */
1581+static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
1582+{
1583+ struct ip_conntrack *ct;
1584+ enum ip_conntrack_info ctinfo;
1585+
1586+ ct = __ip_conntrack_get(nfct, &ctinfo);
1587+
1588+ /* This ICMP is in reverse direction to the packet which
1589+ caused it */
1590+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1591+ ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1592+ else
1593+ ctinfo = IP_CT_RELATED;
1594+
1595+ /* Attach new skbuff, and increment count */
1596+ nskb->nfct = &ct->infos[ctinfo];
1597+ atomic_inc(&ct->ct_general.use);
1598+}
1599+
1600+static inline int
1601+do_kill(const struct ip_conntrack_tuple_hash *i,
1602+ int (*kill)(const struct ip_conntrack *i, void *data),
1603+ void *data)
1604+{
1605+ return kill(i->ctrack, data);
1606+}
1607+
1608+/* Bring out ya dead! */
1609+static struct ip_conntrack_tuple_hash *
1610+get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1611+ void *data)
1612+{
1613+ struct ip_conntrack_tuple_hash *h = NULL;
1614+ unsigned int i;
1615+
1616+ READ_LOCK(&ip_conntrack_lock);
1617+ for (i = 0; !h && i < ip_conntrack_htable_size; i++) {
1618+ h = LIST_FIND(&ip_conntrack_hash[i], do_kill,
1619+ struct ip_conntrack_tuple_hash *, kill, data);
1620+ }
1621+ if (h)
1622+ atomic_inc(&h->ctrack->ct_general.use);
1623+ READ_UNLOCK(&ip_conntrack_lock);
1624+
1625+ return h;
1626+}
1627+
1628+void
1629+ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1630+ void *data)
1631+{
1632+ struct ip_conntrack_tuple_hash *h;
1633+
1634+ /* This is order n^2, by the way. */
1635+ while ((h = get_next_corpse(kill, data)) != NULL) {
1636+ /* Time to push up daises... */
1637+ if (del_timer(&h->ctrack->timeout))
1638+ death_by_timeout((unsigned long)h->ctrack);
1639+ /* ... else the timer will get him soon. */
1640+
1641+ ip_conntrack_put(h->ctrack);
1642+ }
1643+}
1644+
1645+/* Fast function for those who don't want to parse /proc (and I don't
1646+ blame them). */
1647+/* Reversing the socket's dst/src point of view gives us the reply
1648+ mapping. */
1649+static int
1650+getorigdst(struct sock *sk, int optval, void *user, int *len)
1651+{
1652+ struct inet_opt *inet = inet_sk(sk);
1653+ struct ip_conntrack_tuple_hash *h;
1654+ struct ip_conntrack_tuple tuple;
1655+
1656+ IP_CT_TUPLE_U_BLANK(&tuple);
1657+ tuple.src.ip = inet->rcv_saddr;
1658+ tuple.src.u.tcp.port = inet->sport;
1659+ tuple.dst.ip = inet->daddr;
1660+ tuple.dst.u.tcp.port = inet->dport;
1661+ tuple.dst.protonum = IPPROTO_TCP;
1662+
1663+ /* We only do TCP at the moment: is there a better way? */
1664+ if (strcmp(sk->sk_prot->name, "TCP")) {
1665+ DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1666+ return -ENOPROTOOPT;
1667+ }
1668+
1669+ if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1670+ DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1671+ *len, sizeof(struct sockaddr_in));
1672+ return -EINVAL;
1673+ }
1674+
1675+ h = ip_conntrack_find_get(&tuple, NULL);
1676+ if (h) {
1677+ struct sockaddr_in sin;
1678+
1679+ sin.sin_family = AF_INET;
1680+ sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1681+ .tuple.dst.u.tcp.port;
1682+ sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1683+ .tuple.dst.ip;
1684+
1685+ DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1686+ NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1687+ ip_conntrack_put(h->ctrack);
1688+ if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1689+ return -EFAULT;
1690+ else
1691+ return 0;
1692+ }
1693+ DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1694+ NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1695+ NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1696+ return -ENOENT;
1697+}
1698+
1699+static struct nf_sockopt_ops so_getorigdst = {
1700+ .pf = PF_INET,
1701+ .get_optmin = SO_ORIGINAL_DST,
1702+ .get_optmax = SO_ORIGINAL_DST+1,
1703+ .get = &getorigdst,
1704+};
1705+
1706+static int kill_all(const struct ip_conntrack *i, void *data)
1707+{
1708+ return 1;
1709+}
1710+
1711+/* Mishearing the voices in his head, our hero wonders how he's
1712+ supposed to kill the mall. */
1713+void ip_conntrack_cleanup(void)
1714+{
1715+ ip_ct_attach = NULL;
1716+ /* This makes sure all current packets have passed through
1717+ netfilter framework. Roll on, two-stage module
1718+ delete... */
1719+ synchronize_net();
1720+
1721+ i_see_dead_people:
1722+ ip_ct_selective_cleanup(kill_all, NULL);
1723+ if (atomic_read(&ip_conntrack_count) != 0) {
1724+ schedule();
1725+ goto i_see_dead_people;
1726+ }
1727+
1728+ kmem_cache_destroy(ip_conntrack_cachep);
1729+ vfree(ip_conntrack_hash);
1730+ nf_unregister_sockopt(&so_getorigdst);
1731+}
1732+
1733+static int hashsize;
1734+MODULE_PARM(hashsize, "i");
1735+
1736+int __init ip_conntrack_init(void)
1737+{
1738+ unsigned int i;
1739+ int ret;
1740+
1741+ /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1742+ * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1743+ if (hashsize) {
1744+ ip_conntrack_htable_size = hashsize;
1745+ } else {
1746+ ip_conntrack_htable_size
1747+ = (((num_physpages << PAGE_SHIFT) / 16384)
1748+ / sizeof(struct list_head));
1749+ if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1750+ ip_conntrack_htable_size = 8192;
1751+ if (ip_conntrack_htable_size < 16)
1752+ ip_conntrack_htable_size = 16;
1753+ }
1754+ ip_conntrack_max = 8 * ip_conntrack_htable_size;
1755+
1756+ printk("ip_conntrack version %s (%u buckets, %d max)"
1757+ " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1758+ ip_conntrack_htable_size, ip_conntrack_max,
1759+ sizeof(struct ip_conntrack));
1760+
1761+ ret = nf_register_sockopt(&so_getorigdst);
1762+ if (ret != 0) {
1763+ printk(KERN_ERR "Unable to register netfilter socket option\n");
1764+ return ret;
1765+ }
1766+
1767+ ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1768+ * ip_conntrack_htable_size);
1769+ if (!ip_conntrack_hash) {
1770+ printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1771+ goto err_unreg_sockopt;
1772+ }
1773+
1774+ ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1775+ sizeof(struct ip_conntrack), 0,
1776+ SLAB_HWCACHE_ALIGN, NULL, NULL);
1777+ if (!ip_conntrack_cachep) {
1778+ printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1779+ goto err_free_hash;
1780+ }
1781+ /* Don't NEED lock here, but good form anyway. */
1782+ WRITE_LOCK(&ip_conntrack_lock);
1783+ /* Sew in builtin protocols. */
1784+ list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1785+ list_append(&protocol_list, &ip_conntrack_protocol_udp);
1786+ list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1787+ WRITE_UNLOCK(&ip_conntrack_lock);
1788+
1789+ for (i = 0; i < ip_conntrack_htable_size; i++)
1790+ INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1791+
1792+ /* For use by ipt_REJECT */
1793+ ip_ct_attach = ip_conntrack_attach;
1794+ return ret;
1795+
1796+err_free_hash:
1797+ vfree(ip_conntrack_hash);
1798+err_unreg_sockopt:
1799+ nf_unregister_sockopt(&so_getorigdst);
1800+
1801+ return -ENOMEM;
1802+}
1803diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c.rej linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c.rej
1804--- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c.rej 1970-01-01 01:00:00.000000000 +0100
1805+++ linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c.rej 2003-12-17 14:02:02.000000000 +0100
1806@@ -0,0 +1,17 @@
1807+***************
1808+*** 142,147 ****
1809+ tuple->dst.ip = iph->daddr;
1810+ tuple->dst.protonum = iph->protocol;
1811+
1812+ ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl,
1813+ len - 4*iph->ihl,
1814+ tuple);
1815+--- 142,149 ----
1816+ tuple->dst.ip = iph->daddr;
1817+ tuple->dst.protonum = iph->protocol;
1818+
1819++ tuple->src.u.all = tuple->dst.u.all = 0;
1820++
1821+ ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl,
1822+ len - 4*iph->ihl,
1823+ tuple);
1824diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_nat_core.c linux-2.6.0-test11/net/ipv4/netfilter/ip_nat_core.c
1825--- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_nat_core.c 2003-11-26 21:43:07.000000000 +0100
1826+++ linux-2.6.0-test11/net/ipv4/netfilter/ip_nat_core.c 2003-12-17 14:02:03.000000000 +0100
1827@@ -432,7 +432,7 @@
1828 *tuple = *orig_tuple;
1829 while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
1830 != NULL) {
1831- DEBUGP("Found best for "); DUMP_TUPLE(tuple);
1832+ DEBUGP("Found best for "); DUMP_TUPLE_RAW(tuple);
1833 /* 3) The per-protocol part of the manip is made to
1834 map into the range to make a unique tuple. */
1835
1836@@ -573,9 +573,9 @@
1837 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
1838 conntrack);
1839 DEBUGP("Original: ");
1840- DUMP_TUPLE(&orig_tp);
1841+ DUMP_TUPLE_RAW(&orig_tp);
1842 DEBUGP("New: ");
1843- DUMP_TUPLE(&new_tuple);
1844+ DUMP_TUPLE_RAW(&new_tuple);
1845 #endif
1846
1847 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
1848diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_nat_core.c.orig linux-2.6.0-test11/net/ipv4/netfilter/ip_nat_core.c.orig
1849--- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_nat_core.c.orig 1970-01-01 01:00:00.000000000 +0100
1850+++ linux-2.6.0-test11/net/ipv4/netfilter/ip_nat_core.c.orig 2003-11-26 21:43:07.000000000 +0100
1851@@ -0,0 +1,1030 @@
1852+/* NAT for netfilter; shared with compatibility layer. */
1853+
1854+/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
1855+ Public Licence. */
1856+#include <linux/module.h>
1857+#include <linux/types.h>
1858+#include <linux/timer.h>
1859+#include <linux/skbuff.h>
1860+#include <linux/netfilter_ipv4.h>
1861+#include <linux/vmalloc.h>
1862+#include <net/checksum.h>
1863+#include <net/icmp.h>
1864+#include <net/ip.h>
1865+#include <net/tcp.h> /* For tcp_prot in getorigdst */
1866+#include <linux/icmp.h>
1867+#include <linux/udp.h>
1868+
1869+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
1870+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
1871+
1872+#include <linux/netfilter_ipv4/ip_conntrack.h>
1873+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
1874+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
1875+#include <linux/netfilter_ipv4/ip_nat.h>
1876+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
1877+#include <linux/netfilter_ipv4/ip_nat_core.h>
1878+#include <linux/netfilter_ipv4/ip_nat_helper.h>
1879+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
1880+#include <linux/netfilter_ipv4/listhelp.h>
1881+
1882+#if 0
1883+#define DEBUGP printk
1884+#else
1885+#define DEBUGP(format, args...)
1886+#endif
1887+
1888+DECLARE_RWLOCK(ip_nat_lock);
1889+DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
1890+
1891+/* Calculated at init based on memory size */
1892+static unsigned int ip_nat_htable_size;
1893+
1894+static struct list_head *bysource;
1895+static struct list_head *byipsproto;
1896+LIST_HEAD(protos);
1897+LIST_HEAD(helpers);
1898+
1899+extern struct ip_nat_protocol unknown_nat_protocol;
1900+
1901+/* We keep extra hashes for each conntrack, for fast searching. */
1902+static inline size_t
1903+hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
1904+{
1905+ /* Modified src and dst, to ensure we don't create two
1906+ identical streams. */
1907+ return (src + dst + proto) % ip_nat_htable_size;
1908+}
1909+
1910+static inline size_t
1911+hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
1912+{
1913+ /* Original src, to ensure we map it consistently if poss. */
1914+ return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
1915+}
1916+
1917+/* Noone using conntrack by the time this called. */
1918+static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
1919+{
1920+ struct ip_nat_info *info = &conn->nat.info;
1921+ unsigned int hs, hp;
1922+
1923+ if (!info->initialized)
1924+ return;
1925+
1926+ IP_NF_ASSERT(info->bysource.conntrack);
1927+ IP_NF_ASSERT(info->byipsproto.conntrack);
1928+
1929+ hs = hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src,
1930+ conn->tuplehash[IP_CT_DIR_ORIGINAL]
1931+ .tuple.dst.protonum);
1932+
1933+ hp = hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
1934+ conn->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
1935+ conn->tuplehash[IP_CT_DIR_REPLY]
1936+ .tuple.dst.protonum);
1937+
1938+ WRITE_LOCK(&ip_nat_lock);
1939+ LIST_DELETE(&bysource[hs], &info->bysource);
1940+ LIST_DELETE(&byipsproto[hp], &info->byipsproto);
1941+ WRITE_UNLOCK(&ip_nat_lock);
1942+}
1943+
1944+/* We do checksum mangling, so if they were wrong before they're still
1945+ * wrong. Also works for incomplete packets (eg. ICMP dest
1946+ * unreachables.) */
1947+u_int16_t
1948+ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
1949+{
1950+ u_int32_t diffs[] = { oldvalinv, newval };
1951+ return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
1952+ oldcheck^0xFFFF));
1953+}
1954+
1955+static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
1956+{
1957+ return i->protonum == proto;
1958+}
1959+
1960+struct ip_nat_protocol *
1961+find_nat_proto(u_int16_t protonum)
1962+{
1963+ struct ip_nat_protocol *i;
1964+
1965+ MUST_BE_READ_LOCKED(&ip_nat_lock);
1966+ i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
1967+ if (!i)
1968+ i = &unknown_nat_protocol;
1969+ return i;
1970+}
1971+
1972+/* Is this tuple already taken? (not by us) */
1973+int
1974+ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
1975+ const struct ip_conntrack *ignored_conntrack)
1976+{
1977+ /* Conntrack tracking doesn't keep track of outgoing tuples; only
1978+ incoming ones. NAT means they don't have a fixed mapping,
1979+ so we invert the tuple and look for the incoming reply.
1980+
1981+ We could keep a separate hash if this proves too slow. */
1982+ struct ip_conntrack_tuple reply;
1983+
1984+ invert_tuplepr(&reply, tuple);
1985+ return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
1986+}
1987+
1988+/* Does tuple + the source manip come within the range mr */
1989+static int
1990+in_range(const struct ip_conntrack_tuple *tuple,
1991+ const struct ip_conntrack_manip *manip,
1992+ const struct ip_nat_multi_range *mr)
1993+{
1994+ struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
1995+ unsigned int i;
1996+ struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
1997+
1998+ for (i = 0; i < mr->rangesize; i++) {
1999+ /* If we are allowed to map IPs, then we must be in the
2000+ range specified, otherwise we must be unchanged. */
2001+ if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
2002+ if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
2003+ || (ntohl(newtuple.src.ip)
2004+ > ntohl(mr->range[i].max_ip)))
2005+ continue;
2006+ } else {
2007+ if (newtuple.src.ip != tuple->src.ip)
2008+ continue;
2009+ }
2010+
2011+ if (!(mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
2012+ || proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
2013+ &mr->range[i].min, &mr->range[i].max))
2014+ return 1;
2015+ }
2016+ return 0;
2017+}
2018+
2019+static inline int
2020+src_cmp(const struct ip_nat_hash *i,
2021+ const struct ip_conntrack_tuple *tuple,
2022+ const struct ip_nat_multi_range *mr)
2023+{
2024+ return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
2025+ == tuple->dst.protonum
2026+ && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
2027+ == tuple->src.ip
2028+ && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
2029+ == tuple->src.u.all
2030+ && in_range(tuple,
2031+ &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2032+ .tuple.src,
2033+ mr));
2034+}
2035+
2036+/* Only called for SRC manip */
2037+static struct ip_conntrack_manip *
2038+find_appropriate_src(const struct ip_conntrack_tuple *tuple,
2039+ const struct ip_nat_multi_range *mr)
2040+{
2041+ unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
2042+ struct ip_nat_hash *i;
2043+
2044+ MUST_BE_READ_LOCKED(&ip_nat_lock);
2045+ i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
2046+ if (i)
2047+ return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
2048+ else
2049+ return NULL;
2050+}
2051+
2052+#ifdef CONFIG_IP_NF_NAT_LOCAL
2053+/* If it's really a local destination manip, it may need to do a
2054+ source manip too. */
2055+static int
2056+do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
2057+{
2058+ struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } };
2059+ struct rtable *rt;
2060+
2061+ /* FIXME: IPTOS_TOS(iph->tos) --RR */
2062+ if (ip_route_output_key(&rt, &fl) != 0) {
2063+ DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
2064+ NIPQUAD(var_ip));
2065+ return 0;
2066+ }
2067+
2068+ *other_ipp = rt->rt_src;
2069+ ip_rt_put(rt);
2070+ return 1;
2071+}
2072+#endif
2073+
2074+/* Simple way to iterate through all. */
2075+static inline int fake_cmp(const struct ip_nat_hash *i,
2076+ u_int32_t src, u_int32_t dst, u_int16_t protonum,
2077+ unsigned int *score,
2078+ const struct ip_conntrack *conntrack)
2079+{
2080+ /* Compare backwards: we're dealing with OUTGOING tuples, and
2081+ inside the conntrack is the REPLY tuple. Don't count this
2082+ conntrack. */
2083+ if (i->conntrack != conntrack
2084+ && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
2085+ && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
2086+ && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
2087+ == protonum))
2088+ (*score)++;
2089+ return 0;
2090+}
2091+
2092+static inline unsigned int
2093+count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
2094+ const struct ip_conntrack *conntrack)
2095+{
2096+ unsigned int score = 0;
2097+ unsigned int h;
2098+
2099+ MUST_BE_READ_LOCKED(&ip_nat_lock);
2100+ h = hash_by_ipsproto(src, dst, protonum);
2101+ LIST_FIND(&byipsproto[h], fake_cmp, struct ip_nat_hash *,
2102+ src, dst, protonum, &score, conntrack);
2103+
2104+ return score;
2105+}
2106+
2107+/* For [FUTURE] fragmentation handling, we want the least-used
2108+ src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
2109+ if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
2110+ 1-65535, we don't do pro-rata allocation based on ports; we choose
2111+ the ip with the lowest src-ip/dst-ip/proto usage.
2112+
2113+ If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
2114+ range), we eliminate that and try again. This is not the most
2115+ efficient approach, but if you're worried about that, don't hand us
2116+ ranges you don't really have. */
2117+static struct ip_nat_range *
2118+find_best_ips_proto(struct ip_conntrack_tuple *tuple,
2119+ const struct ip_nat_multi_range *mr,
2120+ const struct ip_conntrack *conntrack,
2121+ unsigned int hooknum)
2122+{
2123+ unsigned int i;
2124+ struct {
2125+ const struct ip_nat_range *range;
2126+ unsigned int score;
2127+ struct ip_conntrack_tuple tuple;
2128+ } best = { NULL, 0xFFFFFFFF };
2129+ u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
2130+ static unsigned int randomness;
2131+
2132+ if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
2133+ var_ipp = &tuple->src.ip;
2134+ saved_ip = tuple->dst.ip;
2135+ other_ipp = &tuple->dst.ip;
2136+ } else {
2137+ var_ipp = &tuple->dst.ip;
2138+ saved_ip = tuple->src.ip;
2139+ other_ipp = &tuple->src.ip;
2140+ }
2141+ /* Don't do do_extra_mangle unless necessary (overrides
2142+ explicit socket bindings, for example) */
2143+ orig_dstip = tuple->dst.ip;
2144+
2145+ IP_NF_ASSERT(mr->rangesize >= 1);
2146+ for (i = 0; i < mr->rangesize; i++) {
2147+ /* Host order */
2148+ u_int32_t minip, maxip, j;
2149+
2150+ /* Don't do ranges which are already eliminated. */
2151+ if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
2152+ continue;
2153+ }
2154+
2155+ if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
2156+ minip = ntohl(mr->range[i].min_ip);
2157+ maxip = ntohl(mr->range[i].max_ip);
2158+ } else
2159+ minip = maxip = ntohl(*var_ipp);
2160+
2161+ randomness++;
2162+ for (j = 0; j < maxip - minip + 1; j++) {
2163+ unsigned int score;
2164+
2165+ *var_ipp = htonl(minip + (randomness + j)
2166+ % (maxip - minip + 1));
2167+
2168+ /* Reset the other ip in case it was mangled by
2169+ * do_extra_mangle last time. */
2170+ *other_ipp = saved_ip;
2171+
2172+#ifdef CONFIG_IP_NF_NAT_LOCAL
2173+ if (hooknum == NF_IP_LOCAL_OUT
2174+ && *var_ipp != orig_dstip
2175+ && !do_extra_mangle(*var_ipp, other_ipp)) {
2176+ DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
2177+ i, NIPQUAD(*var_ipp));
2178+ /* Can't route? This whole range part is
2179+ * probably screwed, but keep trying
2180+ * anyway. */
2181+ continue;
2182+ }
2183+#endif
2184+
2185+ /* Count how many others map onto this. */
2186+ score = count_maps(tuple->src.ip, tuple->dst.ip,
2187+ tuple->dst.protonum, conntrack);
2188+ if (score < best.score) {
2189+ /* Optimization: doesn't get any better than
2190+ this. */
2191+ if (score == 0)
2192+ return (struct ip_nat_range *)
2193+ &mr->range[i];
2194+
2195+ best.score = score;
2196+ best.tuple = *tuple;
2197+ best.range = &mr->range[i];
2198+ }
2199+ }
2200+ }
2201+ *tuple = best.tuple;
2202+
2203+ /* Discard const. */
2204+ return (struct ip_nat_range *)best.range;
2205+}
2206+
2207+/* Fast version doesn't iterate through hash chains, but only handles
2208+ common case of single IP address (null NAT, masquerade) */
2209+static struct ip_nat_range *
2210+find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
2211+ const struct ip_nat_multi_range *mr,
2212+ const struct ip_conntrack *conntrack,
2213+ unsigned int hooknum)
2214+{
2215+ if (mr->rangesize != 1
2216+ || (mr->range[0].flags & IP_NAT_RANGE_FULL)
2217+ || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
2218+ && mr->range[0].min_ip != mr->range[0].max_ip))
2219+ return find_best_ips_proto(tuple, mr, conntrack, hooknum);
2220+
2221+ if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
2222+ if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
2223+ tuple->src.ip = mr->range[0].min_ip;
2224+ else {
2225+ /* Only do extra mangle when required (breaks
2226+ socket binding) */
2227+#ifdef CONFIG_IP_NF_NAT_LOCAL
2228+ if (tuple->dst.ip != mr->range[0].min_ip
2229+ && hooknum == NF_IP_LOCAL_OUT
2230+ && !do_extra_mangle(mr->range[0].min_ip,
2231+ &tuple->src.ip))
2232+ return NULL;
2233+#endif
2234+ tuple->dst.ip = mr->range[0].min_ip;
2235+ }
2236+ }
2237+
2238+ /* Discard const. */
2239+ return (struct ip_nat_range *)&mr->range[0];
2240+}
2241+
2242+static int
2243+get_unique_tuple(struct ip_conntrack_tuple *tuple,
2244+ const struct ip_conntrack_tuple *orig_tuple,
2245+ const struct ip_nat_multi_range *mrr,
2246+ struct ip_conntrack *conntrack,
2247+ unsigned int hooknum)
2248+{
2249+ struct ip_nat_protocol *proto
2250+ = find_nat_proto(orig_tuple->dst.protonum);
2251+ struct ip_nat_range *rptr;
2252+ unsigned int i;
2253+ int ret;
2254+
2255+ /* We temporarily use flags for marking full parts, but we
2256+ always clean up afterwards */
2257+ struct ip_nat_multi_range *mr = (void *)mrr;
2258+
2259+ /* 1) If this srcip/proto/src-proto-part is currently mapped,
2260+ and that same mapping gives a unique tuple within the given
2261+ range, use that.
2262+
2263+ This is only required for source (ie. NAT/masq) mappings.
2264+ So far, we don't do local source mappings, so multiple
2265+ manips not an issue. */
2266+ if (hooknum == NF_IP_POST_ROUTING) {
2267+ struct ip_conntrack_manip *manip;
2268+
2269+ manip = find_appropriate_src(orig_tuple, mr);
2270+ if (manip) {
2271+ /* Apply same source manipulation. */
2272+ *tuple = ((struct ip_conntrack_tuple)
2273+ { *manip, orig_tuple->dst });
2274+ DEBUGP("get_unique_tuple: Found current src map\n");
2275+ if (!ip_nat_used_tuple(tuple, conntrack))
2276+ return 1;
2277+ }
2278+ }
2279+
2280+ /* 2) Select the least-used IP/proto combination in the given
2281+ range.
2282+ */
2283+ *tuple = *orig_tuple;
2284+ while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
2285+ != NULL) {
2286+ DEBUGP("Found best for "); DUMP_TUPLE(tuple);
2287+ /* 3) The per-protocol part of the manip is made to
2288+ map into the range to make a unique tuple. */
2289+
2290+ /* Only bother mapping if it's not already in range
2291+ and unique */
2292+ if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
2293+ || proto->in_range(tuple, HOOK2MANIP(hooknum),
2294+ &rptr->min, &rptr->max))
2295+ && !ip_nat_used_tuple(tuple, conntrack)) {
2296+ ret = 1;
2297+ goto clear_fulls;
2298+ } else {
2299+ if (proto->unique_tuple(tuple, rptr,
2300+ HOOK2MANIP(hooknum),
2301+ conntrack)) {
2302+ /* Must be unique. */
2303+ IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
2304+ conntrack));
2305+ ret = 1;
2306+ goto clear_fulls;
2307+ } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
2308+ /* Try implicit source NAT; protocol
2309+ may be able to play with ports to
2310+ make it unique. */
2311+ struct ip_nat_range r
2312+ = { IP_NAT_RANGE_MAP_IPS,
2313+ tuple->src.ip, tuple->src.ip,
2314+ { 0 }, { 0 } };
2315+ DEBUGP("Trying implicit mapping\n");
2316+ if (proto->unique_tuple(tuple, &r,
2317+ IP_NAT_MANIP_SRC,
2318+ conntrack)) {
2319+ /* Must be unique. */
2320+ IP_NF_ASSERT(!ip_nat_used_tuple
2321+ (tuple, conntrack));
2322+ ret = 1;
2323+ goto clear_fulls;
2324+ }
2325+ }
2326+ DEBUGP("Protocol can't get unique tuple %u.\n",
2327+ hooknum);
2328+ }
2329+
2330+ /* Eliminate that from range, and try again. */
2331+ rptr->flags |= IP_NAT_RANGE_FULL;
2332+ *tuple = *orig_tuple;
2333+ }
2334+
2335+ ret = 0;
2336+
2337+ clear_fulls:
2338+ /* Clear full flags. */
2339+ IP_NF_ASSERT(mr->rangesize >= 1);
2340+ for (i = 0; i < mr->rangesize; i++)
2341+ mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
2342+
2343+ return ret;
2344+}
2345+
2346+static inline int
2347+helper_cmp(const struct ip_nat_helper *helper,
2348+ const struct ip_conntrack_tuple *tuple)
2349+{
2350+ return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
2351+}
2352+
2353+/* Where to manip the reply packets (will be reverse manip). */
2354+static unsigned int opposite_hook[NF_IP_NUMHOOKS]
2355+= { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
2356+ [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
2357+#ifdef CONFIG_IP_NF_NAT_LOCAL
2358+ [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
2359+ [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
2360+#endif
2361+};
2362+
2363+unsigned int
2364+ip_nat_setup_info(struct ip_conntrack *conntrack,
2365+ const struct ip_nat_multi_range *mr,
2366+ unsigned int hooknum)
2367+{
2368+ struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
2369+ struct ip_conntrack_tuple orig_tp;
2370+ struct ip_nat_info *info = &conntrack->nat.info;
2371+ int in_hashes = info->initialized;
2372+
2373+ MUST_BE_WRITE_LOCKED(&ip_nat_lock);
2374+ IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
2375+ || hooknum == NF_IP_POST_ROUTING
2376+ || hooknum == NF_IP_LOCAL_OUT);
2377+ IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
2378+ IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum))));
2379+
2380+ /* What we've got will look like inverse of reply. Normally
2381+ this is what is in the conntrack, except for prior
2382+ manipulations (future optimization: if num_manips == 0,
2383+ orig_tp =
2384+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
2385+ invert_tuplepr(&orig_tp,
2386+ &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
2387+
2388+#if 0
2389+ {
2390+ unsigned int i;
2391+
2392+ DEBUGP("Hook %u (%s), ", hooknum,
2393+ HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
2394+ DUMP_TUPLE(&orig_tp);
2395+ DEBUGP("Range %p: ", mr);
2396+ for (i = 0; i < mr->rangesize; i++) {
2397+ DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
2398+ i,
2399+ (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
2400+ ? " MAP_IPS" : "",
2401+ (mr->range[i].flags
2402+ & IP_NAT_RANGE_PROTO_SPECIFIED)
2403+ ? " PROTO_SPECIFIED" : "",
2404+ (mr->range[i].flags & IP_NAT_RANGE_FULL)
2405+ ? " FULL" : "",
2406+ NIPQUAD(mr->range[i].min_ip),
2407+ NIPQUAD(mr->range[i].max_ip),
2408+ mr->range[i].min.all,
2409+ mr->range[i].max.all);
2410+ }
2411+ }
2412+#endif
2413+
2414+ do {
2415+ if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
2416+ hooknum)) {
2417+ DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
2418+ conntrack);
2419+ return NF_DROP;
2420+ }
2421+
2422+#if 0
2423+ DEBUGP("Hook %u (%s) %p\n", hooknum,
2424+ HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
2425+ conntrack);
2426+ DEBUGP("Original: ");
2427+ DUMP_TUPLE(&orig_tp);
2428+ DEBUGP("New: ");
2429+ DUMP_TUPLE(&new_tuple);
2430+#endif
2431+
2432+ /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
2433+ the original (A/B/C/D') and the mangled one (E/F/G/H').
2434+
2435+ We're only allowed to work with the SRC per-proto
2436+ part, so we create inverses of both to start, then
2437+ derive the other fields we need. */
2438+
2439+ /* Reply connection: simply invert the new tuple
2440+ (G/H/E/F') */
2441+ invert_tuplepr(&reply, &new_tuple);
2442+
2443+ /* Alter conntrack table so it recognizes replies.
2444+ If fail this race (reply tuple now used), repeat. */
2445+ } while (!ip_conntrack_alter_reply(conntrack, &reply));
2446+
2447+ /* FIXME: We can simply used existing conntrack reply tuple
2448+ here --RR */
2449+ /* Create inverse of original: C/D/A/B' */
2450+ invert_tuplepr(&inv_tuple, &orig_tp);
2451+
2452+ /* Has source changed?. */
2453+ if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
2454+ /* In this direction, a source manip. */
2455+ info->manips[info->num_manips++] =
2456+ ((struct ip_nat_info_manip)
2457+ { IP_CT_DIR_ORIGINAL, hooknum,
2458+ IP_NAT_MANIP_SRC, new_tuple.src });
2459+
2460+ IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
2461+
2462+ /* In the reverse direction, a destination manip. */
2463+ info->manips[info->num_manips++] =
2464+ ((struct ip_nat_info_manip)
2465+ { IP_CT_DIR_REPLY, opposite_hook[hooknum],
2466+ IP_NAT_MANIP_DST, orig_tp.src });
2467+ IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
2468+ }
2469+
2470+ /* Has destination changed? */
2471+ if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
2472+ /* In this direction, a destination manip */
2473+ info->manips[info->num_manips++] =
2474+ ((struct ip_nat_info_manip)
2475+ { IP_CT_DIR_ORIGINAL, hooknum,
2476+ IP_NAT_MANIP_DST, reply.src });
2477+
2478+ IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
2479+
2480+ /* In the reverse direction, a source manip. */
2481+ info->manips[info->num_manips++] =
2482+ ((struct ip_nat_info_manip)
2483+ { IP_CT_DIR_REPLY, opposite_hook[hooknum],
2484+ IP_NAT_MANIP_SRC, inv_tuple.src });
2485+ IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
2486+ }
2487+
2488+ /* If there's a helper, assign it; based on new tuple. */
2489+ if (!conntrack->master)
2490+ info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
2491+ &reply);
2492+
2493+ /* It's done. */
2494+ info->initialized |= (1 << HOOK2MANIP(hooknum));
2495+
2496+ if (in_hashes) {
2497+ IP_NF_ASSERT(info->bysource.conntrack);
2498+ replace_in_hashes(conntrack, info);
2499+ } else {
2500+ place_in_hashes(conntrack, info);
2501+ }
2502+
2503+ return NF_ACCEPT;
2504+}
2505+
2506+void replace_in_hashes(struct ip_conntrack *conntrack,
2507+ struct ip_nat_info *info)
2508+{
2509+ /* Source has changed, so replace in hashes. */
2510+ unsigned int srchash
2511+ = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2512+ .tuple.src,
2513+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2514+ .tuple.dst.protonum);
2515+ /* We place packet as seen OUTGOUNG in byips_proto hash
2516+ (ie. reverse dst and src of reply packet. */
2517+ unsigned int ipsprotohash
2518+ = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
2519+ .tuple.dst.ip,
2520+ conntrack->tuplehash[IP_CT_DIR_REPLY]
2521+ .tuple.src.ip,
2522+ conntrack->tuplehash[IP_CT_DIR_REPLY]
2523+ .tuple.dst.protonum);
2524+
2525+ IP_NF_ASSERT(info->bysource.conntrack == conntrack);
2526+ MUST_BE_WRITE_LOCKED(&ip_nat_lock);
2527+
2528+ list_del(&info->bysource.list);
2529+ list_del(&info->byipsproto.list);
2530+
2531+ list_prepend(&bysource[srchash], &info->bysource);
2532+ list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
2533+}
2534+
2535+void place_in_hashes(struct ip_conntrack *conntrack,
2536+ struct ip_nat_info *info)
2537+{
2538+ unsigned int srchash
2539+ = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2540+ .tuple.src,
2541+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2542+ .tuple.dst.protonum);
2543+ /* We place packet as seen OUTGOUNG in byips_proto hash
2544+ (ie. reverse dst and src of reply packet. */
2545+ unsigned int ipsprotohash
2546+ = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
2547+ .tuple.dst.ip,
2548+ conntrack->tuplehash[IP_CT_DIR_REPLY]
2549+ .tuple.src.ip,
2550+ conntrack->tuplehash[IP_CT_DIR_REPLY]
2551+ .tuple.dst.protonum);
2552+
2553+ IP_NF_ASSERT(!info->bysource.conntrack);
2554+
2555+ MUST_BE_WRITE_LOCKED(&ip_nat_lock);
2556+ info->byipsproto.conntrack = conntrack;
2557+ info->bysource.conntrack = conntrack;
2558+
2559+ list_prepend(&bysource[srchash], &info->bysource);
2560+ list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
2561+}
2562+
2563+/* Returns true if succeeded. */
2564+static int
2565+manip_pkt(u_int16_t proto,
2566+ struct sk_buff **pskb,
2567+ unsigned int iphdroff,
2568+ const struct ip_conntrack_manip *manip,
2569+ enum ip_nat_manip_type maniptype)
2570+{
2571+ struct iphdr *iph;
2572+
2573+ (*pskb)->nfcache |= NFC_ALTERED;
2574+ if (!skb_ip_make_writable(pskb, iphdroff+sizeof(iph)))
2575+ return 0;
2576+
2577+ iph = (void *)(*pskb)->data + iphdroff;
2578+
2579+ /* Manipulate protcol part. */
2580+ if (!find_nat_proto(proto)->manip_pkt(pskb,
2581+ iphdroff + iph->ihl*4,
2582+ manip, maniptype))
2583+ return 0;
2584+
2585+ iph = (void *)(*pskb)->data + iphdroff;
2586+
2587+ if (maniptype == IP_NAT_MANIP_SRC) {
2588+ iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
2589+ iph->check);
2590+ iph->saddr = manip->ip;
2591+ } else {
2592+ iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
2593+ iph->check);
2594+ iph->daddr = manip->ip;
2595+ }
2596+ return 1;
2597+}
2598+
2599+static inline int exp_for_packet(struct ip_conntrack_expect *exp,
2600+ struct sk_buff *skb)
2601+{
2602+ struct ip_conntrack_protocol *proto;
2603+ int ret = 1;
2604+
2605+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
2606+ proto = __ip_ct_find_proto(skb->nh.iph->protocol);
2607+ if (proto->exp_matches_pkt)
2608+ ret = proto->exp_matches_pkt(exp, skb);
2609+
2610+ return ret;
2611+}
2612+
2613+/* Do packet manipulations according to binding. */
2614+unsigned int
2615+do_bindings(struct ip_conntrack *ct,
2616+ enum ip_conntrack_info ctinfo,
2617+ struct ip_nat_info *info,
2618+ unsigned int hooknum,
2619+ struct sk_buff **pskb)
2620+{
2621+ unsigned int i;
2622+ struct ip_nat_helper *helper;
2623+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
2624+ int proto = (*pskb)->nh.iph->protocol;
2625+
2626+ /* Need nat lock to protect against modification, but neither
2627+ conntrack (referenced) and helper (deleted with
2628+ synchronize_bh()) can vanish. */
2629+ READ_LOCK(&ip_nat_lock);
2630+ for (i = 0; i < info->num_manips; i++) {
2631+ if (info->manips[i].direction == dir
2632+ && info->manips[i].hooknum == hooknum) {
2633+ DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
2634+ *pskb,
2635+ info->manips[i].maniptype == IP_NAT_MANIP_SRC
2636+ ? "SRC" : "DST",
2637+ NIPQUAD(info->manips[i].manip.ip),
2638+ htons(info->manips[i].manip.u.all));
2639+ if (!manip_pkt(proto, pskb, 0,
2640+ &info->manips[i].manip,
2641+ info->manips[i].maniptype)) {
2642+ READ_UNLOCK(&ip_nat_lock);
2643+ return NF_DROP;
2644+ }
2645+ }
2646+ }
2647+ helper = info->helper;
2648+ READ_UNLOCK(&ip_nat_lock);
2649+
2650+ if (helper) {
2651+ struct ip_conntrack_expect *exp = NULL;
2652+ struct list_head *cur_item;
2653+ int ret = NF_ACCEPT;
2654+ int helper_called = 0;
2655+
2656+ DEBUGP("do_bindings: helper existing for (%p)\n", ct);
2657+
2658+ /* Always defragged for helpers */
2659+ IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
2660+ & htons(IP_MF|IP_OFFSET)));
2661+
2662+ /* Have to grab read lock before sibling_list traversal */
2663+ READ_LOCK(&ip_conntrack_lock);
2664+ list_for_each(cur_item, &ct->sibling_list) {
2665+ exp = list_entry(cur_item, struct ip_conntrack_expect,
2666+ expected_list);
2667+
2668+ /* if this expectation is already established, skip */
2669+ if (exp->sibling)
2670+ continue;
2671+
2672+ if (exp_for_packet(exp, *pskb)) {
2673+ /* FIXME: May be true multiple times in the
2674+ * case of UDP!! */
2675+ DEBUGP("calling nat helper (exp=%p) for packet\n", exp);
2676+ ret = helper->help(ct, exp, info, ctinfo,
2677+ hooknum, pskb);
2678+ if (ret != NF_ACCEPT) {
2679+ READ_UNLOCK(&ip_conntrack_lock);
2680+ return ret;
2681+ }
2682+ helper_called = 1;
2683+ }
2684+ }
2685+ /* Helper might want to manip the packet even when there is no
2686+ * matching expectation for this packet */
2687+ if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
2688+ DEBUGP("calling nat helper for packet without expectation\n");
2689+ ret = helper->help(ct, NULL, info, ctinfo,
2690+ hooknum, pskb);
2691+ if (ret != NF_ACCEPT) {
2692+ READ_UNLOCK(&ip_conntrack_lock);
2693+ return ret;
2694+ }
2695+ }
2696+ READ_UNLOCK(&ip_conntrack_lock);
2697+
2698+ /* Adjust sequence number only once per packet
2699+ * (helper is called at all hooks) */
2700+ if (proto == IPPROTO_TCP
2701+ && (hooknum == NF_IP_POST_ROUTING
2702+ || hooknum == NF_IP_LOCAL_IN)) {
2703+ DEBUGP("ip_nat_core: adjusting sequence number\n");
2704+ /* future: put this in a l4-proto specific function,
2705+ * and call this function here. */
2706+ if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
2707+ ret = NF_DROP;
2708+ }
2709+
2710+ return ret;
2711+
2712+ } else
2713+ return NF_ACCEPT;
2714+
2715+ /* not reached */
2716+}
2717+
2718+int
2719+icmp_reply_translation(struct sk_buff **pskb,
2720+ struct ip_conntrack *conntrack,
2721+ unsigned int hooknum,
2722+ int dir)
2723+{
2724+ struct {
2725+ struct icmphdr icmp;
2726+ struct iphdr ip;
2727+ } *inside;
2728+ unsigned int i;
2729+ struct ip_nat_info *info = &conntrack->nat.info;
2730+ int hdrlen;
2731+
2732+ if (!skb_ip_make_writable(pskb,(*pskb)->nh.iph->ihl*4+sizeof(*inside)))
2733+ return 0;
2734+ inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
2735+
2736+ /* We're actually going to mangle it beyond trivial checksum
2737+ adjustment, so make sure the current checksum is correct. */
2738+ if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
2739+ hdrlen = (*pskb)->nh.iph->ihl * 4;
2740+ if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
2741+ (*pskb)->len - hdrlen, 0)))
2742+ return 0;
2743+ }
2744+
2745+ /* Must be RELATED */
2746+ IP_NF_ASSERT((*pskb)->nfct
2747+ - (struct ip_conntrack *)(*pskb)->nfct->master
2748+ == IP_CT_RELATED
2749+ || (*pskb)->nfct
2750+ - (struct ip_conntrack *)(*pskb)->nfct->master
2751+ == IP_CT_RELATED+IP_CT_IS_REPLY);
2752+
2753+ /* Redirects on non-null nats must be dropped, else they'll
2754+ start talking to each other without our translation, and be
2755+ confused... --RR */
2756+ if (inside->icmp.type == ICMP_REDIRECT) {
2757+ /* Don't care about races here. */
2758+ if (info->initialized
2759+ != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
2760+ || info->num_manips != 0)
2761+ return 0;
2762+ }
2763+
2764+ DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
2765+ *pskb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
2766+ /* Note: May not be from a NAT'd host, but probably safest to
2767+ do translation always as if it came from the host itself
2768+ (even though a "host unreachable" coming from the host
2769+ itself is a bit weird).
2770+
2771+ More explanation: some people use NAT for anonymizing.
2772+ Also, CERT recommends dropping all packets from private IP
2773+ addresses (although ICMP errors from internal links with
2774+ such addresses are not too uncommon, as Alan Cox points
2775+ out) */
2776+
2777+ READ_LOCK(&ip_nat_lock);
2778+ for (i = 0; i < info->num_manips; i++) {
2779+ DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
2780+ i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
2781+ "ORIG" : "REPLY", info->manips[i].hooknum);
2782+
2783+ if (info->manips[i].direction != dir)
2784+ continue;
2785+
2786+ /* Mapping the inner packet is just like a normal
2787+ packet, except it was never src/dst reversed, so
2788+ where we would normally apply a dst manip, we apply
2789+ a src, and vice versa. */
2790+ if (info->manips[i].hooknum == hooknum) {
2791+ DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
2792+ info->manips[i].maniptype == IP_NAT_MANIP_SRC
2793+ ? "DST" : "SRC",
2794+ NIPQUAD(info->manips[i].manip.ip),
2795+ ntohs(info->manips[i].manip.u.udp.port));
2796+ if (!manip_pkt(inside->ip.protocol, pskb,
2797+ (*pskb)->nh.iph->ihl*4
2798+ + sizeof(inside->icmp),
2799+ &info->manips[i].manip,
2800+ !info->manips[i].maniptype))
2801+ goto unlock_fail;
2802+
2803+ /* Outer packet needs to have IP header NATed like
2804+ it's a reply. */
2805+
2806+ /* Use mapping to map outer packet: 0 give no
2807+ per-proto mapping */
2808+ DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
2809+ info->manips[i].maniptype == IP_NAT_MANIP_SRC
2810+ ? "SRC" : "DST",
2811+ NIPQUAD(info->manips[i].manip.ip));
2812+ if (!manip_pkt(0, pskb, 0,
2813+ &info->manips[i].manip,
2814+ info->manips[i].maniptype))
2815+ goto unlock_fail;
2816+ }
2817+ }
2818+ READ_UNLOCK(&ip_nat_lock);
2819+
2820+ hdrlen = (*pskb)->nh.iph->ihl * 4;
2821+
2822+ inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
2823+
2824+ inside->icmp.checksum = 0;
2825+ inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
2826+ (*pskb)->len - hdrlen,
2827+ 0));
2828+ return 1;
2829+
2830+ unlock_fail:
2831+ READ_UNLOCK(&ip_nat_lock);
2832+ return 0;
2833+}
2834+
2835+int __init ip_nat_init(void)
2836+{
2837+ size_t i;
2838+
2839+ /* Leave them the same for the moment. */
2840+ ip_nat_htable_size = ip_conntrack_htable_size;
2841+
2842+ /* One vmalloc for both hash tables */
2843+ bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
2844+ if (!bysource) {
2845+ return -ENOMEM;
2846+ }
2847+ byipsproto = bysource + ip_nat_htable_size;
2848+
2849+ /* Sew in builtin protocols. */
2850+ WRITE_LOCK(&ip_nat_lock);
2851+ list_append(&protos, &ip_nat_protocol_tcp);
2852+ list_append(&protos, &ip_nat_protocol_udp);
2853+ list_append(&protos, &ip_nat_protocol_icmp);
2854+ WRITE_UNLOCK(&ip_nat_lock);
2855+
2856+ for (i = 0; i < ip_nat_htable_size; i++) {
2857+ INIT_LIST_HEAD(&bysource[i]);
2858+ INIT_LIST_HEAD(&byipsproto[i]);
2859+ }
2860+
2861+ /* FIXME: Man, this is a hack. <SIGH> */
2862+ IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
2863+ ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
2864+
2865+ return 0;
2866+}
2867+
2868+/* Clear NAT section of all conntracks, in case we're loaded again. */
2869+static int clean_nat(const struct ip_conntrack *i, void *data)
2870+{
2871+ memset((void *)&i->nat, 0, sizeof(i->nat));
2872+ return 0;
2873+}
2874+
2875+/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
2876+void ip_nat_cleanup(void)
2877+{
2878+ ip_ct_selective_cleanup(&clean_nat, NULL);
2879+ ip_conntrack_destroyed = NULL;
2880+ vfree(bysource);
2881+}
This page took 0.437013 seconds and 4 git commands to generate.