]> git.pld-linux.org Git - packages/kernel.git/blob - 2.6.0-t11-pptp-conntrack-nat.patch
- _rel 1.19,
[packages/kernel.git] / 2.6.0-t11-pptp-conntrack-nat.patch
1 diff -Nur linux-2.6.0-test11.org/include/linux/netfilter_ipv4/ip_conntrack_tuple.h linux-2.6.0-test11/include/linux/netfilter_ipv4/ip_conntrack_tuple.h
2 --- linux-2.6.0-test11.org/include/linux/netfilter_ipv4/ip_conntrack_tuple.h    2003-11-26 21:44:58.000000000 +0100
3 +++ linux-2.6.0-test11/include/linux/netfilter_ipv4/ip_conntrack_tuple.h        2003-12-17 14:02:02.000000000 +0100
4 @@ -14,7 +14,7 @@
5  union ip_conntrack_manip_proto
6  {
7         /* Add other protocols here. */
8 -       u_int16_t all;
9 +       u_int32_t all;
10  
11         struct {
12                 u_int16_t port;
13 @@ -25,6 +25,9 @@
14         struct {
15                 u_int16_t id;
16         } icmp;
17 +       struct {
18 +               u_int32_t key;
19 +       } gre;
20  };
21  
22  /* The manipulable part of the tuple. */
23 @@ -44,7 +47,7 @@
24                 u_int32_t ip;
25                 union {
26                         /* Add other protocols here. */
27 -                       u_int16_t all;
28 +                       u_int64_t all;
29  
30                         struct {
31                                 u_int16_t port;
32 @@ -55,6 +58,11 @@
33                         struct {
34                                 u_int8_t type, code;
35                         } icmp;
36 +                       struct {
37 +                               u_int16_t protocol;
38 +                               u_int8_t version;
39 +                               u_int32_t key;
40 +                       } gre;
41                 } u;
42  
43                 /* The protocol. */
44 @@ -80,10 +88,16 @@
45  #ifdef __KERNEL__
46  
47  #define DUMP_TUPLE(tp)                                         \
48 -DEBUGP("tuple %p: %u %u.%u.%u.%u:%hu -> %u.%u.%u.%u:%hu\n",    \
49 +DEBUGP("tuple %p: %u %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n",      \
50         (tp), (tp)->dst.protonum,                               \
51 -       NIPQUAD((tp)->src.ip), ntohs((tp)->src.u.all),          \
52 -       NIPQUAD((tp)->dst.ip), ntohs((tp)->dst.u.all))
53 +       NIPQUAD((tp)->src.ip), ntohl((tp)->src.u.all),          \
54 +       NIPQUAD((tp)->dst.ip), ntohl((tp)->dst.u.all))
55 +
56 +#define DUMP_TUPLE_RAW(x)                                              \
57 +       DEBUGP("tuple %p: %u %u.%u.%u.%u:0x%08x -> %u.%u.%u.%u:0x%08x\n",\
58 +       (x), (x)->dst.protonum,                                         \
59 +       NIPQUAD((x)->src.ip), ntohl((x)->src.u.all),                    \
60 +       NIPQUAD((x)->dst.ip), ntohl((x)->dst.u.all))
61  
62  #define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL)
63  
64 diff -Nur linux-2.6.0-test11.org/include/linux/netfilter_ipv4/ip_conntrack_tuple.h.orig linux-2.6.0-test11/include/linux/netfilter_ipv4/ip_conntrack_tuple.h.orig
65 --- linux-2.6.0-test11.org/include/linux/netfilter_ipv4/ip_conntrack_tuple.h.orig       1970-01-01 01:00:00.000000000 +0100
66 +++ linux-2.6.0-test11/include/linux/netfilter_ipv4/ip_conntrack_tuple.h.orig   2003-11-26 21:44:58.000000000 +0100
67 @@ -0,0 +1,139 @@
68 +#ifndef _IP_CONNTRACK_TUPLE_H
69 +#define _IP_CONNTRACK_TUPLE_H
70 +
71 +/* A `tuple' is a structure containing the information to uniquely
72 +  identify a connection.  ie. if two packets have the same tuple, they
73 +  are in the same connection; if not, they are not.
74 +
75 +  We divide the structure along "manipulatable" and
76 +  "non-manipulatable" lines, for the benefit of the NAT code.
77 +*/
78 +
79 +/* The protocol-specific manipulable parts of the tuple: always in
80 +   network order! */
81 +union ip_conntrack_manip_proto
82 +{
83 +       /* Add other protocols here. */
84 +       u_int16_t all;
85 +
86 +       struct {
87 +               u_int16_t port;
88 +       } tcp;
89 +       struct {
90 +               u_int16_t port;
91 +       } udp;
92 +       struct {
93 +               u_int16_t id;
94 +       } icmp;
95 +};
96 +
97 +/* The manipulable part of the tuple. */
98 +struct ip_conntrack_manip
99 +{
100 +       u_int32_t ip;
101 +       union ip_conntrack_manip_proto u;
102 +};
103 +
104 +/* This contains the information to distinguish a connection. */
105 +struct ip_conntrack_tuple
106 +{
107 +       struct ip_conntrack_manip src;
108 +
109 +       /* These are the parts of the tuple which are fixed. */
110 +       struct {
111 +               u_int32_t ip;
112 +               union {
113 +                       /* Add other protocols here. */
114 +                       u_int16_t all;
115 +
116 +                       struct {
117 +                               u_int16_t port;
118 +                       } tcp;
119 +                       struct {
120 +                               u_int16_t port;
121 +                       } udp;
122 +                       struct {
123 +                               u_int8_t type, code;
124 +                       } icmp;
125 +               } u;
126 +
127 +               /* The protocol. */
128 +               u_int16_t protonum;
129 +       } dst;
130 +};
131 +
132 +/* This is optimized opposed to a memset of the whole structure.  Everything we
133 + * really care about is the  source/destination unions */
134 +#define IP_CT_TUPLE_U_BLANK(tuple)                             \
135 +       do {                                                    \
136 +               (tuple)->src.u.all = 0;                         \
137 +               (tuple)->dst.u.all = 0;                         \
138 +       } while (0)
139 +
140 +enum ip_conntrack_dir
141 +{
142 +       IP_CT_DIR_ORIGINAL,
143 +       IP_CT_DIR_REPLY,
144 +       IP_CT_DIR_MAX
145 +};
146 +
147 +#ifdef __KERNEL__
148 +
149 +#define DUMP_TUPLE(tp)                                         \
150 +DEBUGP("tuple %p: %u %u.%u.%u.%u:%hu -> %u.%u.%u.%u:%hu\n",    \
151 +       (tp), (tp)->dst.protonum,                               \
152 +       NIPQUAD((tp)->src.ip), ntohs((tp)->src.u.all),          \
153 +       NIPQUAD((tp)->dst.ip), ntohs((tp)->dst.u.all))
154 +
155 +#define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL)
156 +
157 +/* If we're the first tuple, it's the original dir. */
158 +#define DIRECTION(h) ((enum ip_conntrack_dir)(&(h)->ctrack->tuplehash[1] == (h)))
159 +
160 +/* Connections have two entries in the hash table: one for each way */
161 +struct ip_conntrack_tuple_hash
162 +{
163 +       struct list_head list;
164 +
165 +       struct ip_conntrack_tuple tuple;
166 +
167 +       /* this == &ctrack->tuplehash[DIRECTION(this)]. */
168 +       struct ip_conntrack *ctrack;
169 +};
170 +
171 +#endif /* __KERNEL__ */
172 +
173 +static inline int ip_ct_tuple_src_equal(const struct ip_conntrack_tuple *t1,
174 +                                       const struct ip_conntrack_tuple *t2)
175 +{
176 +       return t1->src.ip == t2->src.ip
177 +               && t1->src.u.all == t2->src.u.all;
178 +}
179 +
180 +static inline int ip_ct_tuple_dst_equal(const struct ip_conntrack_tuple *t1,
181 +                                       const struct ip_conntrack_tuple *t2)
182 +{
183 +       return t1->dst.ip == t2->dst.ip
184 +               && t1->dst.u.all == t2->dst.u.all
185 +               && t1->dst.protonum == t2->dst.protonum;
186 +}
187 +
188 +static inline int ip_ct_tuple_equal(const struct ip_conntrack_tuple *t1,
189 +                                   const struct ip_conntrack_tuple *t2)
190 +{
191 +       return ip_ct_tuple_src_equal(t1, t2) && ip_ct_tuple_dst_equal(t1, t2);
192 +}
193 +
194 +static inline int ip_ct_tuple_mask_cmp(const struct ip_conntrack_tuple *t,
195 +                                      const struct ip_conntrack_tuple *tuple,
196 +                                      const struct ip_conntrack_tuple *mask)
197 +{
198 +       return !(((t->src.ip ^ tuple->src.ip) & mask->src.ip)
199 +                || ((t->dst.ip ^ tuple->dst.ip) & mask->dst.ip)
200 +                || ((t->src.u.all ^ tuple->src.u.all) & mask->src.u.all)
201 +                || ((t->dst.u.all ^ tuple->dst.u.all) & mask->dst.u.all)
202 +                || ((t->dst.protonum ^ tuple->dst.protonum)
203 +                    & mask->dst.protonum));
204 +}
205 +
206 +#endif /* _IP_CONNTRACK_TUPLE_H */
207 diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/Makefile linux-2.6.0-test11/net/ipv4/netfilter/Makefile
208 --- linux-2.6.0-test11.org/net/ipv4/netfilter/Makefile  2003-11-26 21:43:25.000000000 +0100
209 +++ linux-2.6.0-test11/net/ipv4/netfilter/Makefile      2003-12-17 14:02:02.000000000 +0100
210 @@ -19,6 +19,15 @@
211  # connection tracking
212  obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
213  
214 +# connection tracking protocol helpers
215 +obj-$(CONFIG_IP_NF_CT_PROTO_GRE) += ip_conntrack_proto_gre.o
216 +ifdef CONFIG_IP_NF_CT_PROTO_GRE
217 +       export-objs += ip_conntrack_proto_gre.o
218 +endif
219 +
220 +# NAT protocol helpers
221 +obj-$(CONFIG_IP_NF_NAT_PROTO_GRE) += ip_nat_proto_gre.o
222 +
223  # connection tracking helpers
224  obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
225  obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
226 diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/Makefile.orig linux-2.6.0-test11/net/ipv4/netfilter/Makefile.orig
227 --- linux-2.6.0-test11.org/net/ipv4/netfilter/Makefile.orig     1970-01-01 01:00:00.000000000 +0100
228 +++ linux-2.6.0-test11/net/ipv4/netfilter/Makefile.orig 2003-11-26 21:43:25.000000000 +0100
229 @@ -0,0 +1,96 @@
230 +#
231 +# Makefile for the netfilter modules on top of IPv4.
232 +#
233 +
234 +# objects for the conntrack and NAT core (used by standalone and backw. compat)
235 +ip_nf_conntrack-objs   := ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o
236 +ip_nf_nat-objs         := ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
237 +
238 +# objects for the standalone - connection tracking / NAT
239 +ip_conntrack-objs      := ip_conntrack_standalone.o $(ip_nf_conntrack-objs)
240 +iptable_nat-objs       := ip_nat_standalone.o ip_nat_rule.o $(ip_nf_nat-objs)
241 +
242 +# objects for backwards compatibility mode
243 +ip_nf_compat-objs      := ip_fw_compat.o ip_fw_compat_redir.o ip_fw_compat_masq.o $(ip_nf_conntrack-objs) $(ip_nf_nat-objs)
244 +
245 +ipfwadm-objs           := $(ip_nf_compat-objs) ipfwadm_core.o
246 +ipchains-objs          := $(ip_nf_compat-objs) ipchains_core.o
247 +
248 +# connection tracking
249 +obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
250 +
251 +# connection tracking helpers
252 +obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
253 +obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
254 +obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
255 +obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
256 +
257 +# NAT helpers 
258 +obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
259 +obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
260 +obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
261 +obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
262 +
263 +# generic IP tables 
264 +obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
265 +
266 +# the three instances of ip_tables
267 +obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
268 +obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
269 +obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
270 +
271 +# matches
272 +obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
273 +obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
274 +obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
275 +obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
276 +obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
277 +
278 +obj-$(CONFIG_IP_NF_MATCH_PKTTYPE) += ipt_pkttype.o
279 +obj-$(CONFIG_IP_NF_MATCH_MULTIPORT) += ipt_multiport.o
280 +obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o
281 +obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o
282 +
283 +obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o
284 +
285 +obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
286 +obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o
287 +obj-$(CONFIG_IP_NF_MATCH_AH_ESP) += ipt_ah.o ipt_esp.o
288 +
289 +obj-$(CONFIG_IP_NF_MATCH_LENGTH) += ipt_length.o
290 +
291 +obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
292 +obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
293 +obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
294 +obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
295 +
296 +obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
297 +
298 +# targets
299 +obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
300 +obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o
301 +obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
302 +obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o
303 +obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o
304 +obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
305 +obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
306 +obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
307 +obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o
308 +obj-$(CONFIG_IP_NF_TARGET_CLASSIFY) += ipt_CLASSIFY.o
309 +obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o
310 +obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
311 +obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
312 +obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
313 +
314 +# generic ARP tables
315 +obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
316 +obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
317 +
318 +# just filtering instance of ARP tables for now
319 +obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
320 +
321 +# backwards compatibility 
322 +obj-$(CONFIG_IP_NF_COMPAT_IPCHAINS) += ipchains.o
323 +obj-$(CONFIG_IP_NF_COMPAT_IPFWADM) += ipfwadm.o
324 +
325 +obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
326 diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c
327 --- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c       2003-11-26 21:42:40.000000000 +0100
328 +++ linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c   2003-12-17 14:02:02.000000000 +0100
329 @@ -150,6 +150,8 @@
330         inverse->dst.ip = orig->src.ip;
331         inverse->dst.protonum = orig->dst.protonum;
332  
333 +       inverse->src.u.all = inverse->dst.u.all = 0;
334 +
335         return protocol->invert_tuple(inverse, orig);
336  }
337  
338 @@ -925,8 +927,8 @@
339          * so there is no need to use the tuple lock too */
340  
341         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
342 -       DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
343 -       DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
344 +       DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
345 +       DEBUGP("mask:  "); DUMP_TUPLE_RAW(&expect->mask);
346  
347         old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
348                         struct ip_conntrack_expect *, &expect->tuple, 
349 @@ -1051,15 +1053,14 @@
350  
351         MUST_BE_READ_LOCKED(&ip_conntrack_lock);
352         WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
353 -
354         DEBUGP("change_expect:\n");
355 -       DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
356 -       DEBUGP("exp mask:  "); DUMP_TUPLE(&expect->mask);
357 -       DEBUGP("newtuple:  "); DUMP_TUPLE(newtuple);
358 +       DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
359 +       DEBUGP("exp mask:  "); DUMP_TUPLE_RAW(&expect->mask);
360 +       DEBUGP("newtuple:  "); DUMP_TUPLE_RAW(newtuple);
361         if (expect->ct_tuple.dst.protonum == 0) {
362                 /* Never seen before */
363                 DEBUGP("change expect: never seen before\n");
364 -               if (!ip_ct_tuple_equal(&expect->tuple, newtuple) 
365 +               if (!ip_ct_tuple_mask_cmp(&expect->tuple, newtuple, &expect->mask)
366                     && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
367                                  struct ip_conntrack_expect *, newtuple, &expect->mask)) {
368                         /* Force NAT to find an unused tuple */
369 diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c.orig linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c.orig
370 --- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c.orig  1970-01-01 01:00:00.000000000 +0100
371 +++ linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c.orig      2003-11-26 21:42:40.000000000 +0100
372 @@ -0,0 +1,1430 @@
373 +/* Connection state tracking for netfilter.  This is separated from,
374 +   but required by, the NAT layer; it can also be used by an iptables
375 +   extension. */
376 +
377 +/* (c) 1999 Paul `Rusty' Russell.  Licenced under the GNU General
378 + * Public Licence. 
379 + *
380 + * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
381 + *     - new API and handling of conntrack/nat helpers
382 + *     - now capable of multiple expectations for one master
383 + * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
384 + *     - add usage/reference counts to ip_conntrack_expect
385 + *     - export ip_conntrack[_expect]_{find_get,put} functions
386 + * */
387 +
388 +#include <linux/config.h>
389 +#include <linux/types.h>
390 +#include <linux/icmp.h>
391 +#include <linux/ip.h>
392 +#include <linux/netfilter.h>
393 +#include <linux/netfilter_ipv4.h>
394 +#include <linux/module.h>
395 +#include <linux/skbuff.h>
396 +#include <linux/proc_fs.h>
397 +#include <linux/vmalloc.h>
398 +#include <net/checksum.h>
399 +#include <linux/stddef.h>
400 +#include <linux/sysctl.h>
401 +#include <linux/slab.h>
402 +#include <linux/random.h>
403 +#include <linux/jhash.h>
404 +/* For ERR_PTR().  Yeah, I know... --RR */
405 +#include <linux/fs.h>
406 +
407 +/* This rwlock protects the main hash table, protocol/helper/expected
408 +   registrations, conntrack timers*/
409 +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
410 +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
411 +
412 +#include <linux/netfilter_ipv4/ip_conntrack.h>
413 +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
414 +#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
415 +#include <linux/netfilter_ipv4/ip_conntrack_core.h>
416 +#include <linux/netfilter_ipv4/listhelp.h>
417 +
418 +#define IP_CONNTRACK_VERSION   "2.1"
419 +
420 +#if 0
421 +#define DEBUGP printk
422 +#else
423 +#define DEBUGP(format, args...)
424 +#endif
425 +
426 +DECLARE_RWLOCK(ip_conntrack_lock);
427 +DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
428 +
429 +void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
430 +LIST_HEAD(ip_conntrack_expect_list);
431 +LIST_HEAD(protocol_list);
432 +static LIST_HEAD(helpers);
433 +unsigned int ip_conntrack_htable_size = 0;
434 +int ip_conntrack_max;
435 +static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
436 +struct list_head *ip_conntrack_hash;
437 +static kmem_cache_t *ip_conntrack_cachep;
438 +
439 +extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
440 +
441 +static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
442 +                             u_int8_t protocol)
443 +{
444 +       return protocol == curr->proto;
445 +}
446 +
447 +struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
448 +{
449 +       struct ip_conntrack_protocol *p;
450 +
451 +       MUST_BE_READ_LOCKED(&ip_conntrack_lock);
452 +       p = LIST_FIND(&protocol_list, proto_cmpfn,
453 +                     struct ip_conntrack_protocol *, protocol);
454 +       if (!p)
455 +               p = &ip_conntrack_generic_protocol;
456 +
457 +       return p;
458 +}
459 +
460 +struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
461 +{
462 +       struct ip_conntrack_protocol *p;
463 +
464 +       READ_LOCK(&ip_conntrack_lock);
465 +       p = __ip_ct_find_proto(protocol);
466 +       READ_UNLOCK(&ip_conntrack_lock);
467 +       return p;
468 +}
469 +
470 +inline void 
471 +ip_conntrack_put(struct ip_conntrack *ct)
472 +{
473 +       IP_NF_ASSERT(ct);
474 +       IP_NF_ASSERT(ct->infos[0].master);
475 +       /* nf_conntrack_put wants to go via an info struct, so feed it
476 +           one at random. */
477 +       nf_conntrack_put(&ct->infos[0]);
478 +}
479 +
480 +static int ip_conntrack_hash_rnd_initted;
481 +static unsigned int ip_conntrack_hash_rnd;
482 +
483 +static u_int32_t
484 +hash_conntrack(const struct ip_conntrack_tuple *tuple)
485 +{
486 +#if 0
487 +       dump_tuple(tuple);
488 +#endif
489 +       return (jhash_3words(tuple->src.ip,
490 +                            (tuple->dst.ip ^ tuple->dst.protonum),
491 +                            (tuple->src.u.all | (tuple->dst.u.all << 16)),
492 +                            ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
493 +}
494 +
495 +int
496 +get_tuple(const struct iphdr *iph,
497 +         const struct sk_buff *skb,
498 +         unsigned int dataoff,
499 +         struct ip_conntrack_tuple *tuple,
500 +         const struct ip_conntrack_protocol *protocol)
501 +{
502 +       /* Never happen */
503 +       if (iph->frag_off & htons(IP_OFFSET)) {
504 +               printk("ip_conntrack_core: Frag of proto %u.\n",
505 +                      iph->protocol);
506 +               return 0;
507 +       }
508 +
509 +       tuple->src.ip = iph->saddr;
510 +       tuple->dst.ip = iph->daddr;
511 +       tuple->dst.protonum = iph->protocol;
512 +
513 +       return protocol->pkt_to_tuple(skb, dataoff, tuple);
514 +}
515 +
516 +static int
517 +invert_tuple(struct ip_conntrack_tuple *inverse,
518 +            const struct ip_conntrack_tuple *orig,
519 +            const struct ip_conntrack_protocol *protocol)
520 +{
521 +       inverse->src.ip = orig->dst.ip;
522 +       inverse->dst.ip = orig->src.ip;
523 +       inverse->dst.protonum = orig->dst.protonum;
524 +
525 +       return protocol->invert_tuple(inverse, orig);
526 +}
527 +
528 +
529 +/* ip_conntrack_expect helper functions */
530 +
531 +/* Compare tuple parts depending on mask. */
532 +static inline int expect_cmp(const struct ip_conntrack_expect *i,
533 +                            const struct ip_conntrack_tuple *tuple)
534 +{
535 +       MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
536 +       return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
537 +}
538 +
539 +static void
540 +destroy_expect(struct ip_conntrack_expect *exp)
541 +{
542 +       DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
543 +       IP_NF_ASSERT(atomic_read(&exp->use));
544 +       IP_NF_ASSERT(!timer_pending(&exp->timeout));
545 +
546 +       kfree(exp);
547 +}
548 +
549 +
550 +inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
551 +{
552 +       IP_NF_ASSERT(exp);
553 +
554 +       if (atomic_dec_and_test(&exp->use)) {
555 +               /* usage count dropped to zero */
556 +               destroy_expect(exp);
557 +       }
558 +}
559 +
560 +static inline struct ip_conntrack_expect *
561 +__ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
562 +{
563 +       MUST_BE_READ_LOCKED(&ip_conntrack_lock);
564 +       MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
565 +       return LIST_FIND(&ip_conntrack_expect_list, expect_cmp, 
566 +                        struct ip_conntrack_expect *, tuple);
567 +}
568 +
569 +/* Find a expectation corresponding to a tuple. */
570 +struct ip_conntrack_expect *
571 +ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
572 +{
573 +       struct ip_conntrack_expect *exp;
574 +
575 +       READ_LOCK(&ip_conntrack_lock);
576 +       READ_LOCK(&ip_conntrack_expect_tuple_lock);
577 +       exp = __ip_ct_expect_find(tuple);
578 +       if (exp)
579 +               atomic_inc(&exp->use);
580 +       READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
581 +       READ_UNLOCK(&ip_conntrack_lock);
582 +
583 +       return exp;
584 +}
585 +
586 +/* remove one specific expectation from all lists and drop refcount,
587 + * does _NOT_ delete the timer. */
588 +static void __unexpect_related(struct ip_conntrack_expect *expect)
589 +{
590 +       DEBUGP("unexpect_related(%p)\n", expect);
591 +       MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
592 +
593 +       /* we're not allowed to unexpect a confirmed expectation! */
594 +       IP_NF_ASSERT(!expect->sibling);
595 +
596 +       /* delete from global and local lists */
597 +       list_del(&expect->list);
598 +       list_del(&expect->expected_list);
599 +
600 +       /* decrement expect-count of master conntrack */
601 +       if (expect->expectant)
602 +               expect->expectant->expecting--;
603 +
604 +       ip_conntrack_expect_put(expect);
605 +}
606 +
607 +/* remove one specific expecatation from all lists, drop refcount
608 + * and expire timer. 
609 + * This function can _NOT_ be called for confirmed expects! */
610 +static void unexpect_related(struct ip_conntrack_expect *expect)
611 +{
612 +       IP_NF_ASSERT(expect->expectant);
613 +       IP_NF_ASSERT(expect->expectant->helper);
614 +       /* if we are supposed to have a timer, but we can't delete
615 +        * it: race condition.  __unexpect_related will
616 +        * be calledd by timeout function */
617 +       if (expect->expectant->helper->timeout
618 +           && !del_timer(&expect->timeout))
619 +               return;
620 +
621 +       __unexpect_related(expect);
622 +}
623 +
624 +/* delete all unconfirmed expectations for this conntrack */
625 +static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
626 +{
627 +       struct list_head *exp_entry, *next;
628 +       struct ip_conntrack_expect *exp;
629 +
630 +       DEBUGP("remove_expectations(%p)\n", ct);
631 +
632 +       list_for_each_safe(exp_entry, next, &ct->sibling_list) {
633 +               exp = list_entry(exp_entry, struct ip_conntrack_expect,
634 +                                expected_list);
635 +
636 +               /* we skip established expectations, as we want to delete
637 +                * the un-established ones only */
638 +               if (exp->sibling) {
639 +                       DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
640 +                       if (drop_refcount) {
641 +                               /* Indicate that this expectations parent is dead */
642 +                               ip_conntrack_put(exp->expectant);
643 +                               exp->expectant = NULL;
644 +                       }
645 +                       continue;
646 +               }
647 +
648 +               IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
649 +               IP_NF_ASSERT(exp->expectant == ct);
650 +
651 +               /* delete expectation from global and private lists */
652 +               unexpect_related(exp);
653 +       }
654 +}
655 +
656 +static void
657 +clean_from_lists(struct ip_conntrack *ct)
658 +{
659 +       unsigned int ho, hr;
660 +       
661 +       DEBUGP("clean_from_lists(%p)\n", ct);
662 +       MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
663 +
664 +       ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
665 +       hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
666 +       LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
667 +       LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
668 +
669 +       /* Destroy all un-established, pending expectations */
670 +       remove_expectations(ct, 1);
671 +}
672 +
673 +static void
674 +destroy_conntrack(struct nf_conntrack *nfct)
675 +{
676 +       struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
677 +       struct ip_conntrack_protocol *proto;
678 +
679 +       DEBUGP("destroy_conntrack(%p)\n", ct);
680 +       IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
681 +       IP_NF_ASSERT(!timer_pending(&ct->timeout));
682 +
683 +       /* To make sure we don't get any weird locking issues here:
684 +        * destroy_conntrack() MUST NOT be called with a write lock
685 +        * to ip_conntrack_lock!!! -HW */
686 +       proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
687 +       if (proto && proto->destroy)
688 +               proto->destroy(ct);
689 +
690 +       if (ip_conntrack_destroyed)
691 +               ip_conntrack_destroyed(ct);
692 +
693 +       WRITE_LOCK(&ip_conntrack_lock);
694 +       /* Delete us from our own list to prevent corruption later */
695 +       list_del(&ct->sibling_list);
696 +
697 +       /* Delete our master expectation */
698 +       if (ct->master) {
699 +               if (ct->master->expectant) {
700 +                       /* can't call __unexpect_related here,
701 +                        * since it would screw up expect_list */
702 +                       list_del(&ct->master->expected_list);
703 +                       master = ct->master->expectant;
704 +               }
705 +               kfree(ct->master);
706 +       }
707 +       WRITE_UNLOCK(&ip_conntrack_lock);
708 +
709 +       if (master)
710 +               ip_conntrack_put(master);
711 +
712 +       DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
713 +       kmem_cache_free(ip_conntrack_cachep, ct);
714 +       atomic_dec(&ip_conntrack_count);
715 +}
716 +
717 +static void death_by_timeout(unsigned long ul_conntrack)
718 +{
719 +       struct ip_conntrack *ct = (void *)ul_conntrack;
720 +
721 +       WRITE_LOCK(&ip_conntrack_lock);
722 +       clean_from_lists(ct);
723 +       WRITE_UNLOCK(&ip_conntrack_lock);
724 +       ip_conntrack_put(ct);
725 +}
726 +
727 +static inline int
728 +conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
729 +                   const struct ip_conntrack_tuple *tuple,
730 +                   const struct ip_conntrack *ignored_conntrack)
731 +{
732 +       MUST_BE_READ_LOCKED(&ip_conntrack_lock);
733 +       return i->ctrack != ignored_conntrack
734 +               && ip_ct_tuple_equal(tuple, &i->tuple);
735 +}
736 +
737 +static struct ip_conntrack_tuple_hash *
738 +__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
739 +                   const struct ip_conntrack *ignored_conntrack)
740 +{
741 +       struct ip_conntrack_tuple_hash *h;
742 +       unsigned int hash = hash_conntrack(tuple);
743 +
744 +       MUST_BE_READ_LOCKED(&ip_conntrack_lock);
745 +       h = LIST_FIND(&ip_conntrack_hash[hash],
746 +                     conntrack_tuple_cmp,
747 +                     struct ip_conntrack_tuple_hash *,
748 +                     tuple, ignored_conntrack);
749 +       return h;
750 +}
751 +
752 +/* Find a connection corresponding to a tuple. */
753 +struct ip_conntrack_tuple_hash *
754 +ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
755 +                     const struct ip_conntrack *ignored_conntrack)
756 +{
757 +       struct ip_conntrack_tuple_hash *h;
758 +
759 +       READ_LOCK(&ip_conntrack_lock);
760 +       h = __ip_conntrack_find(tuple, ignored_conntrack);
761 +       if (h)
762 +               atomic_inc(&h->ctrack->ct_general.use);
763 +       READ_UNLOCK(&ip_conntrack_lock);
764 +
765 +       return h;
766 +}
767 +
768 +static inline struct ip_conntrack *
769 +__ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
770 +{
771 +       struct ip_conntrack *ct
772 +               = (struct ip_conntrack *)nfct->master;
773 +
774 +       /* ctinfo is the index of the nfct inside the conntrack */
775 +       *ctinfo = nfct - ct->infos;
776 +       IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
777 +       return ct;
778 +}
779 +
780 +/* Return conntrack and conntrack_info given skb->nfct->master */
781 +struct ip_conntrack *
782 +ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
783 +{
784 +       if (skb->nfct) 
785 +               return __ip_conntrack_get(skb->nfct, ctinfo);
786 +       return NULL;
787 +}
788 +
789 +/* Confirm a connection given skb->nfct; places it in hash table */
790 +int
791 +__ip_conntrack_confirm(struct nf_ct_info *nfct)
792 +{
793 +       unsigned int hash, repl_hash;
794 +       struct ip_conntrack *ct;
795 +       enum ip_conntrack_info ctinfo;
796 +
797 +       ct = __ip_conntrack_get(nfct, &ctinfo);
798 +
799 +       /* ipt_REJECT uses ip_conntrack_attach to attach related
800 +          ICMP/TCP RST packets in other direction.  Actual packet
801 +          which created connection will be IP_CT_NEW or for an
802 +          expected connection, IP_CT_RELATED. */
803 +       if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
804 +               return NF_ACCEPT;
805 +
806 +       hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
807 +       repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
808 +
809 +       /* We're not in hash table, and we refuse to set up related
810 +          connections for unconfirmed conns.  But packet copies and
811 +          REJECT will give spurious warnings here. */
812 +       /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
813 +
814 +       /* No external references means noone else could have
815 +           confirmed us. */
816 +       IP_NF_ASSERT(!is_confirmed(ct));
817 +       DEBUGP("Confirming conntrack %p\n", ct);
818 +
819 +       WRITE_LOCK(&ip_conntrack_lock);
820 +       /* See if there's one in the list already, including reverse:
821 +           NAT could have grabbed it without realizing, since we're
822 +           not in the hash.  If there is, we lost race. */
823 +       if (!LIST_FIND(&ip_conntrack_hash[hash],
824 +                      conntrack_tuple_cmp,
825 +                      struct ip_conntrack_tuple_hash *,
826 +                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
827 +           && !LIST_FIND(&ip_conntrack_hash[repl_hash],
828 +                         conntrack_tuple_cmp,
829 +                         struct ip_conntrack_tuple_hash *,
830 +                         &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
831 +               list_prepend(&ip_conntrack_hash[hash],
832 +                            &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
833 +               list_prepend(&ip_conntrack_hash[repl_hash],
834 +                            &ct->tuplehash[IP_CT_DIR_REPLY]);
835 +               /* Timer relative to confirmation time, not original
836 +                  setting time, otherwise we'd get timer wrap in
837 +                  weird delay cases. */
838 +               ct->timeout.expires += jiffies;
839 +               add_timer(&ct->timeout);
840 +               atomic_inc(&ct->ct_general.use);
841 +               set_bit(IPS_CONFIRMED_BIT, &ct->status);
842 +               WRITE_UNLOCK(&ip_conntrack_lock);
843 +               return NF_ACCEPT;
844 +       }
845 +
846 +       WRITE_UNLOCK(&ip_conntrack_lock);
847 +       return NF_DROP;
848 +}
849 +
850 +/* Returns true if a connection correspondings to the tuple (required
851 +   for NAT). */
852 +int
853 +ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
854 +                        const struct ip_conntrack *ignored_conntrack)
855 +{
856 +       struct ip_conntrack_tuple_hash *h;
857 +
858 +       READ_LOCK(&ip_conntrack_lock);
859 +       h = __ip_conntrack_find(tuple, ignored_conntrack);
860 +       READ_UNLOCK(&ip_conntrack_lock);
861 +
862 +       return h != NULL;
863 +}
864 +
865 +/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
866 +struct ip_conntrack *
867 +icmp_error_track(struct sk_buff *skb,
868 +                enum ip_conntrack_info *ctinfo,
869 +                unsigned int hooknum)
870 +{
871 +       struct ip_conntrack_tuple innertuple, origtuple;
872 +       struct {
873 +               struct icmphdr icmp;
874 +               struct iphdr ip;
875 +       } inside;
876 +       struct ip_conntrack_protocol *innerproto;
877 +       struct ip_conntrack_tuple_hash *h;
878 +       int dataoff;
879 +
880 +       IP_NF_ASSERT(skb->nfct == NULL);
881 +
882 +       /* Not enough header? */
883 +       if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0)
884 +               return NULL;
885 +
886 +       if (inside.icmp.type != ICMP_DEST_UNREACH
887 +           && inside.icmp.type != ICMP_SOURCE_QUENCH
888 +           && inside.icmp.type != ICMP_TIME_EXCEEDED
889 +           && inside.icmp.type != ICMP_PARAMETERPROB
890 +           && inside.icmp.type != ICMP_REDIRECT)
891 +               return NULL;
892 +
893 +       /* Ignore ICMP's containing fragments (shouldn't happen) */
894 +       if (inside.ip.frag_off & htons(IP_OFFSET)) {
895 +               DEBUGP("icmp_error_track: fragment of proto %u\n",
896 +                      inside.ip.protocol);
897 +               return NULL;
898 +       }
899 +
900 +       innerproto = ip_ct_find_proto(inside.ip.protocol);
901 +       dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4;
902 +       /* Are they talking about one of our connections? */
903 +       if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) {
904 +               DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol);
905 +               return NULL;
906 +       }
907 +
908 +       /* Ordinarily, we'd expect the inverted tupleproto, but it's
909 +          been preserved inside the ICMP. */
910 +       if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
911 +               DEBUGP("icmp_error_track: Can't invert tuple\n");
912 +               return NULL;
913 +       }
914 +
915 +       *ctinfo = IP_CT_RELATED;
916 +
917 +       h = ip_conntrack_find_get(&innertuple, NULL);
918 +       if (!h) {
919 +               /* Locally generated ICMPs will match inverted if they
920 +                  haven't been SNAT'ed yet */
921 +               /* FIXME: NAT code has to handle half-done double NAT --RR */
922 +               if (hooknum == NF_IP_LOCAL_OUT)
923 +                       h = ip_conntrack_find_get(&origtuple, NULL);
924 +
925 +               if (!h) {
926 +                       DEBUGP("icmp_error_track: no match\n");
927 +                       return NULL;
928 +               }
929 +               /* Reverse direction from that found */
930 +               if (DIRECTION(h) != IP_CT_DIR_REPLY)
931 +                       *ctinfo += IP_CT_IS_REPLY;
932 +       } else {
933 +               if (DIRECTION(h) == IP_CT_DIR_REPLY)
934 +                       *ctinfo += IP_CT_IS_REPLY;
935 +       }
936 +
937 +       /* Update skb to refer to this connection */
938 +       skb->nfct = &h->ctrack->infos[*ctinfo];
939 +       return h->ctrack;
940 +}
941 +
942 +/* There's a small race here where we may free a just-assured
943 +   connection.  Too bad: we're in trouble anyway. */
944 +static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
945 +{
946 +       return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
947 +}
948 +
949 +static int early_drop(struct list_head *chain)
950 +{
951 +       /* Traverse backwards: gives us oldest, which is roughly LRU */
952 +       struct ip_conntrack_tuple_hash *h;
953 +       int dropped = 0;
954 +
955 +       READ_LOCK(&ip_conntrack_lock);
956 +       h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
957 +       if (h)
958 +               atomic_inc(&h->ctrack->ct_general.use);
959 +       READ_UNLOCK(&ip_conntrack_lock);
960 +
961 +       if (!h)
962 +               return dropped;
963 +
964 +       if (del_timer(&h->ctrack->timeout)) {
965 +               death_by_timeout((unsigned long)h->ctrack);
966 +               dropped = 1;
967 +       }
968 +       ip_conntrack_put(h->ctrack);
969 +       return dropped;
970 +}
971 +
972 +static inline int helper_cmp(const struct ip_conntrack_helper *i,
973 +                            const struct ip_conntrack_tuple *rtuple)
974 +{
975 +       return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
976 +}
977 +
978 +struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
979 +{
980 +       return LIST_FIND(&helpers, helper_cmp,
981 +                        struct ip_conntrack_helper *,
982 +                        tuple);
983 +}
984 +
985 +/* Allocate a new conntrack: we return -ENOMEM if classification
986 +   failed due to stress.  Otherwise it really is unclassifiable. */
987 +static struct ip_conntrack_tuple_hash *
988 +init_conntrack(const struct ip_conntrack_tuple *tuple,
989 +              struct ip_conntrack_protocol *protocol,
990 +              struct sk_buff *skb)
991 +{
992 +       struct ip_conntrack *conntrack;
993 +       struct ip_conntrack_tuple repl_tuple;
994 +       size_t hash;
995 +       struct ip_conntrack_expect *expected;
996 +       int i;
997 +       static unsigned int drop_next;
998 +
999 +       if (!ip_conntrack_hash_rnd_initted) {
1000 +               get_random_bytes(&ip_conntrack_hash_rnd, 4);
1001 +               ip_conntrack_hash_rnd_initted = 1;
1002 +       }
1003 +
1004 +       hash = hash_conntrack(tuple);
1005 +
1006 +       if (ip_conntrack_max &&
1007 +           atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
1008 +               /* Try dropping from random chain, or else from the
1009 +                   chain about to put into (in case they're trying to
1010 +                   bomb one hash chain). */
1011 +               unsigned int next = (drop_next++)%ip_conntrack_htable_size;
1012 +
1013 +               if (!early_drop(&ip_conntrack_hash[next])
1014 +                   && !early_drop(&ip_conntrack_hash[hash])) {
1015 +                       if (net_ratelimit())
1016 +                               printk(KERN_WARNING
1017 +                                      "ip_conntrack: table full, dropping"
1018 +                                      " packet.\n");
1019 +                       return ERR_PTR(-ENOMEM);
1020 +               }
1021 +       }
1022 +
1023 +       if (!invert_tuple(&repl_tuple, tuple, protocol)) {
1024 +               DEBUGP("Can't invert tuple.\n");
1025 +               return NULL;
1026 +       }
1027 +
1028 +       conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
1029 +       if (!conntrack) {
1030 +               DEBUGP("Can't allocate conntrack.\n");
1031 +               return ERR_PTR(-ENOMEM);
1032 +       }
1033 +
1034 +       memset(conntrack, 0, sizeof(*conntrack));
1035 +       atomic_set(&conntrack->ct_general.use, 1);
1036 +       conntrack->ct_general.destroy = destroy_conntrack;
1037 +       conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
1038 +       conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
1039 +       conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
1040 +       conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
1041 +       for (i=0; i < IP_CT_NUMBER; i++)
1042 +               conntrack->infos[i].master = &conntrack->ct_general;
1043 +
1044 +       if (!protocol->new(conntrack, skb)) {
1045 +               kmem_cache_free(ip_conntrack_cachep, conntrack);
1046 +               return NULL;
1047 +       }
1048 +       /* Don't set timer yet: wait for confirmation */
1049 +       init_timer(&conntrack->timeout);
1050 +       conntrack->timeout.data = (unsigned long)conntrack;
1051 +       conntrack->timeout.function = death_by_timeout;
1052 +
1053 +       INIT_LIST_HEAD(&conntrack->sibling_list);
1054 +
1055 +       WRITE_LOCK(&ip_conntrack_lock);
1056 +       /* Need finding and deleting of expected ONLY if we win race */
1057 +       READ_LOCK(&ip_conntrack_expect_tuple_lock);
1058 +       expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
1059 +                            struct ip_conntrack_expect *, tuple);
1060 +       READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
1061 +
1062 +       /* If master is not in hash table yet (ie. packet hasn't left
1063 +          this machine yet), how can other end know about expected?
1064 +          Hence these are not the droids you are looking for (if
1065 +          master ct never got confirmed, we'd hold a reference to it
1066 +          and weird things would happen to future packets). */
1067 +       if (expected && !is_confirmed(expected->expectant))
1068 +               expected = NULL;
1069 +
1070 +       /* Look up the conntrack helper for master connections only */
1071 +       if (!expected)
1072 +               conntrack->helper = ip_ct_find_helper(&repl_tuple);
1073 +
1074 +       /* If the expectation is dying, then this is a loser. */
1075 +       if (expected
1076 +           && expected->expectant->helper->timeout
1077 +           && ! del_timer(&expected->timeout))
1078 +               expected = NULL;
1079 +
1080 +       if (expected) {
1081 +               DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
1082 +                       conntrack, expected);
1083 +               /* Welcome, Mr. Bond.  We've been expecting you... */
1084 +               IP_NF_ASSERT(master_ct(conntrack));
1085 +               __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
1086 +               conntrack->master = expected;
1087 +               expected->sibling = conntrack;
1088 +               LIST_DELETE(&ip_conntrack_expect_list, expected);
1089 +               expected->expectant->expecting--;
1090 +               nf_conntrack_get(&master_ct(conntrack)->infos[0]);
1091 +       }
1092 +       atomic_inc(&ip_conntrack_count);
1093 +       WRITE_UNLOCK(&ip_conntrack_lock);
1094 +
1095 +       if (expected && expected->expectfn)
1096 +               expected->expectfn(conntrack);
1097 +       return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
1098 +}
1099 +
1100 +/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1101 +static inline struct ip_conntrack *
1102 +resolve_normal_ct(struct sk_buff *skb,
1103 +                 struct ip_conntrack_protocol *proto,
1104 +                 int *set_reply,
1105 +                 unsigned int hooknum,
1106 +                 enum ip_conntrack_info *ctinfo)
1107 +{
1108 +       struct ip_conntrack_tuple tuple;
1109 +       struct ip_conntrack_tuple_hash *h;
1110 +
1111 +       IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
1112 +
1113 +       if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto))
1114 +               return NULL;
1115 +
1116 +       /* look for tuple match */
1117 +       h = ip_conntrack_find_get(&tuple, NULL);
1118 +       if (!h) {
1119 +               h = init_conntrack(&tuple, proto, skb);
1120 +               if (!h)
1121 +                       return NULL;
1122 +               if (IS_ERR(h))
1123 +                       return (void *)h;
1124 +       }
1125 +
1126 +       /* It exists; we have (non-exclusive) reference. */
1127 +       if (DIRECTION(h) == IP_CT_DIR_REPLY) {
1128 +               *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1129 +               /* Please set reply bit if this packet OK */
1130 +               *set_reply = 1;
1131 +       } else {
1132 +               /* Once we've had two way comms, always ESTABLISHED. */
1133 +               if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
1134 +                       DEBUGP("ip_conntrack_in: normal packet for %p\n",
1135 +                              h->ctrack);
1136 +                       *ctinfo = IP_CT_ESTABLISHED;
1137 +               } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
1138 +                       DEBUGP("ip_conntrack_in: related packet for %p\n",
1139 +                              h->ctrack);
1140 +                       *ctinfo = IP_CT_RELATED;
1141 +               } else {
1142 +                       DEBUGP("ip_conntrack_in: new packet for %p\n",
1143 +                              h->ctrack);
1144 +                       *ctinfo = IP_CT_NEW;
1145 +               }
1146 +               *set_reply = 0;
1147 +       }
1148 +       skb->nfct = &h->ctrack->infos[*ctinfo];
1149 +       return h->ctrack;
1150 +}
1151 +
1152 +/* Netfilter hook itself. */
1153 +unsigned int ip_conntrack_in(unsigned int hooknum,
1154 +                            struct sk_buff **pskb,
1155 +                            const struct net_device *in,
1156 +                            const struct net_device *out,
1157 +                            int (*okfn)(struct sk_buff *))
1158 +{
1159 +       struct ip_conntrack *ct;
1160 +       enum ip_conntrack_info ctinfo;
1161 +       struct ip_conntrack_protocol *proto;
1162 +       int set_reply;
1163 +       int ret;
1164 +
1165 +       /* FIXME: Do this right please. --RR */
1166 +       (*pskb)->nfcache |= NFC_UNKNOWN;
1167 +
1168 +/* Doesn't cover locally-generated broadcast, so not worth it. */
1169 +#if 0
1170 +       /* Ignore broadcast: no `connection'. */
1171 +       if ((*pskb)->pkt_type == PACKET_BROADCAST) {
1172 +               printk("Broadcast packet!\n");
1173 +               return NF_ACCEPT;
1174 +       } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
1175 +                  == htonl(0x000000FF)) {
1176 +               printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
1177 +                      NIPQUAD((*pskb)->nh.iph->saddr),
1178 +                      NIPQUAD((*pskb)->nh.iph->daddr),
1179 +                      (*pskb)->sk, (*pskb)->pkt_type);
1180 +       }
1181 +#endif
1182 +
1183 +       /* Previously seen (loopback)?  Ignore.  Do this before
1184 +           fragment check. */
1185 +       if ((*pskb)->nfct)
1186 +               return NF_ACCEPT;
1187 +
1188 +       /* Gather fragments. */
1189 +       if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
1190 +               *pskb = ip_ct_gather_frags(*pskb);
1191 +               if (!*pskb)
1192 +                       return NF_STOLEN;
1193 +       }
1194 +
1195 +       proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
1196 +
1197 +       /* It may be an icmp error... */
1198 +       if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP 
1199 +           && icmp_error_track(*pskb, &ctinfo, hooknum))
1200 +               return NF_ACCEPT;
1201 +
1202 +       if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
1203 +               /* Not valid part of a connection */
1204 +               return NF_ACCEPT;
1205 +
1206 +       if (IS_ERR(ct))
1207 +               /* Too stressed to deal. */
1208 +               return NF_DROP;
1209 +
1210 +       IP_NF_ASSERT((*pskb)->nfct);
1211 +
1212 +       ret = proto->packet(ct, *pskb, ctinfo);
1213 +       if (ret == -1) {
1214 +               /* Invalid */
1215 +               nf_conntrack_put((*pskb)->nfct);
1216 +               (*pskb)->nfct = NULL;
1217 +               return NF_ACCEPT;
1218 +       }
1219 +
1220 +       if (ret != NF_DROP && ct->helper) {
1221 +               ret = ct->helper->help(*pskb, ct, ctinfo);
1222 +               if (ret == -1) {
1223 +                       /* Invalid */
1224 +                       nf_conntrack_put((*pskb)->nfct);
1225 +                       (*pskb)->nfct = NULL;
1226 +                       return NF_ACCEPT;
1227 +               }
1228 +       }
1229 +       if (set_reply)
1230 +               set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
1231 +
1232 +       return ret;
1233 +}
1234 +
1235 +int invert_tuplepr(struct ip_conntrack_tuple *inverse,
1236 +                  const struct ip_conntrack_tuple *orig)
1237 +{
1238 +       return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
1239 +}
1240 +
1241 +static inline int resent_expect(const struct ip_conntrack_expect *i,
1242 +                               const struct ip_conntrack_tuple *tuple,
1243 +                               const struct ip_conntrack_tuple *mask)
1244 +{
1245 +       DEBUGP("resent_expect\n");
1246 +       DEBUGP("   tuple:   "); DUMP_TUPLE(&i->tuple);
1247 +       DEBUGP("ct_tuple:   "); DUMP_TUPLE(&i->ct_tuple);
1248 +       DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
1249 +       return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
1250 +                || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
1251 +               && ip_ct_tuple_equal(&i->mask, mask));
1252 +}
1253 +
1254 +/* Would two expected things clash? */
1255 +static inline int expect_clash(const struct ip_conntrack_expect *i,
1256 +                              const struct ip_conntrack_tuple *tuple,
1257 +                              const struct ip_conntrack_tuple *mask)
1258 +{
1259 +       /* Part covered by intersection of masks must be unequal,
1260 +           otherwise they clash */
1261 +       struct ip_conntrack_tuple intersect_mask
1262 +               = { { i->mask.src.ip & mask->src.ip,
1263 +                     { i->mask.src.u.all & mask->src.u.all } },
1264 +                   { i->mask.dst.ip & mask->dst.ip,
1265 +                     { i->mask.dst.u.all & mask->dst.u.all },
1266 +                     i->mask.dst.protonum & mask->dst.protonum } };
1267 +
1268 +       return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
1269 +}
1270 +
1271 +inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
1272 +{
1273 +       WRITE_LOCK(&ip_conntrack_lock);
1274 +       unexpect_related(expect);
1275 +       WRITE_UNLOCK(&ip_conntrack_lock);
1276 +}
1277 +       
1278 +static void expectation_timed_out(unsigned long ul_expect)
1279 +{
1280 +       struct ip_conntrack_expect *expect = (void *) ul_expect;
1281 +
1282 +       DEBUGP("expectation %p timed out\n", expect);   
1283 +       WRITE_LOCK(&ip_conntrack_lock);
1284 +       __unexpect_related(expect);
1285 +       WRITE_UNLOCK(&ip_conntrack_lock);
1286 +}
1287 +
1288 +/* Add a related connection. */
1289 +int ip_conntrack_expect_related(struct ip_conntrack *related_to,
1290 +                               struct ip_conntrack_expect *expect)
1291 +{
1292 +       struct ip_conntrack_expect *old, *new;
1293 +       int ret = 0;
1294 +
1295 +       WRITE_LOCK(&ip_conntrack_lock);
1296 +       /* Because of the write lock, no reader can walk the lists,
1297 +        * so there is no need to use the tuple lock too */
1298 +
1299 +       DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1300 +       DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1301 +       DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1302 +
1303 +       old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
1304 +                       struct ip_conntrack_expect *, &expect->tuple, 
1305 +                       &expect->mask);
1306 +       if (old) {
1307 +               /* Helper private data may contain offsets but no pointers
1308 +                  pointing into the payload - otherwise we should have to copy 
1309 +                  the data filled out by the helper over the old one */
1310 +               DEBUGP("expect_related: resent packet\n");
1311 +               if (related_to->helper->timeout) {
1312 +                       if (!del_timer(&old->timeout)) {
1313 +                               /* expectation is dying. Fall through */
1314 +                               old = NULL;
1315 +                       } else {
1316 +                               old->timeout.expires = jiffies + 
1317 +                                       related_to->helper->timeout * HZ;
1318 +                               add_timer(&old->timeout);
1319 +                       }
1320 +               }
1321 +
1322 +               if (old) {
1323 +                       WRITE_UNLOCK(&ip_conntrack_lock);
1324 +                       return -EEXIST;
1325 +               }
1326 +       } else if (related_to->helper->max_expected && 
1327 +                  related_to->expecting >= related_to->helper->max_expected) {
1328 +               struct list_head *cur_item;
1329 +               /* old == NULL */
1330 +               if (!(related_to->helper->flags & 
1331 +                     IP_CT_HELPER_F_REUSE_EXPECT)) {
1332 +                       WRITE_UNLOCK(&ip_conntrack_lock);
1333 +                       if (net_ratelimit())
1334 +                               printk(KERN_WARNING
1335 +                                      "ip_conntrack: max number of expected "
1336 +                                      "connections %i of %s reached for "
1337 +                                      "%u.%u.%u.%u->%u.%u.%u.%u\n",
1338 +                                      related_to->helper->max_expected,
1339 +                                      related_to->helper->name,
1340 +                                      NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1341 +                                      NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1342 +                       return -EPERM;
1343 +               }
1344 +               DEBUGP("ip_conntrack: max number of expected "
1345 +                      "connections %i of %s reached for "
1346 +                      "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
1347 +                      related_to->helper->max_expected,
1348 +                      related_to->helper->name,
1349 +                      NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1350 +                      NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1351
1352 +               /* choose the the oldest expectation to evict */
1353 +               list_for_each(cur_item, &related_to->sibling_list) { 
1354 +                       struct ip_conntrack_expect *cur;
1355 +
1356 +                       cur = list_entry(cur_item, 
1357 +                                        struct ip_conntrack_expect,
1358 +                                        expected_list);
1359 +                       if (cur->sibling == NULL) {
1360 +                               old = cur;
1361 +                               break;
1362 +                       }
1363 +               }
1364 +
1365 +               /* (!old) cannot happen, since related_to->expecting is the
1366 +                * number of unconfirmed expects */
1367 +               IP_NF_ASSERT(old);
1368 +
1369 +               /* newnat14 does not reuse the real allocated memory
1370 +                * structures but rather unexpects the old and
1371 +                * allocates a new.  unexpect_related will decrement
1372 +                * related_to->expecting. 
1373 +                */
1374 +               unexpect_related(old);
1375 +               ret = -EPERM;
1376 +       } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1377 +                            struct ip_conntrack_expect *, &expect->tuple, 
1378 +                            &expect->mask)) {
1379 +               WRITE_UNLOCK(&ip_conntrack_lock);
1380 +               DEBUGP("expect_related: busy!\n");
1381 +               return -EBUSY;
1382 +       }
1383 +       
1384 +       new = (struct ip_conntrack_expect *) 
1385 +             kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
1386 +       if (!new) {
1387 +               WRITE_UNLOCK(&ip_conntrack_lock);
1388 +               DEBUGP("expect_relaed: OOM allocating expect\n");
1389 +               return -ENOMEM;
1390 +       }
1391 +       
1392 +       DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
1393 +       memcpy(new, expect, sizeof(*expect));
1394 +       new->expectant = related_to;
1395 +       new->sibling = NULL;
1396 +       atomic_set(&new->use, 1);
1397 +       
1398 +       /* add to expected list for this connection */  
1399 +       list_add(&new->expected_list, &related_to->sibling_list);
1400 +       /* add to global list of expectations */
1401 +       list_prepend(&ip_conntrack_expect_list, &new->list);
1402 +       /* add and start timer if required */
1403 +       if (related_to->helper->timeout) {
1404 +               init_timer(&new->timeout);
1405 +               new->timeout.data = (unsigned long)new;
1406 +               new->timeout.function = expectation_timed_out;
1407 +               new->timeout.expires = jiffies + 
1408 +                                       related_to->helper->timeout * HZ;
1409 +               add_timer(&new->timeout);
1410 +       }
1411 +       related_to->expecting++;
1412 +
1413 +       WRITE_UNLOCK(&ip_conntrack_lock);
1414 +
1415 +       return ret;
1416 +}
1417 +
1418 +/* Change tuple in an existing expectation */
1419 +int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
1420 +                              struct ip_conntrack_tuple *newtuple)
1421 +{
1422 +       int ret;
1423 +
1424 +       MUST_BE_READ_LOCKED(&ip_conntrack_lock);
1425 +       WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
1426 +
1427 +       DEBUGP("change_expect:\n");
1428 +       DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
1429 +       DEBUGP("exp mask:  "); DUMP_TUPLE(&expect->mask);
1430 +       DEBUGP("newtuple:  "); DUMP_TUPLE(newtuple);
1431 +       if (expect->ct_tuple.dst.protonum == 0) {
1432 +               /* Never seen before */
1433 +               DEBUGP("change expect: never seen before\n");
1434 +               if (!ip_ct_tuple_equal(&expect->tuple, newtuple) 
1435 +                   && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1436 +                                struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1437 +                       /* Force NAT to find an unused tuple */
1438 +                       ret = -1;
1439 +               } else {
1440 +                       memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1441 +                       memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1442 +                       ret = 0;
1443 +               }
1444 +       } else {
1445 +               /* Resent packet */
1446 +               DEBUGP("change expect: resent packet\n");
1447 +               if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1448 +                       ret = 0;
1449 +               } else {
1450 +                       /* Force NAT to choose again the same port */
1451 +                       ret = -1;
1452 +               }
1453 +       }
1454 +       WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1455 +       
1456 +       return ret;
1457 +}
1458 +
1459 +/* Alter reply tuple (maybe alter helper).  If it's already taken,
1460 +   return 0 and don't do alteration. */
1461 +int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1462 +                            const struct ip_conntrack_tuple *newreply)
1463 +{
1464 +       WRITE_LOCK(&ip_conntrack_lock);
1465 +       if (__ip_conntrack_find(newreply, conntrack)) {
1466 +               WRITE_UNLOCK(&ip_conntrack_lock);
1467 +               return 0;
1468 +       }
1469 +       /* Should be unconfirmed, so not in hash table yet */
1470 +       IP_NF_ASSERT(!is_confirmed(conntrack));
1471 +
1472 +       DEBUGP("Altering reply tuple of %p to ", conntrack);
1473 +       DUMP_TUPLE(newreply);
1474 +
1475 +       conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1476 +       if (!conntrack->master)
1477 +               conntrack->helper = LIST_FIND(&helpers, helper_cmp,
1478 +                                             struct ip_conntrack_helper *,
1479 +                                             newreply);
1480 +       WRITE_UNLOCK(&ip_conntrack_lock);
1481 +
1482 +       return 1;
1483 +}
1484 +
1485 +int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1486 +{
1487 +       WRITE_LOCK(&ip_conntrack_lock);
1488 +       list_prepend(&helpers, me);
1489 +       WRITE_UNLOCK(&ip_conntrack_lock);
1490 +
1491 +       return 0;
1492 +}
1493 +
1494 +static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1495 +                        const struct ip_conntrack_helper *me)
1496 +{
1497 +       if (i->ctrack->helper == me) {
1498 +               /* Get rid of any expected. */
1499 +               remove_expectations(i->ctrack, 0);
1500 +               /* And *then* set helper to NULL */
1501 +               i->ctrack->helper = NULL;
1502 +       }
1503 +       return 0;
1504 +}
1505 +
1506 +void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1507 +{
1508 +       unsigned int i;
1509 +
1510 +       /* Need write lock here, to delete helper. */
1511 +       WRITE_LOCK(&ip_conntrack_lock);
1512 +       LIST_DELETE(&helpers, me);
1513 +
1514 +       /* Get rid of expecteds, set helpers to NULL. */
1515 +       for (i = 0; i < ip_conntrack_htable_size; i++)
1516 +               LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1517 +                           struct ip_conntrack_tuple_hash *, me);
1518 +       WRITE_UNLOCK(&ip_conntrack_lock);
1519 +
1520 +       /* Someone could be still looking at the helper in a bh. */
1521 +       synchronize_net();
1522 +}
1523 +
1524 +/* Refresh conntrack for this many jiffies. */
1525 +void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
1526 +{
1527 +       IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1528 +
1529 +       WRITE_LOCK(&ip_conntrack_lock);
1530 +       /* If not in hash table, timer will not be active yet */
1531 +       if (!is_confirmed(ct))
1532 +               ct->timeout.expires = extra_jiffies;
1533 +       else {
1534 +               /* Need del_timer for race avoidance (may already be dying). */
1535 +               if (del_timer(&ct->timeout)) {
1536 +                       ct->timeout.expires = jiffies + extra_jiffies;
1537 +                       add_timer(&ct->timeout);
1538 +               }
1539 +       }
1540 +       WRITE_UNLOCK(&ip_conntrack_lock);
1541 +}
1542 +
1543 +/* Returns new sk_buff, or NULL */
1544 +struct sk_buff *
1545 +ip_ct_gather_frags(struct sk_buff *skb)
1546 +{
1547 +       struct sock *sk = skb->sk;
1548 +#ifdef CONFIG_NETFILTER_DEBUG
1549 +       unsigned int olddebug = skb->nf_debug;
1550 +#endif
1551 +       if (sk) {
1552 +               sock_hold(sk);
1553 +               skb_orphan(skb);
1554 +       }
1555 +
1556 +       local_bh_disable(); 
1557 +       skb = ip_defrag(skb);
1558 +       local_bh_enable();
1559 +
1560 +       if (!skb) {
1561 +               if (sk)
1562 +                       sock_put(sk);
1563 +               return skb;
1564 +       }
1565 +
1566 +       if (sk) {
1567 +               skb_set_owner_w(skb, sk);
1568 +               sock_put(sk);
1569 +       }
1570 +
1571 +       ip_send_check(skb->nh.iph);
1572 +       skb->nfcache |= NFC_ALTERED;
1573 +#ifdef CONFIG_NETFILTER_DEBUG
1574 +       /* Packet path as if nothing had happened. */
1575 +       skb->nf_debug = olddebug;
1576 +#endif
1577 +       return skb;
1578 +}
1579 +
1580 +/* Used by ipt_REJECT. */
1581 +static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
1582 +{
1583 +       struct ip_conntrack *ct;
1584 +       enum ip_conntrack_info ctinfo;
1585 +
1586 +       ct = __ip_conntrack_get(nfct, &ctinfo);
1587 +
1588 +       /* This ICMP is in reverse direction to the packet which
1589 +           caused it */
1590 +       if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1591 +               ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1592 +       else
1593 +               ctinfo = IP_CT_RELATED;
1594 +
1595 +       /* Attach new skbuff, and increment count */
1596 +       nskb->nfct = &ct->infos[ctinfo];
1597 +       atomic_inc(&ct->ct_general.use);
1598 +}
1599 +
1600 +static inline int
1601 +do_kill(const struct ip_conntrack_tuple_hash *i,
1602 +       int (*kill)(const struct ip_conntrack *i, void *data),
1603 +       void *data)
1604 +{
1605 +       return kill(i->ctrack, data);
1606 +}
1607 +
1608 +/* Bring out ya dead! */
1609 +static struct ip_conntrack_tuple_hash *
1610 +get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1611 +               void *data)
1612 +{
1613 +       struct ip_conntrack_tuple_hash *h = NULL;
1614 +       unsigned int i;
1615 +
1616 +       READ_LOCK(&ip_conntrack_lock);
1617 +       for (i = 0; !h && i < ip_conntrack_htable_size; i++) {
1618 +               h = LIST_FIND(&ip_conntrack_hash[i], do_kill,
1619 +                             struct ip_conntrack_tuple_hash *, kill, data);
1620 +       }
1621 +       if (h)
1622 +               atomic_inc(&h->ctrack->ct_general.use);
1623 +       READ_UNLOCK(&ip_conntrack_lock);
1624 +
1625 +       return h;
1626 +}
1627 +
1628 +void
1629 +ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1630 +                       void *data)
1631 +{
1632 +       struct ip_conntrack_tuple_hash *h;
1633 +
1634 +       /* This is order n^2, by the way. */
1635 +       while ((h = get_next_corpse(kill, data)) != NULL) {
1636 +               /* Time to push up daises... */
1637 +               if (del_timer(&h->ctrack->timeout))
1638 +                       death_by_timeout((unsigned long)h->ctrack);
1639 +               /* ... else the timer will get him soon. */
1640 +
1641 +               ip_conntrack_put(h->ctrack);
1642 +       }
1643 +}
1644 +
1645 +/* Fast function for those who don't want to parse /proc (and I don't
1646 +   blame them). */
1647 +/* Reversing the socket's dst/src point of view gives us the reply
1648 +   mapping. */
1649 +static int
1650 +getorigdst(struct sock *sk, int optval, void *user, int *len)
1651 +{
1652 +       struct inet_opt *inet = inet_sk(sk);
1653 +       struct ip_conntrack_tuple_hash *h;
1654 +       struct ip_conntrack_tuple tuple;
1655 +       
1656 +       IP_CT_TUPLE_U_BLANK(&tuple);
1657 +       tuple.src.ip = inet->rcv_saddr;
1658 +       tuple.src.u.tcp.port = inet->sport;
1659 +       tuple.dst.ip = inet->daddr;
1660 +       tuple.dst.u.tcp.port = inet->dport;
1661 +       tuple.dst.protonum = IPPROTO_TCP;
1662 +
1663 +       /* We only do TCP at the moment: is there a better way? */
1664 +       if (strcmp(sk->sk_prot->name, "TCP")) {
1665 +               DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1666 +               return -ENOPROTOOPT;
1667 +       }
1668 +
1669 +       if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1670 +               DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1671 +                      *len, sizeof(struct sockaddr_in));
1672 +               return -EINVAL;
1673 +       }
1674 +
1675 +       h = ip_conntrack_find_get(&tuple, NULL);
1676 +       if (h) {
1677 +               struct sockaddr_in sin;
1678 +
1679 +               sin.sin_family = AF_INET;
1680 +               sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1681 +                       .tuple.dst.u.tcp.port;
1682 +               sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1683 +                       .tuple.dst.ip;
1684 +
1685 +               DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1686 +                      NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1687 +               ip_conntrack_put(h->ctrack);
1688 +               if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1689 +                       return -EFAULT;
1690 +               else
1691 +                       return 0;
1692 +       }
1693 +       DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1694 +              NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1695 +              NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1696 +       return -ENOENT;
1697 +}
1698 +
1699 +static struct nf_sockopt_ops so_getorigdst = {
1700 +       .pf             = PF_INET,
1701 +       .get_optmin     = SO_ORIGINAL_DST,
1702 +       .get_optmax     = SO_ORIGINAL_DST+1,
1703 +       .get            = &getorigdst,
1704 +};
1705 +
1706 +static int kill_all(const struct ip_conntrack *i, void *data)
1707 +{
1708 +       return 1;
1709 +}
1710 +
1711 +/* Mishearing the voices in his head, our hero wonders how he's
1712 +   supposed to kill the mall. */
1713 +void ip_conntrack_cleanup(void)
1714 +{
1715 +       ip_ct_attach = NULL;
1716 +       /* This makes sure all current packets have passed through
1717 +           netfilter framework.  Roll on, two-stage module
1718 +           delete... */
1719 +       synchronize_net();
1720
1721 + i_see_dead_people:
1722 +       ip_ct_selective_cleanup(kill_all, NULL);
1723 +       if (atomic_read(&ip_conntrack_count) != 0) {
1724 +               schedule();
1725 +               goto i_see_dead_people;
1726 +       }
1727 +
1728 +       kmem_cache_destroy(ip_conntrack_cachep);
1729 +       vfree(ip_conntrack_hash);
1730 +       nf_unregister_sockopt(&so_getorigdst);
1731 +}
1732 +
1733 +static int hashsize;
1734 +MODULE_PARM(hashsize, "i");
1735 +
1736 +int __init ip_conntrack_init(void)
1737 +{
1738 +       unsigned int i;
1739 +       int ret;
1740 +
1741 +       /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1742 +        * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1743 +       if (hashsize) {
1744 +               ip_conntrack_htable_size = hashsize;
1745 +       } else {
1746 +               ip_conntrack_htable_size
1747 +                       = (((num_physpages << PAGE_SHIFT) / 16384)
1748 +                          / sizeof(struct list_head));
1749 +               if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1750 +                       ip_conntrack_htable_size = 8192;
1751 +               if (ip_conntrack_htable_size < 16)
1752 +                       ip_conntrack_htable_size = 16;
1753 +       }
1754 +       ip_conntrack_max = 8 * ip_conntrack_htable_size;
1755 +
1756 +       printk("ip_conntrack version %s (%u buckets, %d max)"
1757 +              " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1758 +              ip_conntrack_htable_size, ip_conntrack_max,
1759 +              sizeof(struct ip_conntrack));
1760 +
1761 +       ret = nf_register_sockopt(&so_getorigdst);
1762 +       if (ret != 0) {
1763 +               printk(KERN_ERR "Unable to register netfilter socket option\n");
1764 +               return ret;
1765 +       }
1766 +
1767 +       ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1768 +                                   * ip_conntrack_htable_size);
1769 +       if (!ip_conntrack_hash) {
1770 +               printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1771 +               goto err_unreg_sockopt;
1772 +       }
1773 +
1774 +       ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1775 +                                               sizeof(struct ip_conntrack), 0,
1776 +                                               SLAB_HWCACHE_ALIGN, NULL, NULL);
1777 +       if (!ip_conntrack_cachep) {
1778 +               printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1779 +               goto err_free_hash;
1780 +       }
1781 +       /* Don't NEED lock here, but good form anyway. */
1782 +       WRITE_LOCK(&ip_conntrack_lock);
1783 +       /* Sew in builtin protocols. */
1784 +       list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1785 +       list_append(&protocol_list, &ip_conntrack_protocol_udp);
1786 +       list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1787 +       WRITE_UNLOCK(&ip_conntrack_lock);
1788 +
1789 +       for (i = 0; i < ip_conntrack_htable_size; i++)
1790 +               INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1791 +
1792 +       /* For use by ipt_REJECT */
1793 +       ip_ct_attach = ip_conntrack_attach;
1794 +       return ret;
1795 +
1796 +err_free_hash:
1797 +       vfree(ip_conntrack_hash);
1798 +err_unreg_sockopt:
1799 +       nf_unregister_sockopt(&so_getorigdst);
1800 +
1801 +       return -ENOMEM;
1802 +}
1803 diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c.rej linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c.rej
1804 --- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c.rej   1970-01-01 01:00:00.000000000 +0100
1805 +++ linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c.rej       2003-12-17 14:02:02.000000000 +0100
1806 @@ -0,0 +1,17 @@
1807 +***************
1808 +*** 142,147 ****
1809 +       tuple->dst.ip = iph->daddr;
1810 +       tuple->dst.protonum = iph->protocol;
1811 +  
1812 +       ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl,
1813 +                                    len - 4*iph->ihl,
1814 +                                    tuple);
1815 +--- 142,149 ----
1816 +       tuple->dst.ip = iph->daddr;
1817 +       tuple->dst.protonum = iph->protocol;
1818 +  
1819 ++      tuple->src.u.all = tuple->dst.u.all = 0;
1820 ++ 
1821 +       ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl,
1822 +                                    len - 4*iph->ihl,
1823 +                                    tuple);
1824 diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_nat_core.c linux-2.6.0-test11/net/ipv4/netfilter/ip_nat_core.c
1825 --- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_nat_core.c     2003-11-26 21:43:07.000000000 +0100
1826 +++ linux-2.6.0-test11/net/ipv4/netfilter/ip_nat_core.c 2003-12-17 14:02:03.000000000 +0100
1827 @@ -432,7 +432,7 @@
1828         *tuple = *orig_tuple;
1829         while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
1830                != NULL) {
1831 -               DEBUGP("Found best for "); DUMP_TUPLE(tuple);
1832 +               DEBUGP("Found best for "); DUMP_TUPLE_RAW(tuple);
1833                 /* 3) The per-protocol part of the manip is made to
1834                    map into the range to make a unique tuple. */
1835  
1836 @@ -573,9 +573,9 @@
1837                        HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
1838                        conntrack);
1839                 DEBUGP("Original: ");
1840 -               DUMP_TUPLE(&orig_tp);
1841 +               DUMP_TUPLE_RAW(&orig_tp);
1842                 DEBUGP("New: ");
1843 -               DUMP_TUPLE(&new_tuple);
1844 +               DUMP_TUPLE_RAW(&new_tuple);
1845  #endif
1846  
1847                 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
1848 diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_nat_core.c.orig linux-2.6.0-test11/net/ipv4/netfilter/ip_nat_core.c.orig
1849 --- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_nat_core.c.orig        1970-01-01 01:00:00.000000000 +0100
1850 +++ linux-2.6.0-test11/net/ipv4/netfilter/ip_nat_core.c.orig    2003-11-26 21:43:07.000000000 +0100
1851 @@ -0,0 +1,1030 @@
1852 +/* NAT for netfilter; shared with compatibility layer. */
1853 +
1854 +/* (c) 1999 Paul `Rusty' Russell.  Licenced under the GNU General
1855 +   Public Licence. */
1856 +#include <linux/module.h>
1857 +#include <linux/types.h>
1858 +#include <linux/timer.h>
1859 +#include <linux/skbuff.h>
1860 +#include <linux/netfilter_ipv4.h>
1861 +#include <linux/vmalloc.h>
1862 +#include <net/checksum.h>
1863 +#include <net/icmp.h>
1864 +#include <net/ip.h>
1865 +#include <net/tcp.h>  /* For tcp_prot in getorigdst */
1866 +#include <linux/icmp.h>
1867 +#include <linux/udp.h>
1868 +
1869 +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
1870 +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
1871 +
1872 +#include <linux/netfilter_ipv4/ip_conntrack.h>
1873 +#include <linux/netfilter_ipv4/ip_conntrack_core.h>
1874 +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
1875 +#include <linux/netfilter_ipv4/ip_nat.h>
1876 +#include <linux/netfilter_ipv4/ip_nat_protocol.h>
1877 +#include <linux/netfilter_ipv4/ip_nat_core.h>
1878 +#include <linux/netfilter_ipv4/ip_nat_helper.h>
1879 +#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
1880 +#include <linux/netfilter_ipv4/listhelp.h>
1881 +
1882 +#if 0
1883 +#define DEBUGP printk
1884 +#else
1885 +#define DEBUGP(format, args...)
1886 +#endif
1887 +
1888 +DECLARE_RWLOCK(ip_nat_lock);
1889 +DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
1890 +
1891 +/* Calculated at init based on memory size */
1892 +static unsigned int ip_nat_htable_size;
1893 +
1894 +static struct list_head *bysource;
1895 +static struct list_head *byipsproto;
1896 +LIST_HEAD(protos);
1897 +LIST_HEAD(helpers);
1898 +
1899 +extern struct ip_nat_protocol unknown_nat_protocol;
1900 +
1901 +/* We keep extra hashes for each conntrack, for fast searching. */
1902 +static inline size_t
1903 +hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
1904 +{
1905 +       /* Modified src and dst, to ensure we don't create two
1906 +           identical streams. */
1907 +       return (src + dst + proto) % ip_nat_htable_size;
1908 +}
1909 +
1910 +static inline size_t
1911 +hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
1912 +{
1913 +       /* Original src, to ensure we map it consistently if poss. */
1914 +       return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
1915 +}
1916 +
1917 +/* Noone using conntrack by the time this called. */
1918 +static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
1919 +{
1920 +       struct ip_nat_info *info = &conn->nat.info;
1921 +       unsigned int hs, hp;
1922 +
1923 +       if (!info->initialized)
1924 +               return;
1925 +
1926 +       IP_NF_ASSERT(info->bysource.conntrack);
1927 +       IP_NF_ASSERT(info->byipsproto.conntrack);
1928 +
1929 +       hs = hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src,
1930 +                        conn->tuplehash[IP_CT_DIR_ORIGINAL]
1931 +                        .tuple.dst.protonum);
1932 +
1933 +       hp = hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
1934 +                             conn->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
1935 +                             conn->tuplehash[IP_CT_DIR_REPLY]
1936 +                             .tuple.dst.protonum);
1937 +
1938 +       WRITE_LOCK(&ip_nat_lock);
1939 +       LIST_DELETE(&bysource[hs], &info->bysource);
1940 +       LIST_DELETE(&byipsproto[hp], &info->byipsproto);
1941 +       WRITE_UNLOCK(&ip_nat_lock);
1942 +}
1943 +
1944 +/* We do checksum mangling, so if they were wrong before they're still
1945 + * wrong.  Also works for incomplete packets (eg. ICMP dest
1946 + * unreachables.) */
1947 +u_int16_t
1948 +ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
1949 +{
1950 +       u_int32_t diffs[] = { oldvalinv, newval };
1951 +       return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
1952 +                                     oldcheck^0xFFFF));
1953 +}
1954 +
1955 +static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
1956 +{
1957 +       return i->protonum == proto;
1958 +}
1959 +
1960 +struct ip_nat_protocol *
1961 +find_nat_proto(u_int16_t protonum)
1962 +{
1963 +       struct ip_nat_protocol *i;
1964 +
1965 +       MUST_BE_READ_LOCKED(&ip_nat_lock);
1966 +       i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
1967 +       if (!i)
1968 +               i = &unknown_nat_protocol;
1969 +       return i;
1970 +}
1971 +
1972 +/* Is this tuple already taken? (not by us) */
1973 +int
1974 +ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
1975 +                 const struct ip_conntrack *ignored_conntrack)
1976 +{
1977 +       /* Conntrack tracking doesn't keep track of outgoing tuples; only
1978 +          incoming ones.  NAT means they don't have a fixed mapping,
1979 +          so we invert the tuple and look for the incoming reply.
1980 +
1981 +          We could keep a separate hash if this proves too slow. */
1982 +       struct ip_conntrack_tuple reply;
1983 +
1984 +       invert_tuplepr(&reply, tuple);
1985 +       return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
1986 +}
1987 +
1988 +/* Does tuple + the source manip come within the range mr */
1989 +static int
1990 +in_range(const struct ip_conntrack_tuple *tuple,
1991 +        const struct ip_conntrack_manip *manip,
1992 +        const struct ip_nat_multi_range *mr)
1993 +{
1994 +       struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
1995 +       unsigned int i;
1996 +       struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
1997 +
1998 +       for (i = 0; i < mr->rangesize; i++) {
1999 +               /* If we are allowed to map IPs, then we must be in the
2000 +                  range specified, otherwise we must be unchanged. */
2001 +               if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
2002 +                       if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
2003 +                           || (ntohl(newtuple.src.ip)
2004 +                               > ntohl(mr->range[i].max_ip)))
2005 +                               continue;
2006 +               } else {
2007 +                       if (newtuple.src.ip != tuple->src.ip)
2008 +                               continue;
2009 +               }
2010 +
2011 +               if (!(mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
2012 +                   || proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
2013 +                                      &mr->range[i].min, &mr->range[i].max))
2014 +                       return 1;
2015 +       }
2016 +       return 0;
2017 +}
2018 +
2019 +static inline int
2020 +src_cmp(const struct ip_nat_hash *i,
2021 +       const struct ip_conntrack_tuple *tuple,
2022 +       const struct ip_nat_multi_range *mr)
2023 +{
2024 +       return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
2025 +               == tuple->dst.protonum
2026 +               && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
2027 +               == tuple->src.ip
2028 +               && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
2029 +               == tuple->src.u.all
2030 +               && in_range(tuple,
2031 +                           &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2032 +                           .tuple.src,
2033 +                           mr));
2034 +}
2035 +
2036 +/* Only called for SRC manip */
2037 +static struct ip_conntrack_manip *
2038 +find_appropriate_src(const struct ip_conntrack_tuple *tuple,
2039 +                    const struct ip_nat_multi_range *mr)
2040 +{
2041 +       unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
2042 +       struct ip_nat_hash *i;
2043 +
2044 +       MUST_BE_READ_LOCKED(&ip_nat_lock);
2045 +       i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
2046 +       if (i)
2047 +               return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
2048 +       else
2049 +               return NULL;
2050 +}
2051 +
2052 +#ifdef CONFIG_IP_NF_NAT_LOCAL
2053 +/* If it's really a local destination manip, it may need to do a
2054 +   source manip too. */
2055 +static int
2056 +do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
2057 +{
2058 +       struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } };
2059 +       struct rtable *rt;
2060 +
2061 +       /* FIXME: IPTOS_TOS(iph->tos) --RR */
2062 +       if (ip_route_output_key(&rt, &fl) != 0) {
2063 +               DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
2064 +                      NIPQUAD(var_ip));
2065 +               return 0;
2066 +       }
2067 +
2068 +       *other_ipp = rt->rt_src;
2069 +       ip_rt_put(rt);
2070 +       return 1;
2071 +}
2072 +#endif
2073 +
2074 +/* Simple way to iterate through all. */
2075 +static inline int fake_cmp(const struct ip_nat_hash *i,
2076 +                          u_int32_t src, u_int32_t dst, u_int16_t protonum,
2077 +                          unsigned int *score,
2078 +                          const struct ip_conntrack *conntrack)
2079 +{
2080 +       /* Compare backwards: we're dealing with OUTGOING tuples, and
2081 +           inside the conntrack is the REPLY tuple.  Don't count this
2082 +           conntrack. */
2083 +       if (i->conntrack != conntrack
2084 +           && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
2085 +           && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
2086 +           && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
2087 +               == protonum))
2088 +               (*score)++;
2089 +       return 0;
2090 +}
2091 +
2092 +static inline unsigned int
2093 +count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
2094 +          const struct ip_conntrack *conntrack)
2095 +{
2096 +       unsigned int score = 0;
2097 +       unsigned int h;
2098 +
2099 +       MUST_BE_READ_LOCKED(&ip_nat_lock);
2100 +       h = hash_by_ipsproto(src, dst, protonum);
2101 +       LIST_FIND(&byipsproto[h], fake_cmp, struct ip_nat_hash *,
2102 +                 src, dst, protonum, &score, conntrack);
2103 +
2104 +       return score;
2105 +}
2106 +
2107 +/* For [FUTURE] fragmentation handling, we want the least-used
2108 +   src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
2109 +   if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
2110 +   1-65535, we don't do pro-rata allocation based on ports; we choose
2111 +   the ip with the lowest src-ip/dst-ip/proto usage.
2112 +
2113 +   If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
2114 +   range), we eliminate that and try again.  This is not the most
2115 +   efficient approach, but if you're worried about that, don't hand us
2116 +   ranges you don't really have.  */
2117 +static struct ip_nat_range *
2118 +find_best_ips_proto(struct ip_conntrack_tuple *tuple,
2119 +                   const struct ip_nat_multi_range *mr,
2120 +                   const struct ip_conntrack *conntrack,
2121 +                   unsigned int hooknum)
2122 +{
2123 +       unsigned int i;
2124 +       struct {
2125 +               const struct ip_nat_range *range;
2126 +               unsigned int score;
2127 +               struct ip_conntrack_tuple tuple;
2128 +       } best = { NULL,  0xFFFFFFFF };
2129 +       u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
2130 +       static unsigned int randomness;
2131 +
2132 +       if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
2133 +               var_ipp = &tuple->src.ip;
2134 +               saved_ip = tuple->dst.ip;
2135 +               other_ipp = &tuple->dst.ip;
2136 +       } else {
2137 +               var_ipp = &tuple->dst.ip;
2138 +               saved_ip = tuple->src.ip;
2139 +               other_ipp = &tuple->src.ip;
2140 +       }
2141 +       /* Don't do do_extra_mangle unless necessary (overrides
2142 +           explicit socket bindings, for example) */
2143 +       orig_dstip = tuple->dst.ip;
2144 +
2145 +       IP_NF_ASSERT(mr->rangesize >= 1);
2146 +       for (i = 0; i < mr->rangesize; i++) {
2147 +               /* Host order */
2148 +               u_int32_t minip, maxip, j;
2149 +
2150 +               /* Don't do ranges which are already eliminated. */
2151 +               if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
2152 +                       continue;
2153 +               }
2154 +
2155 +               if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
2156 +                       minip = ntohl(mr->range[i].min_ip);
2157 +                       maxip = ntohl(mr->range[i].max_ip);
2158 +               } else
2159 +                       minip = maxip = ntohl(*var_ipp);
2160 +
2161 +               randomness++;
2162 +               for (j = 0; j < maxip - minip + 1; j++) {
2163 +                       unsigned int score;
2164 +
2165 +                       *var_ipp = htonl(minip + (randomness + j) 
2166 +                                        % (maxip - minip + 1));
2167 +
2168 +                       /* Reset the other ip in case it was mangled by
2169 +                        * do_extra_mangle last time. */
2170 +                       *other_ipp = saved_ip;
2171 +
2172 +#ifdef CONFIG_IP_NF_NAT_LOCAL
2173 +                       if (hooknum == NF_IP_LOCAL_OUT
2174 +                           && *var_ipp != orig_dstip
2175 +                           && !do_extra_mangle(*var_ipp, other_ipp)) {
2176 +                               DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
2177 +                                      i, NIPQUAD(*var_ipp));
2178 +                               /* Can't route?  This whole range part is
2179 +                                * probably screwed, but keep trying
2180 +                                * anyway. */
2181 +                               continue;
2182 +                       }
2183 +#endif
2184 +
2185 +                       /* Count how many others map onto this. */
2186 +                       score = count_maps(tuple->src.ip, tuple->dst.ip,
2187 +                                          tuple->dst.protonum, conntrack);
2188 +                       if (score < best.score) {
2189 +                               /* Optimization: doesn't get any better than
2190 +                                  this. */
2191 +                               if (score == 0)
2192 +                                       return (struct ip_nat_range *)
2193 +                                               &mr->range[i];
2194 +
2195 +                               best.score = score;
2196 +                               best.tuple = *tuple;
2197 +                               best.range = &mr->range[i];
2198 +                       }
2199 +               }
2200 +       }
2201 +       *tuple = best.tuple;
2202 +
2203 +       /* Discard const. */
2204 +       return (struct ip_nat_range *)best.range;
2205 +}
2206 +
2207 +/* Fast version doesn't iterate through hash chains, but only handles
2208 +   common case of single IP address (null NAT, masquerade) */
2209 +static struct ip_nat_range *
2210 +find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
2211 +                        const struct ip_nat_multi_range *mr,
2212 +                        const struct ip_conntrack *conntrack,
2213 +                        unsigned int hooknum)
2214 +{
2215 +       if (mr->rangesize != 1
2216 +           || (mr->range[0].flags & IP_NAT_RANGE_FULL)
2217 +           || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
2218 +               && mr->range[0].min_ip != mr->range[0].max_ip))
2219 +               return find_best_ips_proto(tuple, mr, conntrack, hooknum);
2220 +
2221 +       if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
2222 +               if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
2223 +                       tuple->src.ip = mr->range[0].min_ip;
2224 +               else {
2225 +                       /* Only do extra mangle when required (breaks
2226 +                           socket binding) */
2227 +#ifdef CONFIG_IP_NF_NAT_LOCAL
2228 +                       if (tuple->dst.ip != mr->range[0].min_ip
2229 +                           && hooknum == NF_IP_LOCAL_OUT
2230 +                           && !do_extra_mangle(mr->range[0].min_ip,
2231 +                                               &tuple->src.ip))
2232 +                               return NULL;
2233 +#endif
2234 +                       tuple->dst.ip = mr->range[0].min_ip;
2235 +               }
2236 +       }
2237 +
2238 +       /* Discard const. */
2239 +       return (struct ip_nat_range *)&mr->range[0];
2240 +}
2241 +
2242 +static int
2243 +get_unique_tuple(struct ip_conntrack_tuple *tuple,
2244 +                const struct ip_conntrack_tuple *orig_tuple,
2245 +                const struct ip_nat_multi_range *mrr,
2246 +                struct ip_conntrack *conntrack,
2247 +                unsigned int hooknum)
2248 +{
2249 +       struct ip_nat_protocol *proto
2250 +               = find_nat_proto(orig_tuple->dst.protonum);
2251 +       struct ip_nat_range *rptr;
2252 +       unsigned int i;
2253 +       int ret;
2254 +
2255 +       /* We temporarily use flags for marking full parts, but we
2256 +          always clean up afterwards */
2257 +       struct ip_nat_multi_range *mr = (void *)mrr;
2258 +
2259 +       /* 1) If this srcip/proto/src-proto-part is currently mapped,
2260 +          and that same mapping gives a unique tuple within the given
2261 +          range, use that.
2262 +
2263 +          This is only required for source (ie. NAT/masq) mappings.
2264 +          So far, we don't do local source mappings, so multiple
2265 +          manips not an issue.  */
2266 +       if (hooknum == NF_IP_POST_ROUTING) {
2267 +               struct ip_conntrack_manip *manip;
2268 +
2269 +               manip = find_appropriate_src(orig_tuple, mr);
2270 +               if (manip) {
2271 +                       /* Apply same source manipulation. */
2272 +                       *tuple = ((struct ip_conntrack_tuple)
2273 +                                 { *manip, orig_tuple->dst });
2274 +                       DEBUGP("get_unique_tuple: Found current src map\n");
2275 +                       if (!ip_nat_used_tuple(tuple, conntrack))
2276 +                               return 1;
2277 +               }
2278 +       }
2279 +
2280 +       /* 2) Select the least-used IP/proto combination in the given
2281 +          range.
2282 +       */
2283 +       *tuple = *orig_tuple;
2284 +       while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
2285 +              != NULL) {
2286 +               DEBUGP("Found best for "); DUMP_TUPLE(tuple);
2287 +               /* 3) The per-protocol part of the manip is made to
2288 +                  map into the range to make a unique tuple. */
2289 +
2290 +               /* Only bother mapping if it's not already in range
2291 +                  and unique */
2292 +               if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
2293 +                    || proto->in_range(tuple, HOOK2MANIP(hooknum),
2294 +                                       &rptr->min, &rptr->max))
2295 +                   && !ip_nat_used_tuple(tuple, conntrack)) {
2296 +                       ret = 1;
2297 +                       goto clear_fulls;
2298 +               } else {
2299 +                       if (proto->unique_tuple(tuple, rptr,
2300 +                                               HOOK2MANIP(hooknum),
2301 +                                               conntrack)) {
2302 +                               /* Must be unique. */
2303 +                               IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
2304 +                                                               conntrack));
2305 +                               ret = 1;
2306 +                               goto clear_fulls;
2307 +                       } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
2308 +                               /* Try implicit source NAT; protocol
2309 +                                   may be able to play with ports to
2310 +                                   make it unique. */
2311 +                               struct ip_nat_range r
2312 +                                       = { IP_NAT_RANGE_MAP_IPS, 
2313 +                                           tuple->src.ip, tuple->src.ip,
2314 +                                           { 0 }, { 0 } };
2315 +                               DEBUGP("Trying implicit mapping\n");
2316 +                               if (proto->unique_tuple(tuple, &r,
2317 +                                                       IP_NAT_MANIP_SRC,
2318 +                                                       conntrack)) {
2319 +                                       /* Must be unique. */
2320 +                                       IP_NF_ASSERT(!ip_nat_used_tuple
2321 +                                                    (tuple, conntrack));
2322 +                                       ret = 1;
2323 +                                       goto clear_fulls;
2324 +                               }
2325 +                       }
2326 +                       DEBUGP("Protocol can't get unique tuple %u.\n",
2327 +                              hooknum);
2328 +               }
2329 +
2330 +               /* Eliminate that from range, and try again. */
2331 +               rptr->flags |= IP_NAT_RANGE_FULL;
2332 +               *tuple = *orig_tuple;
2333 +       }
2334 +
2335 +       ret = 0;
2336 +
2337 + clear_fulls:
2338 +       /* Clear full flags. */
2339 +       IP_NF_ASSERT(mr->rangesize >= 1);
2340 +       for (i = 0; i < mr->rangesize; i++)
2341 +               mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
2342 +
2343 +       return ret;
2344 +}
2345 +
2346 +static inline int
2347 +helper_cmp(const struct ip_nat_helper *helper,
2348 +          const struct ip_conntrack_tuple *tuple)
2349 +{
2350 +       return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
2351 +}
2352 +
2353 +/* Where to manip the reply packets (will be reverse manip). */
2354 +static unsigned int opposite_hook[NF_IP_NUMHOOKS]
2355 += { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
2356 +    [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
2357 +#ifdef CONFIG_IP_NF_NAT_LOCAL
2358 +    [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
2359 +    [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
2360 +#endif
2361 +};
2362 +
2363 +unsigned int
2364 +ip_nat_setup_info(struct ip_conntrack *conntrack,
2365 +                 const struct ip_nat_multi_range *mr,
2366 +                 unsigned int hooknum)
2367 +{
2368 +       struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
2369 +       struct ip_conntrack_tuple orig_tp;
2370 +       struct ip_nat_info *info = &conntrack->nat.info;
2371 +       int in_hashes = info->initialized;
2372 +
2373 +       MUST_BE_WRITE_LOCKED(&ip_nat_lock);
2374 +       IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
2375 +                    || hooknum == NF_IP_POST_ROUTING
2376 +                    || hooknum == NF_IP_LOCAL_OUT);
2377 +       IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
2378 +       IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum))));
2379 +
2380 +       /* What we've got will look like inverse of reply. Normally
2381 +          this is what is in the conntrack, except for prior
2382 +          manipulations (future optimization: if num_manips == 0,
2383 +          orig_tp =
2384 +          conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
2385 +       invert_tuplepr(&orig_tp,
2386 +                      &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
2387 +
2388 +#if 0
2389 +       {
2390 +       unsigned int i;
2391 +
2392 +       DEBUGP("Hook %u (%s), ", hooknum,
2393 +              HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
2394 +       DUMP_TUPLE(&orig_tp);
2395 +       DEBUGP("Range %p: ", mr);
2396 +       for (i = 0; i < mr->rangesize; i++) {
2397 +               DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
2398 +                      i,
2399 +                      (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
2400 +                      ? " MAP_IPS" : "",
2401 +                      (mr->range[i].flags
2402 +                       & IP_NAT_RANGE_PROTO_SPECIFIED)
2403 +                      ? " PROTO_SPECIFIED" : "",
2404 +                      (mr->range[i].flags & IP_NAT_RANGE_FULL)
2405 +                      ? " FULL" : "",
2406 +                      NIPQUAD(mr->range[i].min_ip),
2407 +                      NIPQUAD(mr->range[i].max_ip),
2408 +                      mr->range[i].min.all,
2409 +                      mr->range[i].max.all);
2410 +       }
2411 +       }
2412 +#endif
2413 +
2414 +       do {
2415 +               if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
2416 +                                     hooknum)) {
2417 +                       DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
2418 +                              conntrack);
2419 +                       return NF_DROP;
2420 +               }
2421 +
2422 +#if 0
2423 +               DEBUGP("Hook %u (%s) %p\n", hooknum,
2424 +                      HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
2425 +                      conntrack);
2426 +               DEBUGP("Original: ");
2427 +               DUMP_TUPLE(&orig_tp);
2428 +               DEBUGP("New: ");
2429 +               DUMP_TUPLE(&new_tuple);
2430 +#endif
2431 +
2432 +               /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
2433 +                  the original (A/B/C/D') and the mangled one (E/F/G/H').
2434 +
2435 +                  We're only allowed to work with the SRC per-proto
2436 +                  part, so we create inverses of both to start, then
2437 +                  derive the other fields we need.  */
2438 +
2439 +               /* Reply connection: simply invert the new tuple
2440 +                   (G/H/E/F') */
2441 +               invert_tuplepr(&reply, &new_tuple);
2442 +
2443 +               /* Alter conntrack table so it recognizes replies.
2444 +                   If fail this race (reply tuple now used), repeat. */
2445 +       } while (!ip_conntrack_alter_reply(conntrack, &reply));
2446 +
2447 +       /* FIXME: We can simply used existing conntrack reply tuple
2448 +           here --RR */
2449 +       /* Create inverse of original: C/D/A/B' */
2450 +       invert_tuplepr(&inv_tuple, &orig_tp);
2451 +
2452 +       /* Has source changed?. */
2453 +       if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
2454 +               /* In this direction, a source manip. */
2455 +               info->manips[info->num_manips++] =
2456 +                       ((struct ip_nat_info_manip)
2457 +                        { IP_CT_DIR_ORIGINAL, hooknum,
2458 +                          IP_NAT_MANIP_SRC, new_tuple.src });
2459 +
2460 +               IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
2461 +
2462 +               /* In the reverse direction, a destination manip. */
2463 +               info->manips[info->num_manips++] =
2464 +                       ((struct ip_nat_info_manip)
2465 +                        { IP_CT_DIR_REPLY, opposite_hook[hooknum],
2466 +                          IP_NAT_MANIP_DST, orig_tp.src });
2467 +               IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
2468 +       }
2469 +
2470 +       /* Has destination changed? */
2471 +       if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
2472 +               /* In this direction, a destination manip */
2473 +               info->manips[info->num_manips++] =
2474 +                       ((struct ip_nat_info_manip)
2475 +                        { IP_CT_DIR_ORIGINAL, hooknum,
2476 +                          IP_NAT_MANIP_DST, reply.src });
2477 +
2478 +               IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
2479 +
2480 +               /* In the reverse direction, a source manip. */
2481 +               info->manips[info->num_manips++] =
2482 +                       ((struct ip_nat_info_manip)
2483 +                        { IP_CT_DIR_REPLY, opposite_hook[hooknum],
2484 +                          IP_NAT_MANIP_SRC, inv_tuple.src });
2485 +               IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
2486 +       }
2487 +
2488 +       /* If there's a helper, assign it; based on new tuple. */
2489 +       if (!conntrack->master)
2490 +               info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
2491 +                                        &reply);
2492 +
2493 +       /* It's done. */
2494 +       info->initialized |= (1 << HOOK2MANIP(hooknum));
2495 +
2496 +       if (in_hashes) {
2497 +               IP_NF_ASSERT(info->bysource.conntrack);
2498 +               replace_in_hashes(conntrack, info);
2499 +       } else {
2500 +               place_in_hashes(conntrack, info);
2501 +       }
2502 +
2503 +       return NF_ACCEPT;
2504 +}
2505 +
2506 +void replace_in_hashes(struct ip_conntrack *conntrack,
2507 +                      struct ip_nat_info *info)
2508 +{
2509 +       /* Source has changed, so replace in hashes. */
2510 +       unsigned int srchash
2511 +               = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2512 +                             .tuple.src,
2513 +                             conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2514 +                             .tuple.dst.protonum);
2515 +       /* We place packet as seen OUTGOUNG in byips_proto hash
2516 +           (ie. reverse dst and src of reply packet. */
2517 +       unsigned int ipsprotohash
2518 +               = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
2519 +                                  .tuple.dst.ip,
2520 +                                  conntrack->tuplehash[IP_CT_DIR_REPLY]
2521 +                                  .tuple.src.ip,
2522 +                                  conntrack->tuplehash[IP_CT_DIR_REPLY]
2523 +                                  .tuple.dst.protonum);
2524 +
2525 +       IP_NF_ASSERT(info->bysource.conntrack == conntrack);
2526 +       MUST_BE_WRITE_LOCKED(&ip_nat_lock);
2527 +
2528 +       list_del(&info->bysource.list);
2529 +       list_del(&info->byipsproto.list);
2530 +
2531 +       list_prepend(&bysource[srchash], &info->bysource);
2532 +       list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
2533 +}
2534 +
2535 +void place_in_hashes(struct ip_conntrack *conntrack,
2536 +                    struct ip_nat_info *info)
2537 +{
2538 +       unsigned int srchash
2539 +               = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2540 +                             .tuple.src,
2541 +                             conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2542 +                             .tuple.dst.protonum);
2543 +       /* We place packet as seen OUTGOUNG in byips_proto hash
2544 +           (ie. reverse dst and src of reply packet. */
2545 +       unsigned int ipsprotohash
2546 +               = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
2547 +                                  .tuple.dst.ip,
2548 +                                  conntrack->tuplehash[IP_CT_DIR_REPLY]
2549 +                                  .tuple.src.ip,
2550 +                                  conntrack->tuplehash[IP_CT_DIR_REPLY]
2551 +                                  .tuple.dst.protonum);
2552 +
2553 +       IP_NF_ASSERT(!info->bysource.conntrack);
2554 +
2555 +       MUST_BE_WRITE_LOCKED(&ip_nat_lock);
2556 +       info->byipsproto.conntrack = conntrack;
2557 +       info->bysource.conntrack = conntrack;
2558 +
2559 +       list_prepend(&bysource[srchash], &info->bysource);
2560 +       list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
2561 +}
2562 +
2563 +/* Returns true if succeeded. */
2564 +static int
2565 +manip_pkt(u_int16_t proto,
2566 +         struct sk_buff **pskb,
2567 +         unsigned int iphdroff,
2568 +         const struct ip_conntrack_manip *manip,
2569 +         enum ip_nat_manip_type maniptype)
2570 +{
2571 +       struct iphdr *iph;
2572 +
2573 +       (*pskb)->nfcache |= NFC_ALTERED;
2574 +       if (!skb_ip_make_writable(pskb, iphdroff+sizeof(iph)))
2575 +               return 0;
2576 +
2577 +       iph = (void *)(*pskb)->data + iphdroff;
2578 +
2579 +       /* Manipulate protcol part. */
2580 +       if (!find_nat_proto(proto)->manip_pkt(pskb,
2581 +                                             iphdroff + iph->ihl*4,
2582 +                                             manip, maniptype))
2583 +               return 0;
2584 +
2585 +       iph = (void *)(*pskb)->data + iphdroff;
2586 +
2587 +       if (maniptype == IP_NAT_MANIP_SRC) {
2588 +               iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
2589 +                                               iph->check);
2590 +               iph->saddr = manip->ip;
2591 +       } else {
2592 +               iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
2593 +                                               iph->check);
2594 +               iph->daddr = manip->ip;
2595 +       }
2596 +       return 1;
2597 +}
2598 +
2599 +static inline int exp_for_packet(struct ip_conntrack_expect *exp,
2600 +                                struct sk_buff *skb)
2601 +{
2602 +       struct ip_conntrack_protocol *proto;
2603 +       int ret = 1;
2604 +
2605 +       MUST_BE_READ_LOCKED(&ip_conntrack_lock);
2606 +       proto = __ip_ct_find_proto(skb->nh.iph->protocol);
2607 +       if (proto->exp_matches_pkt)
2608 +               ret = proto->exp_matches_pkt(exp, skb);
2609 +
2610 +       return ret;
2611 +}
2612 +
2613 +/* Do packet manipulations according to binding. */
2614 +unsigned int
2615 +do_bindings(struct ip_conntrack *ct,
2616 +           enum ip_conntrack_info ctinfo,
2617 +           struct ip_nat_info *info,
2618 +           unsigned int hooknum,
2619 +           struct sk_buff **pskb)
2620 +{
2621 +       unsigned int i;
2622 +       struct ip_nat_helper *helper;
2623 +       enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
2624 +       int proto = (*pskb)->nh.iph->protocol;
2625 +
2626 +       /* Need nat lock to protect against modification, but neither
2627 +          conntrack (referenced) and helper (deleted with
2628 +          synchronize_bh()) can vanish. */
2629 +       READ_LOCK(&ip_nat_lock);
2630 +       for (i = 0; i < info->num_manips; i++) {
2631 +               if (info->manips[i].direction == dir
2632 +                   && info->manips[i].hooknum == hooknum) {
2633 +                       DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
2634 +                              *pskb,
2635 +                              info->manips[i].maniptype == IP_NAT_MANIP_SRC
2636 +                              ? "SRC" : "DST",
2637 +                              NIPQUAD(info->manips[i].manip.ip),
2638 +                              htons(info->manips[i].manip.u.all));
2639 +                       if (!manip_pkt(proto, pskb, 0,
2640 +                                      &info->manips[i].manip,
2641 +                                      info->manips[i].maniptype)) {
2642 +                               READ_UNLOCK(&ip_nat_lock);
2643 +                               return NF_DROP;
2644 +                       }
2645 +               }
2646 +       }
2647 +       helper = info->helper;
2648 +       READ_UNLOCK(&ip_nat_lock);
2649 +
2650 +       if (helper) {
2651 +               struct ip_conntrack_expect *exp = NULL;
2652 +               struct list_head *cur_item;
2653 +               int ret = NF_ACCEPT;
2654 +               int helper_called = 0;
2655 +
2656 +               DEBUGP("do_bindings: helper existing for (%p)\n", ct);
2657 +
2658 +               /* Always defragged for helpers */
2659 +               IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
2660 +                              & htons(IP_MF|IP_OFFSET)));
2661 +
2662 +               /* Have to grab read lock before sibling_list traversal */
2663 +               READ_LOCK(&ip_conntrack_lock);
2664 +               list_for_each(cur_item, &ct->sibling_list) { 
2665 +                       exp = list_entry(cur_item, struct ip_conntrack_expect, 
2666 +                                        expected_list);
2667 +                                        
2668 +                       /* if this expectation is already established, skip */
2669 +                       if (exp->sibling)
2670 +                               continue;
2671 +
2672 +                       if (exp_for_packet(exp, *pskb)) {
2673 +                               /* FIXME: May be true multiple times in the
2674 +                                * case of UDP!! */
2675 +                               DEBUGP("calling nat helper (exp=%p) for packet\n", exp);
2676 +                               ret = helper->help(ct, exp, info, ctinfo, 
2677 +                                                  hooknum, pskb);
2678 +                               if (ret != NF_ACCEPT) {
2679 +                                       READ_UNLOCK(&ip_conntrack_lock);
2680 +                                       return ret;
2681 +                               }
2682 +                               helper_called = 1;
2683 +                       }
2684 +               }
2685 +               /* Helper might want to manip the packet even when there is no
2686 +                * matching expectation for this packet */
2687 +               if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
2688 +                       DEBUGP("calling nat helper for packet without expectation\n");
2689 +                       ret = helper->help(ct, NULL, info, ctinfo, 
2690 +                                          hooknum, pskb);
2691 +                       if (ret != NF_ACCEPT) {
2692 +                               READ_UNLOCK(&ip_conntrack_lock);
2693 +                               return ret;
2694 +                       }
2695 +               }
2696 +               READ_UNLOCK(&ip_conntrack_lock);
2697 +               
2698 +               /* Adjust sequence number only once per packet 
2699 +                * (helper is called at all hooks) */
2700 +               if (proto == IPPROTO_TCP
2701 +                   && (hooknum == NF_IP_POST_ROUTING
2702 +                       || hooknum == NF_IP_LOCAL_IN)) {
2703 +                       DEBUGP("ip_nat_core: adjusting sequence number\n");
2704 +                       /* future: put this in a l4-proto specific function,
2705 +                        * and call this function here. */
2706 +                       if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
2707 +                               ret = NF_DROP;
2708 +               }
2709 +
2710 +               return ret;
2711 +
2712 +       } else 
2713 +               return NF_ACCEPT;
2714 +
2715 +       /* not reached */
2716 +}
2717 +
2718 +int
2719 +icmp_reply_translation(struct sk_buff **pskb,
2720 +                      struct ip_conntrack *conntrack,
2721 +                      unsigned int hooknum,
2722 +                      int dir)
2723 +{
2724 +       struct {
2725 +               struct icmphdr icmp;
2726 +               struct iphdr ip;
2727 +       } *inside;
2728 +       unsigned int i;
2729 +       struct ip_nat_info *info = &conntrack->nat.info;
2730 +       int hdrlen;
2731 +
2732 +       if (!skb_ip_make_writable(pskb,(*pskb)->nh.iph->ihl*4+sizeof(*inside)))
2733 +               return 0;
2734 +       inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
2735 +
2736 +       /* We're actually going to mangle it beyond trivial checksum
2737 +          adjustment, so make sure the current checksum is correct. */
2738 +       if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
2739 +               hdrlen = (*pskb)->nh.iph->ihl * 4;
2740 +               if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
2741 +                                               (*pskb)->len - hdrlen, 0)))
2742 +                       return 0;
2743 +       }
2744 +
2745 +       /* Must be RELATED */
2746 +       IP_NF_ASSERT((*pskb)->nfct
2747 +                    - (struct ip_conntrack *)(*pskb)->nfct->master
2748 +                    == IP_CT_RELATED
2749 +                    || (*pskb)->nfct
2750 +                    - (struct ip_conntrack *)(*pskb)->nfct->master
2751 +                    == IP_CT_RELATED+IP_CT_IS_REPLY);
2752 +
2753 +       /* Redirects on non-null nats must be dropped, else they'll
2754 +           start talking to each other without our translation, and be
2755 +           confused... --RR */
2756 +       if (inside->icmp.type == ICMP_REDIRECT) {
2757 +               /* Don't care about races here. */
2758 +               if (info->initialized
2759 +                   != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
2760 +                   || info->num_manips != 0)
2761 +                       return 0;
2762 +       }
2763 +
2764 +       DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
2765 +              *pskb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
2766 +       /* Note: May not be from a NAT'd host, but probably safest to
2767 +          do translation always as if it came from the host itself
2768 +          (even though a "host unreachable" coming from the host
2769 +          itself is a bit weird).
2770 +
2771 +          More explanation: some people use NAT for anonymizing.
2772 +          Also, CERT recommends dropping all packets from private IP
2773 +          addresses (although ICMP errors from internal links with
2774 +          such addresses are not too uncommon, as Alan Cox points
2775 +          out) */
2776 +
2777 +       READ_LOCK(&ip_nat_lock);
2778 +       for (i = 0; i < info->num_manips; i++) {
2779 +               DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
2780 +                      i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
2781 +                      "ORIG" : "REPLY", info->manips[i].hooknum);
2782 +
2783 +               if (info->manips[i].direction != dir)
2784 +                       continue;
2785 +
2786 +               /* Mapping the inner packet is just like a normal
2787 +                  packet, except it was never src/dst reversed, so
2788 +                  where we would normally apply a dst manip, we apply
2789 +                  a src, and vice versa. */
2790 +               if (info->manips[i].hooknum == hooknum) {
2791 +                       DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
2792 +                              info->manips[i].maniptype == IP_NAT_MANIP_SRC
2793 +                              ? "DST" : "SRC",
2794 +                              NIPQUAD(info->manips[i].manip.ip),
2795 +                              ntohs(info->manips[i].manip.u.udp.port));
2796 +                       if (!manip_pkt(inside->ip.protocol, pskb,
2797 +                                      (*pskb)->nh.iph->ihl*4
2798 +                                      + sizeof(inside->icmp),
2799 +                                      &info->manips[i].manip,
2800 +                                      !info->manips[i].maniptype))
2801 +                               goto unlock_fail;
2802 +
2803 +                       /* Outer packet needs to have IP header NATed like
2804 +                          it's a reply. */
2805 +
2806 +                       /* Use mapping to map outer packet: 0 give no
2807 +                           per-proto mapping */
2808 +                       DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
2809 +                              info->manips[i].maniptype == IP_NAT_MANIP_SRC
2810 +                              ? "SRC" : "DST",
2811 +                              NIPQUAD(info->manips[i].manip.ip));
2812 +                       if (!manip_pkt(0, pskb, 0,
2813 +                                      &info->manips[i].manip,
2814 +                                      info->manips[i].maniptype))
2815 +                               goto unlock_fail;
2816 +               }
2817 +       }
2818 +       READ_UNLOCK(&ip_nat_lock);
2819 +
2820 +       hdrlen = (*pskb)->nh.iph->ihl * 4;
2821 +
2822 +       inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
2823 +
2824 +       inside->icmp.checksum = 0;
2825 +       inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
2826 +                                                      (*pskb)->len - hdrlen,
2827 +                                                      0));
2828 +       return 1;
2829 +
2830 + unlock_fail:
2831 +       READ_UNLOCK(&ip_nat_lock);
2832 +       return 0;
2833 +}
2834 +
2835 +int __init ip_nat_init(void)
2836 +{
2837 +       size_t i;
2838 +
2839 +       /* Leave them the same for the moment. */
2840 +       ip_nat_htable_size = ip_conntrack_htable_size;
2841 +
2842 +       /* One vmalloc for both hash tables */
2843 +       bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
2844 +       if (!bysource) {
2845 +               return -ENOMEM;
2846 +       }
2847 +       byipsproto = bysource + ip_nat_htable_size;
2848 +
2849 +       /* Sew in builtin protocols. */
2850 +       WRITE_LOCK(&ip_nat_lock);
2851 +       list_append(&protos, &ip_nat_protocol_tcp);
2852 +       list_append(&protos, &ip_nat_protocol_udp);
2853 +       list_append(&protos, &ip_nat_protocol_icmp);
2854 +       WRITE_UNLOCK(&ip_nat_lock);
2855 +
2856 +       for (i = 0; i < ip_nat_htable_size; i++) {
2857 +               INIT_LIST_HEAD(&bysource[i]);
2858 +               INIT_LIST_HEAD(&byipsproto[i]);
2859 +       }
2860 +
2861 +       /* FIXME: Man, this is a hack.  <SIGH> */
2862 +       IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
2863 +       ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
2864 +
2865 +       return 0;
2866 +}
2867 +
2868 +/* Clear NAT section of all conntracks, in case we're loaded again. */
2869 +static int clean_nat(const struct ip_conntrack *i, void *data)
2870 +{
2871 +       memset((void *)&i->nat, 0, sizeof(i->nat));
2872 +       return 0;
2873 +}
2874 +
2875 +/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
2876 +void ip_nat_cleanup(void)
2877 +{
2878 +       ip_ct_selective_cleanup(&clean_nat, NULL);
2879 +       ip_conntrack_destroyed = NULL;
2880 +       vfree(bysource);
2881 +}
This page took 0.252907 seconds and 3 git commands to generate.