1 diff -Nur linux-2.6.0-test11.org/include/linux/netfilter_ipv4/ip_conntrack_tuple.h linux-2.6.0-test11/include/linux/netfilter_ipv4/ip_conntrack_tuple.h
2 --- linux-2.6.0-test11.org/include/linux/netfilter_ipv4/ip_conntrack_tuple.h 2003-11-26 21:44:58.000000000 +0100
3 +++ linux-2.6.0-test11/include/linux/netfilter_ipv4/ip_conntrack_tuple.h 2003-12-17 14:02:02.000000000 +0100
5 union ip_conntrack_manip_proto
7 /* Add other protocols here. */
22 /* The manipulable part of the tuple. */
26 /* Add other protocols here. */
47 #define DUMP_TUPLE(tp) \
48 -DEBUGP("tuple %p: %u %u.%u.%u.%u:%hu -> %u.%u.%u.%u:%hu\n", \
49 +DEBUGP("tuple %p: %u %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n", \
50 (tp), (tp)->dst.protonum, \
51 - NIPQUAD((tp)->src.ip), ntohs((tp)->src.u.all), \
52 - NIPQUAD((tp)->dst.ip), ntohs((tp)->dst.u.all))
53 + NIPQUAD((tp)->src.ip), ntohl((tp)->src.u.all), \
54 + NIPQUAD((tp)->dst.ip), ntohl((tp)->dst.u.all))
56 +#define DUMP_TUPLE_RAW(x) \
57 + DEBUGP("tuple %p: %u %u.%u.%u.%u:0x%08x -> %u.%u.%u.%u:0x%08x\n",\
58 + (x), (x)->dst.protonum, \
59 + NIPQUAD((x)->src.ip), ntohl((x)->src.u.all), \
60 + NIPQUAD((x)->dst.ip), ntohl((x)->dst.u.all))
62 #define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL)
64 diff -Nur linux-2.6.0-test11.org/include/linux/netfilter_ipv4/ip_conntrack_tuple.h.orig linux-2.6.0-test11/include/linux/netfilter_ipv4/ip_conntrack_tuple.h.orig
65 --- linux-2.6.0-test11.org/include/linux/netfilter_ipv4/ip_conntrack_tuple.h.orig 1970-01-01 01:00:00.000000000 +0100
66 +++ linux-2.6.0-test11/include/linux/netfilter_ipv4/ip_conntrack_tuple.h.orig 2003-11-26 21:44:58.000000000 +0100
68 +#ifndef _IP_CONNTRACK_TUPLE_H
69 +#define _IP_CONNTRACK_TUPLE_H
71 +/* A `tuple' is a structure containing the information to uniquely
72 + identify a connection. ie. if two packets have the same tuple, they
73 + are in the same connection; if not, they are not.
75 + We divide the structure along "manipulatable" and
76 + "non-manipulatable" lines, for the benefit of the NAT code.
79 +/* The protocol-specific manipulable parts of the tuple: always in
81 +union ip_conntrack_manip_proto
83 + /* Add other protocols here. */
97 +/* The manipulable part of the tuple. */
98 +struct ip_conntrack_manip
101 + union ip_conntrack_manip_proto u;
104 +/* This contains the information to distinguish a connection. */
105 +struct ip_conntrack_tuple
107 + struct ip_conntrack_manip src;
109 + /* These are the parts of the tuple which are fixed. */
113 + /* Add other protocols here. */
123 + u_int8_t type, code;
127 + /* The protocol. */
128 + u_int16_t protonum;
132 +/* This is optimized opposed to a memset of the whole structure. Everything we
133 + * really care about is the source/destination unions */
134 +#define IP_CT_TUPLE_U_BLANK(tuple) \
136 + (tuple)->src.u.all = 0; \
137 + (tuple)->dst.u.all = 0; \
140 +enum ip_conntrack_dir
142 + IP_CT_DIR_ORIGINAL,
149 +#define DUMP_TUPLE(tp) \
150 +DEBUGP("tuple %p: %u %u.%u.%u.%u:%hu -> %u.%u.%u.%u:%hu\n", \
151 + (tp), (tp)->dst.protonum, \
152 + NIPQUAD((tp)->src.ip), ntohs((tp)->src.u.all), \
153 + NIPQUAD((tp)->dst.ip), ntohs((tp)->dst.u.all))
155 +#define CTINFO2DIR(ctinfo) ((ctinfo) >= IP_CT_IS_REPLY ? IP_CT_DIR_REPLY : IP_CT_DIR_ORIGINAL)
157 +/* If we're the first tuple, it's the original dir. */
158 +#define DIRECTION(h) ((enum ip_conntrack_dir)(&(h)->ctrack->tuplehash[1] == (h)))
160 +/* Connections have two entries in the hash table: one for each way */
161 +struct ip_conntrack_tuple_hash
163 + struct list_head list;
165 + struct ip_conntrack_tuple tuple;
167 + /* this == &ctrack->tuplehash[DIRECTION(this)]. */
168 + struct ip_conntrack *ctrack;
171 +#endif /* __KERNEL__ */
173 +static inline int ip_ct_tuple_src_equal(const struct ip_conntrack_tuple *t1,
174 + const struct ip_conntrack_tuple *t2)
176 + return t1->src.ip == t2->src.ip
177 + && t1->src.u.all == t2->src.u.all;
180 +static inline int ip_ct_tuple_dst_equal(const struct ip_conntrack_tuple *t1,
181 + const struct ip_conntrack_tuple *t2)
183 + return t1->dst.ip == t2->dst.ip
184 + && t1->dst.u.all == t2->dst.u.all
185 + && t1->dst.protonum == t2->dst.protonum;
188 +static inline int ip_ct_tuple_equal(const struct ip_conntrack_tuple *t1,
189 + const struct ip_conntrack_tuple *t2)
191 + return ip_ct_tuple_src_equal(t1, t2) && ip_ct_tuple_dst_equal(t1, t2);
194 +static inline int ip_ct_tuple_mask_cmp(const struct ip_conntrack_tuple *t,
195 + const struct ip_conntrack_tuple *tuple,
196 + const struct ip_conntrack_tuple *mask)
198 + return !(((t->src.ip ^ tuple->src.ip) & mask->src.ip)
199 + || ((t->dst.ip ^ tuple->dst.ip) & mask->dst.ip)
200 + || ((t->src.u.all ^ tuple->src.u.all) & mask->src.u.all)
201 + || ((t->dst.u.all ^ tuple->dst.u.all) & mask->dst.u.all)
202 + || ((t->dst.protonum ^ tuple->dst.protonum)
203 + & mask->dst.protonum));
206 +#endif /* _IP_CONNTRACK_TUPLE_H */
207 diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/Makefile linux-2.6.0-test11/net/ipv4/netfilter/Makefile
208 --- linux-2.6.0-test11.org/net/ipv4/netfilter/Makefile 2003-11-26 21:43:25.000000000 +0100
209 +++ linux-2.6.0-test11/net/ipv4/netfilter/Makefile 2003-12-17 14:02:02.000000000 +0100
211 # connection tracking
212 obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
214 +# connection tracking protocol helpers
215 +obj-$(CONFIG_IP_NF_CT_PROTO_GRE) += ip_conntrack_proto_gre.o
216 +ifdef CONFIG_IP_NF_CT_PROTO_GRE
217 + export-objs += ip_conntrack_proto_gre.o
220 +# NAT protocol helpers
221 +obj-$(CONFIG_IP_NF_NAT_PROTO_GRE) += ip_nat_proto_gre.o
223 # connection tracking helpers
224 obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
225 obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
226 diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/Makefile.orig linux-2.6.0-test11/net/ipv4/netfilter/Makefile.orig
227 --- linux-2.6.0-test11.org/net/ipv4/netfilter/Makefile.orig 1970-01-01 01:00:00.000000000 +0100
228 +++ linux-2.6.0-test11/net/ipv4/netfilter/Makefile.orig 2003-11-26 21:43:25.000000000 +0100
231 +# Makefile for the netfilter modules on top of IPv4.
234 +# objects for the conntrack and NAT core (used by standalone and backw. compat)
235 +ip_nf_conntrack-objs := ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o
236 +ip_nf_nat-objs := ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
238 +# objects for the standalone - connection tracking / NAT
239 +ip_conntrack-objs := ip_conntrack_standalone.o $(ip_nf_conntrack-objs)
240 +iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o $(ip_nf_nat-objs)
242 +# objects for backwards compatibility mode
243 +ip_nf_compat-objs := ip_fw_compat.o ip_fw_compat_redir.o ip_fw_compat_masq.o $(ip_nf_conntrack-objs) $(ip_nf_nat-objs)
245 +ipfwadm-objs := $(ip_nf_compat-objs) ipfwadm_core.o
246 +ipchains-objs := $(ip_nf_compat-objs) ipchains_core.o
248 +# connection tracking
249 +obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
251 +# connection tracking helpers
252 +obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
253 +obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
254 +obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
255 +obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
258 +obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
259 +obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
260 +obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
261 +obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
264 +obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
266 +# the three instances of ip_tables
267 +obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
268 +obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
269 +obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
272 +obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
273 +obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
274 +obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
275 +obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
276 +obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
278 +obj-$(CONFIG_IP_NF_MATCH_PKTTYPE) += ipt_pkttype.o
279 +obj-$(CONFIG_IP_NF_MATCH_MULTIPORT) += ipt_multiport.o
280 +obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o
281 +obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o
283 +obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o
285 +obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
286 +obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o
287 +obj-$(CONFIG_IP_NF_MATCH_AH_ESP) += ipt_ah.o ipt_esp.o
289 +obj-$(CONFIG_IP_NF_MATCH_LENGTH) += ipt_length.o
291 +obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
292 +obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
293 +obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
294 +obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
296 +obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
299 +obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
300 +obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o
301 +obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
302 +obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o
303 +obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o
304 +obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
305 +obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
306 +obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
307 +obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o
308 +obj-$(CONFIG_IP_NF_TARGET_CLASSIFY) += ipt_CLASSIFY.o
309 +obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o
310 +obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
311 +obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
312 +obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
314 +# generic ARP tables
315 +obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
316 +obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
318 +# just filtering instance of ARP tables for now
319 +obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
321 +# backwards compatibility
322 +obj-$(CONFIG_IP_NF_COMPAT_IPCHAINS) += ipchains.o
323 +obj-$(CONFIG_IP_NF_COMPAT_IPFWADM) += ipfwadm.o
325 +obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
326 diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c
327 --- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c 2003-11-26 21:42:40.000000000 +0100
328 +++ linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c 2003-12-17 14:02:02.000000000 +0100
330 inverse->dst.ip = orig->src.ip;
331 inverse->dst.protonum = orig->dst.protonum;
333 + inverse->src.u.all = inverse->dst.u.all = 0;
335 return protocol->invert_tuple(inverse, orig);
339 * so there is no need to use the tuple lock too */
341 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
342 - DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
343 - DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
344 + DEBUGP("tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
345 + DEBUGP("mask: "); DUMP_TUPLE_RAW(&expect->mask);
347 old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
348 struct ip_conntrack_expect *, &expect->tuple,
349 @@ -1051,15 +1053,14 @@
351 MUST_BE_READ_LOCKED(&ip_conntrack_lock);
352 WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
354 DEBUGP("change_expect:\n");
355 - DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
356 - DEBUGP("exp mask: "); DUMP_TUPLE(&expect->mask);
357 - DEBUGP("newtuple: "); DUMP_TUPLE(newtuple);
358 + DEBUGP("exp tuple: "); DUMP_TUPLE_RAW(&expect->tuple);
359 + DEBUGP("exp mask: "); DUMP_TUPLE_RAW(&expect->mask);
360 + DEBUGP("newtuple: "); DUMP_TUPLE_RAW(newtuple);
361 if (expect->ct_tuple.dst.protonum == 0) {
362 /* Never seen before */
363 DEBUGP("change expect: never seen before\n");
364 - if (!ip_ct_tuple_equal(&expect->tuple, newtuple)
365 + if (!ip_ct_tuple_mask_cmp(&expect->tuple, newtuple, &expect->mask)
366 && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
367 struct ip_conntrack_expect *, newtuple, &expect->mask)) {
368 /* Force NAT to find an unused tuple */
369 diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c.orig linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c.orig
370 --- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c.orig 1970-01-01 01:00:00.000000000 +0100
371 +++ linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c.orig 2003-11-26 21:42:40.000000000 +0100
373 +/* Connection state tracking for netfilter. This is separated from,
374 + but required by, the NAT layer; it can also be used by an iptables
377 +/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
380 + * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
381 + * - new API and handling of conntrack/nat helpers
382 + * - now capable of multiple expectations for one master
383 + * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
384 + * - add usage/reference counts to ip_conntrack_expect
385 + * - export ip_conntrack[_expect]_{find_get,put} functions
388 +#include <linux/config.h>
389 +#include <linux/types.h>
390 +#include <linux/icmp.h>
391 +#include <linux/ip.h>
392 +#include <linux/netfilter.h>
393 +#include <linux/netfilter_ipv4.h>
394 +#include <linux/module.h>
395 +#include <linux/skbuff.h>
396 +#include <linux/proc_fs.h>
397 +#include <linux/vmalloc.h>
398 +#include <net/checksum.h>
399 +#include <linux/stddef.h>
400 +#include <linux/sysctl.h>
401 +#include <linux/slab.h>
402 +#include <linux/random.h>
403 +#include <linux/jhash.h>
404 +/* For ERR_PTR(). Yeah, I know... --RR */
405 +#include <linux/fs.h>
407 +/* This rwlock protects the main hash table, protocol/helper/expected
408 + registrations, conntrack timers*/
409 +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
410 +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
412 +#include <linux/netfilter_ipv4/ip_conntrack.h>
413 +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
414 +#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
415 +#include <linux/netfilter_ipv4/ip_conntrack_core.h>
416 +#include <linux/netfilter_ipv4/listhelp.h>
418 +#define IP_CONNTRACK_VERSION "2.1"
421 +#define DEBUGP printk
423 +#define DEBUGP(format, args...)
426 +DECLARE_RWLOCK(ip_conntrack_lock);
427 +DECLARE_RWLOCK(ip_conntrack_expect_tuple_lock);
429 +void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
430 +LIST_HEAD(ip_conntrack_expect_list);
431 +LIST_HEAD(protocol_list);
432 +static LIST_HEAD(helpers);
433 +unsigned int ip_conntrack_htable_size = 0;
434 +int ip_conntrack_max;
435 +static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
436 +struct list_head *ip_conntrack_hash;
437 +static kmem_cache_t *ip_conntrack_cachep;
439 +extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
441 +static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
444 + return protocol == curr->proto;
447 +struct ip_conntrack_protocol *__ip_ct_find_proto(u_int8_t protocol)
449 + struct ip_conntrack_protocol *p;
451 + MUST_BE_READ_LOCKED(&ip_conntrack_lock);
452 + p = LIST_FIND(&protocol_list, proto_cmpfn,
453 + struct ip_conntrack_protocol *, protocol);
455 + p = &ip_conntrack_generic_protocol;
460 +struct ip_conntrack_protocol *ip_ct_find_proto(u_int8_t protocol)
462 + struct ip_conntrack_protocol *p;
464 + READ_LOCK(&ip_conntrack_lock);
465 + p = __ip_ct_find_proto(protocol);
466 + READ_UNLOCK(&ip_conntrack_lock);
471 +ip_conntrack_put(struct ip_conntrack *ct)
474 + IP_NF_ASSERT(ct->infos[0].master);
475 + /* nf_conntrack_put wants to go via an info struct, so feed it
477 + nf_conntrack_put(&ct->infos[0]);
480 +static int ip_conntrack_hash_rnd_initted;
481 +static unsigned int ip_conntrack_hash_rnd;
484 +hash_conntrack(const struct ip_conntrack_tuple *tuple)
489 + return (jhash_3words(tuple->src.ip,
490 + (tuple->dst.ip ^ tuple->dst.protonum),
491 + (tuple->src.u.all | (tuple->dst.u.all << 16)),
492 + ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
496 +get_tuple(const struct iphdr *iph,
497 + const struct sk_buff *skb,
498 + unsigned int dataoff,
499 + struct ip_conntrack_tuple *tuple,
500 + const struct ip_conntrack_protocol *protocol)
503 + if (iph->frag_off & htons(IP_OFFSET)) {
504 + printk("ip_conntrack_core: Frag of proto %u.\n",
509 + tuple->src.ip = iph->saddr;
510 + tuple->dst.ip = iph->daddr;
511 + tuple->dst.protonum = iph->protocol;
513 + return protocol->pkt_to_tuple(skb, dataoff, tuple);
517 +invert_tuple(struct ip_conntrack_tuple *inverse,
518 + const struct ip_conntrack_tuple *orig,
519 + const struct ip_conntrack_protocol *protocol)
521 + inverse->src.ip = orig->dst.ip;
522 + inverse->dst.ip = orig->src.ip;
523 + inverse->dst.protonum = orig->dst.protonum;
525 + return protocol->invert_tuple(inverse, orig);
529 +/* ip_conntrack_expect helper functions */
531 +/* Compare tuple parts depending on mask. */
532 +static inline int expect_cmp(const struct ip_conntrack_expect *i,
533 + const struct ip_conntrack_tuple *tuple)
535 + MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
536 + return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask);
540 +destroy_expect(struct ip_conntrack_expect *exp)
542 + DEBUGP("destroy_expect(%p) use=%d\n", exp, atomic_read(&exp->use));
543 + IP_NF_ASSERT(atomic_read(&exp->use));
544 + IP_NF_ASSERT(!timer_pending(&exp->timeout));
550 +inline void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
554 + if (atomic_dec_and_test(&exp->use)) {
555 + /* usage count dropped to zero */
556 + destroy_expect(exp);
560 +static inline struct ip_conntrack_expect *
561 +__ip_ct_expect_find(const struct ip_conntrack_tuple *tuple)
563 + MUST_BE_READ_LOCKED(&ip_conntrack_lock);
564 + MUST_BE_READ_LOCKED(&ip_conntrack_expect_tuple_lock);
565 + return LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
566 + struct ip_conntrack_expect *, tuple);
569 +/* Find a expectation corresponding to a tuple. */
570 +struct ip_conntrack_expect *
571 +ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
573 + struct ip_conntrack_expect *exp;
575 + READ_LOCK(&ip_conntrack_lock);
576 + READ_LOCK(&ip_conntrack_expect_tuple_lock);
577 + exp = __ip_ct_expect_find(tuple);
579 + atomic_inc(&exp->use);
580 + READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
581 + READ_UNLOCK(&ip_conntrack_lock);
586 +/* remove one specific expectation from all lists and drop refcount,
587 + * does _NOT_ delete the timer. */
588 +static void __unexpect_related(struct ip_conntrack_expect *expect)
590 + DEBUGP("unexpect_related(%p)\n", expect);
591 + MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
593 + /* we're not allowed to unexpect a confirmed expectation! */
594 + IP_NF_ASSERT(!expect->sibling);
596 + /* delete from global and local lists */
597 + list_del(&expect->list);
598 + list_del(&expect->expected_list);
600 + /* decrement expect-count of master conntrack */
601 + if (expect->expectant)
602 + expect->expectant->expecting--;
604 + ip_conntrack_expect_put(expect);
607 +/* remove one specific expecatation from all lists, drop refcount
608 + * and expire timer.
609 + * This function can _NOT_ be called for confirmed expects! */
610 +static void unexpect_related(struct ip_conntrack_expect *expect)
612 + IP_NF_ASSERT(expect->expectant);
613 + IP_NF_ASSERT(expect->expectant->helper);
614 + /* if we are supposed to have a timer, but we can't delete
615 + * it: race condition. __unexpect_related will
616 + * be calledd by timeout function */
617 + if (expect->expectant->helper->timeout
618 + && !del_timer(&expect->timeout))
621 + __unexpect_related(expect);
624 +/* delete all unconfirmed expectations for this conntrack */
625 +static void remove_expectations(struct ip_conntrack *ct, int drop_refcount)
627 + struct list_head *exp_entry, *next;
628 + struct ip_conntrack_expect *exp;
630 + DEBUGP("remove_expectations(%p)\n", ct);
632 + list_for_each_safe(exp_entry, next, &ct->sibling_list) {
633 + exp = list_entry(exp_entry, struct ip_conntrack_expect,
636 + /* we skip established expectations, as we want to delete
637 + * the un-established ones only */
638 + if (exp->sibling) {
639 + DEBUGP("remove_expectations: skipping established %p of %p\n", exp->sibling, ct);
640 + if (drop_refcount) {
641 + /* Indicate that this expectations parent is dead */
642 + ip_conntrack_put(exp->expectant);
643 + exp->expectant = NULL;
648 + IP_NF_ASSERT(list_inlist(&ip_conntrack_expect_list, exp));
649 + IP_NF_ASSERT(exp->expectant == ct);
651 + /* delete expectation from global and private lists */
652 + unexpect_related(exp);
657 +clean_from_lists(struct ip_conntrack *ct)
659 + unsigned int ho, hr;
661 + DEBUGP("clean_from_lists(%p)\n", ct);
662 + MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
664 + ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
665 + hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
666 + LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
667 + LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
669 + /* Destroy all un-established, pending expectations */
670 + remove_expectations(ct, 1);
674 +destroy_conntrack(struct nf_conntrack *nfct)
676 + struct ip_conntrack *ct = (struct ip_conntrack *)nfct, *master = NULL;
677 + struct ip_conntrack_protocol *proto;
679 + DEBUGP("destroy_conntrack(%p)\n", ct);
680 + IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
681 + IP_NF_ASSERT(!timer_pending(&ct->timeout));
683 + /* To make sure we don't get any weird locking issues here:
684 + * destroy_conntrack() MUST NOT be called with a write lock
685 + * to ip_conntrack_lock!!! -HW */
686 + proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
687 + if (proto && proto->destroy)
688 + proto->destroy(ct);
690 + if (ip_conntrack_destroyed)
691 + ip_conntrack_destroyed(ct);
693 + WRITE_LOCK(&ip_conntrack_lock);
694 + /* Delete us from our own list to prevent corruption later */
695 + list_del(&ct->sibling_list);
697 + /* Delete our master expectation */
699 + if (ct->master->expectant) {
700 + /* can't call __unexpect_related here,
701 + * since it would screw up expect_list */
702 + list_del(&ct->master->expected_list);
703 + master = ct->master->expectant;
707 + WRITE_UNLOCK(&ip_conntrack_lock);
710 + ip_conntrack_put(master);
712 + DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
713 + kmem_cache_free(ip_conntrack_cachep, ct);
714 + atomic_dec(&ip_conntrack_count);
717 +static void death_by_timeout(unsigned long ul_conntrack)
719 + struct ip_conntrack *ct = (void *)ul_conntrack;
721 + WRITE_LOCK(&ip_conntrack_lock);
722 + clean_from_lists(ct);
723 + WRITE_UNLOCK(&ip_conntrack_lock);
724 + ip_conntrack_put(ct);
728 +conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
729 + const struct ip_conntrack_tuple *tuple,
730 + const struct ip_conntrack *ignored_conntrack)
732 + MUST_BE_READ_LOCKED(&ip_conntrack_lock);
733 + return i->ctrack != ignored_conntrack
734 + && ip_ct_tuple_equal(tuple, &i->tuple);
737 +static struct ip_conntrack_tuple_hash *
738 +__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
739 + const struct ip_conntrack *ignored_conntrack)
741 + struct ip_conntrack_tuple_hash *h;
742 + unsigned int hash = hash_conntrack(tuple);
744 + MUST_BE_READ_LOCKED(&ip_conntrack_lock);
745 + h = LIST_FIND(&ip_conntrack_hash[hash],
746 + conntrack_tuple_cmp,
747 + struct ip_conntrack_tuple_hash *,
748 + tuple, ignored_conntrack);
752 +/* Find a connection corresponding to a tuple. */
753 +struct ip_conntrack_tuple_hash *
754 +ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
755 + const struct ip_conntrack *ignored_conntrack)
757 + struct ip_conntrack_tuple_hash *h;
759 + READ_LOCK(&ip_conntrack_lock);
760 + h = __ip_conntrack_find(tuple, ignored_conntrack);
762 + atomic_inc(&h->ctrack->ct_general.use);
763 + READ_UNLOCK(&ip_conntrack_lock);
768 +static inline struct ip_conntrack *
769 +__ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo)
771 + struct ip_conntrack *ct
772 + = (struct ip_conntrack *)nfct->master;
774 + /* ctinfo is the index of the nfct inside the conntrack */
775 + *ctinfo = nfct - ct->infos;
776 + IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
780 +/* Return conntrack and conntrack_info given skb->nfct->master */
781 +struct ip_conntrack *
782 +ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
785 + return __ip_conntrack_get(skb->nfct, ctinfo);
789 +/* Confirm a connection given skb->nfct; places it in hash table */
791 +__ip_conntrack_confirm(struct nf_ct_info *nfct)
793 + unsigned int hash, repl_hash;
794 + struct ip_conntrack *ct;
795 + enum ip_conntrack_info ctinfo;
797 + ct = __ip_conntrack_get(nfct, &ctinfo);
799 + /* ipt_REJECT uses ip_conntrack_attach to attach related
800 + ICMP/TCP RST packets in other direction. Actual packet
801 + which created connection will be IP_CT_NEW or for an
802 + expected connection, IP_CT_RELATED. */
803 + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
806 + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
807 + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
809 + /* We're not in hash table, and we refuse to set up related
810 + connections for unconfirmed conns. But packet copies and
811 + REJECT will give spurious warnings here. */
812 + /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
814 + /* No external references means noone else could have
816 + IP_NF_ASSERT(!is_confirmed(ct));
817 + DEBUGP("Confirming conntrack %p\n", ct);
819 + WRITE_LOCK(&ip_conntrack_lock);
820 + /* See if there's one in the list already, including reverse:
821 + NAT could have grabbed it without realizing, since we're
822 + not in the hash. If there is, we lost race. */
823 + if (!LIST_FIND(&ip_conntrack_hash[hash],
824 + conntrack_tuple_cmp,
825 + struct ip_conntrack_tuple_hash *,
826 + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
827 + && !LIST_FIND(&ip_conntrack_hash[repl_hash],
828 + conntrack_tuple_cmp,
829 + struct ip_conntrack_tuple_hash *,
830 + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
831 + list_prepend(&ip_conntrack_hash[hash],
832 + &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
833 + list_prepend(&ip_conntrack_hash[repl_hash],
834 + &ct->tuplehash[IP_CT_DIR_REPLY]);
835 + /* Timer relative to confirmation time, not original
836 + setting time, otherwise we'd get timer wrap in
837 + weird delay cases. */
838 + ct->timeout.expires += jiffies;
839 + add_timer(&ct->timeout);
840 + atomic_inc(&ct->ct_general.use);
841 + set_bit(IPS_CONFIRMED_BIT, &ct->status);
842 + WRITE_UNLOCK(&ip_conntrack_lock);
846 + WRITE_UNLOCK(&ip_conntrack_lock);
850 +/* Returns true if a connection correspondings to the tuple (required
853 +ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
854 + const struct ip_conntrack *ignored_conntrack)
856 + struct ip_conntrack_tuple_hash *h;
858 + READ_LOCK(&ip_conntrack_lock);
859 + h = __ip_conntrack_find(tuple, ignored_conntrack);
860 + READ_UNLOCK(&ip_conntrack_lock);
865 +/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
866 +struct ip_conntrack *
867 +icmp_error_track(struct sk_buff *skb,
868 + enum ip_conntrack_info *ctinfo,
869 + unsigned int hooknum)
871 + struct ip_conntrack_tuple innertuple, origtuple;
873 + struct icmphdr icmp;
876 + struct ip_conntrack_protocol *innerproto;
877 + struct ip_conntrack_tuple_hash *h;
880 + IP_NF_ASSERT(skb->nfct == NULL);
882 + /* Not enough header? */
883 + if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &inside, sizeof(inside))!=0)
886 + if (inside.icmp.type != ICMP_DEST_UNREACH
887 + && inside.icmp.type != ICMP_SOURCE_QUENCH
888 + && inside.icmp.type != ICMP_TIME_EXCEEDED
889 + && inside.icmp.type != ICMP_PARAMETERPROB
890 + && inside.icmp.type != ICMP_REDIRECT)
893 + /* Ignore ICMP's containing fragments (shouldn't happen) */
894 + if (inside.ip.frag_off & htons(IP_OFFSET)) {
895 + DEBUGP("icmp_error_track: fragment of proto %u\n",
896 + inside.ip.protocol);
900 + innerproto = ip_ct_find_proto(inside.ip.protocol);
901 + dataoff = skb->nh.iph->ihl*4 + sizeof(inside.icmp) + inside.ip.ihl*4;
902 + /* Are they talking about one of our connections? */
903 + if (!get_tuple(&inside.ip, skb, dataoff, &origtuple, innerproto)) {
904 + DEBUGP("icmp_error: ! get_tuple p=%u", inside.ip.protocol);
908 + /* Ordinarily, we'd expect the inverted tupleproto, but it's
909 + been preserved inside the ICMP. */
910 + if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
911 + DEBUGP("icmp_error_track: Can't invert tuple\n");
915 + *ctinfo = IP_CT_RELATED;
917 + h = ip_conntrack_find_get(&innertuple, NULL);
919 + /* Locally generated ICMPs will match inverted if they
920 + haven't been SNAT'ed yet */
921 + /* FIXME: NAT code has to handle half-done double NAT --RR */
922 + if (hooknum == NF_IP_LOCAL_OUT)
923 + h = ip_conntrack_find_get(&origtuple, NULL);
926 + DEBUGP("icmp_error_track: no match\n");
929 + /* Reverse direction from that found */
930 + if (DIRECTION(h) != IP_CT_DIR_REPLY)
931 + *ctinfo += IP_CT_IS_REPLY;
933 + if (DIRECTION(h) == IP_CT_DIR_REPLY)
934 + *ctinfo += IP_CT_IS_REPLY;
937 + /* Update skb to refer to this connection */
938 + skb->nfct = &h->ctrack->infos[*ctinfo];
942 +/* There's a small race here where we may free a just-assured
943 + connection. Too bad: we're in trouble anyway. */
944 +static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
946 + return !(test_bit(IPS_ASSURED_BIT, &i->ctrack->status));
949 +static int early_drop(struct list_head *chain)
951 + /* Traverse backwards: gives us oldest, which is roughly LRU */
952 + struct ip_conntrack_tuple_hash *h;
955 + READ_LOCK(&ip_conntrack_lock);
956 + h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
958 + atomic_inc(&h->ctrack->ct_general.use);
959 + READ_UNLOCK(&ip_conntrack_lock);
964 + if (del_timer(&h->ctrack->timeout)) {
965 + death_by_timeout((unsigned long)h->ctrack);
968 + ip_conntrack_put(h->ctrack);
972 +static inline int helper_cmp(const struct ip_conntrack_helper *i,
973 + const struct ip_conntrack_tuple *rtuple)
975 + return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
978 +struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
980 + return LIST_FIND(&helpers, helper_cmp,
981 + struct ip_conntrack_helper *,
985 +/* Allocate a new conntrack: we return -ENOMEM if classification
986 + failed due to stress. Otherwise it really is unclassifiable. */
987 +static struct ip_conntrack_tuple_hash *
988 +init_conntrack(const struct ip_conntrack_tuple *tuple,
989 + struct ip_conntrack_protocol *protocol,
990 + struct sk_buff *skb)
992 + struct ip_conntrack *conntrack;
993 + struct ip_conntrack_tuple repl_tuple;
995 + struct ip_conntrack_expect *expected;
997 + static unsigned int drop_next;
999 + if (!ip_conntrack_hash_rnd_initted) {
1000 + get_random_bytes(&ip_conntrack_hash_rnd, 4);
1001 + ip_conntrack_hash_rnd_initted = 1;
1004 + hash = hash_conntrack(tuple);
1006 + if (ip_conntrack_max &&
1007 + atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
1008 + /* Try dropping from random chain, or else from the
1009 + chain about to put into (in case they're trying to
1010 + bomb one hash chain). */
1011 + unsigned int next = (drop_next++)%ip_conntrack_htable_size;
1013 + if (!early_drop(&ip_conntrack_hash[next])
1014 + && !early_drop(&ip_conntrack_hash[hash])) {
1015 + if (net_ratelimit())
1016 + printk(KERN_WARNING
1017 + "ip_conntrack: table full, dropping"
1019 + return ERR_PTR(-ENOMEM);
1023 + if (!invert_tuple(&repl_tuple, tuple, protocol)) {
1024 + DEBUGP("Can't invert tuple.\n");
1028 + conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
1030 + DEBUGP("Can't allocate conntrack.\n");
1031 + return ERR_PTR(-ENOMEM);
1034 + memset(conntrack, 0, sizeof(*conntrack));
1035 + atomic_set(&conntrack->ct_general.use, 1);
1036 + conntrack->ct_general.destroy = destroy_conntrack;
1037 + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
1038 + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
1039 + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
1040 + conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
1041 + for (i=0; i < IP_CT_NUMBER; i++)
1042 + conntrack->infos[i].master = &conntrack->ct_general;
1044 + if (!protocol->new(conntrack, skb)) {
1045 + kmem_cache_free(ip_conntrack_cachep, conntrack);
1048 + /* Don't set timer yet: wait for confirmation */
1049 + init_timer(&conntrack->timeout);
1050 + conntrack->timeout.data = (unsigned long)conntrack;
1051 + conntrack->timeout.function = death_by_timeout;
1053 + INIT_LIST_HEAD(&conntrack->sibling_list);
1055 + WRITE_LOCK(&ip_conntrack_lock);
1056 + /* Need finding and deleting of expected ONLY if we win race */
1057 + READ_LOCK(&ip_conntrack_expect_tuple_lock);
1058 + expected = LIST_FIND(&ip_conntrack_expect_list, expect_cmp,
1059 + struct ip_conntrack_expect *, tuple);
1060 + READ_UNLOCK(&ip_conntrack_expect_tuple_lock);
1062 + /* If master is not in hash table yet (ie. packet hasn't left
1063 + this machine yet), how can other end know about expected?
1064 + Hence these are not the droids you are looking for (if
1065 + master ct never got confirmed, we'd hold a reference to it
1066 + and weird things would happen to future packets). */
1067 + if (expected && !is_confirmed(expected->expectant))
1070 + /* Look up the conntrack helper for master connections only */
1072 + conntrack->helper = ip_ct_find_helper(&repl_tuple);
1074 + /* If the expectation is dying, then this is a loser. */
1076 + && expected->expectant->helper->timeout
1077 + && ! del_timer(&expected->timeout))
1081 + DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
1082 + conntrack, expected);
1083 + /* Welcome, Mr. Bond. We've been expecting you... */
1084 + IP_NF_ASSERT(master_ct(conntrack));
1085 + __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
1086 + conntrack->master = expected;
1087 + expected->sibling = conntrack;
1088 + LIST_DELETE(&ip_conntrack_expect_list, expected);
1089 + expected->expectant->expecting--;
1090 + nf_conntrack_get(&master_ct(conntrack)->infos[0]);
1092 + atomic_inc(&ip_conntrack_count);
1093 + WRITE_UNLOCK(&ip_conntrack_lock);
1095 + if (expected && expected->expectfn)
1096 + expected->expectfn(conntrack);
1097 + return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
1100 +/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1101 +static inline struct ip_conntrack *
1102 +resolve_normal_ct(struct sk_buff *skb,
1103 + struct ip_conntrack_protocol *proto,
1105 + unsigned int hooknum,
1106 + enum ip_conntrack_info *ctinfo)
1108 + struct ip_conntrack_tuple tuple;
1109 + struct ip_conntrack_tuple_hash *h;
1111 + IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
1113 + if (!get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, &tuple, proto))
1116 + /* look for tuple match */
1117 + h = ip_conntrack_find_get(&tuple, NULL);
1119 + h = init_conntrack(&tuple, proto, skb);
1126 + /* It exists; we have (non-exclusive) reference. */
1127 + if (DIRECTION(h) == IP_CT_DIR_REPLY) {
1128 + *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1129 + /* Please set reply bit if this packet OK */
1132 + /* Once we've had two way comms, always ESTABLISHED. */
1133 + if (test_bit(IPS_SEEN_REPLY_BIT, &h->ctrack->status)) {
1134 + DEBUGP("ip_conntrack_in: normal packet for %p\n",
1136 + *ctinfo = IP_CT_ESTABLISHED;
1137 + } else if (test_bit(IPS_EXPECTED_BIT, &h->ctrack->status)) {
1138 + DEBUGP("ip_conntrack_in: related packet for %p\n",
1140 + *ctinfo = IP_CT_RELATED;
1142 + DEBUGP("ip_conntrack_in: new packet for %p\n",
1144 + *ctinfo = IP_CT_NEW;
1148 + skb->nfct = &h->ctrack->infos[*ctinfo];
1152 +/* Netfilter hook itself. */
1153 +unsigned int ip_conntrack_in(unsigned int hooknum,
1154 + struct sk_buff **pskb,
1155 + const struct net_device *in,
1156 + const struct net_device *out,
1157 + int (*okfn)(struct sk_buff *))
1159 + struct ip_conntrack *ct;
1160 + enum ip_conntrack_info ctinfo;
1161 + struct ip_conntrack_protocol *proto;
1165 + /* FIXME: Do this right please. --RR */
1166 + (*pskb)->nfcache |= NFC_UNKNOWN;
1168 +/* Doesn't cover locally-generated broadcast, so not worth it. */
1170 + /* Ignore broadcast: no `connection'. */
1171 + if ((*pskb)->pkt_type == PACKET_BROADCAST) {
1172 + printk("Broadcast packet!\n");
1174 + } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
1175 + == htonl(0x000000FF)) {
1176 + printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
1177 + NIPQUAD((*pskb)->nh.iph->saddr),
1178 + NIPQUAD((*pskb)->nh.iph->daddr),
1179 + (*pskb)->sk, (*pskb)->pkt_type);
1183 + /* Previously seen (loopback)? Ignore. Do this before
1184 + fragment check. */
1185 + if ((*pskb)->nfct)
1188 + /* Gather fragments. */
1189 + if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
1190 + *pskb = ip_ct_gather_frags(*pskb);
1195 + proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
1197 + /* It may be an icmp error... */
1198 + if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
1199 + && icmp_error_track(*pskb, &ctinfo, hooknum))
1202 + if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo)))
1203 + /* Not valid part of a connection */
1207 + /* Too stressed to deal. */
1210 + IP_NF_ASSERT((*pskb)->nfct);
1212 + ret = proto->packet(ct, *pskb, ctinfo);
1215 + nf_conntrack_put((*pskb)->nfct);
1216 + (*pskb)->nfct = NULL;
1220 + if (ret != NF_DROP && ct->helper) {
1221 + ret = ct->helper->help(*pskb, ct, ctinfo);
1224 + nf_conntrack_put((*pskb)->nfct);
1225 + (*pskb)->nfct = NULL;
1230 + set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
1235 +int invert_tuplepr(struct ip_conntrack_tuple *inverse,
1236 + const struct ip_conntrack_tuple *orig)
1238 + return invert_tuple(inverse, orig, ip_ct_find_proto(orig->dst.protonum));
1241 +static inline int resent_expect(const struct ip_conntrack_expect *i,
1242 + const struct ip_conntrack_tuple *tuple,
1243 + const struct ip_conntrack_tuple *mask)
1245 + DEBUGP("resent_expect\n");
1246 + DEBUGP(" tuple: "); DUMP_TUPLE(&i->tuple);
1247 + DEBUGP("ct_tuple: "); DUMP_TUPLE(&i->ct_tuple);
1248 + DEBUGP("test tuple: "); DUMP_TUPLE(tuple);
1249 + return (((i->ct_tuple.dst.protonum == 0 && ip_ct_tuple_equal(&i->tuple, tuple))
1250 + || (i->ct_tuple.dst.protonum && ip_ct_tuple_equal(&i->ct_tuple, tuple)))
1251 + && ip_ct_tuple_equal(&i->mask, mask));
1254 +/* Would two expected things clash? */
1255 +static inline int expect_clash(const struct ip_conntrack_expect *i,
1256 + const struct ip_conntrack_tuple *tuple,
1257 + const struct ip_conntrack_tuple *mask)
1259 + /* Part covered by intersection of masks must be unequal,
1260 + otherwise they clash */
1261 + struct ip_conntrack_tuple intersect_mask
1262 + = { { i->mask.src.ip & mask->src.ip,
1263 + { i->mask.src.u.all & mask->src.u.all } },
1264 + { i->mask.dst.ip & mask->dst.ip,
1265 + { i->mask.dst.u.all & mask->dst.u.all },
1266 + i->mask.dst.protonum & mask->dst.protonum } };
1268 + return ip_ct_tuple_mask_cmp(&i->tuple, tuple, &intersect_mask);
1271 +inline void ip_conntrack_unexpect_related(struct ip_conntrack_expect *expect)
1273 + WRITE_LOCK(&ip_conntrack_lock);
1274 + unexpect_related(expect);
1275 + WRITE_UNLOCK(&ip_conntrack_lock);
1278 +static void expectation_timed_out(unsigned long ul_expect)
1280 + struct ip_conntrack_expect *expect = (void *) ul_expect;
1282 + DEBUGP("expectation %p timed out\n", expect);
1283 + WRITE_LOCK(&ip_conntrack_lock);
1284 + __unexpect_related(expect);
1285 + WRITE_UNLOCK(&ip_conntrack_lock);
1288 +/* Add a related connection. */
1289 +int ip_conntrack_expect_related(struct ip_conntrack *related_to,
1290 + struct ip_conntrack_expect *expect)
1292 + struct ip_conntrack_expect *old, *new;
1295 + WRITE_LOCK(&ip_conntrack_lock);
1296 + /* Because of the write lock, no reader can walk the lists,
1297 + * so there is no need to use the tuple lock too */
1299 + DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1300 + DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1301 + DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1303 + old = LIST_FIND(&ip_conntrack_expect_list, resent_expect,
1304 + struct ip_conntrack_expect *, &expect->tuple,
1307 + /* Helper private data may contain offsets but no pointers
1308 + pointing into the payload - otherwise we should have to copy
1309 + the data filled out by the helper over the old one */
1310 + DEBUGP("expect_related: resent packet\n");
1311 + if (related_to->helper->timeout) {
1312 + if (!del_timer(&old->timeout)) {
1313 + /* expectation is dying. Fall through */
1316 + old->timeout.expires = jiffies +
1317 + related_to->helper->timeout * HZ;
1318 + add_timer(&old->timeout);
1323 + WRITE_UNLOCK(&ip_conntrack_lock);
1326 + } else if (related_to->helper->max_expected &&
1327 + related_to->expecting >= related_to->helper->max_expected) {
1328 + struct list_head *cur_item;
1330 + if (!(related_to->helper->flags &
1331 + IP_CT_HELPER_F_REUSE_EXPECT)) {
1332 + WRITE_UNLOCK(&ip_conntrack_lock);
1333 + if (net_ratelimit())
1334 + printk(KERN_WARNING
1335 + "ip_conntrack: max number of expected "
1336 + "connections %i of %s reached for "
1337 + "%u.%u.%u.%u->%u.%u.%u.%u\n",
1338 + related_to->helper->max_expected,
1339 + related_to->helper->name,
1340 + NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1341 + NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1344 + DEBUGP("ip_conntrack: max number of expected "
1345 + "connections %i of %s reached for "
1346 + "%u.%u.%u.%u->%u.%u.%u.%u, reusing\n",
1347 + related_to->helper->max_expected,
1348 + related_to->helper->name,
1349 + NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip),
1350 + NIPQUAD(related_to->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip));
1352 + /* choose the the oldest expectation to evict */
1353 + list_for_each(cur_item, &related_to->sibling_list) {
1354 + struct ip_conntrack_expect *cur;
1356 + cur = list_entry(cur_item,
1357 + struct ip_conntrack_expect,
1359 + if (cur->sibling == NULL) {
1365 + /* (!old) cannot happen, since related_to->expecting is the
1366 + * number of unconfirmed expects */
1367 + IP_NF_ASSERT(old);
1369 + /* newnat14 does not reuse the real allocated memory
1370 + * structures but rather unexpects the old and
1371 + * allocates a new. unexpect_related will decrement
1372 + * related_to->expecting.
1374 + unexpect_related(old);
1376 + } else if (LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1377 + struct ip_conntrack_expect *, &expect->tuple,
1379 + WRITE_UNLOCK(&ip_conntrack_lock);
1380 + DEBUGP("expect_related: busy!\n");
1384 + new = (struct ip_conntrack_expect *)
1385 + kmalloc(sizeof(struct ip_conntrack_expect), GFP_ATOMIC);
1387 + WRITE_UNLOCK(&ip_conntrack_lock);
1388 + DEBUGP("expect_relaed: OOM allocating expect\n");
1392 + DEBUGP("new expectation %p of conntrack %p\n", new, related_to);
1393 + memcpy(new, expect, sizeof(*expect));
1394 + new->expectant = related_to;
1395 + new->sibling = NULL;
1396 + atomic_set(&new->use, 1);
1398 + /* add to expected list for this connection */
1399 + list_add(&new->expected_list, &related_to->sibling_list);
1400 + /* add to global list of expectations */
1401 + list_prepend(&ip_conntrack_expect_list, &new->list);
1402 + /* add and start timer if required */
1403 + if (related_to->helper->timeout) {
1404 + init_timer(&new->timeout);
1405 + new->timeout.data = (unsigned long)new;
1406 + new->timeout.function = expectation_timed_out;
1407 + new->timeout.expires = jiffies +
1408 + related_to->helper->timeout * HZ;
1409 + add_timer(&new->timeout);
1411 + related_to->expecting++;
1413 + WRITE_UNLOCK(&ip_conntrack_lock);
1418 +/* Change tuple in an existing expectation */
1419 +int ip_conntrack_change_expect(struct ip_conntrack_expect *expect,
1420 + struct ip_conntrack_tuple *newtuple)
1424 + MUST_BE_READ_LOCKED(&ip_conntrack_lock);
1425 + WRITE_LOCK(&ip_conntrack_expect_tuple_lock);
1427 + DEBUGP("change_expect:\n");
1428 + DEBUGP("exp tuple: "); DUMP_TUPLE(&expect->tuple);
1429 + DEBUGP("exp mask: "); DUMP_TUPLE(&expect->mask);
1430 + DEBUGP("newtuple: "); DUMP_TUPLE(newtuple);
1431 + if (expect->ct_tuple.dst.protonum == 0) {
1432 + /* Never seen before */
1433 + DEBUGP("change expect: never seen before\n");
1434 + if (!ip_ct_tuple_equal(&expect->tuple, newtuple)
1435 + && LIST_FIND(&ip_conntrack_expect_list, expect_clash,
1436 + struct ip_conntrack_expect *, newtuple, &expect->mask)) {
1437 + /* Force NAT to find an unused tuple */
1440 + memcpy(&expect->ct_tuple, &expect->tuple, sizeof(expect->tuple));
1441 + memcpy(&expect->tuple, newtuple, sizeof(expect->tuple));
1445 + /* Resent packet */
1446 + DEBUGP("change expect: resent packet\n");
1447 + if (ip_ct_tuple_equal(&expect->tuple, newtuple)) {
1450 + /* Force NAT to choose again the same port */
1454 + WRITE_UNLOCK(&ip_conntrack_expect_tuple_lock);
1459 +/* Alter reply tuple (maybe alter helper). If it's already taken,
1460 + return 0 and don't do alteration. */
1461 +int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1462 + const struct ip_conntrack_tuple *newreply)
1464 + WRITE_LOCK(&ip_conntrack_lock);
1465 + if (__ip_conntrack_find(newreply, conntrack)) {
1466 + WRITE_UNLOCK(&ip_conntrack_lock);
1469 + /* Should be unconfirmed, so not in hash table yet */
1470 + IP_NF_ASSERT(!is_confirmed(conntrack));
1472 + DEBUGP("Altering reply tuple of %p to ", conntrack);
1473 + DUMP_TUPLE(newreply);
1475 + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1476 + if (!conntrack->master)
1477 + conntrack->helper = LIST_FIND(&helpers, helper_cmp,
1478 + struct ip_conntrack_helper *,
1480 + WRITE_UNLOCK(&ip_conntrack_lock);
1485 +int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1487 + WRITE_LOCK(&ip_conntrack_lock);
1488 + list_prepend(&helpers, me);
1489 + WRITE_UNLOCK(&ip_conntrack_lock);
1494 +static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1495 + const struct ip_conntrack_helper *me)
1497 + if (i->ctrack->helper == me) {
1498 + /* Get rid of any expected. */
1499 + remove_expectations(i->ctrack, 0);
1500 + /* And *then* set helper to NULL */
1501 + i->ctrack->helper = NULL;
1506 +void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1510 + /* Need write lock here, to delete helper. */
1511 + WRITE_LOCK(&ip_conntrack_lock);
1512 + LIST_DELETE(&helpers, me);
1514 + /* Get rid of expecteds, set helpers to NULL. */
1515 + for (i = 0; i < ip_conntrack_htable_size; i++)
1516 + LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1517 + struct ip_conntrack_tuple_hash *, me);
1518 + WRITE_UNLOCK(&ip_conntrack_lock);
1520 + /* Someone could be still looking at the helper in a bh. */
1521 + synchronize_net();
1524 +/* Refresh conntrack for this many jiffies. */
1525 +void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
1527 + IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1529 + WRITE_LOCK(&ip_conntrack_lock);
1530 + /* If not in hash table, timer will not be active yet */
1531 + if (!is_confirmed(ct))
1532 + ct->timeout.expires = extra_jiffies;
1534 + /* Need del_timer for race avoidance (may already be dying). */
1535 + if (del_timer(&ct->timeout)) {
1536 + ct->timeout.expires = jiffies + extra_jiffies;
1537 + add_timer(&ct->timeout);
1540 + WRITE_UNLOCK(&ip_conntrack_lock);
1543 +/* Returns new sk_buff, or NULL */
1545 +ip_ct_gather_frags(struct sk_buff *skb)
1547 + struct sock *sk = skb->sk;
1548 +#ifdef CONFIG_NETFILTER_DEBUG
1549 + unsigned int olddebug = skb->nf_debug;
1556 + local_bh_disable();
1557 + skb = ip_defrag(skb);
1558 + local_bh_enable();
1567 + skb_set_owner_w(skb, sk);
1571 + ip_send_check(skb->nh.iph);
1572 + skb->nfcache |= NFC_ALTERED;
1573 +#ifdef CONFIG_NETFILTER_DEBUG
1574 + /* Packet path as if nothing had happened. */
1575 + skb->nf_debug = olddebug;
1580 +/* Used by ipt_REJECT. */
1581 +static void ip_conntrack_attach(struct sk_buff *nskb, struct nf_ct_info *nfct)
1583 + struct ip_conntrack *ct;
1584 + enum ip_conntrack_info ctinfo;
1586 + ct = __ip_conntrack_get(nfct, &ctinfo);
1588 + /* This ICMP is in reverse direction to the packet which
1590 + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1591 + ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1593 + ctinfo = IP_CT_RELATED;
1595 + /* Attach new skbuff, and increment count */
1596 + nskb->nfct = &ct->infos[ctinfo];
1597 + atomic_inc(&ct->ct_general.use);
1601 +do_kill(const struct ip_conntrack_tuple_hash *i,
1602 + int (*kill)(const struct ip_conntrack *i, void *data),
1605 + return kill(i->ctrack, data);
1608 +/* Bring out ya dead! */
1609 +static struct ip_conntrack_tuple_hash *
1610 +get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
1613 + struct ip_conntrack_tuple_hash *h = NULL;
1616 + READ_LOCK(&ip_conntrack_lock);
1617 + for (i = 0; !h && i < ip_conntrack_htable_size; i++) {
1618 + h = LIST_FIND(&ip_conntrack_hash[i], do_kill,
1619 + struct ip_conntrack_tuple_hash *, kill, data);
1622 + atomic_inc(&h->ctrack->ct_general.use);
1623 + READ_UNLOCK(&ip_conntrack_lock);
1629 +ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
1632 + struct ip_conntrack_tuple_hash *h;
1634 + /* This is order n^2, by the way. */
1635 + while ((h = get_next_corpse(kill, data)) != NULL) {
1636 + /* Time to push up daises... */
1637 + if (del_timer(&h->ctrack->timeout))
1638 + death_by_timeout((unsigned long)h->ctrack);
1639 + /* ... else the timer will get him soon. */
1641 + ip_conntrack_put(h->ctrack);
1645 +/* Fast function for those who don't want to parse /proc (and I don't
1647 +/* Reversing the socket's dst/src point of view gives us the reply
1650 +getorigdst(struct sock *sk, int optval, void *user, int *len)
1652 + struct inet_opt *inet = inet_sk(sk);
1653 + struct ip_conntrack_tuple_hash *h;
1654 + struct ip_conntrack_tuple tuple;
1656 + IP_CT_TUPLE_U_BLANK(&tuple);
1657 + tuple.src.ip = inet->rcv_saddr;
1658 + tuple.src.u.tcp.port = inet->sport;
1659 + tuple.dst.ip = inet->daddr;
1660 + tuple.dst.u.tcp.port = inet->dport;
1661 + tuple.dst.protonum = IPPROTO_TCP;
1663 + /* We only do TCP at the moment: is there a better way? */
1664 + if (strcmp(sk->sk_prot->name, "TCP")) {
1665 + DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1666 + return -ENOPROTOOPT;
1669 + if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1670 + DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1671 + *len, sizeof(struct sockaddr_in));
1675 + h = ip_conntrack_find_get(&tuple, NULL);
1677 + struct sockaddr_in sin;
1679 + sin.sin_family = AF_INET;
1680 + sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1681 + .tuple.dst.u.tcp.port;
1682 + sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
1685 + DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1686 + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1687 + ip_conntrack_put(h->ctrack);
1688 + if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1693 + DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1694 + NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1695 + NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1699 +static struct nf_sockopt_ops so_getorigdst = {
1701 + .get_optmin = SO_ORIGINAL_DST,
1702 + .get_optmax = SO_ORIGINAL_DST+1,
1703 + .get = &getorigdst,
1706 +static int kill_all(const struct ip_conntrack *i, void *data)
1711 +/* Mishearing the voices in his head, our hero wonders how he's
1712 + supposed to kill the mall. */
1713 +void ip_conntrack_cleanup(void)
1715 + ip_ct_attach = NULL;
1716 + /* This makes sure all current packets have passed through
1717 + netfilter framework. Roll on, two-stage module
1719 + synchronize_net();
1721 + i_see_dead_people:
1722 + ip_ct_selective_cleanup(kill_all, NULL);
1723 + if (atomic_read(&ip_conntrack_count) != 0) {
1725 + goto i_see_dead_people;
1728 + kmem_cache_destroy(ip_conntrack_cachep);
1729 + vfree(ip_conntrack_hash);
1730 + nf_unregister_sockopt(&so_getorigdst);
1733 +static int hashsize;
1734 +MODULE_PARM(hashsize, "i");
1736 +int __init ip_conntrack_init(void)
1741 + /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1742 + * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1744 + ip_conntrack_htable_size = hashsize;
1746 + ip_conntrack_htable_size
1747 + = (((num_physpages << PAGE_SHIFT) / 16384)
1748 + / sizeof(struct list_head));
1749 + if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1750 + ip_conntrack_htable_size = 8192;
1751 + if (ip_conntrack_htable_size < 16)
1752 + ip_conntrack_htable_size = 16;
1754 + ip_conntrack_max = 8 * ip_conntrack_htable_size;
1756 + printk("ip_conntrack version %s (%u buckets, %d max)"
1757 + " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1758 + ip_conntrack_htable_size, ip_conntrack_max,
1759 + sizeof(struct ip_conntrack));
1761 + ret = nf_register_sockopt(&so_getorigdst);
1763 + printk(KERN_ERR "Unable to register netfilter socket option\n");
1767 + ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1768 + * ip_conntrack_htable_size);
1769 + if (!ip_conntrack_hash) {
1770 + printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1771 + goto err_unreg_sockopt;
1774 + ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1775 + sizeof(struct ip_conntrack), 0,
1776 + SLAB_HWCACHE_ALIGN, NULL, NULL);
1777 + if (!ip_conntrack_cachep) {
1778 + printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1779 + goto err_free_hash;
1781 + /* Don't NEED lock here, but good form anyway. */
1782 + WRITE_LOCK(&ip_conntrack_lock);
1783 + /* Sew in builtin protocols. */
1784 + list_append(&protocol_list, &ip_conntrack_protocol_tcp);
1785 + list_append(&protocol_list, &ip_conntrack_protocol_udp);
1786 + list_append(&protocol_list, &ip_conntrack_protocol_icmp);
1787 + WRITE_UNLOCK(&ip_conntrack_lock);
1789 + for (i = 0; i < ip_conntrack_htable_size; i++)
1790 + INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1792 + /* For use by ipt_REJECT */
1793 + ip_ct_attach = ip_conntrack_attach;
1797 + vfree(ip_conntrack_hash);
1799 + nf_unregister_sockopt(&so_getorigdst);
1803 diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c.rej linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c.rej
1804 --- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_conntrack_core.c.rej 1970-01-01 01:00:00.000000000 +0100
1805 +++ linux-2.6.0-test11/net/ipv4/netfilter/ip_conntrack_core.c.rej 2003-12-17 14:02:02.000000000 +0100
1809 + tuple->dst.ip = iph->daddr;
1810 + tuple->dst.protonum = iph->protocol;
1812 + ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl,
1816 + tuple->dst.ip = iph->daddr;
1817 + tuple->dst.protonum = iph->protocol;
1819 ++ tuple->src.u.all = tuple->dst.u.all = 0;
1821 + ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl,
1824 diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_nat_core.c linux-2.6.0-test11/net/ipv4/netfilter/ip_nat_core.c
1825 --- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_nat_core.c 2003-11-26 21:43:07.000000000 +0100
1826 +++ linux-2.6.0-test11/net/ipv4/netfilter/ip_nat_core.c 2003-12-17 14:02:03.000000000 +0100
1828 *tuple = *orig_tuple;
1829 while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
1831 - DEBUGP("Found best for "); DUMP_TUPLE(tuple);
1832 + DEBUGP("Found best for "); DUMP_TUPLE_RAW(tuple);
1833 /* 3) The per-protocol part of the manip is made to
1834 map into the range to make a unique tuple. */
1837 HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
1839 DEBUGP("Original: ");
1840 - DUMP_TUPLE(&orig_tp);
1841 + DUMP_TUPLE_RAW(&orig_tp);
1843 - DUMP_TUPLE(&new_tuple);
1844 + DUMP_TUPLE_RAW(&new_tuple);
1847 /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
1848 diff -Nur linux-2.6.0-test11.org/net/ipv4/netfilter/ip_nat_core.c.orig linux-2.6.0-test11/net/ipv4/netfilter/ip_nat_core.c.orig
1849 --- linux-2.6.0-test11.org/net/ipv4/netfilter/ip_nat_core.c.orig 1970-01-01 01:00:00.000000000 +0100
1850 +++ linux-2.6.0-test11/net/ipv4/netfilter/ip_nat_core.c.orig 2003-11-26 21:43:07.000000000 +0100
1852 +/* NAT for netfilter; shared with compatibility layer. */
1854 +/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
1855 + Public Licence. */
1856 +#include <linux/module.h>
1857 +#include <linux/types.h>
1858 +#include <linux/timer.h>
1859 +#include <linux/skbuff.h>
1860 +#include <linux/netfilter_ipv4.h>
1861 +#include <linux/vmalloc.h>
1862 +#include <net/checksum.h>
1863 +#include <net/icmp.h>
1864 +#include <net/ip.h>
1865 +#include <net/tcp.h> /* For tcp_prot in getorigdst */
1866 +#include <linux/icmp.h>
1867 +#include <linux/udp.h>
1869 +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
1870 +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
1872 +#include <linux/netfilter_ipv4/ip_conntrack.h>
1873 +#include <linux/netfilter_ipv4/ip_conntrack_core.h>
1874 +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
1875 +#include <linux/netfilter_ipv4/ip_nat.h>
1876 +#include <linux/netfilter_ipv4/ip_nat_protocol.h>
1877 +#include <linux/netfilter_ipv4/ip_nat_core.h>
1878 +#include <linux/netfilter_ipv4/ip_nat_helper.h>
1879 +#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
1880 +#include <linux/netfilter_ipv4/listhelp.h>
1883 +#define DEBUGP printk
1885 +#define DEBUGP(format, args...)
1888 +DECLARE_RWLOCK(ip_nat_lock);
1889 +DECLARE_RWLOCK_EXTERN(ip_conntrack_lock);
1891 +/* Calculated at init based on memory size */
1892 +static unsigned int ip_nat_htable_size;
1894 +static struct list_head *bysource;
1895 +static struct list_head *byipsproto;
1897 +LIST_HEAD(helpers);
1899 +extern struct ip_nat_protocol unknown_nat_protocol;
1901 +/* We keep extra hashes for each conntrack, for fast searching. */
1902 +static inline size_t
1903 +hash_by_ipsproto(u_int32_t src, u_int32_t dst, u_int16_t proto)
1905 + /* Modified src and dst, to ensure we don't create two
1906 + identical streams. */
1907 + return (src + dst + proto) % ip_nat_htable_size;
1910 +static inline size_t
1911 +hash_by_src(const struct ip_conntrack_manip *manip, u_int16_t proto)
1913 + /* Original src, to ensure we map it consistently if poss. */
1914 + return (manip->ip + manip->u.all + proto) % ip_nat_htable_size;
1917 +/* Noone using conntrack by the time this called. */
1918 +static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
1920 + struct ip_nat_info *info = &conn->nat.info;
1921 + unsigned int hs, hp;
1923 + if (!info->initialized)
1926 + IP_NF_ASSERT(info->bysource.conntrack);
1927 + IP_NF_ASSERT(info->byipsproto.conntrack);
1929 + hs = hash_by_src(&conn->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src,
1930 + conn->tuplehash[IP_CT_DIR_ORIGINAL]
1931 + .tuple.dst.protonum);
1933 + hp = hash_by_ipsproto(conn->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip,
1934 + conn->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip,
1935 + conn->tuplehash[IP_CT_DIR_REPLY]
1936 + .tuple.dst.protonum);
1938 + WRITE_LOCK(&ip_nat_lock);
1939 + LIST_DELETE(&bysource[hs], &info->bysource);
1940 + LIST_DELETE(&byipsproto[hp], &info->byipsproto);
1941 + WRITE_UNLOCK(&ip_nat_lock);
1944 +/* We do checksum mangling, so if they were wrong before they're still
1945 + * wrong. Also works for incomplete packets (eg. ICMP dest
1946 + * unreachables.) */
1948 +ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
1950 + u_int32_t diffs[] = { oldvalinv, newval };
1951 + return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
1952 + oldcheck^0xFFFF));
1955 +static inline int cmp_proto(const struct ip_nat_protocol *i, int proto)
1957 + return i->protonum == proto;
1960 +struct ip_nat_protocol *
1961 +find_nat_proto(u_int16_t protonum)
1963 + struct ip_nat_protocol *i;
1965 + MUST_BE_READ_LOCKED(&ip_nat_lock);
1966 + i = LIST_FIND(&protos, cmp_proto, struct ip_nat_protocol *, protonum);
1968 + i = &unknown_nat_protocol;
1972 +/* Is this tuple already taken? (not by us) */
1974 +ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
1975 + const struct ip_conntrack *ignored_conntrack)
1977 + /* Conntrack tracking doesn't keep track of outgoing tuples; only
1978 + incoming ones. NAT means they don't have a fixed mapping,
1979 + so we invert the tuple and look for the incoming reply.
1981 + We could keep a separate hash if this proves too slow. */
1982 + struct ip_conntrack_tuple reply;
1984 + invert_tuplepr(&reply, tuple);
1985 + return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
1988 +/* Does tuple + the source manip come within the range mr */
1990 +in_range(const struct ip_conntrack_tuple *tuple,
1991 + const struct ip_conntrack_manip *manip,
1992 + const struct ip_nat_multi_range *mr)
1994 + struct ip_nat_protocol *proto = find_nat_proto(tuple->dst.protonum);
1996 + struct ip_conntrack_tuple newtuple = { *manip, tuple->dst };
1998 + for (i = 0; i < mr->rangesize; i++) {
1999 + /* If we are allowed to map IPs, then we must be in the
2000 + range specified, otherwise we must be unchanged. */
2001 + if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
2002 + if (ntohl(newtuple.src.ip) < ntohl(mr->range[i].min_ip)
2003 + || (ntohl(newtuple.src.ip)
2004 + > ntohl(mr->range[i].max_ip)))
2007 + if (newtuple.src.ip != tuple->src.ip)
2011 + if (!(mr->range[i].flags & IP_NAT_RANGE_PROTO_SPECIFIED)
2012 + || proto->in_range(&newtuple, IP_NAT_MANIP_SRC,
2013 + &mr->range[i].min, &mr->range[i].max))
2020 +src_cmp(const struct ip_nat_hash *i,
2021 + const struct ip_conntrack_tuple *tuple,
2022 + const struct ip_nat_multi_range *mr)
2024 + return (i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
2025 + == tuple->dst.protonum
2026 + && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
2028 + && i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
2029 + == tuple->src.u.all
2030 + && in_range(tuple,
2031 + &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2036 +/* Only called for SRC manip */
2037 +static struct ip_conntrack_manip *
2038 +find_appropriate_src(const struct ip_conntrack_tuple *tuple,
2039 + const struct ip_nat_multi_range *mr)
2041 + unsigned int h = hash_by_src(&tuple->src, tuple->dst.protonum);
2042 + struct ip_nat_hash *i;
2044 + MUST_BE_READ_LOCKED(&ip_nat_lock);
2045 + i = LIST_FIND(&bysource[h], src_cmp, struct ip_nat_hash *, tuple, mr);
2047 + return &i->conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src;
2052 +#ifdef CONFIG_IP_NF_NAT_LOCAL
2053 +/* If it's really a local destination manip, it may need to do a
2054 + source manip too. */
2056 +do_extra_mangle(u_int32_t var_ip, u_int32_t *other_ipp)
2058 + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = var_ip } } };
2059 + struct rtable *rt;
2061 + /* FIXME: IPTOS_TOS(iph->tos) --RR */
2062 + if (ip_route_output_key(&rt, &fl) != 0) {
2063 + DEBUGP("do_extra_mangle: Can't get route to %u.%u.%u.%u\n",
2068 + *other_ipp = rt->rt_src;
2074 +/* Simple way to iterate through all. */
2075 +static inline int fake_cmp(const struct ip_nat_hash *i,
2076 + u_int32_t src, u_int32_t dst, u_int16_t protonum,
2077 + unsigned int *score,
2078 + const struct ip_conntrack *conntrack)
2080 + /* Compare backwards: we're dealing with OUTGOING tuples, and
2081 + inside the conntrack is the REPLY tuple. Don't count this
2083 + if (i->conntrack != conntrack
2084 + && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip == dst
2085 + && i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip == src
2086 + && (i->conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum
2092 +static inline unsigned int
2093 +count_maps(u_int32_t src, u_int32_t dst, u_int16_t protonum,
2094 + const struct ip_conntrack *conntrack)
2096 + unsigned int score = 0;
2099 + MUST_BE_READ_LOCKED(&ip_nat_lock);
2100 + h = hash_by_ipsproto(src, dst, protonum);
2101 + LIST_FIND(&byipsproto[h], fake_cmp, struct ip_nat_hash *,
2102 + src, dst, protonum, &score, conntrack);
2107 +/* For [FUTURE] fragmentation handling, we want the least-used
2108 + src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus
2109 + if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
2110 + 1-65535, we don't do pro-rata allocation based on ports; we choose
2111 + the ip with the lowest src-ip/dst-ip/proto usage.
2113 + If an allocation then fails (eg. all 6 ports used in the 1.2.3.4
2114 + range), we eliminate that and try again. This is not the most
2115 + efficient approach, but if you're worried about that, don't hand us
2116 + ranges you don't really have. */
2117 +static struct ip_nat_range *
2118 +find_best_ips_proto(struct ip_conntrack_tuple *tuple,
2119 + const struct ip_nat_multi_range *mr,
2120 + const struct ip_conntrack *conntrack,
2121 + unsigned int hooknum)
2125 + const struct ip_nat_range *range;
2126 + unsigned int score;
2127 + struct ip_conntrack_tuple tuple;
2128 + } best = { NULL, 0xFFFFFFFF };
2129 + u_int32_t *var_ipp, *other_ipp, saved_ip, orig_dstip;
2130 + static unsigned int randomness;
2132 + if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC) {
2133 + var_ipp = &tuple->src.ip;
2134 + saved_ip = tuple->dst.ip;
2135 + other_ipp = &tuple->dst.ip;
2137 + var_ipp = &tuple->dst.ip;
2138 + saved_ip = tuple->src.ip;
2139 + other_ipp = &tuple->src.ip;
2141 + /* Don't do do_extra_mangle unless necessary (overrides
2142 + explicit socket bindings, for example) */
2143 + orig_dstip = tuple->dst.ip;
2145 + IP_NF_ASSERT(mr->rangesize >= 1);
2146 + for (i = 0; i < mr->rangesize; i++) {
2148 + u_int32_t minip, maxip, j;
2150 + /* Don't do ranges which are already eliminated. */
2151 + if (mr->range[i].flags & IP_NAT_RANGE_FULL) {
2155 + if (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS) {
2156 + minip = ntohl(mr->range[i].min_ip);
2157 + maxip = ntohl(mr->range[i].max_ip);
2159 + minip = maxip = ntohl(*var_ipp);
2162 + for (j = 0; j < maxip - minip + 1; j++) {
2163 + unsigned int score;
2165 + *var_ipp = htonl(minip + (randomness + j)
2166 + % (maxip - minip + 1));
2168 + /* Reset the other ip in case it was mangled by
2169 + * do_extra_mangle last time. */
2170 + *other_ipp = saved_ip;
2172 +#ifdef CONFIG_IP_NF_NAT_LOCAL
2173 + if (hooknum == NF_IP_LOCAL_OUT
2174 + && *var_ipp != orig_dstip
2175 + && !do_extra_mangle(*var_ipp, other_ipp)) {
2176 + DEBUGP("Range %u %u.%u.%u.%u rt failed!\n",
2177 + i, NIPQUAD(*var_ipp));
2178 + /* Can't route? This whole range part is
2179 + * probably screwed, but keep trying
2185 + /* Count how many others map onto this. */
2186 + score = count_maps(tuple->src.ip, tuple->dst.ip,
2187 + tuple->dst.protonum, conntrack);
2188 + if (score < best.score) {
2189 + /* Optimization: doesn't get any better than
2192 + return (struct ip_nat_range *)
2195 + best.score = score;
2196 + best.tuple = *tuple;
2197 + best.range = &mr->range[i];
2201 + *tuple = best.tuple;
2203 + /* Discard const. */
2204 + return (struct ip_nat_range *)best.range;
2207 +/* Fast version doesn't iterate through hash chains, but only handles
2208 + common case of single IP address (null NAT, masquerade) */
2209 +static struct ip_nat_range *
2210 +find_best_ips_proto_fast(struct ip_conntrack_tuple *tuple,
2211 + const struct ip_nat_multi_range *mr,
2212 + const struct ip_conntrack *conntrack,
2213 + unsigned int hooknum)
2215 + if (mr->rangesize != 1
2216 + || (mr->range[0].flags & IP_NAT_RANGE_FULL)
2217 + || ((mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
2218 + && mr->range[0].min_ip != mr->range[0].max_ip))
2219 + return find_best_ips_proto(tuple, mr, conntrack, hooknum);
2221 + if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
2222 + if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC)
2223 + tuple->src.ip = mr->range[0].min_ip;
2225 + /* Only do extra mangle when required (breaks
2226 + socket binding) */
2227 +#ifdef CONFIG_IP_NF_NAT_LOCAL
2228 + if (tuple->dst.ip != mr->range[0].min_ip
2229 + && hooknum == NF_IP_LOCAL_OUT
2230 + && !do_extra_mangle(mr->range[0].min_ip,
2234 + tuple->dst.ip = mr->range[0].min_ip;
2238 + /* Discard const. */
2239 + return (struct ip_nat_range *)&mr->range[0];
2243 +get_unique_tuple(struct ip_conntrack_tuple *tuple,
2244 + const struct ip_conntrack_tuple *orig_tuple,
2245 + const struct ip_nat_multi_range *mrr,
2246 + struct ip_conntrack *conntrack,
2247 + unsigned int hooknum)
2249 + struct ip_nat_protocol *proto
2250 + = find_nat_proto(orig_tuple->dst.protonum);
2251 + struct ip_nat_range *rptr;
2255 + /* We temporarily use flags for marking full parts, but we
2256 + always clean up afterwards */
2257 + struct ip_nat_multi_range *mr = (void *)mrr;
2259 + /* 1) If this srcip/proto/src-proto-part is currently mapped,
2260 + and that same mapping gives a unique tuple within the given
2263 + This is only required for source (ie. NAT/masq) mappings.
2264 + So far, we don't do local source mappings, so multiple
2265 + manips not an issue. */
2266 + if (hooknum == NF_IP_POST_ROUTING) {
2267 + struct ip_conntrack_manip *manip;
2269 + manip = find_appropriate_src(orig_tuple, mr);
2271 + /* Apply same source manipulation. */
2272 + *tuple = ((struct ip_conntrack_tuple)
2273 + { *manip, orig_tuple->dst });
2274 + DEBUGP("get_unique_tuple: Found current src map\n");
2275 + if (!ip_nat_used_tuple(tuple, conntrack))
2280 + /* 2) Select the least-used IP/proto combination in the given
2283 + *tuple = *orig_tuple;
2284 + while ((rptr = find_best_ips_proto_fast(tuple, mr, conntrack, hooknum))
2286 + DEBUGP("Found best for "); DUMP_TUPLE(tuple);
2287 + /* 3) The per-protocol part of the manip is made to
2288 + map into the range to make a unique tuple. */
2290 + /* Only bother mapping if it's not already in range
2292 + if ((!(rptr->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
2293 + || proto->in_range(tuple, HOOK2MANIP(hooknum),
2294 + &rptr->min, &rptr->max))
2295 + && !ip_nat_used_tuple(tuple, conntrack)) {
2299 + if (proto->unique_tuple(tuple, rptr,
2300 + HOOK2MANIP(hooknum),
2302 + /* Must be unique. */
2303 + IP_NF_ASSERT(!ip_nat_used_tuple(tuple,
2307 + } else if (HOOK2MANIP(hooknum) == IP_NAT_MANIP_DST) {
2308 + /* Try implicit source NAT; protocol
2309 + may be able to play with ports to
2310 + make it unique. */
2311 + struct ip_nat_range r
2312 + = { IP_NAT_RANGE_MAP_IPS,
2313 + tuple->src.ip, tuple->src.ip,
2315 + DEBUGP("Trying implicit mapping\n");
2316 + if (proto->unique_tuple(tuple, &r,
2319 + /* Must be unique. */
2320 + IP_NF_ASSERT(!ip_nat_used_tuple
2321 + (tuple, conntrack));
2326 + DEBUGP("Protocol can't get unique tuple %u.\n",
2330 + /* Eliminate that from range, and try again. */
2331 + rptr->flags |= IP_NAT_RANGE_FULL;
2332 + *tuple = *orig_tuple;
2338 + /* Clear full flags. */
2339 + IP_NF_ASSERT(mr->rangesize >= 1);
2340 + for (i = 0; i < mr->rangesize; i++)
2341 + mr->range[i].flags &= ~IP_NAT_RANGE_FULL;
2347 +helper_cmp(const struct ip_nat_helper *helper,
2348 + const struct ip_conntrack_tuple *tuple)
2350 + return ip_ct_tuple_mask_cmp(tuple, &helper->tuple, &helper->mask);
2353 +/* Where to manip the reply packets (will be reverse manip). */
2354 +static unsigned int opposite_hook[NF_IP_NUMHOOKS]
2355 += { [NF_IP_PRE_ROUTING] = NF_IP_POST_ROUTING,
2356 + [NF_IP_POST_ROUTING] = NF_IP_PRE_ROUTING,
2357 +#ifdef CONFIG_IP_NF_NAT_LOCAL
2358 + [NF_IP_LOCAL_OUT] = NF_IP_LOCAL_IN,
2359 + [NF_IP_LOCAL_IN] = NF_IP_LOCAL_OUT,
2364 +ip_nat_setup_info(struct ip_conntrack *conntrack,
2365 + const struct ip_nat_multi_range *mr,
2366 + unsigned int hooknum)
2368 + struct ip_conntrack_tuple new_tuple, inv_tuple, reply;
2369 + struct ip_conntrack_tuple orig_tp;
2370 + struct ip_nat_info *info = &conntrack->nat.info;
2371 + int in_hashes = info->initialized;
2373 + MUST_BE_WRITE_LOCKED(&ip_nat_lock);
2374 + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
2375 + || hooknum == NF_IP_POST_ROUTING
2376 + || hooknum == NF_IP_LOCAL_OUT);
2377 + IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
2378 + IP_NF_ASSERT(!(info->initialized & (1 << HOOK2MANIP(hooknum))));
2380 + /* What we've got will look like inverse of reply. Normally
2381 + this is what is in the conntrack, except for prior
2382 + manipulations (future optimization: if num_manips == 0,
2384 + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
2385 + invert_tuplepr(&orig_tp,
2386 + &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
2392 + DEBUGP("Hook %u (%s), ", hooknum,
2393 + HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST");
2394 + DUMP_TUPLE(&orig_tp);
2395 + DEBUGP("Range %p: ", mr);
2396 + for (i = 0; i < mr->rangesize; i++) {
2397 + DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n",
2399 + (mr->range[i].flags & IP_NAT_RANGE_MAP_IPS)
2400 + ? " MAP_IPS" : "",
2401 + (mr->range[i].flags
2402 + & IP_NAT_RANGE_PROTO_SPECIFIED)
2403 + ? " PROTO_SPECIFIED" : "",
2404 + (mr->range[i].flags & IP_NAT_RANGE_FULL)
2406 + NIPQUAD(mr->range[i].min_ip),
2407 + NIPQUAD(mr->range[i].max_ip),
2408 + mr->range[i].min.all,
2409 + mr->range[i].max.all);
2415 + if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack,
2417 + DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n",
2423 + DEBUGP("Hook %u (%s) %p\n", hooknum,
2424 + HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST",
2426 + DEBUGP("Original: ");
2427 + DUMP_TUPLE(&orig_tp);
2429 + DUMP_TUPLE(&new_tuple);
2432 + /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT):
2433 + the original (A/B/C/D') and the mangled one (E/F/G/H').
2435 + We're only allowed to work with the SRC per-proto
2436 + part, so we create inverses of both to start, then
2437 + derive the other fields we need. */
2439 + /* Reply connection: simply invert the new tuple
2441 + invert_tuplepr(&reply, &new_tuple);
2443 + /* Alter conntrack table so it recognizes replies.
2444 + If fail this race (reply tuple now used), repeat. */
2445 + } while (!ip_conntrack_alter_reply(conntrack, &reply));
2447 + /* FIXME: We can simply used existing conntrack reply tuple
2449 + /* Create inverse of original: C/D/A/B' */
2450 + invert_tuplepr(&inv_tuple, &orig_tp);
2452 + /* Has source changed?. */
2453 + if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) {
2454 + /* In this direction, a source manip. */
2455 + info->manips[info->num_manips++] =
2456 + ((struct ip_nat_info_manip)
2457 + { IP_CT_DIR_ORIGINAL, hooknum,
2458 + IP_NAT_MANIP_SRC, new_tuple.src });
2460 + IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
2462 + /* In the reverse direction, a destination manip. */
2463 + info->manips[info->num_manips++] =
2464 + ((struct ip_nat_info_manip)
2465 + { IP_CT_DIR_REPLY, opposite_hook[hooknum],
2466 + IP_NAT_MANIP_DST, orig_tp.src });
2467 + IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
2470 + /* Has destination changed? */
2471 + if (!ip_ct_tuple_dst_equal(&new_tuple, &orig_tp)) {
2472 + /* In this direction, a destination manip */
2473 + info->manips[info->num_manips++] =
2474 + ((struct ip_nat_info_manip)
2475 + { IP_CT_DIR_ORIGINAL, hooknum,
2476 + IP_NAT_MANIP_DST, reply.src });
2478 + IP_NF_ASSERT(info->num_manips < IP_NAT_MAX_MANIPS);
2480 + /* In the reverse direction, a source manip. */
2481 + info->manips[info->num_manips++] =
2482 + ((struct ip_nat_info_manip)
2483 + { IP_CT_DIR_REPLY, opposite_hook[hooknum],
2484 + IP_NAT_MANIP_SRC, inv_tuple.src });
2485 + IP_NF_ASSERT(info->num_manips <= IP_NAT_MAX_MANIPS);
2488 + /* If there's a helper, assign it; based on new tuple. */
2489 + if (!conntrack->master)
2490 + info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *,
2494 + info->initialized |= (1 << HOOK2MANIP(hooknum));
2497 + IP_NF_ASSERT(info->bysource.conntrack);
2498 + replace_in_hashes(conntrack, info);
2500 + place_in_hashes(conntrack, info);
2506 +void replace_in_hashes(struct ip_conntrack *conntrack,
2507 + struct ip_nat_info *info)
2509 + /* Source has changed, so replace in hashes. */
2510 + unsigned int srchash
2511 + = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2513 + conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2514 + .tuple.dst.protonum);
2515 + /* We place packet as seen OUTGOUNG in byips_proto hash
2516 + (ie. reverse dst and src of reply packet. */
2517 + unsigned int ipsprotohash
2518 + = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
2520 + conntrack->tuplehash[IP_CT_DIR_REPLY]
2522 + conntrack->tuplehash[IP_CT_DIR_REPLY]
2523 + .tuple.dst.protonum);
2525 + IP_NF_ASSERT(info->bysource.conntrack == conntrack);
2526 + MUST_BE_WRITE_LOCKED(&ip_nat_lock);
2528 + list_del(&info->bysource.list);
2529 + list_del(&info->byipsproto.list);
2531 + list_prepend(&bysource[srchash], &info->bysource);
2532 + list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
2535 +void place_in_hashes(struct ip_conntrack *conntrack,
2536 + struct ip_nat_info *info)
2538 + unsigned int srchash
2539 + = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2541 + conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
2542 + .tuple.dst.protonum);
2543 + /* We place packet as seen OUTGOUNG in byips_proto hash
2544 + (ie. reverse dst and src of reply packet. */
2545 + unsigned int ipsprotohash
2546 + = hash_by_ipsproto(conntrack->tuplehash[IP_CT_DIR_REPLY]
2548 + conntrack->tuplehash[IP_CT_DIR_REPLY]
2550 + conntrack->tuplehash[IP_CT_DIR_REPLY]
2551 + .tuple.dst.protonum);
2553 + IP_NF_ASSERT(!info->bysource.conntrack);
2555 + MUST_BE_WRITE_LOCKED(&ip_nat_lock);
2556 + info->byipsproto.conntrack = conntrack;
2557 + info->bysource.conntrack = conntrack;
2559 + list_prepend(&bysource[srchash], &info->bysource);
2560 + list_prepend(&byipsproto[ipsprotohash], &info->byipsproto);
2563 +/* Returns true if succeeded. */
2565 +manip_pkt(u_int16_t proto,
2566 + struct sk_buff **pskb,
2567 + unsigned int iphdroff,
2568 + const struct ip_conntrack_manip *manip,
2569 + enum ip_nat_manip_type maniptype)
2571 + struct iphdr *iph;
2573 + (*pskb)->nfcache |= NFC_ALTERED;
2574 + if (!skb_ip_make_writable(pskb, iphdroff+sizeof(iph)))
2577 + iph = (void *)(*pskb)->data + iphdroff;
2579 + /* Manipulate protcol part. */
2580 + if (!find_nat_proto(proto)->manip_pkt(pskb,
2581 + iphdroff + iph->ihl*4,
2582 + manip, maniptype))
2585 + iph = (void *)(*pskb)->data + iphdroff;
2587 + if (maniptype == IP_NAT_MANIP_SRC) {
2588 + iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip,
2590 + iph->saddr = manip->ip;
2592 + iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip,
2594 + iph->daddr = manip->ip;
2599 +static inline int exp_for_packet(struct ip_conntrack_expect *exp,
2600 + struct sk_buff *skb)
2602 + struct ip_conntrack_protocol *proto;
2605 + MUST_BE_READ_LOCKED(&ip_conntrack_lock);
2606 + proto = __ip_ct_find_proto(skb->nh.iph->protocol);
2607 + if (proto->exp_matches_pkt)
2608 + ret = proto->exp_matches_pkt(exp, skb);
2613 +/* Do packet manipulations according to binding. */
2615 +do_bindings(struct ip_conntrack *ct,
2616 + enum ip_conntrack_info ctinfo,
2617 + struct ip_nat_info *info,
2618 + unsigned int hooknum,
2619 + struct sk_buff **pskb)
2622 + struct ip_nat_helper *helper;
2623 + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
2624 + int proto = (*pskb)->nh.iph->protocol;
2626 + /* Need nat lock to protect against modification, but neither
2627 + conntrack (referenced) and helper (deleted with
2628 + synchronize_bh()) can vanish. */
2629 + READ_LOCK(&ip_nat_lock);
2630 + for (i = 0; i < info->num_manips; i++) {
2631 + if (info->manips[i].direction == dir
2632 + && info->manips[i].hooknum == hooknum) {
2633 + DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n",
2635 + info->manips[i].maniptype == IP_NAT_MANIP_SRC
2637 + NIPQUAD(info->manips[i].manip.ip),
2638 + htons(info->manips[i].manip.u.all));
2639 + if (!manip_pkt(proto, pskb, 0,
2640 + &info->manips[i].manip,
2641 + info->manips[i].maniptype)) {
2642 + READ_UNLOCK(&ip_nat_lock);
2647 + helper = info->helper;
2648 + READ_UNLOCK(&ip_nat_lock);
2651 + struct ip_conntrack_expect *exp = NULL;
2652 + struct list_head *cur_item;
2653 + int ret = NF_ACCEPT;
2654 + int helper_called = 0;
2656 + DEBUGP("do_bindings: helper existing for (%p)\n", ct);
2658 + /* Always defragged for helpers */
2659 + IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
2660 + & htons(IP_MF|IP_OFFSET)));
2662 + /* Have to grab read lock before sibling_list traversal */
2663 + READ_LOCK(&ip_conntrack_lock);
2664 + list_for_each(cur_item, &ct->sibling_list) {
2665 + exp = list_entry(cur_item, struct ip_conntrack_expect,
2668 + /* if this expectation is already established, skip */
2672 + if (exp_for_packet(exp, *pskb)) {
2673 + /* FIXME: May be true multiple times in the
2674 + * case of UDP!! */
2675 + DEBUGP("calling nat helper (exp=%p) for packet\n", exp);
2676 + ret = helper->help(ct, exp, info, ctinfo,
2678 + if (ret != NF_ACCEPT) {
2679 + READ_UNLOCK(&ip_conntrack_lock);
2682 + helper_called = 1;
2685 + /* Helper might want to manip the packet even when there is no
2686 + * matching expectation for this packet */
2687 + if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) {
2688 + DEBUGP("calling nat helper for packet without expectation\n");
2689 + ret = helper->help(ct, NULL, info, ctinfo,
2691 + if (ret != NF_ACCEPT) {
2692 + READ_UNLOCK(&ip_conntrack_lock);
2696 + READ_UNLOCK(&ip_conntrack_lock);
2698 + /* Adjust sequence number only once per packet
2699 + * (helper is called at all hooks) */
2700 + if (proto == IPPROTO_TCP
2701 + && (hooknum == NF_IP_POST_ROUTING
2702 + || hooknum == NF_IP_LOCAL_IN)) {
2703 + DEBUGP("ip_nat_core: adjusting sequence number\n");
2704 + /* future: put this in a l4-proto specific function,
2705 + * and call this function here. */
2706 + if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
2719 +icmp_reply_translation(struct sk_buff **pskb,
2720 + struct ip_conntrack *conntrack,
2721 + unsigned int hooknum,
2725 + struct icmphdr icmp;
2729 + struct ip_nat_info *info = &conntrack->nat.info;
2732 + if (!skb_ip_make_writable(pskb,(*pskb)->nh.iph->ihl*4+sizeof(*inside)))
2734 + inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
2736 + /* We're actually going to mangle it beyond trivial checksum
2737 + adjustment, so make sure the current checksum is correct. */
2738 + if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
2739 + hdrlen = (*pskb)->nh.iph->ihl * 4;
2740 + if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
2741 + (*pskb)->len - hdrlen, 0)))
2745 + /* Must be RELATED */
2746 + IP_NF_ASSERT((*pskb)->nfct
2747 + - (struct ip_conntrack *)(*pskb)->nfct->master
2750 + - (struct ip_conntrack *)(*pskb)->nfct->master
2751 + == IP_CT_RELATED+IP_CT_IS_REPLY);
2753 + /* Redirects on non-null nats must be dropped, else they'll
2754 + start talking to each other without our translation, and be
2755 + confused... --RR */
2756 + if (inside->icmp.type == ICMP_REDIRECT) {
2757 + /* Don't care about races here. */
2758 + if (info->initialized
2759 + != ((1 << IP_NAT_MANIP_SRC) | (1 << IP_NAT_MANIP_DST))
2760 + || info->num_manips != 0)
2764 + DEBUGP("icmp_reply_translation: translating error %p hook %u dir %s\n",
2765 + *pskb, hooknum, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
2766 + /* Note: May not be from a NAT'd host, but probably safest to
2767 + do translation always as if it came from the host itself
2768 + (even though a "host unreachable" coming from the host
2769 + itself is a bit weird).
2771 + More explanation: some people use NAT for anonymizing.
2772 + Also, CERT recommends dropping all packets from private IP
2773 + addresses (although ICMP errors from internal links with
2774 + such addresses are not too uncommon, as Alan Cox points
2777 + READ_LOCK(&ip_nat_lock);
2778 + for (i = 0; i < info->num_manips; i++) {
2779 + DEBUGP("icmp_reply: manip %u dir %s hook %u\n",
2780 + i, info->manips[i].direction == IP_CT_DIR_ORIGINAL ?
2781 + "ORIG" : "REPLY", info->manips[i].hooknum);
2783 + if (info->manips[i].direction != dir)
2786 + /* Mapping the inner packet is just like a normal
2787 + packet, except it was never src/dst reversed, so
2788 + where we would normally apply a dst manip, we apply
2789 + a src, and vice versa. */
2790 + if (info->manips[i].hooknum == hooknum) {
2791 + DEBUGP("icmp_reply: inner %s -> %u.%u.%u.%u %u\n",
2792 + info->manips[i].maniptype == IP_NAT_MANIP_SRC
2794 + NIPQUAD(info->manips[i].manip.ip),
2795 + ntohs(info->manips[i].manip.u.udp.port));
2796 + if (!manip_pkt(inside->ip.protocol, pskb,
2797 + (*pskb)->nh.iph->ihl*4
2798 + + sizeof(inside->icmp),
2799 + &info->manips[i].manip,
2800 + !info->manips[i].maniptype))
2803 + /* Outer packet needs to have IP header NATed like
2806 + /* Use mapping to map outer packet: 0 give no
2807 + per-proto mapping */
2808 + DEBUGP("icmp_reply: outer %s -> %u.%u.%u.%u\n",
2809 + info->manips[i].maniptype == IP_NAT_MANIP_SRC
2811 + NIPQUAD(info->manips[i].manip.ip));
2812 + if (!manip_pkt(0, pskb, 0,
2813 + &info->manips[i].manip,
2814 + info->manips[i].maniptype))
2818 + READ_UNLOCK(&ip_nat_lock);
2820 + hdrlen = (*pskb)->nh.iph->ihl * 4;
2822 + inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
2824 + inside->icmp.checksum = 0;
2825 + inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
2826 + (*pskb)->len - hdrlen,
2831 + READ_UNLOCK(&ip_nat_lock);
2835 +int __init ip_nat_init(void)
2839 + /* Leave them the same for the moment. */
2840 + ip_nat_htable_size = ip_conntrack_htable_size;
2842 + /* One vmalloc for both hash tables */
2843 + bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size*2);
2847 + byipsproto = bysource + ip_nat_htable_size;
2849 + /* Sew in builtin protocols. */
2850 + WRITE_LOCK(&ip_nat_lock);
2851 + list_append(&protos, &ip_nat_protocol_tcp);
2852 + list_append(&protos, &ip_nat_protocol_udp);
2853 + list_append(&protos, &ip_nat_protocol_icmp);
2854 + WRITE_UNLOCK(&ip_nat_lock);
2856 + for (i = 0; i < ip_nat_htable_size; i++) {
2857 + INIT_LIST_HEAD(&bysource[i]);
2858 + INIT_LIST_HEAD(&byipsproto[i]);
2861 + /* FIXME: Man, this is a hack. <SIGH> */
2862 + IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
2863 + ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
2868 +/* Clear NAT section of all conntracks, in case we're loaded again. */
2869 +static int clean_nat(const struct ip_conntrack *i, void *data)
2871 + memset((void *)&i->nat, 0, sizeof(i->nat));
2875 +/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
2876 +void ip_nat_cleanup(void)
2878 + ip_ct_selective_cleanup(&clean_nat, NULL);
2879 + ip_conntrack_destroyed = NULL;